summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java16
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java44
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java14
8 files changed, 111 insertions, 22 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index f9ff2f08375..766bc688c62 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -7,6 +7,7 @@ import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
+import com.yahoo.slime.SlimeUtils;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
@@ -109,7 +110,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
for (Node node : activeNodes) {
Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
- if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
+ if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !undergoingCmr(node)) {
// Allow a grace period after node re-activation
if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
@@ -157,6 +158,19 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
}
+ private boolean undergoingCmr(Node node) {
+ return node.reports().getReport("vcmr")
+ .map(report ->
+ SlimeUtils.entriesStream(report.getInspector().field("upcoming"))
+ .anyMatch(cmr -> {
+ var startTime = cmr.field("plannedStartTime").asLong();
+ var endTime = cmr.field("plannedEndTime").asLong();
+ var now = clock().instant().getEpochSecond();
+ return now > startTime && now < endTime;
+ })
+ ).orElse(false);
+ }
+
/** Is the node and all active children suspended? */
private boolean allSuspended(Node node, NodeList activeNodes) {
if (!nodeRepository().nodes().suspended(node)) return false;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
index eff39dddf42..06c1916dd4f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
@@ -21,7 +21,6 @@ import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.logging.Level;
import java.util.logging.Logger;
-import java.util.stream.Collectors;
/**
* Performs preparation of node activation changes for a single host group in an application.
@@ -124,6 +123,14 @@ public class GroupPreparer {
hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant()));
throw e;
}
+ } else if (allocation.hostDeficit().isPresent() && requestedNodes.canFail() &&
+ allocation.hasRetiredJustNow() && requestedNodes instanceof NodeSpec.CountNodeSpec cns) {
+ // Non-dynamically provisioned zone with a deficit because we just now retired some nodes.
+ // Try again, but without retiring
+ indices.resetProbe();
+ List<Node> accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), surplusActiveNodes, indices, wantedGroups);
+ log.warning("Prepared " + application + " " + cluster.id() + " without retirement due to lack of capacity");
+ return accepted;
}
if (! allocation.fulfilled() && requestedNodes.canFail())
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
index 890d190c24e..b3198a72d1b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
@@ -193,8 +193,8 @@ public class LoadBalancerProvisioner {
Optional<LoadBalancer> loadBalancer = db.readLoadBalancer(id);
LoadBalancer newLoadBalancer;
LoadBalancer.State fromState = loadBalancer.map(LoadBalancer::state).orElse(null);
- boolean recreateLoadBalancer = loadBalancer.isPresent() && (!inAccount(cloudAccount, loadBalancer.get())
- || !hasCorrectVisibility(loadBalancer.get(), zoneEndpoint));
+ boolean recreateLoadBalancer = loadBalancer.isPresent() && ( ! inAccount(cloudAccount, loadBalancer.get())
+ || ! hasCorrectVisibility(loadBalancer.get(), zoneEndpoint));
if (recreateLoadBalancer) {
// We have a load balancer, but with the wrong account or visibility.
// Load balancer must be removed before we can provision a new one with the wanted visibility
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
index c6971f0fe02..3af63125474 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
@@ -307,10 +307,15 @@ class NodeAllocation {
}
/** Returns true if this allocation was already fulfilled and resulted in no new changes */
- public boolean fulfilledAndNoChanges() {
+ boolean fulfilledAndNoChanges() {
return fulfilled() && reservableNodes().isEmpty() && newNodes().isEmpty();
}
+ /** Returns true if this allocation has retired nodes */
+ boolean hasRetiredJustNow() {
+ return wasRetiredJustNow > 0;
+ }
+
/**
* Returns {@link HostDeficit} describing the host deficit for the given {@link NodeSpec}.
*
@@ -451,7 +456,7 @@ class NodeAllocation {
.toList();
}
- public String allocationFailureDetails() {
+ String allocationFailureDetails() {
List<String> reasons = new ArrayList<>();
if (rejectedDueToExclusivity > 0)
reasons.add("host exclusivity constraints");
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
index 9edfa221abf..28d1e7c1c68 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
@@ -3,8 +3,6 @@ package com.yahoo.vespa.hosted.provision.provisioning;
import com.yahoo.config.provision.CloudAccount;
import com.yahoo.config.provision.ClusterSpec;
-import com.yahoo.config.provision.Flavor;
-import com.yahoo.config.provision.NodeFlavors;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
@@ -79,7 +77,7 @@ public interface NodeSpec {
}
static NodeSpec from(int nodeCount, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) {
- return new CountNodeSpec(nodeCount, resources, exclusive, canFail, cloudAccount);
+ return new CountNodeSpec(nodeCount, resources, exclusive, canFail, canFail, cloudAccount);
}
static NodeSpec from(NodeType type, CloudAccount cloudAccount) {
@@ -93,14 +91,19 @@ public interface NodeSpec {
private final NodeResources requestedNodeResources;
private final boolean exclusive;
private final boolean canFail;
+ private final boolean considerRetiring;
private final CloudAccount cloudAccount;
- private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) {
+ private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, boolean considerRetiring, CloudAccount cloudAccount) {
this.count = count;
this.requestedNodeResources = Objects.requireNonNull(resources, "Resources must be specified");
this.exclusive = exclusive;
this.canFail = canFail;
+ this.considerRetiring = considerRetiring;
this.cloudAccount = Objects.requireNonNull(cloudAccount);
+
+ if (!canFail && considerRetiring)
+ throw new IllegalArgumentException("Cannot consider retiring nodes if we cannot fail");
}
@Override
@@ -127,8 +130,7 @@ public interface NodeSpec {
@Override
public boolean considerRetiring() {
- // If we cannot fail we cannot retire as we may end up without sufficient replacement capacity
- return canFail();
+ return considerRetiring;
}
@Override
@@ -143,7 +145,11 @@ public interface NodeSpec {
@Override
public NodeSpec fraction(int divisor) {
- return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, cloudAccount);
+ return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, considerRetiring, cloudAccount);
+ }
+
+ public NodeSpec withoutRetiring() {
+ return new CountNodeSpec(count, requestedNodeResources, exclusive, canFail, false, cloudAccount);
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 491485b78fc..c63be6d5dc5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
@@ -627,6 +628,49 @@ public class NodeFailerTest {
assertFalse(badNode(1, 3, 1, 2));
}
+ @Test
+ public void nodes_undergoing_cmr_are_not_failed() {
+ var tester = NodeFailTester.withTwoApplications(6);
+ var clock = tester.clock;
+ var slime = SlimeUtils.jsonToSlime(
+ String.format("""
+ {
+ "upcoming":[{
+ "id": "id-42",
+ "status": "some-status",
+ "plannedStartTime": %d,
+ "plannedEndTime": %d
+ }]
+ }
+ """, clock.instant().getEpochSecond(), clock.instant().plus(Duration.ofMinutes(90)).getEpochSecond())
+ );
+ var cmrReport = Report.fromSlime("vcmr", slime.get());
+ var downHost = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname();
+
+ var node = tester.nodeRepository.nodes().node(downHost).get();
+ tester.nodeRepository.nodes().write(node.with(node.reports().withReport(cmrReport)), () -> {});
+
+ tester.serviceMonitor.setHostDown(downHost);
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.active, node.state());
+
+ // CMR still ongoing, don't fail yet
+ clock.advance(Duration.ofHours(1));
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.active, node.state());
+
+ // No ongoing CMR anymore, host should be failed
+ clock.advance(Duration.ofHours(1));
+ tester.runMaintainers();
+ node = tester.nodeRepository.nodes().node(downHost).get();
+ assertTrue(node.isDown());
+ assertEquals(Node.State.failed, node.state());
+ }
+
private void addServiceInstances(List<ServiceInstance> list, ServiceStatus status, int num) {
for (int i = 0; i < num; ++i) {
ServiceInstance service = mock(ServiceInstance.class);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
index 978edf3f7e4..2cd0e84c356 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
@@ -757,6 +757,23 @@ public class ProvisioningTest {
}
@Test
+ public void ignore_retirement_if_no_capacity() {
+ ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build();
+ tester.makeReadyHosts(3, defaultResources).activateTenantHosts();
+
+ ApplicationId application = ProvisioningTester.applicationId();
+ ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("music")).vespaVersion("4.5.6").build();
+ tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources));
+
+ // Mark the nodes as want to retire
+ NodeList nodes = tester.getNodes(application);
+ tester.patchNodes(nodes.asList(), node -> node.withWantToRetire(true, Agent.system, tester.clock().instant()));
+
+ tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources));
+ assertEquals(3, tester.getNodes(application).state(Node.State.active).not().retired().size());
+ }
+
+ @Test
public void highest_node_indexes_are_retired_first() {
ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java
index fb773f19b8a..0744d82c85b 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java
@@ -34,7 +34,6 @@ import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -171,7 +170,7 @@ public class VirtualNodeProvisioningTest {
@Test
public void indistinct_distribution_with_known_ready_nodes() {
ProvisioningTester tester = new ProvisioningTester.Builder().build();
- tester.makeReadyChildren(3, resources1);
+ tester.makeReadyChildren(4, resources1);
int contentNodeCount = 3;
int groups = 1;
@@ -191,13 +190,10 @@ public class VirtualNodeProvisioningTest {
tester.makeReadyChildren(1, resources1, "parentHost1");
tester.makeReadyChildren(2, resources1, "parentHost2");
- NodeAllocationException expectedException = null;
- try {
- tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1);
- } catch (NodeAllocationException e) {
- expectedException = e;
- }
- assertNotNull(expectedException);
+ tester.activate(applicationId, tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1));
+ nodes = tester.getNodes(applicationId, Node.State.active);
+ assertEquals(4, nodes.size());
+ assertEquals(1, nodes.retired().size());
}
@Test