diff options
Diffstat (limited to 'node-repository')
8 files changed, 111 insertions, 22 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f9ff2f08375..766bc688c62 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; +import com.yahoo.slime.SlimeUtils; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -109,7 +110,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); - if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { + if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node) && !undergoingCmr(node)) { // Allow a grace period after node re-activation if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); @@ -157,6 +158,19 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } + private boolean undergoingCmr(Node node) { + return node.reports().getReport("vcmr") + .map(report -> + SlimeUtils.entriesStream(report.getInspector().field("upcoming")) + .anyMatch(cmr -> { + var startTime = cmr.field("plannedStartTime").asLong(); + var endTime = cmr.field("plannedEndTime").asLong(); + var now = clock().instant().getEpochSecond(); + return now > startTime && now < endTime; + }) + ).orElse(false); + } + /** Is the node and all active children suspended? */ private boolean allSuspended(Node node, NodeList activeNodes) { if (!nodeRepository().nodes().suspended(node)) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index eff39dddf42..06c1916dd4f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -21,7 +21,6 @@ import java.util.function.Consumer; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.stream.Collectors; /** * Performs preparation of node activation changes for a single host group in an application. @@ -124,6 +123,14 @@ public class GroupPreparer { hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); throw e; } + } else if (allocation.hostDeficit().isPresent() && requestedNodes.canFail() && + allocation.hasRetiredJustNow() && requestedNodes instanceof NodeSpec.CountNodeSpec cns) { + // Non-dynamically provisioned zone with a deficit because we just now retired some nodes. + // Try again, but without retiring + indices.resetProbe(); + List<Node> accepted = prepareWithLocks(application, cluster, cns.withoutRetiring(), surplusActiveNodes, indices, wantedGroups); + log.warning("Prepared " + application + " " + cluster.id() + " without retirement due to lack of capacity"); + return accepted; } if (! allocation.fulfilled() && requestedNodes.canFail()) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java index 890d190c24e..b3198a72d1b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java @@ -193,8 +193,8 @@ public class LoadBalancerProvisioner { Optional<LoadBalancer> loadBalancer = db.readLoadBalancer(id); LoadBalancer newLoadBalancer; LoadBalancer.State fromState = loadBalancer.map(LoadBalancer::state).orElse(null); - boolean recreateLoadBalancer = loadBalancer.isPresent() && (!inAccount(cloudAccount, loadBalancer.get()) - || !hasCorrectVisibility(loadBalancer.get(), zoneEndpoint)); + boolean recreateLoadBalancer = loadBalancer.isPresent() && ( ! inAccount(cloudAccount, loadBalancer.get()) + || ! hasCorrectVisibility(loadBalancer.get(), zoneEndpoint)); if (recreateLoadBalancer) { // We have a load balancer, but with the wrong account or visibility. // Load balancer must be removed before we can provision a new one with the wanted visibility diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index c6971f0fe02..3af63125474 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -307,10 +307,15 @@ class NodeAllocation { } /** Returns true if this allocation was already fulfilled and resulted in no new changes */ - public boolean fulfilledAndNoChanges() { + boolean fulfilledAndNoChanges() { return fulfilled() && reservableNodes().isEmpty() && newNodes().isEmpty(); } + /** Returns true if this allocation has retired nodes */ + boolean hasRetiredJustNow() { + return wasRetiredJustNow > 0; + } + /** * Returns {@link HostDeficit} describing the host deficit for the given {@link NodeSpec}. * @@ -451,7 +456,7 @@ class NodeAllocation { .toList(); } - public String allocationFailureDetails() { + String allocationFailureDetails() { List<String> reasons = new ArrayList<>(); if (rejectedDueToExclusivity > 0) reasons.add("host exclusivity constraints"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index 9edfa221abf..28d1e7c1c68 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -3,8 +3,6 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.CloudAccount; import com.yahoo.config.provision.ClusterSpec; -import com.yahoo.config.provision.Flavor; -import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; @@ -79,7 +77,7 @@ public interface NodeSpec { } static NodeSpec from(int nodeCount, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { - return new CountNodeSpec(nodeCount, resources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(nodeCount, resources, exclusive, canFail, canFail, cloudAccount); } static NodeSpec from(NodeType type, CloudAccount cloudAccount) { @@ -93,14 +91,19 @@ public interface NodeSpec { private final NodeResources requestedNodeResources; private final boolean exclusive; private final boolean canFail; + private final boolean considerRetiring; private final CloudAccount cloudAccount; - private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, CloudAccount cloudAccount) { + private CountNodeSpec(int count, NodeResources resources, boolean exclusive, boolean canFail, boolean considerRetiring, CloudAccount cloudAccount) { this.count = count; this.requestedNodeResources = Objects.requireNonNull(resources, "Resources must be specified"); this.exclusive = exclusive; this.canFail = canFail; + this.considerRetiring = considerRetiring; this.cloudAccount = Objects.requireNonNull(cloudAccount); + + if (!canFail && considerRetiring) + throw new IllegalArgumentException("Cannot consider retiring nodes if we cannot fail"); } @Override @@ -127,8 +130,7 @@ public interface NodeSpec { @Override public boolean considerRetiring() { - // If we cannot fail we cannot retire as we may end up without sufficient replacement capacity - return canFail(); + return considerRetiring; } @Override @@ -143,7 +145,11 @@ public interface NodeSpec { @Override public NodeSpec fraction(int divisor) { - return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, cloudAccount); + return new CountNodeSpec(count/divisor, requestedNodeResources, exclusive, canFail, considerRetiring, cloudAccount); + } + + public NodeSpec withoutRetiring() { + return new CountNodeSpec(count, requestedNodeResources, exclusive, canFail, false, cloudAccount); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 491485b78fc..c63be6d5dc5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.vespa.hosted.provision.Node; @@ -627,6 +628,49 @@ public class NodeFailerTest { assertFalse(badNode(1, 3, 1, 2)); } + @Test + public void nodes_undergoing_cmr_are_not_failed() { + var tester = NodeFailTester.withTwoApplications(6); + var clock = tester.clock; + var slime = SlimeUtils.jsonToSlime( + String.format(""" + { + "upcoming":[{ + "id": "id-42", + "status": "some-status", + "plannedStartTime": %d, + "plannedEndTime": %d + }] + } + """, clock.instant().getEpochSecond(), clock.instant().plus(Duration.ofMinutes(90)).getEpochSecond()) + ); + var cmrReport = Report.fromSlime("vcmr", slime.get()); + var downHost = tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1).asList().get(1).hostname(); + + var node = tester.nodeRepository.nodes().node(downHost).get(); + tester.nodeRepository.nodes().write(node.with(node.reports().withReport(cmrReport)), () -> {}); + + tester.serviceMonitor.setHostDown(downHost); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.active, node.state()); + + // CMR still ongoing, don't fail yet + clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.active, node.state()); + + // No ongoing CMR anymore, host should be failed + clock.advance(Duration.ofHours(1)); + tester.runMaintainers(); + node = tester.nodeRepository.nodes().node(downHost).get(); + assertTrue(node.isDown()); + assertEquals(Node.State.failed, node.state()); + } + private void addServiceInstances(List<ServiceInstance> list, ServiceStatus status, int num) { for (int i = 0; i < num; ++i) { ServiceInstance service = mock(ServiceInstance.class); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index 978edf3f7e4..2cd0e84c356 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -757,6 +757,23 @@ public class ProvisioningTest { } @Test + public void ignore_retirement_if_no_capacity() { + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); + tester.makeReadyHosts(3, defaultResources).activateTenantHosts(); + + ApplicationId application = ProvisioningTester.applicationId(); + ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("music")).vespaVersion("4.5.6").build(); + tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources)); + + // Mark the nodes as want to retire + NodeList nodes = tester.getNodes(application); + tester.patchNodes(nodes.asList(), node -> node.withWantToRetire(true, Agent.system, tester.clock().instant())); + + tester.activate(application, tester.prepare(application, cluster, 3, 1, defaultResources)); + assertEquals(3, tester.getNodes(application).state(Node.State.active).not().retired().size()); + } + + @Test public void highest_node_indexes_are_retired_first() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java index fb773f19b8a..0744d82c85b 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java @@ -34,7 +34,6 @@ import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -171,7 +170,7 @@ public class VirtualNodeProvisioningTest { @Test public void indistinct_distribution_with_known_ready_nodes() { ProvisioningTester tester = new ProvisioningTester.Builder().build(); - tester.makeReadyChildren(3, resources1); + tester.makeReadyChildren(4, resources1); int contentNodeCount = 3; int groups = 1; @@ -191,13 +190,10 @@ public class VirtualNodeProvisioningTest { tester.makeReadyChildren(1, resources1, "parentHost1"); tester.makeReadyChildren(2, resources1, "parentHost2"); - NodeAllocationException expectedException = null; - try { - tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1); - } catch (NodeAllocationException e) { - expectedException = e; - } - assertNotNull(expectedException); + tester.activate(applicationId, tester.prepare(applicationId, contentClusterSpec, contentNodeCount, groups, resources1)); + nodes = tester.getNodes(applicationId, Node.State.active); + assertEquals(4, nodes.size()); + assertEquals(1, nodes.retired().size()); } @Test |