From c3d3a4c606dce5d43ebe825b341f63237c5280ee Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Mon, 13 Dec 2021 12:57:52 +0100 Subject: Do not re-activate nodes that are preferred to retire --- .../maintenance/MaintenanceDeployment.java | 3 + .../provision/provisioning/NodeCandidate.java | 4 ++ .../hosted/provision/provisioning/Preparer.java | 2 +- .../maintenance/SwitchRebalancerTest.java | 74 +++++++++++++++++++--- 4 files changed, 72 insertions(+), 11 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index 0c78a9adade..511d3efe313 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -206,6 +206,9 @@ class MaintenanceDeployment implements Closeable { if (nodeLock.node().status().preferToRetire() == preferToRetire) return false; + // Node is retiring, keep preferToRetire + if (nodeLock.node().allocation().get().membership().retired() && !preferToRetire) return false; + nodeRepository.nodes().write(nodeLock.node().withPreferToRetire(preferToRetire, agent, nodeRepository.clock().instant()), nodeLock); return true; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java index 62ac1f0d0e6..6f7eeb75c03 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java @@ -139,6 +139,10 @@ public abstract class NodeCandidate implements Nodelike, Comparable findNodesInRemovableGroups(NodeList appNodes, ClusterSpec requestedCluster, int wantedGroups) { - List surplusNodes = new ArrayList<>(0); + List surplusNodes = new ArrayList<>(); for (Node node : appNodes.state(Node.State.active)) { ClusterSpec nodeCluster = node.allocation().get().membership().cluster(); if ( ! nodeCluster.id().equals(requestedCluster.id())) continue; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java index f2f5869ece3..6be07f6f702 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java @@ -30,6 +30,7 @@ import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertSame; /** * @author mpolden @@ -92,11 +93,7 @@ public class SwitchRebalancerTest { rebalancedClusters.add(cluster); // Retired node becomes inactive and makes zone stable - try (var lock = tester.provisioner().lock(app)) { - NestedTransaction removeTransaction = new NestedTransaction(); - tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction)); - removeTransaction.commit(); - } + deactivate(tester, retired); } assertEquals("Rebalanced all clusters", clusters, rebalancedClusters); @@ -134,21 +131,78 @@ public class SwitchRebalancerTest { // Rebalance tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); rebalancer.maintain(); - NodeList activeNodes = tester.nodeRepository().nodes().list().owner(app).cluster(spec.id()).state(Node.State.active); + NodeList activeNodes = nodesIn(spec.id(), tester).state(Node.State.active); NodeList retired = activeNodes.retired(); assertEquals("Node is retired", 1, retired.size()); assertFalse("Retired node was not on exclusive switch", nodesOnExclusiveSwitch.contains(retired.first().get())); tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id()); // Retired node becomes inactive and makes zone stable + deactivate(tester, retired); + + // Next iteration does nothing + tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); + assertNoMoves(rebalancer, tester); + } + + @Test + public void rebalancing_does_not_reuse_inactive_nodes() { + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); + ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c1")).vespaVersion("1").build(); + Capacity capacity = Capacity.from(new ClusterResources(4, 1, new NodeResources(4, 8, 50, 1))); + MockDeployer deployer = deployer(tester, capacity, spec); + SwitchRebalancer rebalancer = new SwitchRebalancer(tester.nodeRepository(), Duration.ofDays(1), new TestMetric(), deployer); + + // Provision initial hosts on two switches + NodeResources hostResources = new NodeResources(8, 16, 500, 10); + String switch0 = "switch0"; + String switch1 = "switch1"; + provisionHosts(2, switch0, hostResources, tester); + provisionHosts(2, switch1, hostResources, tester); + + // Deploy application + deployer.deployFromLocalActive(app).get().activate(); + assertEquals("Nodes on " + switch0, 2, tester.activeNodesOn(switch0, app, spec.id()).size()); + assertEquals("Nodes on " + switch1, 2, tester.activeNodesOn(switch1, app, spec.id()).size()); + + // Two new hosts becomes available on a new switches + String switch2 = "switch2"; + String switch3 = "switch3"; + provisionHost(switch2, hostResources, tester); + provisionHost(switch3, hostResources, tester); + + // Rebalance retires one node and allocates another + tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); + rebalancer.maintain(); + tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id()); + NodeList retired = nodesIn(spec.id(), tester).state(Node.State.active).retired(); + assertEquals("Node is retired", 1, retired.size()); + deactivate(tester, retired); + + // Next rebalance does not reuse inactive node + tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); + rebalancer.maintain(); + assertSame("Inactive node is not re-activated", + Node.State.inactive, + nodesIn(spec.id(), tester).node(retired.first().get().hostname()).get().state()); + tester.assertSwitches(Set.of(switch0, switch1, switch2, switch3), app, spec.id()); + retired = nodesIn(spec.id(), tester).state(Node.State.active).retired(); + deactivate(tester, retired); + + // Next iteration does nothing + tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); + assertNoMoves(rebalancer, tester); + } + + private NodeList nodesIn(ClusterSpec.Id cluster, ProvisioningTester tester) { + return tester.nodeRepository().nodes().list().owner(app).cluster(cluster); + } + + private void deactivate(ProvisioningTester tester, NodeList retired) { try (var lock = tester.provisioner().lock(app)) { NestedTransaction removeTransaction = new NestedTransaction(); tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction)); removeTransaction.commit(); } - - // Next iteration does nothing - tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); - assertNoMoves(rebalancer, tester); } private void provisionHost(String switchHostname, NodeResources hostResources, ProvisioningTester tester) { -- cgit v1.2.3