aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-12-13 12:57:52 +0100
committerMartin Polden <mpolden@mpolden.no>2021-12-13 12:58:17 +0100
commitc3d3a4c606dce5d43ebe825b341f63237c5280ee (patch)
tree287a08bc003bd656a3a2a2095319429468a55abc /node-repository
parent65f43531e1103b974b7a81b1dd0a8dac17310cec (diff)
Do not re-activate nodes that are preferred to retire
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java74
4 files changed, 72 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
index 0c78a9adade..511d3efe313 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
@@ -206,6 +206,9 @@ class MaintenanceDeployment implements Closeable {
if (nodeLock.node().status().preferToRetire() == preferToRetire) return false;
+ // Node is retiring, keep preferToRetire
+ if (nodeLock.node().allocation().get().membership().retired() && !preferToRetire) return false;
+
nodeRepository.nodes().write(nodeLock.node().withPreferToRetire(preferToRetire, agent, nodeRepository.clock().instant()), nodeLock);
return true;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
index 62ac1f0d0e6..6f7eeb75c03 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
@@ -139,6 +139,10 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
if (this.isInNodeRepoAndReserved() && ! other.isInNodeRepoAndReserved()) return -1;
if (other.isInNodeRepoAndReserved() && ! this.isInNodeRepoAndReserved()) return 1;
+ // Choose nodes that are not preferred to retire
+ if (!this.preferToRetire() && other.preferToRetire()) return -1;
+ if (!other.preferToRetire() && this.preferToRetire()) return 1;
+
// Choose inactive nodes
if (this.state() == Node.State.inactive && other.state() != Node.State.inactive) return -1;
if (other.state() == Node.State.inactive && this.state() != Node.State.inactive) return 1;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
index b12368b2834..c98ff31ecb6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
@@ -96,7 +96,7 @@ class Preparer {
* in groups with index number above or equal the group count
*/
private List<Node> findNodesInRemovableGroups(NodeList appNodes, ClusterSpec requestedCluster, int wantedGroups) {
- List<Node> surplusNodes = new ArrayList<>(0);
+ List<Node> surplusNodes = new ArrayList<>();
for (Node node : appNodes.state(Node.State.active)) {
ClusterSpec nodeCluster = node.allocation().get().membership().cluster();
if ( ! nodeCluster.id().equals(requestedCluster.id())) continue;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
index f2f5869ece3..6be07f6f702 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
@@ -30,6 +30,7 @@ import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertSame;
/**
* @author mpolden
@@ -92,11 +93,7 @@ public class SwitchRebalancerTest {
rebalancedClusters.add(cluster);
// Retired node becomes inactive and makes zone stable
- try (var lock = tester.provisioner().lock(app)) {
- NestedTransaction removeTransaction = new NestedTransaction();
- tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction));
- removeTransaction.commit();
- }
+ deactivate(tester, retired);
}
assertEquals("Rebalanced all clusters", clusters, rebalancedClusters);
@@ -134,21 +131,78 @@ public class SwitchRebalancerTest {
// Rebalance
tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
rebalancer.maintain();
- NodeList activeNodes = tester.nodeRepository().nodes().list().owner(app).cluster(spec.id()).state(Node.State.active);
+ NodeList activeNodes = nodesIn(spec.id(), tester).state(Node.State.active);
NodeList retired = activeNodes.retired();
assertEquals("Node is retired", 1, retired.size());
assertFalse("Retired node was not on exclusive switch", nodesOnExclusiveSwitch.contains(retired.first().get()));
tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id());
// Retired node becomes inactive and makes zone stable
+ deactivate(tester, retired);
+
+ // Next iteration does nothing
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ assertNoMoves(rebalancer, tester);
+ }
+
+ @Test
+ public void rebalancing_does_not_reuse_inactive_nodes() {
+ ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build();
+ ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c1")).vespaVersion("1").build();
+ Capacity capacity = Capacity.from(new ClusterResources(4, 1, new NodeResources(4, 8, 50, 1)));
+ MockDeployer deployer = deployer(tester, capacity, spec);
+ SwitchRebalancer rebalancer = new SwitchRebalancer(tester.nodeRepository(), Duration.ofDays(1), new TestMetric(), deployer);
+
+ // Provision initial hosts on two switches
+ NodeResources hostResources = new NodeResources(8, 16, 500, 10);
+ String switch0 = "switch0";
+ String switch1 = "switch1";
+ provisionHosts(2, switch0, hostResources, tester);
+ provisionHosts(2, switch1, hostResources, tester);
+
+ // Deploy application
+ deployer.deployFromLocalActive(app).get().activate();
+ assertEquals("Nodes on " + switch0, 2, tester.activeNodesOn(switch0, app, spec.id()).size());
+ assertEquals("Nodes on " + switch1, 2, tester.activeNodesOn(switch1, app, spec.id()).size());
+
+ // Two new hosts becomes available on a new switches
+ String switch2 = "switch2";
+ String switch3 = "switch3";
+ provisionHost(switch2, hostResources, tester);
+ provisionHost(switch3, hostResources, tester);
+
+ // Rebalance retires one node and allocates another
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ rebalancer.maintain();
+ tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id());
+ NodeList retired = nodesIn(spec.id(), tester).state(Node.State.active).retired();
+ assertEquals("Node is retired", 1, retired.size());
+ deactivate(tester, retired);
+
+ // Next rebalance does not reuse inactive node
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ rebalancer.maintain();
+ assertSame("Inactive node is not re-activated",
+ Node.State.inactive,
+ nodesIn(spec.id(), tester).node(retired.first().get().hostname()).get().state());
+ tester.assertSwitches(Set.of(switch0, switch1, switch2, switch3), app, spec.id());
+ retired = nodesIn(spec.id(), tester).state(Node.State.active).retired();
+ deactivate(tester, retired);
+
+ // Next iteration does nothing
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ assertNoMoves(rebalancer, tester);
+ }
+
+ private NodeList nodesIn(ClusterSpec.Id cluster, ProvisioningTester tester) {
+ return tester.nodeRepository().nodes().list().owner(app).cluster(cluster);
+ }
+
+ private void deactivate(ProvisioningTester tester, NodeList retired) {
try (var lock = tester.provisioner().lock(app)) {
NestedTransaction removeTransaction = new NestedTransaction();
tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction));
removeTransaction.commit();
}
-
- // Next iteration does nothing
- tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
- assertNoMoves(rebalancer, tester);
}
private void provisionHost(String switchHostname, NodeResources hostResources, ProvisioningTester tester) {