aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2021-12-13 13:15:30 +0100
committerGitHub <noreply@github.com>2021-12-13 13:15:30 +0100
commit52d01199ab74a7a6be6be0fb8fdb519b9838f6af (patch)
treee691c6b8f87afd5fbf0ffefccb1bb7098d745f6b /node-repository
parenta37abfa9d4f72d520f7d62729de21df9d0cd71d7 (diff)
parentc3d3a4c606dce5d43ebe825b341f63237c5280ee (diff)
Merge pull request #20486 from vespa-engine/mpolden/skip-reactivation
Do not re-activate nodes that are preferred to retire
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java16
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java74
5 files changed, 88 insertions, 11 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
index 0c78a9adade..511d3efe313 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
@@ -206,6 +206,9 @@ class MaintenanceDeployment implements Closeable {
if (nodeLock.node().status().preferToRetire() == preferToRetire) return false;
+ // Node is retiring, keep preferToRetire
+ if (nodeLock.node().allocation().get().membership().retired() && !preferToRetire) return false;
+
nodeRepository.nodes().write(nodeLock.node().withPreferToRetire(preferToRetire, agent, nodeRepository.clock().instant()), nodeLock);
return true;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
index 62ac1f0d0e6..6f7eeb75c03 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
@@ -139,6 +139,10 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
if (this.isInNodeRepoAndReserved() && ! other.isInNodeRepoAndReserved()) return -1;
if (other.isInNodeRepoAndReserved() && ! this.isInNodeRepoAndReserved()) return 1;
+ // Choose nodes that are not preferred to retire
+ if (!this.preferToRetire() && other.preferToRetire()) return -1;
+ if (!other.preferToRetire() && this.preferToRetire()) return 1;
+
// Choose inactive nodes
if (this.state() == Node.State.inactive && other.state() != Node.State.inactive) return -1;
if (other.state() == Node.State.inactive && this.state() != Node.State.inactive) return 1;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
index b12368b2834..c98ff31ecb6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
@@ -96,7 +96,7 @@ class Preparer {
* in groups with index number above or equal the group count
*/
private List<Node> findNodesInRemovableGroups(NodeList appNodes, ClusterSpec requestedCluster, int wantedGroups) {
- List<Node> surplusNodes = new ArrayList<>(0);
+ List<Node> surplusNodes = new ArrayList<>();
for (Node node : appNodes.state(Node.State.active)) {
ClusterSpec nodeCluster = node.allocation().get().membership().cluster();
if ( ! nodeCluster.id().equals(requestedCluster.id())) continue;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
index 27233791cf1..3dd76c76cac 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
@@ -18,12 +18,15 @@ import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
import com.yahoo.config.provisioning.FlavorsConfig;
+import com.yahoo.jdisc.test.MockMetric;
import com.yahoo.transaction.NestedTransaction;
import com.yahoo.vespa.config.ConfigPayload;
+import com.yahoo.vespa.hosted.provision.maintenance.SwitchRebalancer;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.persistence.DnsNameResolver;
import com.yahoo.vespa.hosted.provision.persistence.NodeSerializer;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
+import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
import com.yahoo.vespa.model.builder.xml.dom.DomConfigPayloadBuilder;
import org.junit.Ignore;
import org.junit.Test;
@@ -35,8 +38,11 @@ import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.time.Duration;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
+import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.logging.Logger;
@@ -97,6 +103,16 @@ public class RealDataScenarioTest {
deploy(tester, app, specs, capacities);
tester.nodeRepository().nodes().list().owner(app).cluster(specs[1].id()).forEach(System.out::println);
+
+ // Perform a node move
+ tester.clock().advance(Duration.ofHours(1)); // Enough time for deployment to not be considered deployed recently
+ List<MockDeployer.ClusterContext> contexts = new ArrayList<>();
+ for (int i = 0; i < specs.length; i++) {
+ contexts.add(new MockDeployer.ClusterContext(app, specs[i], capacities[i]));
+ }
+ MockDeployer deployer = new MockDeployer(tester.provisioner(), tester.clock(), Map.of(app, new MockDeployer.ApplicationContext(app, contexts)));
+ SwitchRebalancer rebalancer = new SwitchRebalancer(tester.nodeRepository(), Duration.ofDays(1), new MockMetric(), deployer);
+ rebalancer.run();
}
private void deploy(ProvisioningTester tester, ApplicationId app, ClusterSpec[] specs, Capacity[] capacities) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
index f2f5869ece3..6be07f6f702 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java
@@ -30,6 +30,7 @@ import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertSame;
/**
* @author mpolden
@@ -92,11 +93,7 @@ public class SwitchRebalancerTest {
rebalancedClusters.add(cluster);
// Retired node becomes inactive and makes zone stable
- try (var lock = tester.provisioner().lock(app)) {
- NestedTransaction removeTransaction = new NestedTransaction();
- tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction));
- removeTransaction.commit();
- }
+ deactivate(tester, retired);
}
assertEquals("Rebalanced all clusters", clusters, rebalancedClusters);
@@ -134,21 +131,78 @@ public class SwitchRebalancerTest {
// Rebalance
tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
rebalancer.maintain();
- NodeList activeNodes = tester.nodeRepository().nodes().list().owner(app).cluster(spec.id()).state(Node.State.active);
+ NodeList activeNodes = nodesIn(spec.id(), tester).state(Node.State.active);
NodeList retired = activeNodes.retired();
assertEquals("Node is retired", 1, retired.size());
assertFalse("Retired node was not on exclusive switch", nodesOnExclusiveSwitch.contains(retired.first().get()));
tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id());
// Retired node becomes inactive and makes zone stable
+ deactivate(tester, retired);
+
+ // Next iteration does nothing
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ assertNoMoves(rebalancer, tester);
+ }
+
+ @Test
+ public void rebalancing_does_not_reuse_inactive_nodes() {
+ ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build();
+ ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("c1")).vespaVersion("1").build();
+ Capacity capacity = Capacity.from(new ClusterResources(4, 1, new NodeResources(4, 8, 50, 1)));
+ MockDeployer deployer = deployer(tester, capacity, spec);
+ SwitchRebalancer rebalancer = new SwitchRebalancer(tester.nodeRepository(), Duration.ofDays(1), new TestMetric(), deployer);
+
+ // Provision initial hosts on two switches
+ NodeResources hostResources = new NodeResources(8, 16, 500, 10);
+ String switch0 = "switch0";
+ String switch1 = "switch1";
+ provisionHosts(2, switch0, hostResources, tester);
+ provisionHosts(2, switch1, hostResources, tester);
+
+ // Deploy application
+ deployer.deployFromLocalActive(app).get().activate();
+ assertEquals("Nodes on " + switch0, 2, tester.activeNodesOn(switch0, app, spec.id()).size());
+ assertEquals("Nodes on " + switch1, 2, tester.activeNodesOn(switch1, app, spec.id()).size());
+
+ // Two new hosts becomes available on a new switches
+ String switch2 = "switch2";
+ String switch3 = "switch3";
+ provisionHost(switch2, hostResources, tester);
+ provisionHost(switch3, hostResources, tester);
+
+ // Rebalance retires one node and allocates another
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ rebalancer.maintain();
+ tester.assertSwitches(Set.of(switch0, switch1, switch2), app, spec.id());
+ NodeList retired = nodesIn(spec.id(), tester).state(Node.State.active).retired();
+ assertEquals("Node is retired", 1, retired.size());
+ deactivate(tester, retired);
+
+ // Next rebalance does not reuse inactive node
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ rebalancer.maintain();
+ assertSame("Inactive node is not re-activated",
+ Node.State.inactive,
+ nodesIn(spec.id(), tester).node(retired.first().get().hostname()).get().state());
+ tester.assertSwitches(Set.of(switch0, switch1, switch2, switch3), app, spec.id());
+ retired = nodesIn(spec.id(), tester).state(Node.State.active).retired();
+ deactivate(tester, retired);
+
+ // Next iteration does nothing
+ tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
+ assertNoMoves(rebalancer, tester);
+ }
+
+ private NodeList nodesIn(ClusterSpec.Id cluster, ProvisioningTester tester) {
+ return tester.nodeRepository().nodes().list().owner(app).cluster(cluster);
+ }
+
+ private void deactivate(ProvisioningTester tester, NodeList retired) {
try (var lock = tester.provisioner().lock(app)) {
NestedTransaction removeTransaction = new NestedTransaction();
tester.nodeRepository().nodes().deactivate(retired.asList(), new ApplicationTransaction(lock, removeTransaction));
removeTransaction.commit();
}
-
- // Next iteration does nothing
- tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment);
- assertNoMoves(rebalancer, tester);
}
private void provisionHost(String switchHostname, NodeResources hostResources, ProvisioningTester tester) {