summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2019-11-19 15:36:52 +0100
committerGitHub <noreply@github.com>2019-11-19 15:36:52 +0100
commitebe2949751df9a0654e806049240c39a56b48add (patch)
tree7443357b571d5f4e886a9fda663a911e2f19c1d0
parent05e647af99a5907310714a85e92bf4675d449a28 (diff)
parentfd5012f00f242d2611a9dd9b5dea2a2d6d982ed5 (diff)
Merge pull request #11355 from vespa-engine/bratseth/faster-rebalancing-cleanup
Fast-track cleanup after failed rebalancing
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java15
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java10
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java19
3 files changed, 36 insertions, 8 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index 7d8949502d4..7160edb96f4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -121,7 +121,7 @@ public class Rebalancer extends Maintainer {
if (nodeToMove.isEmpty()) return false;
if (nodeToMove.get().state() != Node.State.active) return false;
- if (node.status().wantToRetire() == wantToRetire) return false;
+ if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false;
nodeRepository().write(nodeToMove.get().withWantToRetire(wantToRetire, Agent.system, clock.instant()), lock);
return true;
@@ -142,10 +142,15 @@ public class Rebalancer extends Maintainer {
boolean couldMarkRetiredNow = markWantToRetire(move.node, true);
if ( ! couldMarkRetiredNow) return false;
+ Optional<Node> expectedNewNode = Optional.empty();
try {
if ( ! deployment.prepare()) return false;
- if (nodeRepository().getNodes(application, Node.State.reserved).stream().noneMatch(node -> node.hasParent(move.toHost.hostname())))
- return false; // Deployment is not moving the from node to the target we identified for some reason
+ expectedNewNode =
+ nodeRepository().getNodes(application, Node.State.reserved).stream()
+ .filter(node -> node.hasParent(move.toHost.hostname()))
+ .filter(node -> node.allocation().get().membership().cluster().id().equals(move.node.allocation().get().membership().cluster().id()))
+ .findAny();
+ if (expectedNewNode.isEmpty()) return false;
if ( ! deployment.activate()) return false;
log.info("Rebalancer redeployed " + application + " to " + move);
@@ -153,6 +158,10 @@ public class Rebalancer extends Maintainer {
}
finally {
markWantToRetire(move.node, false); // Necessary if this failed, no-op otherwise
+ if (expectedNewNode.isPresent()) { // Immediately clean up if we reserved the node but could not activate
+ Optional<Node> reservedNewNode = nodeRepository().getNode(expectedNewNode.get().hostname(), Node.State.reserved);
+ reservedNewNode.ifPresent(reserved -> nodeRepository().setDirty(reserved, Agent.system, "Expired by Rebalancer"));
+ }
}
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
index 898e1343ea3..7ab42093bab 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
@@ -37,6 +37,8 @@ public class MockDeployer implements Deployer {
private final Clock clock;
private final ReentrantLock lock = new ReentrantLock();
+ private boolean failActivate = false;
+
@Inject
@SuppressWarnings("unused")
public MockDeployer() {
@@ -55,9 +57,9 @@ public class MockDeployer implements Deployer {
this.applications = new HashMap<>(applications);
}
- public ReentrantLock lock() {
- return lock;
- }
+ public ReentrantLock lock() { return lock; }
+
+ public void setFailActivate(boolean failActivate) { this.failActivate = failActivate; }
@Override
public Optional<Deployment> deployFromLocalActive(ApplicationId id, boolean bootstrap) {
@@ -119,6 +121,8 @@ public class MockDeployer implements Deployer {
if (preparedHosts == null)
prepare();
redeployments++;
+ if (failActivate)
+ throw new IllegalStateException("failActivate is true");
try (NestedTransaction t = new NestedTransaction()) {
provisioner.activate(t, application.id(), preparedHosts);
t.commit();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
index fc0e0e68570..d1a330a3bd6 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
@@ -39,6 +39,7 @@ public class RebalancerTest {
@Test
public void testRebalancing() {
+ // --- Setup
ApplicationId cpuApp = makeApplicationId("t1", "a1");
ApplicationId memApp = makeApplicationId("t2", "a2");
NodeResources cpuResources = new NodeResources(8, 4, 1, 0.1);
@@ -64,7 +65,7 @@ public class RebalancerTest {
tester.makeReadyNodes(3, "flt", NodeType.host, 8);
tester.deployZoneApp();
- // Cpu heavy application - causing 1 of these nodes to be skewed
+ // --- Deploying a cpu heavy application - causing 1 of these nodes to be skewed
deployApp(cpuApp, clusterSpec("c"), cpuResources, tester, 1);
Node cpuSkewedNode = tester.nodeRepository().getNodes(cpuApp).get(0);
@@ -73,6 +74,7 @@ public class RebalancerTest {
tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().status().wantToRetire());
assertEquals(0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ // --- Making a more suitable node configuration available causes rebalancing
Node newCpuHost = tester.makeReadyNodes(1, "cpu", NodeType.host, 8).get(0);
tester.deployZoneApp();
@@ -84,14 +86,15 @@ public class RebalancerTest {
assertEquals("Skew is reduced",
0.00244, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ // --- Deploying a mem heavy application - allocated to the best option and causing increased skew
deployApp(memApp, clusterSpec("c"), memResources, tester, 1);
assertEquals("Assigned to a flat node as that causes least skew", "flt",
tester.nodeRepository().list().parentOf(tester.nodeRepository().getNodes(memApp).get(0)).get().flavor().name());
-
rebalancer.maintain();
assertEquals("Deploying the mem skewed app increased skew",
0.00734, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
+ // --- Adding a more suitable node reconfiguration causes no action as the system is not stable
Node memSkewedNode = tester.nodeRepository().getNodes(memApp).get(0);
Node newMemHost = tester.makeReadyNodes(1, "mem", NodeType.host, 8).get(0);
tester.deployZoneApp();
@@ -100,10 +103,22 @@ public class RebalancerTest {
assertFalse("No rebalancing happens because cpuSkewedNode is still retired",
tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired());
+ // --- Making the system stable enables rebalancing
NestedTransaction tx = new NestedTransaction();
tester.nodeRepository().deactivate(List.of(cpuSkewedNode), tx);
tx.commit();
+ // ... if activation fails when trying, we clean up the state
+ deployer.setFailActivate(true);
+ rebalancer.maintain();
+ assertTrue("Want to retire is reset", tester.nodeRepository().getNodes(Node.State.active).stream().noneMatch(node -> node.status().wantToRetire()));
+ assertEquals("Reserved node was moved to dirty", 1, tester.nodeRepository().getNodes(Node.State.dirty).size());
+ String reservedHostname = tester.nodeRepository().getNodes(Node.State.dirty).get(0).hostname();
+ tester.nodeRepository().setReady(reservedHostname, Agent.system, "Cleanup");
+ tester.nodeRepository().removeRecursively(reservedHostname);
+
+ // ... otherwise we successfully rebalance, again reducing skew
+ deployer.setFailActivate(false);
rebalancer.maintain();
assertTrue("Rebalancer retired the node we wanted to move away from",
tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired());