diff options
author | Jon Bratseth <bratseth@oath.com> | 2019-11-19 15:36:52 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-11-19 15:36:52 +0100 |
commit | ebe2949751df9a0654e806049240c39a56b48add (patch) | |
tree | 7443357b571d5f4e886a9fda663a911e2f19c1d0 | |
parent | 05e647af99a5907310714a85e92bf4675d449a28 (diff) | |
parent | fd5012f00f242d2611a9dd9b5dea2a2d6d982ed5 (diff) |
Merge pull request #11355 from vespa-engine/bratseth/faster-rebalancing-cleanup
Fast-track cleanup after failed rebalancing
3 files changed, 36 insertions, 8 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 7d8949502d4..7160edb96f4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -121,7 +121,7 @@ public class Rebalancer extends Maintainer { if (nodeToMove.isEmpty()) return false; if (nodeToMove.get().state() != Node.State.active) return false; - if (node.status().wantToRetire() == wantToRetire) return false; + if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false; nodeRepository().write(nodeToMove.get().withWantToRetire(wantToRetire, Agent.system, clock.instant()), lock); return true; @@ -142,10 +142,15 @@ public class Rebalancer extends Maintainer { boolean couldMarkRetiredNow = markWantToRetire(move.node, true); if ( ! couldMarkRetiredNow) return false; + Optional<Node> expectedNewNode = Optional.empty(); try { if ( ! deployment.prepare()) return false; - if (nodeRepository().getNodes(application, Node.State.reserved).stream().noneMatch(node -> node.hasParent(move.toHost.hostname()))) - return false; // Deployment is not moving the from node to the target we identified for some reason + expectedNewNode = + nodeRepository().getNodes(application, Node.State.reserved).stream() + .filter(node -> node.hasParent(move.toHost.hostname())) + .filter(node -> node.allocation().get().membership().cluster().id().equals(move.node.allocation().get().membership().cluster().id())) + .findAny(); + if (expectedNewNode.isEmpty()) return false; if ( ! deployment.activate()) return false; log.info("Rebalancer redeployed " + application + " to " + move); @@ -153,6 +158,10 @@ public class Rebalancer extends Maintainer { } finally { markWantToRetire(move.node, false); // Necessary if this failed, no-op otherwise + if (expectedNewNode.isPresent()) { // Immediately clean up if we reserved the node but could not activate + Optional<Node> reservedNewNode = nodeRepository().getNode(expectedNewNode.get().hostname(), Node.State.reserved); + reservedNewNode.ifPresent(reserved -> nodeRepository().setDirty(reserved, Agent.system, "Expired by Rebalancer")); + } } } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 898e1343ea3..7ab42093bab 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -37,6 +37,8 @@ public class MockDeployer implements Deployer { private final Clock clock; private final ReentrantLock lock = new ReentrantLock(); + private boolean failActivate = false; + @Inject @SuppressWarnings("unused") public MockDeployer() { @@ -55,9 +57,9 @@ public class MockDeployer implements Deployer { this.applications = new HashMap<>(applications); } - public ReentrantLock lock() { - return lock; - } + public ReentrantLock lock() { return lock; } + + public void setFailActivate(boolean failActivate) { this.failActivate = failActivate; } @Override public Optional<Deployment> deployFromLocalActive(ApplicationId id, boolean bootstrap) { @@ -119,6 +121,8 @@ public class MockDeployer implements Deployer { if (preparedHosts == null) prepare(); redeployments++; + if (failActivate) + throw new IllegalStateException("failActivate is true"); try (NestedTransaction t = new NestedTransaction()) { provisioner.activate(t, application.id(), preparedHosts); t.commit(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java index fc0e0e68570..d1a330a3bd6 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java @@ -39,6 +39,7 @@ public class RebalancerTest { @Test public void testRebalancing() { + // --- Setup ApplicationId cpuApp = makeApplicationId("t1", "a1"); ApplicationId memApp = makeApplicationId("t2", "a2"); NodeResources cpuResources = new NodeResources(8, 4, 1, 0.1); @@ -64,7 +65,7 @@ public class RebalancerTest { tester.makeReadyNodes(3, "flt", NodeType.host, 8); tester.deployZoneApp(); - // Cpu heavy application - causing 1 of these nodes to be skewed + // --- Deploying a cpu heavy application - causing 1 of these nodes to be skewed deployApp(cpuApp, clusterSpec("c"), cpuResources, tester, 1); Node cpuSkewedNode = tester.nodeRepository().getNodes(cpuApp).get(0); @@ -73,6 +74,7 @@ public class RebalancerTest { tester.nodeRepository().getNode(cpuSkewedNode.hostname()).get().status().wantToRetire()); assertEquals(0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); + // --- Making a more suitable node configuration available causes rebalancing Node newCpuHost = tester.makeReadyNodes(1, "cpu", NodeType.host, 8).get(0); tester.deployZoneApp(); @@ -84,14 +86,15 @@ public class RebalancerTest { assertEquals("Skew is reduced", 0.00244, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); + // --- Deploying a mem heavy application - allocated to the best option and causing increased skew deployApp(memApp, clusterSpec("c"), memResources, tester, 1); assertEquals("Assigned to a flat node as that causes least skew", "flt", tester.nodeRepository().list().parentOf(tester.nodeRepository().getNodes(memApp).get(0)).get().flavor().name()); - rebalancer.maintain(); assertEquals("Deploying the mem skewed app increased skew", 0.00734, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); + // --- Adding a more suitable node reconfiguration causes no action as the system is not stable Node memSkewedNode = tester.nodeRepository().getNodes(memApp).get(0); Node newMemHost = tester.makeReadyNodes(1, "mem", NodeType.host, 8).get(0); tester.deployZoneApp(); @@ -100,10 +103,22 @@ public class RebalancerTest { assertFalse("No rebalancing happens because cpuSkewedNode is still retired", tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired()); + // --- Making the system stable enables rebalancing NestedTransaction tx = new NestedTransaction(); tester.nodeRepository().deactivate(List.of(cpuSkewedNode), tx); tx.commit(); + // ... if activation fails when trying, we clean up the state + deployer.setFailActivate(true); + rebalancer.maintain(); + assertTrue("Want to retire is reset", tester.nodeRepository().getNodes(Node.State.active).stream().noneMatch(node -> node.status().wantToRetire())); + assertEquals("Reserved node was moved to dirty", 1, tester.nodeRepository().getNodes(Node.State.dirty).size()); + String reservedHostname = tester.nodeRepository().getNodes(Node.State.dirty).get(0).hostname(); + tester.nodeRepository().setReady(reservedHostname, Agent.system, "Cleanup"); + tester.nodeRepository().removeRecursively(reservedHostname); + + // ... otherwise we successfully rebalance, again reducing skew + deployer.setFailActivate(false); rebalancer.maintain(); assertTrue("Rebalancer retired the node we wanted to move away from", tester.nodeRepository().getNode(memSkewedNode.hostname()).get().allocation().get().membership().retired()); |