diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-10 14:04:29 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-10 14:04:29 +0200 |
commit | 9fc05281d6a79c26efe04edeb7604300f0c05845 (patch) | |
tree | d2c8e6abc6bfe02d564815aa6c83715eb880e432 /node-repository | |
parent | 49ad958565e4db19872bfb1a069a0209a652c011 (diff) |
Refactor - no funcntional changes
Diffstat (limited to 'node-repository')
4 files changed, 116 insertions, 73 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java index 0ab343f0795..97253df900b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java @@ -197,7 +197,8 @@ public class CapacityChecker { /** * Tests whether it's possible to remove the provided hosts. * Does not mutate any input variable. - * @return Empty optional if removal is possible, information on what caused the failure otherwise + * + * @return empty optional if removal is possible, information on what caused the failure otherwise */ private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts, Map<Node, List<Node>> nodechildren, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index 18471637da7..b006b2f964b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -11,6 +11,7 @@ import java.util.logging.Level; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.yolean.Exceptions; import java.io.Closeable; @@ -128,4 +129,79 @@ class MaintenanceDeployment implements Closeable { return "deployment of " + application; } + public static class Move { + + final Node node; + final Node toHost; + + Move(Node node, Node toHost) { + this.node = node; + this.toHost = toHost; + } + + /** + * Try to deploy to make this move. + * + * @return true if the move was done, false if it couldn't be + */ + public boolean execute(Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) { + if (isEmpty()) return false; + ApplicationId application = node.allocation().get().owner(); + try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository)) { + if ( ! deployment.isValid()) return false; + + boolean couldMarkRetiredNow = markWantToRetire(node, true, agent, nodeRepository); + if ( ! couldMarkRetiredNow) return false; + + Optional<Node> expectedNewNode = Optional.empty(); + try { + if ( ! deployment.prepare()) return false; + expectedNewNode = + nodeRepository.getNodes(application, Node.State.reserved).stream() + .filter(n -> !n.hostname().equals(node.hostname())) + .filter(n -> n.allocation().get().membership().cluster().id().equals(node.allocation().get().membership().cluster().id())) + .findAny(); + if (expectedNewNode.isEmpty()) return false; + if ( ! expectedNewNode.get().hasParent(toHost.hostname())) return false; + if ( ! deployment.activate()) return false; + + log.info(agent + " redeployed " + application + " to " + this); + return true; + } + finally { + markWantToRetire(node, false, agent, nodeRepository); // Necessary if this failed, no-op otherwise + + // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host + expectedNewNode.flatMap(node -> nodeRepository.getNode(node.hostname(), Node.State.reserved)) + .ifPresent(node -> nodeRepository.setDirty(node, Agent.Rebalancer, "Expired by " + agent)); + } + } + } + + /** Returns true only if this operation changes the state of the wantToRetire flag */ + private boolean markWantToRetire(Node node, boolean wantToRetire, Agent agent, NodeRepository nodeRepository) { + try (Mutex lock = nodeRepository.lock(node)) { + Optional<Node> nodeToMove = nodeRepository.getNode(node.hostname()); + if (nodeToMove.isEmpty()) return false; + if (nodeToMove.get().state() != Node.State.active) return false; + + if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false; + + nodeRepository.write(nodeToMove.get().withWantToRetire(wantToRetire, agent, nodeRepository.clock().instant()), lock); + return true; + } + } + + public boolean isEmpty() { return node == null; } + + @Override + public String toString() { + return "move " + + ( isEmpty() ? "none" : (node.hostname() + " to " + toHost)); + } + + public static Move empty() { return new Move(null, null); } + + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 2cb46a6a78e..da3b9bd0e65 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -42,18 +42,13 @@ public class Rebalancer extends NodeRepositoryMaintainer { @Override protected void maintain() { if ( ! nodeRepository().zone().getCloud().allowHostSharing()) return; // Rebalancing not necessary - if (nodeRepository().zone().environment().isTest()) return; // Test zones have short lived deployments, no need to rebalance + if (nodeRepository().zone().environment().isTest()) return; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().list(); - updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return; - - Move bestMove = findBestMove(allNodes); - if (bestMove == Move.none) return; - deployTo(bestMove); + findBestMove(allNodes).execute(Agent.Rebalancer, deployer, metric, nodeRepository()); } /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */ @@ -81,7 +76,7 @@ public class Rebalancer extends NodeRepositoryMaintainer { */ private Move findBestMove(NodeList allNodes) { DockerHostCapacity capacity = new DockerHostCapacity(allNodes, nodeRepository().resourcesCalculator()); - Move bestMove = Move.none; + Move bestMove = Move.empty(); for (Node node : allNodes.nodeType(NodeType.tenant).state(Node.State.active)) { if (node.parentHostname().isEmpty()) continue; ApplicationId applicationId = node.allocation().get().owner(); @@ -101,59 +96,6 @@ public class Rebalancer extends NodeRepositoryMaintainer { return bestMove; } - /** Returns true only if this operation changes the state of the wantToRetire flag */ - private boolean markWantToRetire(Node node, boolean wantToRetire) { - try (Mutex lock = nodeRepository().lock(node)) { - Optional<Node> nodeToMove = nodeRepository().getNode(node.hostname()); - if (nodeToMove.isEmpty()) return false; - if (nodeToMove.get().state() != Node.State.active) return false; - - if (nodeToMove.get().status().wantToRetire() == wantToRetire) return false; - - nodeRepository().write(nodeToMove.get().withWantToRetire(wantToRetire, Agent.Rebalancer, clock.instant()), lock); - return true; - } - } - - /** - * Try a redeployment to effect the chosen move. - * If it can be done, that's ok; we'll try this or another move later. - * - * @return true if the move was done, false if it couldn't be - */ - private boolean deployTo(Move move) { - ApplicationId application = move.node.allocation().get().owner(); - try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { - if ( ! deployment.isValid()) return false; - - boolean couldMarkRetiredNow = markWantToRetire(move.node, true); - if ( ! couldMarkRetiredNow) return false; - - Optional<Node> expectedNewNode = Optional.empty(); - try { - if ( ! deployment.prepare()) return false; - expectedNewNode = - nodeRepository().getNodes(application, Node.State.reserved).stream() - .filter(node -> !node.hostname().equals(move.node.hostname())) - .filter(node -> node.allocation().get().membership().cluster().id().equals(move.node.allocation().get().membership().cluster().id())) - .findAny(); - if (expectedNewNode.isEmpty()) return false; - if ( ! expectedNewNode.get().hasParent(move.toHost.hostname())) return false; - if ( ! deployment.activate()) return false; - - log.info("Rebalancer redeployed " + application + " to " + move); - return true; - } - finally { - markWantToRetire(move.node, false); // Necessary if this failed, no-op otherwise - - // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host - expectedNewNode.flatMap(node -> nodeRepository().getNode(node.hostname(), Node.State.reserved)) - .ifPresent(node -> nodeRepository().setDirty(node, Agent.Rebalancer, "Expired by Rebalancer")); - } - } - } - private double skewReductionByRemoving(Node node, Node fromHost, DockerHostCapacity capacity) { NodeResources freeHostCapacity = capacity.freeCapacityOf(fromHost); double skewBefore = Node.skew(fromHost.flavor().resources(), freeHostCapacity); @@ -176,17 +118,12 @@ public class Rebalancer extends NodeRepositoryMaintainer { .orElse(true); } - private static class Move { - - static final Move none = new Move(null, null, 0); + private static class Move extends MaintenanceDeployment.Move { - final Node node; - final Node toHost; final double netSkewReduction; Move(Node node, Node toHost, double netSkewReduction) { - this.node = node; - this.toHost = toHost; + super(node, toHost); this.netSkewReduction = netSkewReduction; } @@ -197,6 +134,10 @@ public class Rebalancer extends NodeRepositoryMaintainer { (node.hostname() + " to " + toHost + " [skew reduction " + netSkewReduction + "]")); } + public static Move empty() { + return new Move(null, null, 0); + } + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 4f7902d4dbc..c0e39b39e94 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -63,19 +63,44 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size(); metric.set("spareHostCapacity", worstCaseHostLoss - 1, null); if (worstCaseHostLoss == 1) { // Try to get back to needing 2 hosts to fail in the worst case - Optional<Node> moveCandidate = identifyMoveCandidate(failurePath.get()); + Optional<Move> moveCandidate = identifyMoveCandidate(failurePath.get()); if (moveCandidate.isPresent()) move(moveCandidate.get()); } } } - private Optional<Node> identifyMoveCandidate(CapacityChecker.HostFailurePath failurePath) { - Node host = failurePath.hostsCausingFailure.get(0); + private Optional<Move> identifyMoveCandidate(CapacityChecker.HostFailurePath failurePath) { + Optional<Node> nodeWhichCantMove = failurePath.failureReason.tenant; + if (nodeWhichCantMove.isEmpty()) return Optional.empty(); + return findMoveWhichMakesRoomFor(nodeWhichCantMove.get()); + } + private Optional<Move> findMoveWhichMakesRoomFor(Node node) { + return Optional.empty(); } - private void move(Node node) { + private void move(Move move) { + + } + + private static class Move { + + static final Move none = new Move(null, null); + + final Node node; + final Node toHost; + + Move(Node node, Node toHost) { + this.node = node; + this.toHost = toHost; + } + + @Override + public String toString() { + return "move " + + ( node == null ? "none" : (node.hostname() + " to " + toHost)); + } } |