diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-23 19:57:39 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-23 19:57:39 +0200 |
commit | 37f352aac0f48a38df9e9d36dd5f7f8c662049bf (patch) | |
tree | 3d4638b53c90161d7b6f1488013db8f718dbfa9a | |
parent | c1313421e3f7ec8b10c61d8e317e190edcd2be8c (diff) |
Log on mitigation
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java | 26 |
1 files changed, 16 insertions, 10 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 54899372397..9c042c6cdb9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -80,8 +80,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { if (failurePath.isPresent()) { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; if (spareHostCapacity == 0) { - Move move = findMitigatingMove(failurePath.get()); - if (moving(move)) { + List<Move> mitigation = findMitigation(failurePath.get()); + if (execute(mitigation)) { // We succeeded or are in the process of taking a step to mitigate. // Report with the assumption this will eventually succeed to avoid alerting before we're stuck spareHostCapacity++; @@ -91,15 +91,21 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } } - private boolean moving(Move move) { - if (move.isEmpty()) return false; - if (move.node().allocation().get().membership().retired()) return true; // Move already in progress - return move.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); + private boolean execute(List<Move> mitigation) { + if (mitigation.isEmpty()) { + log.warning("Out of spare capacity. No mitigation could be found"); + return false; + } + Move firstMove = mitigation.get(0); + if (firstMove.node().allocation().get().membership().retired()) return true; // Already in progress + boolean success = firstMove.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository()); + log.info("Out of spare capacity. Mitigation plan: " + mitigation + ". First move successful: " + success); + return success; } - private Move findMitigatingMove(CapacityChecker.HostFailurePath failurePath) { + private List<Move> findMitigation(CapacityChecker.HostFailurePath failurePath) { Optional<Node> nodeWhichCantMove = failurePath.failureReason.tenant; - if (nodeWhichCantMove.isEmpty()) return Move.empty(); + if (nodeWhichCantMove.isEmpty()) return List.of(); Node node = nodeWhichCantMove.get(); NodeList allNodes = nodeRepository().list(); @@ -117,8 +123,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { if (shortestMitigation == null || shortestMitigation.size() > mitigation.size()) shortestMitigation = mitigation; } - if (shortestMitigation == null || shortestMitigation.isEmpty()) return Move.empty(); - return shortestMitigation.get(0); + if (shortestMitigation == null || shortestMitigation.isEmpty()) return List.of(); + return shortestMitigation; } private static class CapacitySolver { |