summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-06-23 19:57:39 +0200
committerJon Bratseth <bratseth@gmail.com>2020-06-23 19:57:39 +0200
commit37f352aac0f48a38df9e9d36dd5f7f8c662049bf (patch)
tree3d4638b53c90161d7b6f1488013db8f718dbfa9a
parentc1313421e3f7ec8b10c61d8e317e190edcd2be8c (diff)
Log on mitigation
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java26
1 files changed, 16 insertions, 10 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
index 54899372397..9c042c6cdb9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java
@@ -80,8 +80,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
if (failurePath.isPresent()) {
int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1;
if (spareHostCapacity == 0) {
- Move move = findMitigatingMove(failurePath.get());
- if (moving(move)) {
+ List<Move> mitigation = findMitigation(failurePath.get());
+ if (execute(mitigation)) {
// We succeeded or are in the process of taking a step to mitigate.
// Report with the assumption this will eventually succeed to avoid alerting before we're stuck
spareHostCapacity++;
@@ -91,15 +91,21 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
}
}
- private boolean moving(Move move) {
- if (move.isEmpty()) return false;
- if (move.node().allocation().get().membership().retired()) return true; // Move already in progress
- return move.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository());
+ private boolean execute(List<Move> mitigation) {
+ if (mitigation.isEmpty()) {
+ log.warning("Out of spare capacity. No mitigation could be found");
+ return false;
+ }
+ Move firstMove = mitigation.get(0);
+ if (firstMove.node().allocation().get().membership().retired()) return true; // Already in progress
+ boolean success = firstMove.execute(false, Agent.SpareCapacityMaintainer, deployer, metric, nodeRepository());
+ log.info("Out of spare capacity. Mitigation plan: " + mitigation + ". First move successful: " + success);
+ return success;
}
- private Move findMitigatingMove(CapacityChecker.HostFailurePath failurePath) {
+ private List<Move> findMitigation(CapacityChecker.HostFailurePath failurePath) {
Optional<Node> nodeWhichCantMove = failurePath.failureReason.tenant;
- if (nodeWhichCantMove.isEmpty()) return Move.empty();
+ if (nodeWhichCantMove.isEmpty()) return List.of();
Node node = nodeWhichCantMove.get();
NodeList allNodes = nodeRepository().list();
@@ -117,8 +123,8 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer {
if (shortestMitigation == null || shortestMitigation.size() > mitigation.size())
shortestMitigation = mitigation;
}
- if (shortestMitigation == null || shortestMitigation.isEmpty()) return Move.empty();
- return shortestMitigation.get(0);
+ if (shortestMitigation == null || shortestMitigation.isEmpty()) return List.of();
+ return shortestMitigation;
}
private static class CapacitySolver {