aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-08 14:56:34 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-08 14:56:34 +0200
commite969caac12895478af9ef627c84abce90d2f21ef (patch)
treee428a8b796fb25ec101622e00933ce311e440826 /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
parent0f8387d7f21d392f9d7ab5ee60b10acce1dff4d8 (diff)
Move nodes to 'failed' during activate
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java17
1 files changed, 10 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index eb9c8300724..ac6ecd98fac 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -277,7 +277,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- node = nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason);
+ wantToFail(node, true, reason, lock);
try {
deployment.get().activate();
return true;
@@ -287,17 +287,20 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Exceptions.toMessageString(e));
return true;
} catch (RuntimeException e) {
- // The expected reason for deployment to fail here is that there is no capacity available to redeploy.
- // In that case we should leave the node in the active state to avoid failing additional nodes.
- nodeRepository().nodes().reactivate(node.hostname(), Agent.NodeFailer,
- "Failed to redeploy after being failed by NodeFailer");
- log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() +
- ", but redeploying without the node failed", e);
+ // Reset want to fail: We'll retry failing unless it heals in the meantime
+ nodeRepository().nodes().node(node.hostname())
+ .ifPresent(n -> wantToFail(n, false, "Could not fail", lock));
+ log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() +
+ " for " + reason + ": " + Exceptions.toMessageString(e));
return false;
}
}
}
+ private void wantToFail(Node node, boolean wantToFail, String reason, Mutex lock) {
+ nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, reason, clock().instant()), lock);
+ }
+
/** Returns true if node failing should be throttled */
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled) return false;