diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-08 14:56:34 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-08 14:56:34 +0200 |
commit | e969caac12895478af9ef627c84abce90d2f21ef (patch) | |
tree | e428a8b796fb25ec101622e00933ce311e440826 /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | |
parent | 0f8387d7f21d392f9d7ab5ee60b10acce1dff4d8 (diff) |
Move nodes to 'failed' during activate
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index eb9c8300724..ac6ecd98fac 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -277,7 +277,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { } if (! allTenantNodesFailedOutSuccessfully) return false; - node = nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason); + wantToFail(node, true, reason, lock); try { deployment.get().activate(); return true; @@ -287,17 +287,20 @@ public class NodeFailer extends NodeRepositoryMaintainer { Exceptions.toMessageString(e)); return true; } catch (RuntimeException e) { - // The expected reason for deployment to fail here is that there is no capacity available to redeploy. - // In that case we should leave the node in the active state to avoid failing additional nodes. - nodeRepository().nodes().reactivate(node.hostname(), Agent.NodeFailer, - "Failed to redeploy after being failed by NodeFailer"); - log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() + - ", but redeploying without the node failed", e); + // Reset want to fail: We'll retry failing unless it heals in the meantime + nodeRepository().nodes().node(node.hostname()) + .ifPresent(n -> wantToFail(n, false, "Could not fail", lock)); + log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() + + " for " + reason + ": " + Exceptions.toMessageString(e)); return false; } } } + private void wantToFail(Node node, boolean wantToFail, String reason, Mutex lock) { + nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, reason, clock().instant()), lock); + } + /** Returns true if node failing should be throttled */ private boolean throttle(Node node) { if (throttlePolicy == ThrottlePolicy.disabled) return false; |