summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2023-03-09 13:26:35 +0100
committerMartin Polden <mpolden@mpolden.no>2023-03-10 11:26:29 +0100
commitf6702b43e47bfdc925b3d9187766cdd7d4f82bf9 (patch)
treeefef95543cd301f4e511d9fb73d8d66901877aa5
parent73465b058689a0c7ae5599083c4ae9f31557584b (diff)
Do not hold application lock while replacing failing node
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java63
1 files changed, 36 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 84a45de39d7..afea08711fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -182,27 +182,26 @@ public class NodeFailer extends NodeRepositoryMaintainer {
* Called when a node should be moved to the failed state: Do that if it seems safe,
* which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
* Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken.
- *
- * @return whether node was successfully failed
*/
- private boolean failActive(FailingNode failing) {
+ private void failActive(FailingNode failing) {
Optional<Deployment> deployment =
deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5));
- if (deployment.isEmpty()) return false;
+ if (deployment.isEmpty()) return;
// If the active node that we are trying to fail is of type host, we need to successfully fail all
// the children nodes running on it before we fail the host. Failing a child node in a dynamically
// provisioned zone may require provisioning new hosts that require the host application lock to be held,
// so we must release ours before failing the children.
List<FailingNode> activeChildrenToFail = new ArrayList<>();
+ boolean redeploy = false;
try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) {
// Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
- return false;
+ return;
if (lock.node().state() == Node.State.failed)
- return true;
+ return;
if (!Objects.equals(failing.node().state(), lock.node().state()))
- return false;
+ return;
failing = new FailingNode(lock.node(), failing.reason);
String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
@@ -216,36 +215,46 @@ public class NodeFailer extends NodeRepositoryMaintainer {
if (activeChildrenToFail.isEmpty()) {
log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
- wantToFail(failing.node(), true, lock);
- try {
- deployment.get().activate();
- return true;
- } catch (TransientException | UncheckedTimeoutException e) {
- log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
- " with a transient error, will be retried by application maintainer: " +
- Exceptions.toMessageString(e));
- return true;
- } catch (RuntimeException e) {
- // Reset want to fail: We'll retry failing unless it heals in the meantime
- nodeRepository().nodes().node(failing.node().hostname())
- .ifPresent(n -> wantToFail(n, false, lock));
- log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
- " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
- return false;
- }
+ markWantToFail(failing.node(), true, lock);
+ redeploy = true;
}
}
+ // Redeploy to replace failing node
+ if (redeploy) {
+ redeploy(deployment.get(), failing);
+ return;
+ }
+
// In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
// so failActive() may take a long time to complete, but the remaining children should be fast.
activeChildrenToFail.forEach(this::failActive);
- return false;
}
- private void wantToFail(Node node, boolean wantToFail, Mutex lock) {
- if (!node.status().wantToFail())
+ private void redeploy(Deployment deployment, FailingNode failing) {
+ try {
+ deployment.activate();
+ } catch (TransientException | UncheckedTimeoutException e) {
+ log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
+ " with a transient error, will be retried by application maintainer: " +
+ Exceptions.toMessageString(e));
+ } catch (RuntimeException e) {
+ // Reset want to fail: We'll retry failing unless it heals in the meantime
+ Optional<NodeMutex> optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
+ if (optionalNodeMutex.isEmpty()) return;
+ try (var nodeMutex = optionalNodeMutex.get()) {
+ markWantToFail(nodeMutex.node(), false, nodeMutex);
+ log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
+ " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
+ }
+ }
+ }
+
+ private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
+ if (node.status().wantToFail() != wantToFail) {
nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
+ }
}
/** Returns true if node failing should be throttled */