Do not hold application lock while replacing failing node

author: Martin Polden <mpolden@mpolden.no> 2023-03-09 13:26:35 +0100
committer: Martin Polden <mpolden@mpolden.no> 2023-03-10 11:26:29 +0100
commit: f6702b43e47bfdc925b3d9187766cdd7d4f82bf9 (patch)
tree: efef95543cd301f4e511d9fb73d8d66901877aa5
parent: 73465b058689a0c7ae5599083c4ae9f31557584b (diff)
1 files changed, 36 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 84a45de39d7..afea08711fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -182,27 +182,26 @@ public class NodeFailer extends NodeRepositoryMaintainer {
      * Called when a node should be moved to the failed state: Do that if it seems safe,
      * which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
      * Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken.
-     *
-     * @return whether node was successfully failed
      */
-    private boolean failActive(FailingNode failing) {
+    private void failActive(FailingNode failing) {
         Optional<Deployment> deployment =
             deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5));
-        if (deployment.isEmpty()) return false;
+        if (deployment.isEmpty()) return;
 
         // If the active node that we are trying to fail is of type host, we need to successfully fail all
         // the children nodes running on it before we fail the host.  Failing a child node in a dynamically
         // provisioned zone may require provisioning new hosts that require the host application lock to be held,
         // so we must release ours before failing the children.
         List<FailingNode> activeChildrenToFail = new ArrayList<>();
+        boolean redeploy = false;
         try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) {
             // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
             if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
-                return false;
+                return;
             if (lock.node().state() == Node.State.failed)
-                return true;
+                return;
             if (!Objects.equals(failing.node().state(), lock.node().state()))
-                return false;
+                return;
             failing = new FailingNode(lock.node(), failing.reason);
 
             String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
@@ -216,36 +215,46 @@ public class NodeFailer extends NodeRepositoryMaintainer {
 
             if (activeChildrenToFail.isEmpty()) {
                 log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
-                wantToFail(failing.node(), true, lock);
-                try {
-                    deployment.get().activate();
-                    return true;
-                } catch (TransientException | UncheckedTimeoutException e) {
-                    log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
-                                        " with a transient error, will be retried by application maintainer: " +
-                                        Exceptions.toMessageString(e));
-                    return true;
-                } catch (RuntimeException e) {
-                    // Reset want to fail: We'll retry failing unless it heals in the meantime
-                    nodeRepository().nodes().node(failing.node().hostname())
-                                    .ifPresent(n -> wantToFail(n, false, lock));
-                    log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
-                                           " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
-                    return false;
-                }
+                markWantToFail(failing.node(), true, lock);
+                redeploy = true;
             }
         }
 
+        // Redeploy to replace failing node
+        if (redeploy) {
+            redeploy(deployment.get(), failing);
+            return;
+        }
+
         // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
         // so failActive() may take a long time to complete, but the remaining children should be fast.
         activeChildrenToFail.forEach(this::failActive);
 
-        return false;
     }
 
-    private void wantToFail(Node node, boolean wantToFail, Mutex lock) {
-        if (!node.status().wantToFail())
+    private void redeploy(Deployment deployment, FailingNode failing) {
+        try {
+            deployment.activate();
+        } catch (TransientException | UncheckedTimeoutException e) {
+            log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
+                                " with a transient error, will be retried by application maintainer: " +
+                                Exceptions.toMessageString(e));
+        } catch (RuntimeException e) {
+            // Reset want to fail: We'll retry failing unless it heals in the meantime
+            Optional<NodeMutex> optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
+            if (optionalNodeMutex.isEmpty()) return;
+            try (var nodeMutex = optionalNodeMutex.get()) {
+                markWantToFail(nodeMutex.node(), false, nodeMutex);
+                log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
+                                       " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
+            }
+        }
+    }
+
+    private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
+        if (node.status().wantToFail() != wantToFail) {
             nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
+        }
     }
 
     /** Returns true if node failing should be throttled */
author	Martin Polden <mpolden@mpolden.no>	2023-03-09 13:26:35 +0100
committer	Martin Polden <mpolden@mpolden.no>	2023-03-10 11:26:29 +0100
commit	f6702b43e47bfdc925b3d9187766cdd7d4f82bf9 (patch)
tree	efef95543cd301f4e511d9fb73d8d66901877aa5
parent	73465b058689a0c7ae5599083c4ae9f31557584b (diff)