Merge pull request #27788 from vespa-engine/jonmv/ensure-correct-lock-order

Ensure correct lock order when failing tenant hosts
author: Martin Polden <mpolden@mpolden.no> 2023-07-17 14:52:19 +0200
committer: GitHub <noreply@github.com> 2023-07-17 14:52:19 +0200
commit: 49153129db2c3a77ccb27b5dab9d698668c0a1bd (patch)
tree: a0c25baa248e77a7cf92bb0c9abbabd29dba6785 /node-repository
parent: bb7411992b5a4b1e1841d2aa5b574babc55e9592 (diff)
parent: b9bb81bf92599ae630e706924bc269fc08fa06af (diff)
2 files changed, 41 insertions, 26 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index a16290361fb..585a7f341b5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -195,39 +195,48 @@ public class NodeFailer extends NodeRepositoryMaintainer {
     /**
      * Called when a node should be moved to the failed state: Do that if it seems safe,
      * which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
-     * Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken.
+     * Otherwise, not replacing the node ensures (by Orchestrator check) that no further action will be taken.
      */
     private void failActive(FailingNode failing) {
         Optional<Deployment> deployment =
             deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5));
         if (deployment.isEmpty()) return;
 
+        boolean redeploy = false;
         // If the active node that we are trying to fail is of type host, we need to successfully fail all
         // the children nodes running on it before we fail the host.  Failing a child node in a dynamically
         // provisioned zone may require provisioning new hosts that require the host application lock to be held,
         // so we must release ours before failing the children.
-        List<FailingNode> activeChildrenToFail = new ArrayList<>();
-        boolean redeploy = false;
-        try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) { // TODO: recursive lock for right order, only for hosts though
-            // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
-            if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
-                return;
-            if (lock.node().state() == Node.State.failed)
-                return;
-            if (!Objects.equals(failing.node().state(), lock.node().state()))
-                return;
-            failing = new FailingNode(lock.node(), failing.reason);
-
-            String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
-            for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) {
-                if (failingTenantNode.state() == Node.State.active) {
-                    activeChildrenToFail.add(new FailingNode(failingTenantNode, reasonForChildFailure));
-                } else if (failingTenantNode.state() != Node.State.failed) {
-                    nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure);
+        if (failing.node.type().isHost()) {
+            List<FailingNode> activeChildrenToFail = new ArrayList<>();
+            try (var lock = nodeRepository().nodes().lockAndGetRecursively(failing.node.hostname(), Optional.empty())) {
+                failing = shouldFail(lock.parent().node(), failing);
+                if (failing == null) return;
+
+                String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
+                for (var failingTenantNode : lock.children()) {
+                    if (failingTenantNode.node().state() == Node.State.active) {
+                        activeChildrenToFail.add(new FailingNode(failingTenantNode.node(), reasonForChildFailure));
+                    } else if (failingTenantNode.node().state() != Node.State.failed) {
+                        nodeRepository().nodes().fail(failingTenantNode.node().hostname(), Agent.NodeFailer, reasonForChildFailure);
+                    }
+                }
+
+                if (activeChildrenToFail.isEmpty()) {
+                    log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
+                    markWantToFail(failing.node(), true, lock.parent());
+                    redeploy = true;
                 }
             }
+            // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
+            // so failActive() may take a long time to complete, but the remaining children should be fast.
+            activeChildrenToFail.forEach(this::failActive);
+        }
+        else {
+            try (var lock = nodeRepository().nodes().lockAndGetRequired(failing.node)) {
+                failing = shouldFail(lock.node(), failing);
+                if (failing == null) return;
 
-            if (activeChildrenToFail.isEmpty()) {
                 log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
                 markWantToFail(failing.node(), true, lock);
                 redeploy = true;
@@ -237,13 +246,19 @@ public class NodeFailer extends NodeRepositoryMaintainer {
         // Redeploy to replace failing node
         if (redeploy) {
             redeploy(deployment.get(), failing);
-            return;
         }
+    }
 
-        // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
-        // so failActive() may take a long time to complete, but the remaining children should be fast.
-        activeChildrenToFail.forEach(this::failActive);
-
+    // Returns an updated FailingNode if we should still fail the node, otherwise null
+    private static FailingNode shouldFail(Node fresh, FailingNode stale) {
+        // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
+        if (!Objects.equals(stale.node.allocation().map(Allocation::owner), fresh.allocation().map(Allocation::owner)))
+            return null;
+        if (fresh.state() == Node.State.failed)
+            return null;
+        if (!Objects.equals(stale.node.state(), fresh.state()))
+            return null;
+        return new FailingNode(fresh, stale.reason);
     }
 
     private void redeploy(Deployment deployment, FailingNode failing) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index bf046c09899..1ae9b00d794 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -72,7 +72,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
             }
             boolean redeploy = false;
             List<String> nodesToDeactivate = new ArrayList<>();
-            try (var lock = nodeRepository().applications().lock(application)) { // TODO: take recusrive lock for right order
+            try (var lock = nodeRepository().applications().lock(application)) {
                 NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
                 Map<Removal, NodeList> nodesByRemovalReason = activeNodes.owner(application)
                                                                          .retired()
author	Martin Polden <mpolden@mpolden.no>	2023-07-17 14:52:19 +0200
committer	GitHub <noreply@github.com>	2023-07-17 14:52:19 +0200
commit	49153129db2c3a77ccb27b5dab9d698668c0a1bd (patch)
tree	a0c25baa248e77a7cf92bb0c9abbabd29dba6785 /node-repository
parent	bb7411992b5a4b1e1841d2aa5b574babc55e9592 (diff)
parent	b9bb81bf92599ae630e706924bc269fc08fa06af (diff)