From dac838f1fa390d59c000fdc91b54fc7218599c08 Mon Sep 17 00:00:00 2001
From: Håkon Hallingstad <hakon@yahooinc.com>
Date: Mon, 11 Jul 2022 11:29:09 +0200
Subject: Revert "Avoid the host lock while failing the children"

This reverts commit 2cdaef56e18ace2ee2269d28f959f5a534bd68ee.
---
 .../hosted/provision/maintenance/NodeFailer.java   | 100 ++++++++++-----------
 .../provision/maintenance/NodeFailerTest.java      |  22 ++---
 2 files changed, 51 insertions(+), 71 deletions(-)

(limited to 'node-repository')
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index f9b3dae72c5..3c5b20da4d0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -12,7 +12,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
 import com.yahoo.vespa.hosted.provision.NodeMutex;
 import com.yahoo.vespa.hosted.provision.NodeRepository;
 import com.yahoo.vespa.hosted.provision.node.Agent;
-import com.yahoo.vespa.hosted.provision.node.Allocation;
 import com.yahoo.vespa.hosted.provision.node.History;
 import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
 import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
@@ -20,11 +19,9 @@ import com.yahoo.yolean.Exceptions;
 
 import java.time.Duration;
 import java.time.Instant;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
 import java.util.logging.Level;
@@ -109,6 +106,26 @@ public class NodeFailer extends NodeRepositoryMaintainer {
             failActive(failing);
         }
 
+        // Active hosts
+        NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
+        for (Node host : activeNodes.hosts().failing()) {
+            if ( ! activeNodes.childrenOf(host).isEmpty()) continue;
+            Optional<NodeMutex> locked = Optional.empty();
+            try {
+                attempts++;
+                locked = nodeRepository().nodes().lockAndGet(host);
+                if (locked.isEmpty()) continue;
+                nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer,
+                                              "Host should be failed and have no tenant nodes");
+            }
+            catch (Exception e) {
+                failures++;
+            }
+            finally {
+                locked.ifPresent(NodeMutex::close);
+            }
+        }
+
         int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures);
         metric.set(throttlingActiveMetric, throttlingActive, null);
         metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
@@ -136,9 +153,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
         Set<FailingNode> failingNodes = new HashSet<>();
         NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
 
-        for (Node host : activeNodes.hosts().failing())
-            failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));
-
         for (Node node : activeNodes) {
             Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
             if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
@@ -227,63 +241,42 @@ public class NodeFailer extends NodeRepositoryMaintainer {
             deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30));
         if (deployment.isEmpty()) return false;
 
-        // If the active node that we are trying to fail is of type host, we need to successfully fail all
-        // the children nodes running on it before we fail the host.  Failing a child node in a dynamically
-        // provisioned zone may require provisioning new hosts that require the host application lock to be held,
-        // so we must release ours before failing the children.
-        List<FailingNode> activeChildrenToFail = new ArrayList<>();
-        try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) {
-            // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
-            if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
-                return false;
-            if (lock.node().state() == Node.State.failed)
-                return true;
-            if (!Objects.equals(failing.node().state(), lock.node().state()))
-                return false;
-            failing = new FailingNode(lock.node(), failing.reason);
-
+        try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) {
+            // If the active node that we are trying to fail is of type host, we need to successfully fail all
+            // the children nodes running on it before we fail the host
+            boolean allTenantNodesFailedOutSuccessfully = true;
             String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
             for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) {
                 if (failingTenantNode.state() == Node.State.active) {
-                    activeChildrenToFail.add(new FailingNode(failingTenantNode, reasonForChildFailure));
-                } else if (failingTenantNode.state() != Node.State.failed) {
+                    allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure));
+                } else {
                     nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure);
                 }
             }
 
-            // A parent with children gets wantToFail to avoid getting more nodes allocated to it.
+            if (! allTenantNodesFailedOutSuccessfully) return false;
             wantToFail(failing.node(), true, lock);
-
-            if (activeChildrenToFail.isEmpty()) {
-                try {
-                    deployment.get().activate();
-                    return true;
-                } catch (TransientException e) {
-                    log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
-                                        " with a transient error, will be retried by application maintainer: " +
-                                        Exceptions.toMessageString(e));
-                    return true;
-                } catch (RuntimeException e) {
-                    // Reset want to fail: We'll retry failing unless it heals in the meantime
-                    nodeRepository().nodes().node(failing.node().hostname())
-                                    .ifPresent(n -> wantToFail(n, false, lock));
-                    log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
-                                           " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
-                    return false;
-                }
+            try {
+                deployment.get().activate();
+                return true;
+            } catch (TransientException e) {
+                log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
+                                    " with a transient error, will be retried by application maintainer: " +
+                                    Exceptions.toMessageString(e));
+                return true;
+            } catch (RuntimeException e) {
+                // Reset want to fail: We'll retry failing unless it heals in the meantime
+                nodeRepository().nodes().node(failing.node().hostname())
+                                        .ifPresent(n -> wantToFail(n, false, lock));
+                log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
+                                       " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
+                return false;
             }
         }
-
-        // In a dynamically provisioned zone the failing of an active child node takes ~10 minutes,
-        // so perhaps this should be done in parallel.
-        activeChildrenToFail.forEach(this::failActive);
-
-        return false;
     }
 
     private void wantToFail(Node node, boolean wantToFail, Mutex lock) {
-        if (!node.status().wantToFail())
-            nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
+        nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
     }
 
     /** Returns true if node failing should be throttled */
@@ -291,10 +284,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
         if (throttlePolicy == ThrottlePolicy.disabled) return false;
         Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow);
         NodeList allNodes = nodeRepository().nodes().list();
-        NodeList recentlyFailedNodes = allNodes
-                .matching(n -> n.status().wantToFail() ||
-                               (n.state() == Node.State.failed &&
-                                n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)));
+        NodeList recentlyFailedNodes = allNodes.state(Node.State.failed)
+                                               .matching(n -> n.history().hasEventAfter(History.Event.Type.failed,
+                                                                                        startOfThrottleWindow));
 
         // Allow failing any node within policy
         if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 3ba536ee4d7..f7d29a116ed 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -457,16 +457,6 @@ public class NodeFailerTest {
         tester.allNodesMakeAConfigRequestExcept();
         tester.runMaintainers();
 
-        assertEquals(2, tester.deployer.redeployments);
-        assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
-        assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
-        assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
-        assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
-        assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
-
-        // The failing of the host is deferred to the next maintain
-        tester.runMaintainers();
-
         assertEquals(2 + 1, tester.deployer.redeployments);
         assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
         assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
@@ -505,7 +495,6 @@ public class NodeFailerTest {
         tester.clock.advance(Duration.ofMinutes(90));
         tester.allNodesMakeAConfigRequestExcept();
         tester.runMaintainers();
-        tester.runMaintainers();  // The host is failed in the 2. maintain()
 
         assertEquals(5 + 2, tester.deployer.redeployments);
         assertEquals(7, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
@@ -598,7 +587,7 @@ public class NodeFailerTest {
 
     @Test
     public void node_failing_throttle() {
-        // Throttles based on an absolute number in small zone
+        // Throttles based on a absolute number in small zone
         {
             // 10 hosts with 3 tenant nodes each, total 40 nodes
             NodeFailTester tester = NodeFailTester.withTwoApplications(10);
@@ -606,12 +595,13 @@ public class NodeFailerTest {
 
             // 3 hosts fail. 2 of them and all of their children are allowed to fail
             List<Node> failedHosts = hosts.asList().subList(0, 3);
-            failedHosts.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
+            failedHosts.forEach(host -> {
+                tester.serviceMonitor.setHostDown(host.hostname());
+            });
             tester.runMaintainers();
             tester.clock.advance(Duration.ofMinutes(61));
 
             tester.runMaintainers();
-            tester.runMaintainers();  // hosts are typically failed in the 2. maintain()
             assertEquals(2 + /* hosts */
                          (2 * 3) /* containers per host */,
                          tester.nodeRepository.nodes().list(Node.State.failed).size());
@@ -630,7 +620,6 @@ public class NodeFailerTest {
             // The final host and its containers are failed out
             tester.clock.advance(Duration.ofMinutes(30));
             tester.runMaintainers();
-            tester.runMaintainers();  // hosts are failed in the 2. maintain()
             assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
             assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
             assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -735,8 +724,7 @@ public class NodeFailerTest {
                 .map(Map.Entry::getKey)
                 .flatMap(parentHost -> Stream.of(parentHost.get()))
                 .filter(node -> ! exceptSet.contains(node))
-                .findFirst()
-                .orElseThrow();
+                .findFirst().get();
     }
 
 }
-- 
cgit v1.2.3