From dac838f1fa390d59c000fdc91b54fc7218599c08 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Mon, 11 Jul 2022 11:29:09 +0200 Subject: Revert "Avoid the host lock while failing the children" This reverts commit 2cdaef56e18ace2ee2269d28f959f5a534bd68ee. --- .../hosted/provision/maintenance/NodeFailer.java | 100 ++++++++++----------- .../provision/maintenance/NodeFailerTest.java | 22 ++--- 2 files changed, 51 insertions(+), 71 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f9b3dae72c5..3c5b20da4d0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -12,7 +12,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; @@ -20,11 +19,9 @@ import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.time.Instant; -import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; -import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.logging.Level; @@ -109,6 +106,26 @@ public class NodeFailer extends NodeRepositoryMaintainer { failActive(failing); } + // Active hosts + NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); + for (Node host : activeNodes.hosts().failing()) { + if ( ! activeNodes.childrenOf(host).isEmpty()) continue; + Optional locked = Optional.empty(); + try { + attempts++; + locked = nodeRepository().nodes().lockAndGet(host); + if (locked.isEmpty()) continue; + nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, + "Host should be failed and have no tenant nodes"); + } + catch (Exception e) { + failures++; + } + finally { + locked.ifPresent(NodeMutex::close); + } + } + int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures); metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); @@ -136,9 +153,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { Set failingNodes = new HashSet<>(); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); - for (Node host : activeNodes.hosts().failing()) - failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); - for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { @@ -227,63 +241,42 @@ public class NodeFailer extends NodeRepositoryMaintainer { deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30)); if (deployment.isEmpty()) return false; - // If the active node that we are trying to fail is of type host, we need to successfully fail all - // the children nodes running on it before we fail the host. Failing a child node in a dynamically - // provisioned zone may require provisioning new hosts that require the host application lock to be held, - // so we must release ours before failing the children. - List activeChildrenToFail = new ArrayList<>(); - try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) { - // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail - if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner))) - return false; - if (lock.node().state() == Node.State.failed) - return true; - if (!Objects.equals(failing.node().state(), lock.node().state())) - return false; - failing = new FailingNode(lock.node(), failing.reason); - + try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) { + // If the active node that we are trying to fail is of type host, we need to successfully fail all + // the children nodes running on it before we fail the host + boolean allTenantNodesFailedOutSuccessfully = true; String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason(); for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) { if (failingTenantNode.state() == Node.State.active) { - activeChildrenToFail.add(new FailingNode(failingTenantNode, reasonForChildFailure)); - } else if (failingTenantNode.state() != Node.State.failed) { + allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure)); + } else { nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); } } - // A parent with children gets wantToFail to avoid getting more nodes allocated to it. + if (! allTenantNodesFailedOutSuccessfully) return false; wantToFail(failing.node(), true, lock); - - if (activeChildrenToFail.isEmpty()) { - try { - deployment.get().activate(); - return true; - } catch (TransientException e) { - log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + - " with a transient error, will be retried by application maintainer: " + - Exceptions.toMessageString(e)); - return true; - } catch (RuntimeException e) { - // Reset want to fail: We'll retry failing unless it heals in the meantime - nodeRepository().nodes().node(failing.node().hostname()) - .ifPresent(n -> wantToFail(n, false, lock)); - log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + - " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); - return false; - } + try { + deployment.get().activate(); + return true; + } catch (TransientException e) { + log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + + " with a transient error, will be retried by application maintainer: " + + Exceptions.toMessageString(e)); + return true; + } catch (RuntimeException e) { + // Reset want to fail: We'll retry failing unless it heals in the meantime + nodeRepository().nodes().node(failing.node().hostname()) + .ifPresent(n -> wantToFail(n, false, lock)); + log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + + " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); + return false; } } - - // In a dynamically provisioned zone the failing of an active child node takes ~10 minutes, - // so perhaps this should be done in parallel. - activeChildrenToFail.forEach(this::failActive); - - return false; } private void wantToFail(Node node, boolean wantToFail, Mutex lock) { - if (!node.status().wantToFail()) - nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); + nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); } /** Returns true if node failing should be throttled */ @@ -291,10 +284,9 @@ public class NodeFailer extends NodeRepositoryMaintainer { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow); NodeList allNodes = nodeRepository().nodes().list(); - NodeList recentlyFailedNodes = allNodes - .matching(n -> n.status().wantToFail() || - (n.state() == Node.State.failed && - n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow))); + NodeList recentlyFailedNodes = allNodes.state(Node.State.failed) + .matching(n -> n.history().hasEventAfter(History.Event.Type.failed, + startOfThrottleWindow)); // Allow failing any node within policy if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 3ba536ee4d7..f7d29a116ed 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -457,16 +457,6 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - assertEquals(2, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); - assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); - assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); - assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); - assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); - - // The failing of the host is deferred to the next maintain - tester.runMaintainers(); - assertEquals(2 + 1, tester.deployer.redeployments); assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); @@ -505,7 +495,6 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(90)); tester.allNodesMakeAConfigRequestExcept(); tester.runMaintainers(); - tester.runMaintainers(); // The host is failed in the 2. maintain() assertEquals(5 + 2, tester.deployer.redeployments); assertEquals(7, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size()); @@ -598,7 +587,7 @@ public class NodeFailerTest { @Test public void node_failing_throttle() { - // Throttles based on an absolute number in small zone + // Throttles based on a absolute number in small zone { // 10 hosts with 3 tenant nodes each, total 40 nodes NodeFailTester tester = NodeFailTester.withTwoApplications(10); @@ -606,12 +595,13 @@ public class NodeFailerTest { // 3 hosts fail. 2 of them and all of their children are allowed to fail List failedHosts = hosts.asList().subList(0, 3); - failedHosts.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname())); + failedHosts.forEach(host -> { + tester.serviceMonitor.setHostDown(host.hostname()); + }); tester.runMaintainers(); tester.clock.advance(Duration.ofMinutes(61)); tester.runMaintainers(); - tester.runMaintainers(); // hosts are typically failed in the 2. maintain() assertEquals(2 + /* hosts */ (2 * 3) /* containers per host */, tester.nodeRepository.nodes().list(Node.State.failed).size()); @@ -630,7 +620,6 @@ public class NodeFailerTest { // The final host and its containers are failed out tester.clock.advance(Duration.ofMinutes(30)); tester.runMaintainers(); - tester.runMaintainers(); // hosts are failed in the 2. maintain() assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size()); assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric)); assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric)); @@ -735,8 +724,7 @@ public class NodeFailerTest { .map(Map.Entry::getKey) .flatMap(parentHost -> Stream.of(parentHost.get())) .filter(node -> ! exceptSet.contains(node)) - .findFirst() - .orElseThrow(); + .findFirst().get(); } } -- cgit v1.2.3