aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@yahooinc.com>2022-07-05 15:03:05 +0200
committerHåkon Hallingstad <hakon@yahooinc.com>2022-07-05 15:03:05 +0200
commit2cdaef56e18ace2ee2269d28f959f5a534bd68ee (patch)
tree1ba71772e88d8a898efe15ae3676cee6d4391791 /node-repository/src
parent3196922227ec99a3fc8d174de153bd12fec6697b (diff)
Avoid the host lock while failing the children
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java100
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java22
2 files changed, 71 insertions, 51 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 3c5b20da4d0..f9b3dae72c5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
@@ -19,9 +20,11 @@ import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.time.Instant;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
+import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Level;
@@ -106,26 +109,6 @@ public class NodeFailer extends NodeRepositoryMaintainer {
failActive(failing);
}
- // Active hosts
- NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
- for (Node host : activeNodes.hosts().failing()) {
- if ( ! activeNodes.childrenOf(host).isEmpty()) continue;
- Optional<NodeMutex> locked = Optional.empty();
- try {
- attempts++;
- locked = nodeRepository().nodes().lockAndGet(host);
- if (locked.isEmpty()) continue;
- nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer,
- "Host should be failed and have no tenant nodes");
- }
- catch (Exception e) {
- failures++;
- }
- finally {
- locked.ifPresent(NodeMutex::close);
- }
- }
-
int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures);
metric.set(throttlingActiveMetric, throttlingActive, null);
metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
@@ -153,6 +136,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Set<FailingNode> failingNodes = new HashSet<>();
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
+ for (Node host : activeNodes.hosts().failing())
+ failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));
+
for (Node node : activeNodes) {
Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
@@ -241,42 +227,63 @@ public class NodeFailer extends NodeRepositoryMaintainer {
deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30));
if (deployment.isEmpty()) return false;
- try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) {
- // If the active node that we are trying to fail is of type host, we need to successfully fail all
- // the children nodes running on it before we fail the host
- boolean allTenantNodesFailedOutSuccessfully = true;
+ // If the active node that we are trying to fail is of type host, we need to successfully fail all
+ // the children nodes running on it before we fail the host. Failing a child node in a dynamically
+ // provisioned zone may require provisioning new hosts that require the host application lock to be held,
+ // so we must release ours before failing the children.
+ List<FailingNode> activeChildrenToFail = new ArrayList<>();
+ try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) {
+ // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
+ if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner)))
+ return false;
+ if (lock.node().state() == Node.State.failed)
+ return true;
+ if (!Objects.equals(failing.node().state(), lock.node().state()))
+ return false;
+ failing = new FailingNode(lock.node(), failing.reason);
+
String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) {
if (failingTenantNode.state() == Node.State.active) {
- allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure));
- } else {
+ activeChildrenToFail.add(new FailingNode(failingTenantNode, reasonForChildFailure));
+ } else if (failingTenantNode.state() != Node.State.failed) {
nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure);
}
}
- if (! allTenantNodesFailedOutSuccessfully) return false;
+ // A parent with children gets wantToFail to avoid getting more nodes allocated to it.
wantToFail(failing.node(), true, lock);
- try {
- deployment.get().activate();
- return true;
- } catch (TransientException e) {
- log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
- " with a transient error, will be retried by application maintainer: " +
- Exceptions.toMessageString(e));
- return true;
- } catch (RuntimeException e) {
- // Reset want to fail: We'll retry failing unless it heals in the meantime
- nodeRepository().nodes().node(failing.node().hostname())
- .ifPresent(n -> wantToFail(n, false, lock));
- log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
- " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
- return false;
+
+ if (activeChildrenToFail.isEmpty()) {
+ try {
+ deployment.get().activate();
+ return true;
+ } catch (TransientException e) {
+ log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
+ " with a transient error, will be retried by application maintainer: " +
+ Exceptions.toMessageString(e));
+ return true;
+ } catch (RuntimeException e) {
+ // Reset want to fail: We'll retry failing unless it heals in the meantime
+ nodeRepository().nodes().node(failing.node().hostname())
+ .ifPresent(n -> wantToFail(n, false, lock));
+ log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
+ " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
+ return false;
+ }
}
}
+
+ // In a dynamically provisioned zone the failing of an active child node takes ~10 minutes,
+ // so perhaps this should be done in parallel.
+ activeChildrenToFail.forEach(this::failActive);
+
+ return false;
}
private void wantToFail(Node node, boolean wantToFail, Mutex lock) {
- nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
+ if (!node.status().wantToFail())
+ nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
}
/** Returns true if node failing should be throttled */
@@ -284,9 +291,10 @@ public class NodeFailer extends NodeRepositoryMaintainer {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow);
NodeList allNodes = nodeRepository().nodes().list();
- NodeList recentlyFailedNodes = allNodes.state(Node.State.failed)
- .matching(n -> n.history().hasEventAfter(History.Event.Type.failed,
- startOfThrottleWindow));
+ NodeList recentlyFailedNodes = allNodes
+ .matching(n -> n.status().wantToFail() ||
+ (n.state() == Node.State.failed &&
+ n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)));
// Allow failing any node within policy
if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index f7d29a116ed..3ba536ee4d7 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -457,6 +457,16 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ assertEquals(2, tester.deployer.redeployments);
+ assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
+ assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
+ assertEquals(7, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
+ assertEquals(0, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
+
+ // The failing of the host is deferred to the next maintain
+ tester.runMaintainers();
+
assertEquals(2 + 1, tester.deployer.redeployments);
assertEquals(3, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
@@ -495,6 +505,7 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(90));
tester.allNodesMakeAConfigRequestExcept();
tester.runMaintainers();
+ tester.runMaintainers(); // The host is failed in the 2. maintain()
assertEquals(5 + 2, tester.deployer.redeployments);
assertEquals(7, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).size());
@@ -587,7 +598,7 @@ public class NodeFailerTest {
@Test
public void node_failing_throttle() {
- // Throttles based on a absolute number in small zone
+ // Throttles based on an absolute number in small zone
{
// 10 hosts with 3 tenant nodes each, total 40 nodes
NodeFailTester tester = NodeFailTester.withTwoApplications(10);
@@ -595,13 +606,12 @@ public class NodeFailerTest {
// 3 hosts fail. 2 of them and all of their children are allowed to fail
List<Node> failedHosts = hosts.asList().subList(0, 3);
- failedHosts.forEach(host -> {
- tester.serviceMonitor.setHostDown(host.hostname());
- });
+ failedHosts.forEach(host -> tester.serviceMonitor.setHostDown(host.hostname()));
tester.runMaintainers();
tester.clock.advance(Duration.ofMinutes(61));
tester.runMaintainers();
+ tester.runMaintainers(); // hosts are typically failed in the 2. maintain()
assertEquals(2 + /* hosts */
(2 * 3) /* containers per host */,
tester.nodeRepository.nodes().list(Node.State.failed).size());
@@ -620,6 +630,7 @@ public class NodeFailerTest {
// The final host and its containers are failed out
tester.clock.advance(Duration.ofMinutes(30));
tester.runMaintainers();
+ tester.runMaintainers(); // hosts are failed in the 2. maintain()
assertEquals(12, tester.nodeRepository.nodes().list(Node.State.failed).size());
assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made", 0, tester.metric.values.get(NodeFailer.throttlingActiveMetric));
assertEquals("No throttled node failures", 0, tester.metric.values.get(NodeFailer.throttledNodeFailuresMetric));
@@ -724,7 +735,8 @@ public class NodeFailerTest {
.map(Map.Entry::getKey)
.flatMap(parentHost -> Stream.of(parentHost.get()))
.filter(node -> ! exceptSet.contains(node))
- .findFirst().get();
+ .findFirst()
+ .orElseThrow();
}
}