diff options
author | Håkon Hallingstad <hakon@yahooinc.com> | 2024-02-28 16:00:52 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahooinc.com> | 2024-02-28 16:00:52 +0100 |
commit | 9b516b5c0d8d0a679c321f1c061afadf6566e9d4 (patch) | |
tree | ac16325a21028b64456f5df4f50de0c841163c54 /node-repository | |
parent | 0414b12d53264aa0f9f84ea947fc32acdc506947 (diff) |
Avoid expiring failed host with children to dirty
Diffstat (limited to 'node-repository')
2 files changed, 56 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index c3fea72fab9..ced1776bb62 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -108,7 +108,22 @@ public class FailedExpirer extends NodeRepositoryMaintainer { return Optional.empty(); } } else { - return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer")); + List<String> childrenBlockingDirtying = children + .stream() + // Examples: a failed child node may have an index we want to preserve. A dirty child node has + // log we want to sync. A parked child w/o wTD may have been parked by an operator for inspection. + .filter(child -> child.state() != Node.State.parked || !child.status().wantToDeprovision()) + .map(Node::hostname) + .toList(); + + if (childrenBlockingDirtying.isEmpty()) { + return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer")); + } else { + log.info(String.format("Expired failed host %s was not dirtied because it has children: %s", + node.hostname(), String.join(", ", childrenBlockingDirtying))); + return Optional.empty(); + } + } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 4af8756774d..abe789bc968 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -165,6 +165,37 @@ public class FailedExpirerTest { } @Test + public void ensure_failed_host_is_not_dirtied_unless_all_children_are_gone() { + FailureScenario scenario = new FailureScenario(SystemName.Public, Environment.prod) + .withNode(NodeType.host, FailureScenario.defaultFlavor, "host1") + .setReady("host1") + .allocate(NodeType.host) + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "host1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "host1") + .setReady("node1", "node2") + .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2") + .failNode(1, "host1", "node1", "node2"); + + scenario.clock.advance(Duration.ofHours(2)); + scenario.expirer().run(); + // host1 still failed because children are too + scenario.assertNodesIn(Node.State.failed, "host1", "node1", "node2"); + + scenario.clock.advance(Duration.ofDays(5)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.failed, "host1"); + scenario.assertNodesIn(Node.State.dirty, "node1", "node2"); + + scenario.setChildrenReady("node1", "node2"); + scenario.assertNodesIn(Node.State.ready); + scenario.assertNodesIn(Node.State.failed, "host1"); + + scenario.clock.advance(Duration.ofHours(1)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.dirty, "host1"); + } + + @Test public void ensure_parked_docker_host() { FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") @@ -307,6 +338,15 @@ public class FailedExpirerTest { return this; } + public FailureScenario setChildrenReady(String... hostnames) { + List<Node> nodes = Stream.of(hostnames) + .map(this::get) + .toList(); + nodeRepository.nodes().deallocate(nodes, Agent.system, getClass().getSimpleName()); + Stream.of(hostnames).forEach(hostname -> nodeRepository.nodes().markNodeAvailableForNewAllocation(hostname, Agent.nodeAdmin, getClass().getSimpleName())); + return this; + } + public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) { return allocate(clusterType, defaultFlavor, hostname); } |