summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2024-02-28 16:21:25 +0100
committerGitHub <noreply@github.com>2024-02-28 16:21:25 +0100
commitd31bc234beb39489fc23c19eccd7dddc86aec03b (patch)
treea289ed5d65e85f00db8b6b300bbc8716437ac1f8 /node-repository
parentc1baaf0e0a9a40cf00a2e520f98c1b80a0e4a0be (diff)
parent9b516b5c0d8d0a679c321f1c061afadf6566e9d4 (diff)
Merge pull request #30431 from vespa-engine/hakonhall/avoid-expiring-failed-host-with-children-to-dirty
Avoid expiring failed host with children to dirty
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java40
2 files changed, 56 insertions, 1 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index c3fea72fab9..ced1776bb62 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -108,7 +108,22 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
return Optional.empty();
}
} else {
- return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer"));
+ List<String> childrenBlockingDirtying = children
+ .stream()
+ // Examples: a failed child node may have an index we want to preserve. A dirty child node has
+ // log we want to sync. A parked child w/o wTD may have been parked by an operator for inspection.
+ .filter(child -> child.state() != Node.State.parked || !child.status().wantToDeprovision())
+ .map(Node::hostname)
+ .toList();
+
+ if (childrenBlockingDirtying.isEmpty()) {
+ return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer"));
+ } else {
+ log.info(String.format("Expired failed host %s was not dirtied because it has children: %s",
+ node.hostname(), String.join(", ", childrenBlockingDirtying)));
+ return Optional.empty();
+ }
+
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 4af8756774d..abe789bc968 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -165,6 +165,37 @@ public class FailedExpirerTest {
}
@Test
+ public void ensure_failed_host_is_not_dirtied_unless_all_children_are_gone() {
+ FailureScenario scenario = new FailureScenario(SystemName.Public, Environment.prod)
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "host1")
+ .setReady("host1")
+ .allocate(NodeType.host)
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "host1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "host1")
+ .setReady("node1", "node2")
+ .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2")
+ .failNode(1, "host1", "node1", "node2");
+
+ scenario.clock.advance(Duration.ofHours(2));
+ scenario.expirer().run();
+ // host1 still failed because children are too
+ scenario.assertNodesIn(Node.State.failed, "host1", "node1", "node2");
+
+ scenario.clock.advance(Duration.ofDays(5));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.failed, "host1");
+ scenario.assertNodesIn(Node.State.dirty, "node1", "node2");
+
+ scenario.setChildrenReady("node1", "node2");
+ scenario.assertNodesIn(Node.State.ready);
+ scenario.assertNodesIn(Node.State.failed, "host1");
+
+ scenario.clock.advance(Duration.ofHours(1));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.dirty, "host1");
+ }
+
+ @Test
public void ensure_parked_docker_host() {
FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
@@ -307,6 +338,15 @@ public class FailedExpirerTest {
return this;
}
+ public FailureScenario setChildrenReady(String... hostnames) {
+ List<Node> nodes = Stream.of(hostnames)
+ .map(this::get)
+ .toList();
+ nodeRepository.nodes().deallocate(nodes, Agent.system, getClass().getSimpleName());
+ Stream.of(hostnames).forEach(hostname -> nodeRepository.nodes().markNodeAvailableForNewAllocation(hostname, Agent.nodeAdmin, getClass().getSimpleName()));
+ return this;
+ }
+
public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) {
return allocate(clusterType, defaultFlavor, hostname);
}