diff options
-rw-r--r-- | annotations/OWNERS | 3 | ||||
-rw-r--r-- | application/OWNERS | 3 | ||||
-rw-r--r-- | clustercontroller-core/OWNERS | 2 | ||||
-rw-r--r-- | config-model/OWNERS | 2 | ||||
-rw-r--r-- | container-core/OWNERS | 1 | ||||
-rw-r--r-- | container-search/OWNERS | 1 | ||||
-rw-r--r-- | defaults/OWNERS | 1 | ||||
-rw-r--r-- | docproc/OWNERS | 1 | ||||
-rw-r--r-- | fileacquirer/OWNERS | 3 | ||||
-rw-r--r-- | model-evaluation/OWNERS | 2 | ||||
-rw-r--r-- | node-repository/OWNERS | 4 | ||||
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java | 17 | ||||
-rw-r--r-- | node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java | 40 | ||||
-rw-r--r-- | searchlib/src/main/OWNERS | 3 |
14 files changed, 72 insertions, 11 deletions
diff --git a/annotations/OWNERS b/annotations/OWNERS index 31af040f698..78b92e411b4 100644 --- a/annotations/OWNERS +++ b/annotations/OWNERS @@ -1 +1,2 @@ -bratseth +gjoranv +bjorncs diff --git a/application/OWNERS b/application/OWNERS index 31af040f698..78b92e411b4 100644 --- a/application/OWNERS +++ b/application/OWNERS @@ -1 +1,2 @@ -bratseth +gjoranv +bjorncs diff --git a/clustercontroller-core/OWNERS b/clustercontroller-core/OWNERS index abe8e49c8d6..390322e4912 100644 --- a/clustercontroller-core/OWNERS +++ b/clustercontroller-core/OWNERS @@ -1,3 +1,3 @@ vekterli hakonhall -bratseth +hmusum diff --git a/config-model/OWNERS b/config-model/OWNERS index 8223ccfb64e..fd4a2e9d996 100644 --- a/config-model/OWNERS +++ b/config-model/OWNERS @@ -1,2 +1,2 @@ hmusum -bratseth +gjoranv diff --git a/container-core/OWNERS b/container-core/OWNERS index c16e87d4c9e..98b59fccc99 100644 --- a/container-core/OWNERS +++ b/container-core/OWNERS @@ -1,4 +1,3 @@ arnej27959 bjorncs -bratseth gjoranv diff --git a/container-search/OWNERS b/container-search/OWNERS index cd50f7a263a..37b794fd559 100644 --- a/container-search/OWNERS +++ b/container-search/OWNERS @@ -1,2 +1,3 @@ bratseth arnej27959 +baldersheim diff --git a/defaults/OWNERS b/defaults/OWNERS index 6c96073cde8..67cd2820bb8 100644 --- a/defaults/OWNERS +++ b/defaults/OWNERS @@ -1,2 +1 @@ arnej27959 -bratseth diff --git a/docproc/OWNERS b/docproc/OWNERS index 31af040f698..58e37c72e4c 100644 --- a/docproc/OWNERS +++ b/docproc/OWNERS @@ -1 +1,2 @@ bratseth +baldersheim diff --git a/fileacquirer/OWNERS b/fileacquirer/OWNERS index 31af040f698..2faf7df0593 100644 --- a/fileacquirer/OWNERS +++ b/fileacquirer/OWNERS @@ -1 +1,2 @@ -bratseth +baldersheim +hmusum diff --git a/model-evaluation/OWNERS b/model-evaluation/OWNERS index 2bd865cff34..dcd0d81ac63 100644 --- a/model-evaluation/OWNERS +++ b/model-evaluation/OWNERS @@ -1,2 +1,4 @@ bratseth lesters +arnej27959 +bjorncs diff --git a/node-repository/OWNERS b/node-repository/OWNERS index 2a808fa8ccc..3c01a091757 100644 --- a/node-repository/OWNERS +++ b/node-repository/OWNERS @@ -1,3 +1,3 @@ -bratseth -hmusum mpolden +hakonhall +freva diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index c3fea72fab9..ced1776bb62 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -108,7 +108,22 @@ public class FailedExpirer extends NodeRepositoryMaintainer { return Optional.empty(); } } else { - return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer")); + List<String> childrenBlockingDirtying = children + .stream() + // Examples: a failed child node may have an index we want to preserve. A dirty child node has + // log we want to sync. A parked child w/o wTD may have been parked by an operator for inspection. + .filter(child -> child.state() != Node.State.parked || !child.status().wantToDeprovision()) + .map(Node::hostname) + .toList(); + + if (childrenBlockingDirtying.isEmpty()) { + return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer")); + } else { + log.info(String.format("Expired failed host %s was not dirtied because it has children: %s", + node.hostname(), String.join(", ", childrenBlockingDirtying))); + return Optional.empty(); + } + } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 4af8756774d..abe789bc968 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -165,6 +165,37 @@ public class FailedExpirerTest { } @Test + public void ensure_failed_host_is_not_dirtied_unless_all_children_are_gone() { + FailureScenario scenario = new FailureScenario(SystemName.Public, Environment.prod) + .withNode(NodeType.host, FailureScenario.defaultFlavor, "host1") + .setReady("host1") + .allocate(NodeType.host) + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "host1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "host1") + .setReady("node1", "node2") + .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2") + .failNode(1, "host1", "node1", "node2"); + + scenario.clock.advance(Duration.ofHours(2)); + scenario.expirer().run(); + // host1 still failed because children are too + scenario.assertNodesIn(Node.State.failed, "host1", "node1", "node2"); + + scenario.clock.advance(Duration.ofDays(5)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.failed, "host1"); + scenario.assertNodesIn(Node.State.dirty, "node1", "node2"); + + scenario.setChildrenReady("node1", "node2"); + scenario.assertNodesIn(Node.State.ready); + scenario.assertNodesIn(Node.State.failed, "host1"); + + scenario.clock.advance(Duration.ofHours(1)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.dirty, "host1"); + } + + @Test public void ensure_parked_docker_host() { FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") @@ -307,6 +338,15 @@ public class FailedExpirerTest { return this; } + public FailureScenario setChildrenReady(String... hostnames) { + List<Node> nodes = Stream.of(hostnames) + .map(this::get) + .toList(); + nodeRepository.nodes().deallocate(nodes, Agent.system, getClass().getSimpleName()); + Stream.of(hostnames).forEach(hostname -> nodeRepository.nodes().markNodeAvailableForNewAllocation(hostname, Agent.nodeAdmin, getClass().getSimpleName())); + return this; + } + public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) { return allocate(clusterType, defaultFlavor, hostname); } diff --git a/searchlib/src/main/OWNERS b/searchlib/src/main/OWNERS index dd9b7991fad..28161f29373 100644 --- a/searchlib/src/main/OWNERS +++ b/searchlib/src/main/OWNERS @@ -1,2 +1,3 @@ bratseth -lesters
\ No newline at end of file +lesters +baldersheim |