aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--annotations/OWNERS3
-rw-r--r--application/OWNERS3
-rw-r--r--clustercontroller-core/OWNERS2
-rw-r--r--config-model/OWNERS2
-rw-r--r--container-core/OWNERS1
-rw-r--r--container-search/OWNERS1
-rw-r--r--defaults/OWNERS1
-rw-r--r--docproc/OWNERS1
-rw-r--r--fileacquirer/OWNERS3
-rw-r--r--model-evaluation/OWNERS2
-rw-r--r--node-repository/OWNERS4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java40
-rw-r--r--searchlib/src/main/OWNERS3
14 files changed, 72 insertions, 11 deletions
diff --git a/annotations/OWNERS b/annotations/OWNERS
index 31af040f698..78b92e411b4 100644
--- a/annotations/OWNERS
+++ b/annotations/OWNERS
@@ -1 +1,2 @@
-bratseth
+gjoranv
+bjorncs
diff --git a/application/OWNERS b/application/OWNERS
index 31af040f698..78b92e411b4 100644
--- a/application/OWNERS
+++ b/application/OWNERS
@@ -1 +1,2 @@
-bratseth
+gjoranv
+bjorncs
diff --git a/clustercontroller-core/OWNERS b/clustercontroller-core/OWNERS
index abe8e49c8d6..390322e4912 100644
--- a/clustercontroller-core/OWNERS
+++ b/clustercontroller-core/OWNERS
@@ -1,3 +1,3 @@
vekterli
hakonhall
-bratseth
+hmusum
diff --git a/config-model/OWNERS b/config-model/OWNERS
index 8223ccfb64e..fd4a2e9d996 100644
--- a/config-model/OWNERS
+++ b/config-model/OWNERS
@@ -1,2 +1,2 @@
hmusum
-bratseth
+gjoranv
diff --git a/container-core/OWNERS b/container-core/OWNERS
index c16e87d4c9e..98b59fccc99 100644
--- a/container-core/OWNERS
+++ b/container-core/OWNERS
@@ -1,4 +1,3 @@
arnej27959
bjorncs
-bratseth
gjoranv
diff --git a/container-search/OWNERS b/container-search/OWNERS
index cd50f7a263a..37b794fd559 100644
--- a/container-search/OWNERS
+++ b/container-search/OWNERS
@@ -1,2 +1,3 @@
bratseth
arnej27959
+baldersheim
diff --git a/defaults/OWNERS b/defaults/OWNERS
index 6c96073cde8..67cd2820bb8 100644
--- a/defaults/OWNERS
+++ b/defaults/OWNERS
@@ -1,2 +1 @@
arnej27959
-bratseth
diff --git a/docproc/OWNERS b/docproc/OWNERS
index 31af040f698..58e37c72e4c 100644
--- a/docproc/OWNERS
+++ b/docproc/OWNERS
@@ -1 +1,2 @@
bratseth
+baldersheim
diff --git a/fileacquirer/OWNERS b/fileacquirer/OWNERS
index 31af040f698..2faf7df0593 100644
--- a/fileacquirer/OWNERS
+++ b/fileacquirer/OWNERS
@@ -1 +1,2 @@
-bratseth
+baldersheim
+hmusum
diff --git a/model-evaluation/OWNERS b/model-evaluation/OWNERS
index 2bd865cff34..dcd0d81ac63 100644
--- a/model-evaluation/OWNERS
+++ b/model-evaluation/OWNERS
@@ -1,2 +1,4 @@
bratseth
lesters
+arnej27959
+bjorncs
diff --git a/node-repository/OWNERS b/node-repository/OWNERS
index 2a808fa8ccc..3c01a091757 100644
--- a/node-repository/OWNERS
+++ b/node-repository/OWNERS
@@ -1,3 +1,3 @@
-bratseth
-hmusum
mpolden
+hakonhall
+freva
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index c3fea72fab9..ced1776bb62 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -108,7 +108,22 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
return Optional.empty();
}
} else {
- return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer"));
+ List<String> childrenBlockingDirtying = children
+ .stream()
+ // Examples: a failed child node may have an index we want to preserve. A dirty child node has
+ // log we want to sync. A parked child w/o wTD may have been parked by an operator for inspection.
+ .filter(child -> child.state() != Node.State.parked || !child.status().wantToDeprovision())
+ .map(Node::hostname)
+ .toList();
+
+ if (childrenBlockingDirtying.isEmpty()) {
+ return Optional.of(nodeRepository.nodes().deallocate(node, Agent.FailedExpirer, "Expired by FailedExpirer"));
+ } else {
+ log.info(String.format("Expired failed host %s was not dirtied because it has children: %s",
+ node.hostname(), String.join(", ", childrenBlockingDirtying)));
+ return Optional.empty();
+ }
+
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 4af8756774d..abe789bc968 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -165,6 +165,37 @@ public class FailedExpirerTest {
}
@Test
+ public void ensure_failed_host_is_not_dirtied_unless_all_children_are_gone() {
+ FailureScenario scenario = new FailureScenario(SystemName.Public, Environment.prod)
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "host1")
+ .setReady("host1")
+ .allocate(NodeType.host)
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "host1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "host1")
+ .setReady("node1", "node2")
+ .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2")
+ .failNode(1, "host1", "node1", "node2");
+
+ scenario.clock.advance(Duration.ofHours(2));
+ scenario.expirer().run();
+ // host1 still failed because children are too
+ scenario.assertNodesIn(Node.State.failed, "host1", "node1", "node2");
+
+ scenario.clock.advance(Duration.ofDays(5));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.failed, "host1");
+ scenario.assertNodesIn(Node.State.dirty, "node1", "node2");
+
+ scenario.setChildrenReady("node1", "node2");
+ scenario.assertNodesIn(Node.State.ready);
+ scenario.assertNodesIn(Node.State.failed, "host1");
+
+ scenario.clock.advance(Duration.ofHours(1));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.dirty, "host1");
+ }
+
+ @Test
public void ensure_parked_docker_host() {
FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
.withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
@@ -307,6 +338,15 @@ public class FailedExpirerTest {
return this;
}
+ public FailureScenario setChildrenReady(String... hostnames) {
+ List<Node> nodes = Stream.of(hostnames)
+ .map(this::get)
+ .toList();
+ nodeRepository.nodes().deallocate(nodes, Agent.system, getClass().getSimpleName());
+ Stream.of(hostnames).forEach(hostname -> nodeRepository.nodes().markNodeAvailableForNewAllocation(hostname, Agent.nodeAdmin, getClass().getSimpleName()));
+ return this;
+ }
+
public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) {
return allocate(clusterType, defaultFlavor, hostname);
}
diff --git a/searchlib/src/main/OWNERS b/searchlib/src/main/OWNERS
index dd9b7991fad..28161f29373 100644
--- a/searchlib/src/main/OWNERS
+++ b/searchlib/src/main/OWNERS
@@ -1,2 +1,3 @@
bratseth
-lesters \ No newline at end of file
+lesters
+baldersheim