aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@vespa.ai>2023-10-25 23:33:17 +0200
committerValerij Fredriksen <valerijf@vespa.ai>2023-10-25 23:33:17 +0200
commit17d7a900c9273fe41d1d5d4d5f112adeb49bb590 (patch)
treefd01389ce8ed34b233ad70250f67ff22d01ff0af
parentc6a0e3f87a67c2d8836630589c0c9d6ae5a5cbb1 (diff)
Clean up nodes faster on provisioning failure
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java6
6 files changed, 21 insertions, 13 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
index 1d459f7dd14..2e3c6d1755a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
@@ -61,10 +61,13 @@ public class HostResumeProvisioner extends NodeRepositoryMaintainer {
log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " +
interval() + ": " + Exceptions.toMessageString(e));
} catch (FatalProvisioningException e) {
+ // FatalProvisioningException is thrown if node is not found in the cloud, allow for
+ // some time for the state to propagate
+ if (host.history().age(clock().instant()).getSeconds() < 30) continue;
failures++;
log.log(Level.SEVERE, "Failed to provision " + host.hostname() + ", failing out the host recursively", e);
- nodeRepository().nodes().failOrMarkRecursively(
- host.hostname(), Agent.HostResumeProvisioner, "Failed by HostResumeProvisioner due to provisioning failure");
+ nodeRepository().nodes().parkRecursively(
+ host.hostname(), Agent.HostResumeProvisioner, true, "Failed by HostResumeProvisioner due to provisioning failure");
} catch (RuntimeException e) {
if (e.getCause() instanceof NamingException)
log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
index 2d513390cf5..24901cb10a9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
@@ -38,10 +38,9 @@ public class ProvisionedExpirer extends Expirer {
for (Node expiredNode : expired) {
if (expiredNode.type() != NodeType.host)
continue;
- nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned");
- if (MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired) {
- nodeRepository.nodes().deprovision(expiredNode.hostname(), Agent.ProvisionedExpirer, nodeRepository.clock().instant());
- }
+ boolean forceDeprovision = MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired;
+ nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer,
+ forceDeprovision, "Node is stuck in provisioned");
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index c352dca1656..5c0614dd8d6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -403,8 +403,8 @@ public class Nodes {
*
* @return List of all the parked nodes in their new state
*/
- public List<Node> parkRecursively(String hostname, Agent agent, String reason) {
- return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason));
+ public List<Node> parkRecursively(String hostname, Agent agent, boolean forceDeprovision, String reason) {
+ return moveRecursively(hostname, Node.State.parked, agent, forceDeprovision, Optional.of(reason));
}
/**
@@ -433,12 +433,12 @@ public class Nodes {
}
}
- private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) {
+ private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, boolean forceDeprovision, Optional<String> reason) {
try (RecursiveNodeMutexes locked = lockAndGetRecursively(hostname, Optional.empty())) {
List<Node> moved = new ArrayList<>();
NestedTransaction transaction = new NestedTransaction();
for (NodeMutex node : locked.nodes().nodes())
- moved.add(move(node.node().hostname(), toState, agent, false, reason, transaction));
+ moved.add(move(node.node().hostname(), toState, agent, forceDeprovision, reason, transaction));
transaction.commit();
return moved;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
index cf99adc647f..17ff3b99e0c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
@@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing;
+import com.yahoo.yolean.Exceptions;
import java.util.LinkedHashSet;
import java.util.List;
@@ -156,7 +157,8 @@ public class Preparer {
// Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do
// not exist, we cannot remove them from ZK here because other nodes may already have been
// allocated on them, so let HostDeprovisioner deal with it
- hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant()));
+ hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true,
+ "Failed to provision: " + Exceptions.toMessageString(e)));
throw e;
}
} else if (allocation.hostDeficit().isPresent() && requested.canFail() &&
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 0d9666e4f26..9080030f026 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -153,7 +153,7 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler {
" and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail");
}
else if (path.matches("/nodes/v2/state/parked/{hostname}")) {
- List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), "Parked through the nodes/v2 API");
+ List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), false, "Parked through the nodes/v2 API");
return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked);
}
else if (path.matches("/nodes/v2/state/dirty/{hostname}")) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
index 77986d03da2..74db071b060 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
@@ -98,7 +98,11 @@ public class HostResumeProvisionerTest {
Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(List::isEmpty));
hostResumeProvisioner.maintain();
- assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames());
+ assertEquals(Set.of(), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames());
+ tester.clock().advance(Duration.ofSeconds(60));
+
+ hostResumeProvisioner.maintain();
+ assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames());
}
private void deployApplication() {