diff options
author | Valerij Fredriksen <valerijf@vespa.ai> | 2023-10-25 23:33:17 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@vespa.ai> | 2023-10-25 23:33:17 +0200 |
commit | 17d7a900c9273fe41d1d5d4d5f112adeb49bb590 (patch) | |
tree | fd01389ce8ed34b233ad70250f67ff22d01ff0af | |
parent | c6a0e3f87a67c2d8836630589c0c9d6ae5a5cbb1 (diff) |
Clean up nodes faster on provisioning failure
6 files changed, 21 insertions, 13 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java index 1d459f7dd14..2e3c6d1755a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java @@ -61,10 +61,13 @@ public class HostResumeProvisioner extends NodeRepositoryMaintainer { log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); } catch (FatalProvisioningException e) { + // FatalProvisioningException is thrown if node is not found in the cloud, allow for + // some time for the state to propagate + if (host.history().age(clock().instant()).getSeconds() < 30) continue; failures++; log.log(Level.SEVERE, "Failed to provision " + host.hostname() + ", failing out the host recursively", e); - nodeRepository().nodes().failOrMarkRecursively( - host.hostname(), Agent.HostResumeProvisioner, "Failed by HostResumeProvisioner due to provisioning failure"); + nodeRepository().nodes().parkRecursively( + host.hostname(), Agent.HostResumeProvisioner, true, "Failed by HostResumeProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NamingException) log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index 2d513390cf5..24901cb10a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -38,10 +38,9 @@ public class ProvisionedExpirer extends Expirer { for (Node expiredNode : expired) { if (expiredNode.type() != NodeType.host) continue; - nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned"); - if (MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired) { - nodeRepository.nodes().deprovision(expiredNode.hostname(), Agent.ProvisionedExpirer, nodeRepository.clock().instant()); - } + boolean forceDeprovision = MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired; + nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, + forceDeprovision, "Node is stuck in provisioned"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index c352dca1656..5c0614dd8d6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -403,8 +403,8 @@ public class Nodes { * * @return List of all the parked nodes in their new state */ - public List<Node> parkRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason)); + public List<Node> parkRecursively(String hostname, Agent agent, boolean forceDeprovision, String reason) { + return moveRecursively(hostname, Node.State.parked, agent, forceDeprovision, Optional.of(reason)); } /** @@ -433,12 +433,12 @@ public class Nodes { } } - private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { + private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, boolean forceDeprovision, Optional<String> reason) { try (RecursiveNodeMutexes locked = lockAndGetRecursively(hostname, Optional.empty())) { List<Node> moved = new ArrayList<>(); NestedTransaction transaction = new NestedTransaction(); for (NodeMutex node : locked.nodes().nodes()) - moved.add(move(node.node().hostname(), toState, agent, false, reason, transaction)); + moved.add(move(node.node().hostname(), toState, agent, forceDeprovision, reason, transaction)); transaction.commit(); return moved; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index cf99adc647f..17ff3b99e0c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; +import com.yahoo.yolean.Exceptions; import java.util.LinkedHashSet; import java.util.List; @@ -156,7 +157,8 @@ public class Preparer { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do // not exist, we cannot remove them from ZK here because other nodes may already have been // allocated on them, so let HostDeprovisioner deal with it - hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); + hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true, + "Failed to provision: " + Exceptions.toMessageString(e))); throw e; } } else if (allocation.hostDeficit().isPresent() && requested.canFail() && diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 0d9666e4f26..9080030f026 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -153,7 +153,7 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler { " and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail"); } else if (path.matches("/nodes/v2/state/parked/{hostname}")) { - List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), "Parked through the nodes/v2 API"); + List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), false, "Parked through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked); } else if (path.matches("/nodes/v2/state/dirty/{hostname}")) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java index 77986d03da2..74db071b060 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java @@ -98,7 +98,11 @@ public class HostResumeProvisionerTest { Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(List::isEmpty)); hostResumeProvisioner.maintain(); - assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames()); + assertEquals(Set.of(), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); + tester.clock().advance(Duration.ofSeconds(60)); + + hostResumeProvisioner.maintain(); + assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); } private void deployApplication() { |