aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon.hallingstad@gmail.com>2023-10-26 09:55:49 +0200
committerGitHub <noreply@github.com>2023-10-26 09:55:49 +0200
commit8079a8769915d740474493a8c23623adfecdfdc9 (patch)
tree337c0f310e800c2a166d4a5a39f2800166383ce6
parent01431e4953faa89955978113c67887fa19ce8a4e (diff)
parent17d7a900c9273fe41d1d5d4d5f112adeb49bb590 (diff)
Merge pull request #29105 from vespa-engine/freva/optimize
Dynamic provisioning improvements
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java6
9 files changed, 46 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
index 7155a49ef58..be6c420c63b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
@@ -229,15 +229,19 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer {
sharingMode, clusterType.map(ClusterSpec.Type::valueOf), Optional.empty(),
nodeRepository().zone().cloud().account(), false);
List<Node> hosts = new ArrayList<>();
- hostProvisioner.provisionHosts(request,
- resources -> true,
- provisionedHosts -> {
- hosts.addAll(provisionedHosts.stream()
- .map(host -> host.generateHost(Duration.ZERO))
- .map(host -> host.withExclusiveToApplicationId(null))
- .toList());
- nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer);
- });
+ Runnable waiter;
+ try (var lock = nodeRepository().nodes().lockUnallocated()) {
+ waiter = hostProvisioner.provisionHosts(request,
+ resources -> true,
+ provisionedHosts -> {
+ hosts.addAll(provisionedHosts.stream()
+ .map(host -> host.generateHost(Duration.ZERO))
+ .map(host -> host.withExclusiveToApplicationId(null))
+ .toList());
+ nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer);
+ });
+ }
+ waiter.run();
return hosts;
} catch (NodeAllocationException | IllegalArgumentException | IllegalStateException e) {
throw new NodeAllocationException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage(),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
index 1d459f7dd14..2e3c6d1755a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java
@@ -61,10 +61,13 @@ public class HostResumeProvisioner extends NodeRepositoryMaintainer {
log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " +
interval() + ": " + Exceptions.toMessageString(e));
} catch (FatalProvisioningException e) {
+ // FatalProvisioningException is thrown if node is not found in the cloud, allow for
+ // some time for the state to propagate
+ if (host.history().age(clock().instant()).getSeconds() < 30) continue;
failures++;
log.log(Level.SEVERE, "Failed to provision " + host.hostname() + ", failing out the host recursively", e);
- nodeRepository().nodes().failOrMarkRecursively(
- host.hostname(), Agent.HostResumeProvisioner, "Failed by HostResumeProvisioner due to provisioning failure");
+ nodeRepository().nodes().parkRecursively(
+ host.hostname(), Agent.HostResumeProvisioner, true, "Failed by HostResumeProvisioner due to provisioning failure");
} catch (RuntimeException e) {
if (e.getCause() instanceof NamingException)
log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
index 2d513390cf5..24901cb10a9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java
@@ -38,10 +38,9 @@ public class ProvisionedExpirer extends Expirer {
for (Node expiredNode : expired) {
if (expiredNode.type() != NodeType.host)
continue;
- nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned");
- if (MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired) {
- nodeRepository.nodes().deprovision(expiredNode.hostname(), Agent.ProvisionedExpirer, nodeRepository.clock().instant());
- }
+ boolean forceDeprovision = MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired;
+ nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer,
+ forceDeprovision, "Node is stuck in provisioned");
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index c352dca1656..5c0614dd8d6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -403,8 +403,8 @@ public class Nodes {
*
* @return List of all the parked nodes in their new state
*/
- public List<Node> parkRecursively(String hostname, Agent agent, String reason) {
- return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason));
+ public List<Node> parkRecursively(String hostname, Agent agent, boolean forceDeprovision, String reason) {
+ return moveRecursively(hostname, Node.State.parked, agent, forceDeprovision, Optional.of(reason));
}
/**
@@ -433,12 +433,12 @@ public class Nodes {
}
}
- private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) {
+ private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, boolean forceDeprovision, Optional<String> reason) {
try (RecursiveNodeMutexes locked = lockAndGetRecursively(hostname, Optional.empty())) {
List<Node> moved = new ArrayList<>();
NestedTransaction transaction = new NestedTransaction();
for (NodeMutex node : locked.nodes().nodes())
- moved.add(move(node.node().hostname(), toState, agent, false, reason, transaction));
+ moved.add(move(node.node().hostname(), toState, agent, forceDeprovision, reason, transaction));
transaction.commit();
return moved;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
index a876999e80b..38cbfa7fe5f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
@@ -50,8 +50,10 @@ public interface HostProvisioner {
* written to ZK immediately in case the config server goes down while waiting
* for the provisioning to finish.
* @throws NodeAllocationException if the cloud provider cannot satisfy the request
+ * @return a runnable that waits for the provisioning request to finish. It can be run without holding any locks,
+ * but may fail with an exception that should be propagated to the user initiating prepare()
*/
- void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException;
+ Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException;
/**
* Continue provisioning of given list of Nodes.
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
index fad42449285..17ff3b99e0c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java
@@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing;
+import com.yahoo.yolean.Exceptions;
import java.util.LinkedHashSet;
import java.util.List;
@@ -108,6 +109,8 @@ public class Preparer {
/// Note that this will write to the node repo.
private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requested, NodeIndices indices, boolean makeExclusive) {
+ Runnable waiter = null;
+ List<Node> acceptedNodes;
try (Mutex lock = nodeRepository.applications().lock(application);
ApplicationMutex parentLockOrNull = parentLockOrNull(makeExclusive, requested.type());
Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) {
@@ -149,12 +152,13 @@ public class Preparer {
requested.cloudAccount(),
deficit.dueToFlavorUpgrade());
Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository.nodeResourceLimits().isWithinRealLimits(resources, application, cluster);
- hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned);
+ waiter = hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned);
} catch (NodeAllocationException e) {
// Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do
// not exist, we cannot remove them from ZK here because other nodes may already have been
// allocated on them, so let HostDeprovisioner deal with it
- hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant()));
+ hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true,
+ "Failed to provision: " + Exceptions.toMessageString(e)));
throw e;
}
} else if (allocation.hostDeficit().isPresent() && requested.canFail() &&
@@ -177,7 +181,7 @@ public class Preparer {
nodeRepository.nodes().setExclusiveToApplicationId(exclusiveParents, parentLockOrNull);
// TODO: also update tags
}
- List<Node> acceptedNodes = allocation.finalNodes();
+ acceptedNodes = allocation.finalNodes();
nodeRepository.nodes().reserve(allocation.reservableNodes());
nodeRepository.nodes().addReservedNodes(new LockedNodeList(allocation.newNodes(), allocationLock));
@@ -187,8 +191,10 @@ public class Preparer {
.filter(node -> node.parentHostname().isEmpty() || activeHosts.parentOf(node).isPresent())
.toList();
}
- return acceptedNodes;
}
+
+ if (waiter != null) waiter.run();
+ return acceptedNodes;
}
private NodeAllocation prepareAllocation(ApplicationId application, ClusterSpec cluster, NodeSpec requested,
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 0d9666e4f26..9080030f026 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -153,7 +153,7 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler {
" and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail");
}
else if (path.matches("/nodes/v2/state/parked/{hostname}")) {
- List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), "Parked through the nodes/v2 API");
+ List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), false, "Parked through the nodes/v2 API");
return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked);
}
else if (path.matches("/nodes/v2/state/dirty/{hostname}")) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
index f7710ca7019..b5bb91af71a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
@@ -73,7 +73,7 @@ public class MockHostProvisioner implements HostProvisioner {
}
@Override
- public void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException {
+ public Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException {
if (behaviour(Behaviour.failProvisionRequest)) throw new NodeAllocationException("No capacity for provision request", true);
Flavor hostFlavor = hostFlavors.get(request.clusterType().orElse(ClusterSpec.Type.content));
if (hostFlavor == null)
@@ -101,6 +101,7 @@ public class MockHostProvisioner implements HostProvisioner {
}
provisionedHosts.addAll(hosts);
whenProvisioned.accept(hosts);
+ return () -> {};
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
index 77986d03da2..74db071b060 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java
@@ -98,7 +98,11 @@ public class HostResumeProvisionerTest {
Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(List::isEmpty));
hostResumeProvisioner.maintain();
- assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames());
+ assertEquals(Set.of(), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames());
+ tester.clock().advance(Duration.ofSeconds(60));
+
+ hostResumeProvisioner.maintain();
+ assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames());
}
private void deployApplication() {