diff options
11 files changed, 72 insertions, 33 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/metricsproxy/MetricsProxyContainerCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/metricsproxy/MetricsProxyContainerCluster.java index 9fab6a2b17b..348b84367d5 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/metricsproxy/MetricsProxyContainerCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/metricsproxy/MetricsProxyContainerCluster.java @@ -48,6 +48,7 @@ import java.util.stream.Stream; import static ai.vespa.metrics.set.DefaultMetrics.defaultMetricSet; import static ai.vespa.metrics.set.MetricSet.empty; import static ai.vespa.metrics.set.SystemMetrics.systemMetricSet; +import static ai.vespa.metrics.set.Vespa9DefaultMetricSet.vespa9defaultMetricSet; import static com.yahoo.vespa.model.admin.metricsproxy.ConsumersConfigGenerator.addMetrics; import static com.yahoo.vespa.model.admin.metricsproxy.ConsumersConfigGenerator.generateConsumers; import static com.yahoo.vespa.model.admin.metricsproxy.ConsumersConfigGenerator.toConsumerBuilder; @@ -169,8 +170,7 @@ public class MetricsProxyContainerCluster extends ContainerCluster<MetricsProxyC public MetricsConsumer newDefaultConsumer() { if (isHostedVespa()) { - // TODO: use different metric set for hosted vespa. - return MetricsConsumer.consumer(NEW_DEFAULT_CONSUMER_ID, defaultMetricSet, systemMetricSet); + return MetricsConsumer.consumer(NEW_DEFAULT_CONSUMER_ID, vespa9defaultMetricSet, systemMetricSet); } return MetricsConsumer.consumer(NEW_DEFAULT_CONSUMER_ID, defaultMetricSet, systemMetricSet); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java index 7155a49ef58..be6c420c63b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java @@ -229,15 +229,19 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { sharingMode, clusterType.map(ClusterSpec.Type::valueOf), Optional.empty(), nodeRepository().zone().cloud().account(), false); List<Node> hosts = new ArrayList<>(); - hostProvisioner.provisionHosts(request, - resources -> true, - provisionedHosts -> { - hosts.addAll(provisionedHosts.stream() - .map(host -> host.generateHost(Duration.ZERO)) - .map(host -> host.withExclusiveToApplicationId(null)) - .toList()); - nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer); - }); + Runnable waiter; + try (var lock = nodeRepository().nodes().lockUnallocated()) { + waiter = hostProvisioner.provisionHosts(request, + resources -> true, + provisionedHosts -> { + hosts.addAll(provisionedHosts.stream() + .map(host -> host.generateHost(Duration.ZERO)) + .map(host -> host.withExclusiveToApplicationId(null)) + .toList()); + nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer); + }); + } + waiter.run(); return hosts; } catch (NodeAllocationException | IllegalArgumentException | IllegalStateException e) { throw new NodeAllocationException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage(), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java index 1d459f7dd14..2e3c6d1755a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java @@ -61,10 +61,13 @@ public class HostResumeProvisioner extends NodeRepositoryMaintainer { log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); } catch (FatalProvisioningException e) { + // FatalProvisioningException is thrown if node is not found in the cloud, allow for + // some time for the state to propagate + if (host.history().age(clock().instant()).getSeconds() < 30) continue; failures++; log.log(Level.SEVERE, "Failed to provision " + host.hostname() + ", failing out the host recursively", e); - nodeRepository().nodes().failOrMarkRecursively( - host.hostname(), Agent.HostResumeProvisioner, "Failed by HostResumeProvisioner due to provisioning failure"); + nodeRepository().nodes().parkRecursively( + host.hostname(), Agent.HostResumeProvisioner, true, "Failed by HostResumeProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NamingException) log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index 2d513390cf5..24901cb10a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -38,10 +38,9 @@ public class ProvisionedExpirer extends Expirer { for (Node expiredNode : expired) { if (expiredNode.type() != NodeType.host) continue; - nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned"); - if (MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired) { - nodeRepository.nodes().deprovision(expiredNode.hostname(), Agent.ProvisionedExpirer, nodeRepository.clock().instant()); - } + boolean forceDeprovision = MAXIMUM_ALLOWED_EXPIRED_HOSTS < ++previouslyExpired; + nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, + forceDeprovision, "Node is stuck in provisioned"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index c352dca1656..5c0614dd8d6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -403,8 +403,8 @@ public class Nodes { * * @return List of all the parked nodes in their new state */ - public List<Node> parkRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason)); + public List<Node> parkRecursively(String hostname, Agent agent, boolean forceDeprovision, String reason) { + return moveRecursively(hostname, Node.State.parked, agent, forceDeprovision, Optional.of(reason)); } /** @@ -433,12 +433,12 @@ public class Nodes { } } - private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { + private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, boolean forceDeprovision, Optional<String> reason) { try (RecursiveNodeMutexes locked = lockAndGetRecursively(hostname, Optional.empty())) { List<Node> moved = new ArrayList<>(); NestedTransaction transaction = new NestedTransaction(); for (NodeMutex node : locked.nodes().nodes()) - moved.add(move(node.node().hostname(), toState, agent, false, reason, transaction)); + moved.add(move(node.node().hostname(), toState, agent, forceDeprovision, reason, transaction)); transaction.commit(); return moved; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index a876999e80b..38cbfa7fe5f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -50,8 +50,10 @@ public interface HostProvisioner { * written to ZK immediately in case the config server goes down while waiting * for the provisioning to finish. * @throws NodeAllocationException if the cloud provider cannot satisfy the request + * @return a runnable that waits for the provisioning request to finish. It can be run without holding any locks, + * but may fail with an exception that should be propagated to the user initiating prepare() */ - void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; + Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; /** * Continue provisioning of given list of Nodes. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index fad42449285..17ff3b99e0c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -22,6 +22,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; +import com.yahoo.yolean.Exceptions; import java.util.LinkedHashSet; import java.util.List; @@ -108,6 +109,8 @@ public class Preparer { /// Note that this will write to the node repo. private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requested, NodeIndices indices, boolean makeExclusive) { + Runnable waiter = null; + List<Node> acceptedNodes; try (Mutex lock = nodeRepository.applications().lock(application); ApplicationMutex parentLockOrNull = parentLockOrNull(makeExclusive, requested.type()); Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) { @@ -149,12 +152,13 @@ public class Preparer { requested.cloudAccount(), deficit.dueToFlavorUpgrade()); Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository.nodeResourceLimits().isWithinRealLimits(resources, application, cluster); - hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); + waiter = hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); } catch (NodeAllocationException e) { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do // not exist, we cannot remove them from ZK here because other nodes may already have been // allocated on them, so let HostDeprovisioner deal with it - hosts.forEach(host -> nodeRepository.nodes().deprovision(host.hostname(), Agent.system, nodeRepository.clock().instant())); + hosts.forEach(host -> nodeRepository.nodes().parkRecursively(host.hostname(), Agent.system, true, + "Failed to provision: " + Exceptions.toMessageString(e))); throw e; } } else if (allocation.hostDeficit().isPresent() && requested.canFail() && @@ -177,7 +181,7 @@ public class Preparer { nodeRepository.nodes().setExclusiveToApplicationId(exclusiveParents, parentLockOrNull); // TODO: also update tags } - List<Node> acceptedNodes = allocation.finalNodes(); + acceptedNodes = allocation.finalNodes(); nodeRepository.nodes().reserve(allocation.reservableNodes()); nodeRepository.nodes().addReservedNodes(new LockedNodeList(allocation.newNodes(), allocationLock)); @@ -187,8 +191,10 @@ public class Preparer { .filter(node -> node.parentHostname().isEmpty() || activeHosts.parentOf(node).isPresent()) .toList(); } - return acceptedNodes; } + + if (waiter != null) waiter.run(); + return acceptedNodes; } private NodeAllocation prepareAllocation(ApplicationId application, ClusterSpec cluster, NodeSpec requested, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 0d9666e4f26..9080030f026 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -153,7 +153,7 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler { " and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail"); } else if (path.matches("/nodes/v2/state/parked/{hostname}")) { - List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), "Parked through the nodes/v2 API"); + List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), agent(request), false, "Parked through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked); } else if (path.matches("/nodes/v2/state/dirty/{hostname}")) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index f7710ca7019..b5bb91af71a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -73,7 +73,7 @@ public class MockHostProvisioner implements HostProvisioner { } @Override - public void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException { + public Runnable provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException { if (behaviour(Behaviour.failProvisionRequest)) throw new NodeAllocationException("No capacity for provision request", true); Flavor hostFlavor = hostFlavors.get(request.clusterType().orElse(ClusterSpec.Type.content)); if (hostFlavor == null) @@ -101,6 +101,7 @@ public class MockHostProvisioner implements HostProvisioner { } provisionedHosts.addAll(hosts); whenProvisioned.accept(hosts); + return () -> {}; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java index 77986d03da2..74db071b060 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java @@ -98,7 +98,11 @@ public class HostResumeProvisionerTest { Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(List::isEmpty)); hostResumeProvisioner.maintain(); - assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames()); + assertEquals(Set.of(), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); + tester.clock().advance(Duration.ofSeconds(60)); + + hostResumeProvisioner.maintain(); + assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.parked).deprovisioning().hostnames()); } private void deployApplication() { diff --git a/screwdriver.yaml b/screwdriver.yaml index db38d3b1a09..ccab9bc4bd0 100644 --- a/screwdriver.yaml +++ b/screwdriver.yaml @@ -367,12 +367,13 @@ jobs: screwdriver.cd/buildPeriodically: H 6 1 * * environment: - IMAGE_NAME: "vespaengine/vespa-el9-preview" + BASE_IMAGE: "el9" + IMAGE_NAME: "vespa-el9-preview" secrets: - DOCKER_HUB_DEPLOY_KEY - steps: + steps: &publish-el9-preview-steps - get-vespa-version: | set -x VESPA_VERSION=$(meta get vespa.version --external publish-release) @@ -399,9 +400,10 @@ jobs: --progress plain \ --load \ --platform linux/amd64 \ - --build-arg VESPA_BASE_IMAGE=el9 \ + --build-arg VESPA_BASE_IMAGE=$BASE_IMAGE \ --build-arg VESPA_VERSION=$VESPA_VERSION \ --file Dockerfile \ + --tag vespaengine/vespa:latest \ --tag vespaengine/$IMAGE_NAME:latest \ . - verify-container-image: | @@ -420,7 +422,7 @@ jobs: --progress plain \ --push \ --platform linux/amd64,linux/arm64 \ - --build-arg VESPA_BASE_IMAGE=el9 \ + --build-arg VESPA_BASE_IMAGE=$BASE_IMAGE \ --build-arg VESPA_VERSION=$VESPA_VERSION \ --file Dockerfile \ --tag docker.io/vespaengine/$IMAGE_NAME:$VESPA_VERSION \ @@ -430,6 +432,24 @@ jobs: fi fi + publish-el8-preview: + image: docker.io/vespaengine/vespa-build-centos-stream8:latest + annotations: + screwdriver.cd/cpu: 7 + screwdriver.cd/ram: 16 + screwdriver.cd/disk: HIGH + screwdriver.cd/timeout: 300 + screwdriver.cd/dockerEnabled: true + screwdriver.cd/dockerCpu: TURBO + screwdriver.cd/dockerRam: HIGH + screwdriver.cd/buildPeriodically: H 6 1 * * + environment: + BASE_IMAGE: "el8" + IMAGE_NAME: "vespa-el8-preview" + secrets: + - DOCKER_HUB_DEPLOY_KEY + steps: *publish-el9-preview-steps + publish-cli-release: requires: [publish-release] image: homebrew/brew:latest |