diff options
author | Martin Polden <mpolden@mpolden.no> | 2023-07-26 13:03:31 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-26 13:03:31 +0200 |
commit | 8527a87e966cc58cb071f52b40ca2d07a2f6c271 (patch) | |
tree | f968d1b379bfbf6039ed8ca83bdf6f0659b72b92 /node-repository | |
parent | efcaea20086dec39b35385e1a26cd97c5cd60be2 (diff) | |
parent | 697356065a407b10cfd69c58bad8fe2eed07e9e9 (diff) |
Merge pull request #27897 from vespa-engine/freva/fix
Verify selected flavor is within resource limits
Diffstat (limited to 'node-repository')
10 files changed, 43 insertions, 32 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 2d4e7142622..13a6c35e9a7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.flags.PermanentFlags; import com.yahoo.vespa.flags.custom.SharedHost; import com.yahoo.vespa.hosted.provision.Node.State; import com.yahoo.vespa.hosted.provision.applications.Applications; +import com.yahoo.vespa.hosted.provision.archive.ArchiveUriManager; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.lb.LoadBalancers; import com.yahoo.vespa.hosted.provision.maintenance.InfrastructureVersions; @@ -28,10 +29,10 @@ import com.yahoo.vespa.hosted.provision.persistence.CuratorDb; import com.yahoo.vespa.hosted.provision.persistence.DnsNameResolver; import com.yahoo.vespa.hosted.provision.persistence.JobControlFlags; import com.yahoo.vespa.hosted.provision.persistence.NameResolver; -import com.yahoo.vespa.hosted.provision.archive.ArchiveUriManager; import com.yahoo.vespa.hosted.provision.provisioning.ContainerImages; import com.yahoo.vespa.hosted.provision.provisioning.FirmwareChecks; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; +import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceLimits; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider; import com.yahoo.vespa.orchestrator.Orchestrator; @@ -52,6 +53,7 @@ public class NodeRepository extends AbstractComponent { private final Nodes nodes; private final NodeFlavors flavors; private final HostResourcesCalculator resourcesCalculator; + private final NodeResourceLimits nodeResourceLimits; private final NameResolver nameResolver; private final OsVersions osVersions; private final InfrastructureVersions infrastructureVersions; @@ -129,6 +131,7 @@ public class NodeRepository extends AbstractComponent { this.nodes = new Nodes(db, zone, clock, orchestrator, applications); this.flavors = flavors; this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator(); + this.nodeResourceLimits = new NodeResourceLimits(this); this.nameResolver = nameResolver; this.osVersions = new OsVersions(this); this.infrastructureVersions = new InfrastructureVersions(db); @@ -181,6 +184,8 @@ public class NodeRepository extends AbstractComponent { public HostResourcesCalculator resourcesCalculator() { return resourcesCalculator; } + public NodeResourceLimits nodeResourceLimits() { return nodeResourceLimits; } + public FlagSource flagSource() { return flagSource; } public MetricsDb metricsDb() { return metricsDb; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index 40d1d50e0e8..c19d76efb35 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -9,7 +9,6 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceLimits; import java.util.List; import java.util.Optional; @@ -161,7 +160,7 @@ public class AllocatableClusterResources { Limits applicationLimits, List<NodeResources> availableRealHostResources, NodeRepository nodeRepository) { - var systemLimits = new NodeResourceLimits(nodeRepository); + var systemLimits = nodeRepository.nodeResourceLimits(); boolean exclusive = nodeRepository.exclusiveAllocation(clusterSpec); if (! exclusive) { // We decide resources: Add overhead to what we'll request (advertised) to make sure real becomes (at least) cappedNodeResources @@ -245,7 +244,7 @@ public class AllocatableClusterResources { Limits applicationLimits, boolean exclusive, boolean bestCase) { - var systemLimits = new NodeResourceLimits(nodeRepository); + var systemLimits = nodeRepository.nodeResourceLimits(); var advertisedResources = nodeRepository.resourcesCalculator().realToRequest(wantedResources.nodeResources(), exclusive, bestCase); advertisedResources = systemLimits.enlargeToLegal(advertisedResources, applicationId, clusterSpec, exclusive, true); // Ask for something legal advertisedResources = applicationLimits.cap(advertisedResources); // Overrides other conditions, even if it will then fail diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java index 8a9a29f58c6..2fa4fc82867 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java @@ -224,6 +224,7 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { nodeRepository().zone().cloud().account(), false); List<Node> hosts = new ArrayList<>(); hostProvisioner.provisionHosts(request, + resources -> true, provisionedHosts -> { hosts.addAll(provisionedHosts.stream().map(host -> host.generateHost(Duration.ZERO)).toList()); nodeRepository().nodes().addNodes(hosts, Agent.HostCapacityMaintainer); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java index b16f2c5c17e..83a710cb5a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java @@ -3,18 +3,21 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeAllocationException; +import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; import java.time.Duration; import java.util.Objects; import java.util.Optional; import java.util.Random; +import java.util.function.Predicate; import java.util.logging.Level; /** @@ -63,16 +66,18 @@ public class HostFlavorUpgrader extends NodeRepositoryMaintainer { for (var node : activeNodes) { Optional<Node> parent = allNodes.parentOf(node); if (parent.isEmpty()) continue; - if (!hostProvisioner.canUpgradeFlavor(parent.get(), node)) continue; + Allocation allocation = node.allocation().get(); + Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository().nodeResourceLimits().isWithinRealLimits(resources, allocation.owner(), allocation.membership().cluster()); + if (!hostProvisioner.canUpgradeFlavor(parent.get(), node, realHostResourcesWithinLimits)) continue; if (parent.get().status().wantToUpgradeFlavor()) continue; // Already upgrading boolean redeployed = false; boolean deploymentValid = false; - try (MaintenanceDeployment deployment = new MaintenanceDeployment(node.allocation().get().owner(), deployer, metric, nodeRepository(), true)) { + try (MaintenanceDeployment deployment = new MaintenanceDeployment(allocation.owner(), deployer, metric, nodeRepository(), true)) { deploymentValid = deployment.isValid(); if (!deploymentValid) continue; - log.log(Level.INFO, () -> "Redeploying " + node.allocation().get().owner() + " to upgrade flavor (" + + log.log(Level.INFO, () -> "Redeploying " + allocation.owner() + " to upgrade flavor (" + parent.get().flavor().name() + ") of " + parent.get()); upgradeFlavor(parent.get(), true); deployment.activate(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index dd838375a59..66d1a4e8bc8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -4,10 +4,12 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.CloudAccount; import com.yahoo.config.provision.HostEvent; import com.yahoo.config.provision.NodeAllocationException; +import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; import java.util.List; import java.util.function.Consumer; +import java.util.function.Predicate; /** * A service which supports provisioning container hosts dynamically. @@ -33,13 +35,14 @@ public interface HostProvisioner { * Schedule provisioning of a given number of hosts. * * @param request details of the host provision request. + * @param realHostResourcesWithinLimits predicate that returns true if the given resources are within allowed limits * @param whenProvisioned consumer of {@link ProvisionedHost}s describing the provisioned nodes, * the {@link Node} returned from {@link ProvisionedHost#generateHost} must be * written to ZK immediately in case the config server goes down while waiting * for the provisioning to finish. * @throws NodeAllocationException if the cloud provider cannot satisfy the request */ - void provisionHosts(HostProvisionRequest request, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; + void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException; /** * Continue provisioning of given list of Nodes. @@ -75,6 +78,6 @@ public interface HostProvisioner { List<HostEvent> hostEventsIn(List<CloudAccount> cloudAccounts); /** Returns whether flavor for given host can be upgraded to a newer generation */ - boolean canUpgradeFlavor(Node host, Node child); + boolean canUpgradeFlavor(Node host, Node child, Predicate<NodeResources> realHostResourcesWithinLimits); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index 28caf26353b..b289a965567 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -83,7 +83,6 @@ class NodeAllocation { private final Supplier<Integer> nextIndex; private final NodeRepository nodeRepository; - private final NodeResourceLimits nodeResourceLimits; private final Optional<String> requiredHostFlavor; NodeAllocation(NodeList allNodes, ApplicationId application, ClusterSpec cluster, NodeSpec requested, @@ -94,7 +93,6 @@ class NodeAllocation { this.requested = requested; this.nextIndex = nextIndex; this.nodeRepository = nodeRepository; - this.nodeResourceLimits = new NodeResourceLimits(nodeRepository); this.requiredHostFlavor = Optional.of(PermanentFlags.HOST_FLAVOR.bindTo(nodeRepository.flagSource()) .with(FetchVector.Dimension.APPLICATION_ID, application.serializedForm()) .with(FetchVector.Dimension.CLUSTER_TYPE, cluster.type().name()) @@ -133,7 +131,7 @@ class NodeAllocation { } } else if (! saturated() && hasCompatibleResources(candidate)) { - if (! nodeResourceLimits.isWithinRealLimits(candidate, application, cluster)) { + if (! nodeRepository.nodeResourceLimits().isWithinRealLimits(candidate, application, cluster)) { ++rejectedDueToInsufficientRealResources; continue; } @@ -165,7 +163,7 @@ class NodeAllocation { boolean alreadyRetired = candidate.allocation().map(a -> a.membership().retired()).orElse(false); return alreadyRetired ? Retirement.alreadyRetired : Retirement.none; } - if ( ! nodeResourceLimits.isWithinRealLimits(candidate, application, cluster)) return Retirement.outsideRealLimits; + if ( ! nodeRepository.nodeResourceLimits().isWithinRealLimits(candidate, application, cluster)) return Retirement.outsideRealLimits; if (violatesParentHostPolicy(candidate)) return Retirement.violatesParentHostPolicy; if ( ! hasCompatibleResources(candidate)) return Retirement.incompatibleResources; if (candidate.parent.map(node -> node.status().wantToUpgradeFlavor()).orElse(false)) return Retirement.violatesHostFlavorGeneration; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index bcc63a6704a..3d0c1069584 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -57,7 +57,6 @@ public class NodeRepositoryProvisioner implements Provisioner { private final Preparer preparer; private final Activator activator; private final Optional<LoadBalancerProvisioner> loadBalancerProvisioner; - private final NodeResourceLimits nodeResourceLimits; @Inject public NodeRepositoryProvisioner(NodeRepository nodeRepository, @@ -70,7 +69,6 @@ public class NodeRepositoryProvisioner implements Provisioner { this.zone = zone; this.loadBalancerProvisioner = provisionServiceProvider.getLoadBalancerService() .map(lbService -> new LoadBalancerProvisioner(nodeRepository, lbService)); - this.nodeResourceLimits = new NodeResourceLimits(nodeRepository); this.preparer = new Preparer(nodeRepository, provisionServiceProvider.getHostProvisioner(), loadBalancerProvisioner, @@ -115,8 +113,8 @@ public class NodeRepositoryProvisioner implements Provisioner { private void validate(ApplicationId application, ClusterSpec cluster, Capacity requested, ProvisionLogger logger) { if (cluster.group().isPresent()) throw new IllegalArgumentException("Node requests cannot specify a group"); - nodeResourceLimits.ensureWithinAdvertisedLimits("Min", requested.minResources().nodeResources(), application, cluster); - nodeResourceLimits.ensureWithinAdvertisedLimits("Max", requested.maxResources().nodeResources(), application, cluster); + nodeRepository.nodeResourceLimits().ensureWithinAdvertisedLimits("Min", requested.minResources().nodeResources(), application, cluster); + nodeRepository.nodeResourceLimits().ensureWithinAdvertisedLimits("Max", requested.maxResources().nodeResources(), application, cluster); if ( ! requested.minResources().nodeResources().gpuResources().equals(requested.maxResources().nodeResources().gpuResources())) throw new IllegalArgumentException(requested + " is invalid: Gpu capacity cannot have ranges"); @@ -126,7 +124,7 @@ public class NodeRepositoryProvisioner implements Provisioner { private void logInsufficientDiskResources(ClusterSpec cluster, Capacity requested, ProvisionLogger logger) { var resources = requested.minResources().nodeResources(); - if ( ! nodeResourceLimits.isWithinAdvertisedDiskLimits(resources, cluster)) { + if ( ! nodeRepository.nodeResourceLimits().isWithinAdvertisedDiskLimits(resources, cluster)) { logger.logApplicationPackage(Level.WARNING, "Requested disk (" + resources.diskGb() + "Gb) in " + cluster.id() + " is not large enough to fit " + "core/heap dumps. Minimum recommended disk resources " + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java index 8c5a7b6c61e..47c388f97a8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java @@ -2,17 +2,14 @@ package com.yahoo.vespa.hosted.provision.provisioning; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; -import com.yahoo.config.provision.ProvisionLogger; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.util.Locale; -import java.util.logging.Level; /** * Defines the resource limits for nodes in various zones @@ -91,7 +88,7 @@ public class NodeResourceLimits { } // TODO: Move this check into the above when we are ready to fail, not just warn on this. */ - private double minAdvertisedDiskGb(NodeResources requested, ClusterSpec cluster) { + private static double minAdvertisedDiskGb(NodeResources requested, ClusterSpec cluster) { return requested.memoryGb() * switch (cluster.type()) { case combined, content -> 3; case container -> 2; @@ -111,16 +108,16 @@ public class NodeResourceLimits { return minAdvertisedVcpu(applicationId, cluster); } - private double minRealMemoryGb(ClusterSpec cluster) { + private static double minRealMemoryGb(ClusterSpec cluster) { if (cluster.type() == ClusterSpec.Type.admin) return 0.95; // TODO: Increase to 1.05 after March 2023 return 2.3; } - private double minRealDiskGb() { return 6; } + private static double minRealDiskGb() { return 6; } private Zone zone() { return nodeRepository.zone(); } - private void illegal(String type, String resource, String unit, ClusterSpec cluster, double requested, double minAllowed) { + private static void illegal(String type, String resource, String unit, ClusterSpec cluster, double requested, double minAllowed) { if ( ! unit.isEmpty()) unit = " " + unit; String message = String.format(Locale.ENGLISH, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index 79b1bccbbde..349be9e4b47 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.function.Consumer; +import java.util.function.Predicate; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; @@ -103,6 +104,10 @@ public class Preparer { allocation.offer(candidates); }; try { + if (throttler.throttle(allNodes, Agent.system)) { + throw new NodeAllocationException("Host provisioning is being throttled", true); + } + HostProvisionRequest request = new HostProvisionRequest(allocation.provisionIndices(deficit.count()), hostType, deficit.resources(), @@ -113,10 +118,8 @@ public class Preparer { Optional.of(cluster.id()), requested.cloudAccount(), deficit.dueToFlavorUpgrade()); - if (throttler.throttle(allNodes, Agent.system)) { - throw new NodeAllocationException("Host provisioning is being throttled", true); - } - hostProvisioner.get().provisionHosts(request, whenProvisioned); + Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository.nodeResourceLimits().isWithinRealLimits(resources, application, cluster); + hostProvisioner.get().provisionHosts(request, realHostResourcesWithinLimits, whenProvisioned); } catch (NodeAllocationException e) { // Mark the nodes that were written to ZK in the consumer for deprovisioning. While these hosts do // not exist, we cannot remove them from ZK here because other nodes may already have been diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index e6a064c7bf5..965611b9a6e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; +import java.util.function.Predicate; import java.util.stream.IntStream; /** @@ -68,13 +69,14 @@ public class MockHostProvisioner implements HostProvisioner { } @Override - public void provisionHosts(HostProvisionRequest request, Consumer<List<ProvisionedHost>> whenProvisioned) { + public void provisionHosts(HostProvisionRequest request, Predicate<NodeResources> realHostResourcesWithinLimits, Consumer<List<ProvisionedHost>> whenProvisioned) throws NodeAllocationException { if (behaviour(Behaviour.failProvisionRequest)) throw new NodeAllocationException("No capacity for provision request", true); Flavor hostFlavor = hostFlavors.get(request.clusterType().orElse(ClusterSpec.Type.content)); if (hostFlavor == null) hostFlavor = flavors.stream() .filter(f -> request.sharing() == HostSharing.exclusive ? compatible(f, request.resources()) : f.resources().satisfies(request.resources())) + .filter(f -> realHostResourcesWithinLimits.test(f.resources())) .findFirst() .orElseThrow(() -> new NodeAllocationException("No host flavor matches " + request.resources(), true)); @@ -130,7 +132,7 @@ public class MockHostProvisioner implements HostProvisioner { } @Override - public boolean canUpgradeFlavor(Node host, Node child) { + public boolean canUpgradeFlavor(Node host, Node child, Predicate<NodeResources> realHostResourcesWithinLimits) { return upgradableFlavors.contains(host.flavor().name()); } @@ -248,7 +250,7 @@ public class MockHostProvisioner implements HostProvisioner { /** Fail call to {@link MockHostProvisioner#provision(com.yahoo.vespa.hosted.provision.Node)} */ failProvisioning, - /** Fail call to {@link MockHostProvisioner#provisionHosts(HostProvisionRequest, Consumer)} */ + /** Fail call to {@link MockHostProvisioner#provisionHosts(HostProvisionRequest, Predicate, Consumer)} */ failProvisionRequest, /** Fail call to {@link MockHostProvisioner#deprovision(com.yahoo.vespa.hosted.provision.Node)} */ |