diff options
Diffstat (limited to 'node-repository')
24 files changed, 353 insertions, 185 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index a435814c21e..3f5255c6618 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -2,16 +2,11 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterSpec; -import com.yahoo.vespa.hosted.provision.NodeList; -import com.yahoo.vespa.hosted.provision.applications.Cluster; import java.time.Duration; -import java.time.Instant; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Collectors; /** * A list of metric snapshots from a cluster, sorted by increasing time (newest last). @@ -85,8 +80,8 @@ public class ClusterTimeseries { else return 0.0; // ... because load is stable } - if (queryRateNow() == 0) return 0.1; // Growth not expressible as a fraction of the current rate - return maxGrowthRate / queryRateNow(); + if (currentQueryRate() == 0) return 0.1; // Growth not expressible as a fraction of the current rate + return maxGrowthRate / currentQueryRate(); } /** The current query rate as a fraction of the peak rate in this timeseries */ @@ -97,12 +92,22 @@ public class ClusterTimeseries { return snapshots.get(snapshots.size() - 1).queryRate() / max; } + public double currentQueryRate() { + return queryRateAt(snapshots.size() - 1); + } + + public double currentWriteRate() { + return writeRateAt(snapshots.size() - 1); + } + private double queryRateAt(int index) { + if (snapshots.isEmpty()) return 0.0; return snapshots.get(index).queryRate(); } - private double queryRateNow() { - return queryRateAt(snapshots.size() - 1); + private double writeRateAt(int index) { + if (snapshots.isEmpty()) return 0.0; + return snapshots.get(index).writeRate(); } private Duration durationBetween(int startIndex, int endIndex) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index ab6a6d548e9..35717b97cf4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -75,6 +75,8 @@ public class ResourceTarget { public static double idealCpuLoad(Duration scalingDuration, ClusterTimeseries clusterTimeseries, Application application) { + double queryCpuFraction = queryCpuFraction(clusterTimeseries); + // What's needed to have headroom for growth during scale-up as a fraction of current resources? double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); @@ -84,18 +86,40 @@ public class ResourceTarget { growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1); // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down? + double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share double trafficShiftHeadroom; if (application.status().maxReadShare() == 0) // No traffic fraction data trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic + else if (application.status().currentReadShare() == 0) + trafficShiftHeadroom = maxTrafficShiftHeadroom; else trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare(); + trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom); - if (trafficShiftHeadroom > 2.0) // The expectation that we have almost no load with almost no queries is incorrect due - trafficShiftHeadroom = 2.0; // to write traffic; once that is separated we can increase this threshold + // Assumptions: 1) Write load is not organic so we should not grow to handle more. + // (TODO: But allow applications to set their target write rate and size for that) + // 2) Write load does not change in BCP scenarios. + return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad() + + (1 - queryCpuFraction) * idealWriteCpuLoad(); + } + + private static double queryCpuFraction(ClusterTimeseries clusterTimeseries) { + double queryRate = clusterTimeseries.currentQueryRate(); + double writeRate = clusterTimeseries.currentWriteRate(); + if (queryRate == 0 && writeRate == 0) return queryCpuFraction(0.5); + return queryCpuFraction(queryRate / (queryRate + writeRate)); + } - return 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * Resource.cpu.idealAverageLoad(); + private static double queryCpuFraction(double queryFraction) { + double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure + double writeFraction = 1 - queryFraction; + return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction); } + public static double idealQueryCpuLoad() { return Resource.cpu.idealAverageLoad(); } + + public static double idealWriteCpuLoad() { return 0.95; } + public static double idealMemoryLoad() { return Resource.memory.idealAverageLoad(); } public static double idealDiskLoad() { return Resource.disk.idealAverageLoad(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 9df6af4d02a..6ff4e1cc20d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -6,7 +6,6 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Deployer; import com.yahoo.jdisc.Metric; -import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; @@ -14,17 +13,15 @@ import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler; -import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.NodeTimeseries; import com.yahoo.vespa.hosted.provision.node.History; import java.time.Duration; import java.time.Instant; -import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.stream.Collectors; /** * Maintainer making automatic scaling decisions @@ -57,12 +54,12 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { boolean success = true; if ( ! nodeRepository().zone().environment().isProduction()) return success; - activeNodesByApplication().forEach((applicationId, nodes) -> autoscale(applicationId, nodes)); + activeNodesByApplication().forEach(this::autoscale); return success; } - private void autoscale(ApplicationId application, List<Node> applicationNodes) { - nodesByCluster(applicationNodes).forEach((clusterId, clusterNodes) -> autoscale(application, clusterId, NodeList.copyOf(clusterNodes))); + private void autoscale(ApplicationId application, NodeList applicationNodes) { + nodesByCluster(applicationNodes).forEach((clusterId, clusterNodes) -> autoscale(application, clusterId, clusterNodes)); } private void autoscale(ApplicationId applicationId, ClusterSpec.Id clusterId, NodeList clusterNodes) { @@ -143,8 +140,8 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { return r + " (total: " + r.totalResources() + ")"; } - private Map<ClusterSpec.Id, List<Node>> nodesByCluster(List<Node> applicationNodes) { - return applicationNodes.stream().collect(Collectors.groupingBy(n -> n.allocation().get().membership().cluster().id())); + private Map<ClusterSpec.Id, NodeList> nodesByCluster(NodeList applicationNodes) { + return applicationNodes.groupingBy(n -> n.allocation().get().membership().cluster().id()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index d0c02d7baaf..55548e70ddd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -90,13 +90,14 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { /** Resume provisioning of already provisioned hosts and their children */ private void resumeProvisioning(NodeList nodes, Mutex lock) { - Map<String, Set<Node>> nodesByProvisionedParentHostname = nodes.nodeType(NodeType.tenant, NodeType.config).asList().stream() - .filter(node -> node.parentHostname().isPresent()) - .collect(Collectors.groupingBy( - node -> node.parentHostname().get(), - Collectors.toSet())); - - nodes.state(Node.State.provisioned).nodeType(NodeType.host, NodeType.confighost).forEach(host -> { + Map<String, Set<Node>> nodesByProvisionedParentHostname = + nodes.nodeType(NodeType.tenant, NodeType.config, NodeType.controller) + .asList() + .stream() + .filter(node -> node.parentHostname().isPresent()) + .collect(Collectors.groupingBy(node -> node.parentHostname().get(), Collectors.toSet())); + + nodes.state(Node.State.provisioned).nodeType(NodeType.host, NodeType.confighost, NodeType.controllerhost).forEach(host -> { Set<Node> children = nodesByProvisionedParentHostname.getOrDefault(host.hostname(), Set.of()); try { List<Node> updatedNodes = hostProvisioner.provision(host, children); @@ -189,6 +190,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { // TODO: Mark empty tenant hosts as wanttoretire & wanttodeprovision elsewhere, then handle as confighost here return node.state() != Node.State.parked || node.status().wantToDeprovision(); case confighost: + case controllerhost: return node.state() == Node.State.parked && node.status().wantToDeprovision(); default: return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index e6338d73a17..025c8be449c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -7,13 +7,12 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Clock; import java.time.Duration; -import java.util.List; import java.util.Map; -import java.util.stream.Collectors; /** * A maintainer is some job which runs at a fixed rate to perform some maintenance task on the node repo. @@ -41,13 +40,12 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { protected Clock clock() { return nodeRepository.clock(); } /** A utility to group active tenant nodes by application */ - protected Map<ApplicationId, List<Node>> activeNodesByApplication() { - return nodeRepository().nodes().list(Node.State.active) + protected Map<ApplicationId, NodeList> activeNodesByApplication() { + return nodeRepository().nodes() + .list(Node.State.active) .nodeType(NodeType.tenant) - .asList() - .stream() - .filter(node -> ! node.allocation().get().owner().instance().isTester()) - .collect(Collectors.groupingBy(node -> node.allocation().get().owner())); + .matching(node -> ! node.allocation().get().owner().instance().isTester()) + .groupingBy(node -> node.allocation().get().owner()); } private static JobMetrics jobMetrics(Metric metric) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java index 49a33c4d120..f620a6d113d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java @@ -5,7 +5,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; -import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; @@ -13,7 +13,6 @@ import com.yahoo.vespa.hosted.provision.node.History; import java.time.Duration; import java.time.Instant; import java.util.LinkedHashSet; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -38,15 +37,14 @@ public class OperatorChangeApplicationMaintainer extends ApplicationMaintainer { @Override protected Set<ApplicationId> applicationsNeedingMaintenance() { - Map<ApplicationId, List<Node>> nodesByApplication = nodeRepository().nodes().list() - .nodeType(NodeType.tenant, NodeType.proxy).asList().stream() - .filter(node -> node.allocation().isPresent()) - .collect(Collectors.groupingBy(node -> node.allocation().get().owner(), Collectors.toList())); - + Map<ApplicationId, NodeList> nodesByApplication = nodeRepository().nodes().list() + .nodeType(NodeType.tenant, NodeType.proxy) + .matching(node -> node.allocation().isPresent()) + .groupingBy(node -> node.allocation().get().owner()); return nodesByApplication.entrySet().stream() - .filter(entry -> hasNodesWithChanges(entry.getKey(), entry.getValue())) - .map(Map.Entry::getKey) - .collect(Collectors.toCollection(LinkedHashSet::new)); + .filter(entry -> hasNodesWithChanges(entry.getKey(), entry.getValue())) + .map(Map.Entry::getKey) + .collect(Collectors.toCollection(LinkedHashSet::new)); } /** @@ -61,15 +59,15 @@ public class OperatorChangeApplicationMaintainer extends ApplicationMaintainer { " as a manual change was made to its nodes"); } - private boolean hasNodesWithChanges(ApplicationId applicationId, List<Node> nodes) { + private boolean hasNodesWithChanges(ApplicationId applicationId, NodeList nodes) { Optional<Instant> lastDeployTime = deployer().lastDeployTime(applicationId); if (lastDeployTime.isEmpty()) return false; return nodes.stream() - .flatMap(node -> node.history().events().stream()) - .filter(event -> event.agent() == Agent.operator) - .map(History.Event::at) - .anyMatch(e -> lastDeployTime.get().isBefore(e)); + .flatMap(node -> node.history().events().stream()) + .filter(event -> event.agent() == Agent.operator) + .map(History.Event::at) + .anyMatch(e -> lastDeployTime.get().isBefore(e)); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index 1274e83fb3a..f72daf1bc2b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -50,15 +50,10 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); - - Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream() - .filter(node -> node.allocation().isPresent()) - .filter(node -> node.allocation().get().membership().retired()) - .collect(Collectors.groupingBy(node -> node.allocation().get().owner())); - - for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) { + Map<ApplicationId, NodeList> retiredNodesByApplication = activeNodes.retired().groupingBy(node -> node.allocation().get().owner()); + for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) { ApplicationId application = entry.getKey(); - List<Node> retiredNodes = entry.getValue(); + NodeList retiredNodes = entry.getValue(); List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index 3d6130c4116..e2b89879141 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; @@ -17,10 +16,8 @@ import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import java.time.Duration; -import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.stream.Collectors; /** * Maintainer computing scaling suggestions for all clusters @@ -49,10 +46,10 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { return successes > 0; } - private int suggest(ApplicationId application, List<Node> applicationNodes) { + private int suggest(ApplicationId application, NodeList applicationNodes) { int successes = 0; for (var cluster : nodesByCluster(applicationNodes).entrySet()) - successes += suggest(application, cluster.getKey(), NodeList.copyOf(cluster.getValue())) ? 1 : 0; + successes += suggest(application, cluster.getKey(), cluster.getValue()) ? 1 : 0; return successes; } @@ -99,8 +96,8 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { return r1.totalResources().cost() > r2.totalResources().cost(); } - private Map<ClusterSpec.Id, List<Node>> nodesByCluster(List<Node> applicationNodes) { - return applicationNodes.stream().collect(Collectors.groupingBy(n -> n.allocation().get().membership().cluster().id())); + private Map<ClusterSpec.Id, NodeList> nodesByCluster(NodeList applicationNodes) { + return applicationNodes.groupingBy(n -> n.allocation().get().membership().cluster().id()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java index 3c936e4e6ba..49eb44a4ec0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -100,13 +101,12 @@ class Activator { Optional<Application> application = nodeRepository.applications().get(transaction.application()); if (application.isEmpty()) return; // infrastructure app, hopefully :-| - var currentNodesByCluster = newNodes.stream() - .collect(Collectors.groupingBy(node -> node.allocation().get().membership().cluster().id())); + Map<ClusterSpec.Id, NodeList> currentNodesByCluster = newNodes.groupingBy(node -> node.allocation().get().membership().cluster().id()); Application modified = application.get(); for (var clusterEntry : currentNodesByCluster.entrySet()) { var cluster = modified.cluster(clusterEntry.getKey()).get(); var previousResources = oldNodes.cluster(clusterEntry.getKey()).toResources(); - var currentResources = NodeList.copyOf(clusterEntry.getValue()).toResources(); + var currentResources = clusterEntry.getValue().toResources(); if ( ! previousResources.justNumbers().equals(currentResources.justNumbers())) { cluster = cluster.with(ScalingEvent.create(previousResources, currentResources, generation, at)); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index 597c4c1bd8c..0f725e6447a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -92,7 +92,7 @@ public class GroupPreparer { allocateOsRequirement); NodeType hostType = allocation.nodeType().hostType(); boolean hostTypeSupportsDynamicProvisioning = hostType == NodeType.host || - (hostType == NodeType.confighost && + (hostType.isConfigServerHostLike() && provisionConfigServerDynamically.value()); if (nodeRepository.zone().getCloud().dynamicProvisioning() && hostTypeSupportsDynamicProvisioning) { final Version osVersion; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java index b1bba656dc8..499eb3f23c0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java @@ -82,7 +82,7 @@ public class LoadBalancerProvisioner { if (application.instance().isTester()) return; // Do not provision for tester instances try (var lock = db.lock(application)) { ClusterSpec.Id clusterId = effectiveId(cluster); - List<Node> nodes = nodesOf(clusterId, application); + NodeList nodes = nodesOf(clusterId, application); LoadBalancerId loadBalancerId = requireNonClashing(new LoadBalancerId(application, clusterId)); ApplicationTransaction transaction = new ApplicationTransaction(new ProvisionLock(application, lock), new NestedTransaction()); provision(transaction, loadBalancerId, nodes, false); @@ -167,7 +167,7 @@ public class LoadBalancerProvisioner { } /** Idempotently provision a load balancer for given application and cluster */ - private void provision(ApplicationTransaction transaction, LoadBalancerId id, List<Node> nodes, boolean activate) { + private void provision(ApplicationTransaction transaction, LoadBalancerId id, NodeList nodes, boolean activate) { Instant now = nodeRepository.clock().instant(); Optional<LoadBalancer> loadBalancer = db.readLoadBalancer(id); if (loadBalancer.isEmpty() && activate) return; // Nothing to activate as this load balancer was never prepared @@ -185,7 +185,7 @@ public class LoadBalancerProvisioner { db.writeLoadBalancers(List.of(newLoadBalancer), transaction.nested()); } - private void provision(ApplicationTransaction transaction, ClusterSpec.Id clusterId, List<Node> nodes) { + private void provision(ApplicationTransaction transaction, ClusterSpec.Id clusterId, NodeList nodes) { provision(transaction, new LoadBalancerId(transaction.application(), clusterId), nodes, true); } @@ -204,12 +204,12 @@ public class LoadBalancerProvisioner { } /** Returns the nodes allocated to the given load balanced cluster */ - private List<Node> nodesOf(ClusterSpec.Id loadBalancedCluster, ApplicationId application) { - return loadBalancedClustersOf(application).getOrDefault(loadBalancedCluster, List.of()); + private NodeList nodesOf(ClusterSpec.Id loadBalancedCluster, ApplicationId application) { + return loadBalancedClustersOf(application).getOrDefault(loadBalancedCluster, NodeList.copyOf(List.of())); } /** Returns the load balanced clusters of given application and their nodes */ - private Map<ClusterSpec.Id, List<Node>> loadBalancedClustersOf(ApplicationId application) { + private Map<ClusterSpec.Id, NodeList> loadBalancedClustersOf(ApplicationId application) { NodeList nodes = nodeRepository.nodes().list(Node.State.reserved, Node.State.active).owner(application); if (nodes.stream().anyMatch(node -> node.type() == NodeType.config)) { nodes = nodes.nodeType(NodeType.config).type(ClusterSpec.Type.admin); @@ -218,11 +218,11 @@ public class LoadBalancerProvisioner { } else { nodes = nodes.nodeType(NodeType.tenant).container(); } - return nodes.stream().collect(Collectors.groupingBy(node -> effectiveId(node.allocation().get().membership().cluster()))); + return nodes.groupingBy(node -> effectiveId(node.allocation().get().membership().cluster())); } /** Returns real servers for given nodes */ - private Set<Real> realsOf(List<Node> nodes) { + private Set<Real> realsOf(NodeList nodes) { var reals = new LinkedHashSet<Real>(); for (var node : nodes) { for (var ip : reachableIpAddresses(node)) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index 19c8d68963a..cd5355befbe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -296,8 +296,8 @@ class NodeAllocation { * flavor and host count required to cover the deficit. */ Optional<HostDeficit> hostDeficit() { - if (nodeType() != NodeType.config && nodeType() != NodeType.tenant) { - return Optional.empty(); // Requests for these node types never have a deficit + if (nodeType().isHost()) { + return Optional.empty(); // Hosts are provisioned as required by the child application } return Optional.of(new HostDeficit(requestedNodes.resources().orElseGet(NodeResources::unspecified), requestedNodes.fulfilledDeficitCount(accepted()))) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index 3ff4765dd00..482f0f2e011 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -179,7 +179,9 @@ public interface NodeSpec { /** A node spec specifying a node type. This will accept all nodes of this type. */ class TypeNodeSpec implements NodeSpec { - private static final Map<NodeType, Integer> WANTED_NODE_COUNT = Map.of(NodeType.config, 3); + private static final Map<NodeType, Integer> WANTED_NODE_COUNT = Map.of( + NodeType.config, 3, + NodeType.controller, 3); private final NodeType type; @@ -207,10 +209,8 @@ public interface NodeSpec { @Override public int idealRetiredCount(int acceptedCount, int currentRetiredCount) { - // All nodes marked with wantToRetire get marked as retired just before this function is called, - // the job of this function is to throttle the retired count. If no nodes are marked as retired - // then continue this way, otherwise allow only 1 node to be retired - return Math.min(1, currentRetiredCount); + // All nodes marked with wantToRetire get marked as retired just before this function is called + return currentRetiredCount; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index baf7d2dbe15..650bfe761b5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -15,7 +15,6 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.Nodelike; -import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import org.junit.Test; @@ -54,6 +53,7 @@ public class AutoscalingTest { assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.clock().advance(Duration.ofDays(1)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high", 14, 1, 1.4, 30.8, 30.8, @@ -93,6 +93,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since cpu usage is too high", @@ -122,6 +123,7 @@ public class AutoscalingTest { .allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow); tester.clock().advance(Duration.ofDays(2)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // Changing min and max from slow to any ClusterResources min = new ClusterResources( 2, 1, @@ -181,6 +183,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, new NodeResources(1.9, 70, 70, 1)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(0.25f, 0.95f, 0.95f, 0, 120, application1); tester.assertResources("Scaling up to limit since resource usage is too high", 6, 1, 2.4, 78.0, 79.0, @@ -217,6 +220,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 5, new NodeResources(3.0, 10, 10, 1)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements( 0.3f, 1f, 240, application1); tester.assertResources("Scaling up since resource usage is too high", 6, 6, 3.6, 8.0, 10.0, @@ -252,6 +256,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); tester.assertResources("Scaling up since resource usage is too high", 7, 1, 2.5, 80.0, 80.0, @@ -304,6 +309,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 5, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); tester.assertResources("Scaling up since resource usage is too high", 7, 7, 2.5, 80.0, 80.0, @@ -322,6 +328,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 2, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); tester.assertResources("Scaling up since resource usage is too high, changing to 1 group is cheaper", 8, 1, 2.7, 83.3, 83.3, @@ -341,6 +348,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 2, new NodeResources(10, 100, 100, 1)); tester.clock().advance(Duration.ofDays(1)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(1.0f, 1f, 1000, application1); tester.assertResources("Increase group size to reduce memory load", 8, 2, 13.6, 89.3, 62.5, @@ -360,6 +368,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); tester.clock().advance(Duration.ofDays(2)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(0.02f, 0.95f, 120, application1); tester.assertResources("Scaling down", 6, 1, 2.9, 4.0, 95.0, @@ -377,6 +386,7 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only // No autoscaling as it is too soon to scale down after initial deploy (counting as a scaling event) tester.addMemMeasurements(0.02f, 0.95f, 120, application1); @@ -391,7 +401,7 @@ public class AutoscalingTest { } @Test - public void real_resources_are_taken_into_account() { + public void test_autoscaling_considers_real_resources() { NodeResources hostResources = new NodeResources(60, 100, 1000, 10); ClusterResources min = new ClusterResources(2, 1, new NodeResources( 2, 20, 200, 1)); ClusterResources max = new ClusterResources(4, 1, new NodeResources(60, 100, 1000, 1)); @@ -403,6 +413,7 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, min); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(1.0f, 1.0f, 0.7f, 0, 1000, application1); tester.assertResources("Scaling up", 4, 1, 7.4, 20, 200, @@ -416,6 +427,7 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); tester.deploy(application1, cluster1, min); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMeasurements(1.0f, 1.0f, 0.7f, 0, 1000, application1); tester.assertResources("Scaling up", 4, 1, 7.4, 34, 200, @@ -455,6 +467,7 @@ public class AutoscalingTest { tester.deactivateRetired(application1, cluster1, scaledResources); tester.clock().advance(Duration.ofDays(2)); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(0.3f, 0.6f, 1000, application1); tester.assertResources("Scaling down since resource usage has gone down", 6, 1, 3, 83, 28.8, @@ -472,6 +485,7 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); tester.deploy(application1, cluster1, 5, 1, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // (no read share stored) @@ -502,6 +516,7 @@ public class AutoscalingTest { ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); tester.deploy(application1, cluster1, 5, 1, resources); + tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // (no query rate data) @@ -529,6 +544,50 @@ public class AutoscalingTest { } @Test + public void test_autoscaling_considers_query_vs_write_rate() { + NodeResources minResources = new NodeResources( 1, 100, 100, 1); + NodeResources midResources = new NodeResources( 5, 100, 100, 1); + NodeResources maxResources = new NodeResources(10, 100, 100, 1); + ClusterResources min = new ClusterResources(5, 1, minResources); + ClusterResources max = new ClusterResources(5, 1, maxResources); + AutoscalingTester tester = new AutoscalingTester(maxResources.withVcpu(maxResources.vcpu() * 2)); + + ApplicationId application1 = tester.applicationId("application1"); + ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); + + tester.deploy(application1, cluster1, 5, 1, midResources); + tester.addCpuMeasurements(0.4f, 1f, 120, application1); + + // Why twice the query rate at time = 0? + // This makes headroom for queries doubling, which we want to observe the effect of here + + tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); + tester.assertResources("Query and write load is equal -> scale up somewhat", + 5, 1, 7.3, 100, 100, + tester.autoscale(application1, cluster1.id(), min, max).target()); + + tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 100.0 : 50.0, t -> 10.0); + tester.assertResources("Query load is 5x write load -> scale up more", + 5, 1, 9.7, 100, 100, + tester.autoscale(application1, cluster1.id(), min, max).target()); + + tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); + tester.assertResources("Write load is 10x query load -> scale down", + 5, 1, 3.8, 100, 100, + tester.autoscale(application1, cluster1.id(), min, max).target()); + + tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); + tester.assertResources("Query only -> largest possible", + 5, 1, 10.0, 100, 100, + tester.autoscale(application1, cluster1.id(), min, max).target()); + + tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> 0.0, t -> 10.0); + tester.assertResources("Write only -> smallest possible", + 5, 1, 2.1, 100, 100, + tester.autoscale(application1, cluster1.id(), min, max).target()); + } + + @Test public void test_cd_autoscaling_test() { NodeResources resources = new NodeResources(1, 4, 50, 1); ClusterResources min = new ClusterResources( 2, 1, resources); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index 1949a6116d8..e24146d4752 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -236,6 +236,19 @@ class AutoscalingTester { } /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ + public void addLoadMeasurements(ApplicationId application, + ClusterSpec.Id cluster, + int measurements, + IntFunction<Double> queryRate, + IntFunction<Double> writeRate) { + Instant time = clock().instant(); + for (int i = 0; i < measurements; i++) { + db.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(time, queryRate.apply(i), writeRate.apply(i)))); + time = time.plus(Duration.ofMinutes(5)); + } + } + + /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ public void addQueryRateMeasurements(ApplicationId application, ClusterSpec.Id cluster, int measurements, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java new file mode 100644 index 00000000000..f616e3e8b9d --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java @@ -0,0 +1,75 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.autoscale; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterResources; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.test.ManualClock; +import com.yahoo.vespa.hosted.provision.applications.Application; +import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.Status; +import org.junit.Test; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.IntFunction; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class ResourceTargetTest { + + private static final double delta = 0.001; + + @Test + public void test_traffic_headroom() { + Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); + Cluster cluster = new Cluster(ClusterSpec.Id.from("test"), + false, + new ClusterResources(5, 1, new NodeResources(1, 10, 100, 1)), + new ClusterResources(5, 1, new NodeResources(1, 10, 100, 1)), + Optional.empty(), + Optional.empty(), + List.of(), + ""); + application = application.with(cluster); + + // No current traffic: Ideal load is low but capped + application = application.with(new Status(0.0, 1.0)); + assertEquals(0.131, + ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), + new ClusterTimeseries(cluster.id(), + loadSnapshots(100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0)), + application), + delta); + + // Almost current traffic: Ideal load is low but capped + application = application.with(new Status(0.0001, 1.0)); + assertEquals(0.131, + ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), + new ClusterTimeseries(cluster.id(), + loadSnapshots(100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0)), + application), + delta); + } + + + /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ + private List<ClusterMetricSnapshot> loadSnapshots(int measurements, + IntFunction<Double> queryRate, + IntFunction<Double> writeRate) { + List<ClusterMetricSnapshot> snapshots = new ArrayList<>(measurements); + ManualClock clock = new ManualClock(); + for (int i = 0; i < measurements; i++) { + snapshots.add(new ClusterMetricSnapshot(clock.instant(), queryRate.apply(i), writeRate.apply(i))); + clock.advance(Duration.ofMinutes(5)); + } + return snapshots; + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index 0c1a59c883d..f292ab8ccf1 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -152,6 +152,7 @@ public class AutoscalingMaintainerTest { // deploy tester.deploy(app1, cluster1, app1Capacity); + tester.addQueryRateMeasurements(app1, cluster1.id(), 12, t -> t == 0 ? 20.0 : 10.0); for (int i = 0; i < 20; i++) { // Record completion to keep scaling window at minimum diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java index e8cfe6a2310..755f7608cd9 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java @@ -16,6 +16,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; @@ -27,6 +28,7 @@ import java.time.Instant; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.function.IntFunction; import java.util.stream.Collectors; /** @@ -85,6 +87,18 @@ public class AutoscalingMaintainerTester { } } + /** Creates the given number of measurements, spaced 5 minutes between, using the given function */ + public void addQueryRateMeasurements(ApplicationId application, + ClusterSpec.Id cluster, + int measurements, + IntFunction<Double> queryRate) { + Instant time = clock().instant(); + for (int i = 0; i < measurements; i++) { + metricsDb.addClusterMetrics(application, Map.of(cluster, new ClusterMetricSnapshot(time, queryRate.apply(i), 0.0))); + time = time.plus(Duration.ofMinutes(5)); + } + } + public Cluster cluster(ApplicationId application, ClusterSpec cluster) { return nodeRepository().applications().get(application).get().cluster(cluster.id()).get(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index 076a0e24620..48a6e03f646 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -36,6 +36,8 @@ import com.yahoo.vespa.hosted.provision.testutils.MockHostProvisioner; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; import com.yahoo.vespa.service.duper.ConfigServerApplication; import com.yahoo.vespa.service.duper.ConfigServerHostApplication; +import com.yahoo.vespa.service.duper.ControllerApplication; +import com.yahoo.vespa.service.duper.ControllerHostApplication; import org.junit.Test; import java.time.Duration; @@ -421,6 +423,30 @@ public class DynamicProvisioningMaintainerTest { @Test public void replace_config_server() { + replace_config_server_like(NodeType.confighost); + } + + @Test + public void replace_controller() { + replace_config_server_like(NodeType.controllerhost); + } + + public void replace_config_server_like(NodeType hostType) { + final ApplicationId hostApp; + final ApplicationId configSrvApp; + switch (hostType) { + case confighost: + hostApp = new ConfigServerHostApplication().getApplicationId(); + configSrvApp = new ConfigServerApplication().getApplicationId(); + break; + case controllerhost: + hostApp = new ControllerHostApplication().getApplicationId(); + configSrvApp = new ControllerApplication().getApplicationId(); + break; + default: + throw new IllegalArgumentException("Unexpected config server host like node type: " + hostType); + } + Cloud cloud = Cloud.builder().dynamicProvisioning(true).build(); DynamicProvisioningTester dynamicProvisioningTester = new DynamicProvisioningTester(cloud, new MockNameResolver().mockAnyLookup()); ProvisioningTester tester = dynamicProvisioningTester.provisioningTester; @@ -428,24 +454,22 @@ public class DynamicProvisioningMaintainerTest { dynamicProvisioningTester.flagSource.withBooleanFlag(Flags.DYNAMIC_CONFIG_SERVER_PROVISIONING.id(), true); // Initial config server hosts are provisioned manually - ApplicationId hostApp = ApplicationId.from("hosted-vespa", "configserver-host", "default"); - List<Node> provisionedHosts = tester.makeReadyNodes(3, "default", NodeType.confighost).stream() + List<Node> provisionedHosts = tester.makeReadyNodes(3, "default", hostType).stream() .sorted(Comparator.comparing(Node::hostname)) .collect(Collectors.toList()); - tester.prepareAndActivateInfraApplication(hostApp, NodeType.confighost); + tester.prepareAndActivateInfraApplication(hostApp, hostType); // Provision config servers - ApplicationId configSrvApp = ApplicationId.from("hosted-vespa", "zone-config-servers", "default"); for (int i = 0; i < provisionedHosts.size(); i++) { - tester.makeReadyChildren(1, i + 1, NodeResources.unspecified(), NodeType.config, - provisionedHosts.get(i).hostname(), (nodeIndex) -> "cfg" + nodeIndex); + tester.makeReadyChildren(1, i + 1, NodeResources.unspecified(), hostType.childNodeType(), + provisionedHosts.get(i).hostname(), (nodeIndex) -> "cfg" + nodeIndex); } - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); // Expected number of hosts and children are provisioned NodeList allNodes = tester.nodeRepository().nodes().list(); - NodeList configHosts = allNodes.nodeType(NodeType.confighost); - NodeList configNodes = allNodes.nodeType(NodeType.config); + NodeList configHosts = allNodes.nodeType(hostType); + NodeList configNodes = allNodes.nodeType(hostType.childNodeType()); assertEquals(3, configHosts.size()); assertEquals(3, configNodes.size()); String hostnameToRemove = provisionedHosts.get(1).hostname(); @@ -456,20 +480,20 @@ public class DynamicProvisioningMaintainerTest { tester.nodeRepository().nodes().deprovision(hostToRemove.get(), Agent.system, tester.clock().instant()); // Redeployment of config server application retires node - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); assertTrue("Redeployment retires node", nodeToRemove.get().allocation().get().membership().retired()); // Config server becomes removable (done by RetiredExpirer in a real system) and redeployment moves it // to inactive tester.nodeRepository().nodes().setRemovable(configSrvApp, List.of(nodeToRemove.get())); - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); assertEquals("Node moves to inactive", Node.State.inactive, nodeToRemove.get().state()); // Node is completely removed (done by InactiveExpirer and host-admin in a real system) Node inactiveConfigServer = nodeToRemove.get(); int removedIndex = inactiveConfigServer.allocation().get().membership().index(); tester.nodeRepository().nodes().removeRecursively(inactiveConfigServer, true); - assertEquals(2, tester.nodeRepository().nodes().list().nodeType(NodeType.config).size()); + assertEquals(2, tester.nodeRepository().nodes().list().nodeType(hostType.childNodeType()).size()); // ExpiredRetirer moves host to inactive after child has moved to parked tester.nodeRepository().nodes().deallocate(hostToRemove.get(), Agent.system, getClass().getSimpleName()); @@ -477,38 +501,38 @@ public class DynamicProvisioningMaintainerTest { // Host is removed dynamicProvisioningTester.maintainer.maintain(); - assertEquals(2, tester.nodeRepository().nodes().list().nodeType(NodeType.confighost).size()); + assertEquals(2, tester.nodeRepository().nodes().list().nodeType(hostType).size()); // Deployment by the removed host has no effect HostName.setHostNameForTestingOnly("cfg2.example.com"); - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); assertEquals(List.of(), dynamicProvisioningTester.hostProvisioner.provisionedHosts()); // Deployment on another config server starts provisioning a new host and child HostName.setHostNameForTestingOnly("cfg3.example.com"); - assertEquals(0, tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(NodeType.config).size()); - assertEquals(2, tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config).size()); - assertEquals(1, tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(NodeType.config).size()); - Node newNode = tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(NodeType.config).first().get(); + assertEquals(0, tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(hostType.childNodeType()).size()); + assertEquals(2, tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()).size()); + assertEquals(1, tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(hostType.childNodeType()).size()); + Node newNode = tester.nodeRepository().nodes().list(Node.State.reserved).nodeType(hostType.childNodeType()).first().get(); // Resume provisioning and activate host dynamicProvisioningTester.maintainer.maintain(); List<ProvisionedHost> newHosts = dynamicProvisioningTester.hostProvisioner.provisionedHosts(); assertEquals(1, newHosts.size()); tester.nodeRepository().nodes().setReady(newHosts.get(0).hostHostname(), Agent.operator, getClass().getSimpleName()); - tester.prepareAndActivateInfraApplication(hostApp, NodeType.confighost); - assertEquals(3, tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.confighost).size()); + tester.prepareAndActivateInfraApplication(hostApp, hostType); + assertEquals(3, tester.nodeRepository().nodes().list(Node.State.active).nodeType(hostType).size()); // Redeployment of config server app actives new node - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); newNode = tester.nodeRepository().nodes().node(newNode.hostname()).get(); assertSame(Node.State.active, newNode.state()); assertEquals("Removed index is reused", removedIndex, newNode.allocation().get().membership().index()); // Next redeployment does nothing - NodeList nodesBefore = tester.nodeRepository().nodes().list().nodeType(NodeType.config); - tester.prepareAndActivateInfraApplication(configSrvApp, NodeType.config); - NodeList nodesAfter = tester.nodeRepository().nodes().list().nodeType(NodeType.config); + NodeList nodesBefore = tester.nodeRepository().nodes().list().nodeType(hostType.childNodeType()); + tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); + NodeList nodesAfter = tester.nodeRepository().nodes().list().nodeType(hostType.childNodeType()); assertEquals(nodesBefore, nodesAfter); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java index 718facd477c..924d38cc6c2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirerTest.java @@ -42,7 +42,6 @@ import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -191,25 +190,20 @@ public class RetiredExpirerTest { // Redeploy to retire all 3 config servers infraDeployer.activateAllSupportedInfraApplications(true); + List<Node> retiredNodes = tester.nodeRepository().nodes().list().retired().asList(); + assertEquals(3, retiredNodes.size()); - // Only 1 config server is allowed to retire at any given point in time - List<Node> retiredNodes = tester.nodeRepository().nodes().list(() -> {}).stream() - .filter(node -> node.allocation().map(allocation -> allocation.membership().retired()).orElse(false)) - .collect(Collectors.toList()); - assertEquals(1, retiredNodes.size()); - Node retiredNode = retiredNodes.get(0); - String retiredNodeHostname = retiredNode.hostname(); - - // Allow retiredNodeHostname to be removed + // The Orchestrator will allow only 1 to be removed, say cfg1 + Node retiredNode = tester.nodeRepository().nodes().node(cfg1.s()).orElseThrow(); doThrow(new OrchestrationException("denied")).when(orchestrator).acquirePermissionToRemove(any()); - doNothing().when(orchestrator).acquirePermissionToRemove(eq(new HostName(retiredNodeHostname))); + doNothing().when(orchestrator).acquirePermissionToRemove(eq(new HostName(retiredNode.hostname()))); // RetiredExpirer should remove cfg1 from application RetiredExpirer retiredExpirer = createRetiredExpirer(deployer); retiredExpirer.run(); var activeConfigServerHostnames = new HashSet<>(Set.of("cfg1", "cfg2", "cfg3")); - assertTrue(activeConfigServerHostnames.contains(retiredNodeHostname)); - activeConfigServerHostnames.remove(retiredNodeHostname); + assertTrue(activeConfigServerHostnames.contains(retiredNode.hostname())); + activeConfigServerHostnames.remove(retiredNode.hostname()); assertEquals(activeConfigServerHostnames, configServerHostnames(duperModel)); assertEquals(1, tester.nodeRepository().nodes().list(Node.State.inactive).nodeType(NodeType.config).size()); assertEquals(2, tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.config).size()); @@ -234,8 +228,8 @@ public class RetiredExpirerTest { // Provision and ready new config server MockNameResolver nameResolver = (MockNameResolver)tester.nodeRepository().nameResolver(); String ipv4 = "127.0.1.4"; - nameResolver.addRecord(retiredNodeHostname, ipv4); - Node node = Node.create(retiredNodeHostname, new IP.Config(Set.of(ipv4), Set.of()), retiredNodeHostname, + nameResolver.addRecord(retiredNode.hostname(), ipv4); + Node node = Node.create(retiredNode.hostname(), new IP.Config(Set.of(ipv4), Set.of()), retiredNode.hostname(), tester.asFlavor("default", NodeType.config), NodeType.config).build(); var nodes = List.of(node); nodes = nodeRepository.nodes().addNodes(nodes, Agent.system); @@ -252,14 +246,16 @@ public class RetiredExpirerTest { infraDeployer.activateAllSupportedInfraApplications(true); assertEquals(3, tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.config).size()); - // Another config server should now have retired + // There are now 2 retired config servers left retiredExpirer.run(); assertEquals(3, tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.config).size()); - var retiredNodes2 = tester.nodeRepository().nodes().list(() -> {}).stream() - .filter(n -> n.allocation().map(allocation -> allocation.membership().retired()).orElse(false)) - .collect(Collectors.toList()); - assertEquals(1, retiredNodes2.size()); - assertNotEquals(retiredNodeHostname, retiredNodes2.get(0)); + var retiredHostnames = tester.nodeRepository() + .nodes().list(() -> {}) + .stream() + .filter(n -> n.allocation().map(allocation -> allocation.membership().retired()).orElse(false)) + .map(Node::hostname) + .collect(Collectors.toSet()); + assertEquals(Set.of("cfg2", "cfg3"), retiredHostnames); } private Set<String> configServerHostnames(MockDuperModel duperModel) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index 88d39e887d3..9ae67cef235 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -72,9 +72,9 @@ public class ScalingSuggestionsMaintainerTest { new TestMetric()); maintainer.maintain(); - assertEquals("14 nodes with [vcpu: 6.9, memory: 5.1 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", + assertEquals("12 nodes with [vcpu: 6.0, memory: 5.1 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app1, cluster1, tester).get().resources().toString()); - assertEquals("9 nodes with [vcpu: 13.8, memory: 4.0 Gb, disk 10.3 Gb, bandwidth: 0.1 Gbps]", + assertEquals("8 nodes with [vcpu: 11.0, memory: 4.0 Gb, disk 11.8 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app2, cluster2, tester).get().resources().toString()); // Utilization goes way down @@ -82,14 +82,14 @@ public class ScalingSuggestionsMaintainerTest { addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository(), metricsDb); maintainer.maintain(); assertEquals("Suggestion stays at the peak value observed", - "14 nodes with [vcpu: 6.9, memory: 5.1 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", + "12 nodes with [vcpu: 6.0, memory: 5.1 Gb, disk 15.0 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app1, cluster1, tester).get().resources().toString()); // Utilization is still way down and a week has passed tester.clock().advance(Duration.ofDays(7)); addMeasurements(0.10f, 0.10f, 0.10f, 0, 500, app1, tester.nodeRepository(), metricsDb); maintainer.maintain(); assertEquals("Peak suggestion has been outdated", - "6 nodes with [vcpu: 2.0, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]", + "5 nodes with [vcpu: 1.8, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 0.1 Gbps]", suggestionOf(app1, cluster1, tester).get().resources().toString()); assertTrue(shouldSuggest(app1, cluster1, tester)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java index 6e50c934047..e94d1c1230e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java @@ -19,10 +19,10 @@ import java.time.Duration; import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; /** @@ -183,7 +183,6 @@ public class NodeTypeProvisioningTest { List<Node> nodesToRetire = tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.proxy).asList() .subList(3, 3 + numNodesToRetire); - String currentyRetiringHostname; { nodesToRetire.forEach(nodeToRetire -> tester.nodeRepository().nodes().write(nodeToRetire.withWantToRetire(true, Agent.system, tester.clock().instant()), () -> {})); @@ -198,14 +197,13 @@ public class NodeTypeProvisioningTest { List<Node> nodesCurrentlyRetiring = nodes.stream() .filter(node -> node.allocation().get().membership().retired()) .collect(Collectors.toList()); - assertEquals(1, nodesCurrentlyRetiring.size()); + assertEquals(5, nodesCurrentlyRetiring.size()); - // The retiring node should be one of the nodes we marked for retirement - currentyRetiringHostname = nodesCurrentlyRetiring.get(0).hostname(); - assertEquals(1, nodesToRetire.stream().map(Node::hostname).filter(hostname -> hostname.equals(currentyRetiringHostname)).count()); + // The retiring nodes should be the nodes we marked for retirement + assertTrue(Set.copyOf(nodesToRetire).containsAll(nodesCurrentlyRetiring)); } - { // Redeploying while the node is still retiring has no effect + { // Redeploying while the nodes are still retiring has no effect List<HostSpec> hosts = deployProxies(application, tester); assertEquals(11, hosts.size()); tester.activate(application, new HashSet<>(hosts)); @@ -216,57 +214,29 @@ public class NodeTypeProvisioningTest { List<Node> nodesCurrentlyRetiring = nodes.stream() .filter(node -> node.allocation().get().membership().retired()) .collect(Collectors.toList()); - assertEquals(1, nodesCurrentlyRetiring.size()); - - // The node that started retiring is still the only one retiring - assertEquals(currentyRetiringHostname, nodesCurrentlyRetiring.get(0).hostname()); + assertEquals(5, nodesCurrentlyRetiring.size()); } { + // Let all retired nodes expire tester.advanceTime(Duration.ofMinutes(11)); retiredExpirer.run(); List<HostSpec> hosts = deployProxies(application, tester); - assertEquals(10, hosts.size()); + assertEquals(6, hosts.size()); tester.activate(application, new HashSet<>(hosts)); - NodeList nodes = tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.proxy); - assertEquals(10, nodes.size()); - // Verify the node we previously set to retire has finished retiring - assertEquals(Node.State.dirty, tester.nodeRepository().nodes().node(currentyRetiringHostname) - .orElseThrow(RuntimeException::new).state()); - - // Verify that a node is currently retiring - List<Node> nodesCurrentlyRetiring = nodes.stream() - .filter(node -> node.allocation().get().membership().retired()) - .collect(Collectors.toList()); - assertEquals(1, nodesCurrentlyRetiring.size()); + // All currently active proxy nodes are not marked with wantToRetire or as retired + long numRetiredActiveProxyNodes = tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.proxy).stream() + .filter(node -> !node.status().wantToRetire()) + .filter(node -> !node.allocation().get().membership().retired()) + .count(); + assertEquals(6, numRetiredActiveProxyNodes); - // This node is different from the one that was retiring previously - String newRetiringHostname = nodesCurrentlyRetiring.get(0).hostname(); - assertNotEquals(currentyRetiringHostname, newRetiringHostname); - // ... but is one of the nodes that were put to wantToRetire earlier - assertTrue(nodesToRetire.stream().map(Node::hostname).filter(hostname -> hostname.equals(newRetiringHostname)).count() == 1); + // All the nodes that were marked with wantToRetire earlier are now dirty + assertEquals(nodesToRetire.stream().map(Node::hostname).collect(Collectors.toSet()), + tester.nodeRepository().nodes().list(Node.State.dirty).stream().map(Node::hostname).collect(Collectors.toSet())); } - - - for (int i = 0; i < 10; i++){ - tester.advanceTime(Duration.ofMinutes(11)); - retiredExpirer.run(); - List<HostSpec> hosts = deployProxies(application, tester); - tester.activate(application, new HashSet<>(hosts)); - } - - // After a long time, all currently active proxy nodes are not marked with wantToRetire or as retired - long numRetiredActiveProxyNodes = tester.nodeRepository().nodes().list(Node.State.active).nodeType(NodeType.proxy).stream() - .filter(node -> !node.status().wantToRetire()) - .filter(node -> !node.allocation().get().membership().retired()) - .count(); - assertEquals(11 - numNodesToRetire, numRetiredActiveProxyNodes); - - // All the nodes that were marked with wantToRetire earlier are now dirty - assertEquals(nodesToRetire.stream().map(Node::hostname).collect(Collectors.toSet()), - tester.nodeRepository().nodes().list(Node.State.dirty).stream().map(Node::hostname).collect(Collectors.toSet())); } private List<HostSpec> deployProxies(ApplicationId application, ProvisioningTester tester) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json index 1e9a2d60837..65e07c46242 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application1.json @@ -66,7 +66,7 @@ }, "utilization" : { "cpu" : 0.0, - "idealCpu": 0.2, + "idealCpu": 0.275, "memory" : 0.0, "idealMemory": 0.7, "disk" : 0.0, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json index 376b748ff8e..ecab55d19d4 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/application2.json @@ -42,7 +42,7 @@ }, "utilization" : { "cpu" : 0.0, - "idealCpu": 0.19047619047619047, + "idealCpu": 0.2664285714285714, "memory" : 0.0, "idealMemory": 0.7, "disk" : 0.0, |