diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-11-26 09:14:39 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-11-26 09:14:39 +0100 |
commit | ad0293d8b799fc9ac3fd91e0fbe11d23c7f973ed (patch) | |
tree | 666ed353198f17c6909ea2381ec862e87aaf01a4 /node-repository | |
parent | e1584673531bc771fa94731da337ce311b4ff7d1 (diff) |
More detailed autoscaling status
Diffstat (limited to 'node-repository')
2 files changed, 34 insertions, 29 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index c4f11ee76d0..d2c943794fe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -60,6 +60,7 @@ public class Autoscaler { } private Advice autoscale(Cluster cluster, List<Node> clusterNodes, Limits limits, boolean exclusive) { + ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); if (unstable(clusterNodes, nodeRepository)) return Advice.none("Cluster change in progress"); @@ -68,13 +69,21 @@ public class Autoscaler { ClusterTimeseries clusterTimeseries = new ClusterTimeseries(cluster, clusterNodes, metricsDb, nodeRepository); - Optional<Double> cpuLoad = clusterTimeseries.averageLoad(Resource.cpu, cluster); - Optional<Double> memoryLoad = clusterTimeseries.averageLoad(Resource.memory, cluster); - Optional<Double> diskLoad = clusterTimeseries.averageLoad(Resource.disk, cluster); - if (cpuLoad.isEmpty() || memoryLoad.isEmpty() || diskLoad.isEmpty()) - return Advice.none("Collecting more data before making new scaling decisions"); + int measurementsPerNode = clusterTimeseries.measurementsPerNode(); + if (measurementsPerNode < minimumMeasurementsPerNode(clusterType)) + return Advice.none("Collecting more data before making new scaling decisions" + + ": Has " + measurementsPerNode + " data points per node"); - var target = ResourceTarget.idealLoad(cpuLoad.get(), memoryLoad.get(), diskLoad.get(), currentAllocation); + int nodesMeasured = clusterTimeseries.nodesMeasured(); + if (nodesMeasured != clusterNodes.size()) + return Advice.none("Collecting more data before making new scaling decisions" + + ": Has measurements from " + nodesMeasured + " but need from " + clusterNodes.size()); + + double cpuLoad = clusterTimeseries.averageLoad(Resource.cpu); + double memoryLoad = clusterTimeseries.averageLoad(Resource.memory); + double diskLoad = clusterTimeseries.averageLoad(Resource.disk); + + var target = ResourceTarget.idealLoad(cpuLoad, memoryLoad, diskLoad, currentAllocation); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits, exclusive); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index 3c93e7ee7f6..e325e797ca5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -10,8 +10,6 @@ import java.time.Instant; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; -import java.util.logging.Logger; import java.util.stream.Collectors; /** @@ -21,10 +19,7 @@ import java.util.stream.Collectors; */ public class ClusterTimeseries { - private static final Logger log = Logger.getLogger(ClusterTimeseries.class.getName()); - private final List<Node> clusterNodes; - private final Map<String, Instant> startTimePerNode; /** The measurements for all hosts in this snapshot */ private final List<NodeTimeseries> nodeTimeseries; @@ -32,9 +27,10 @@ public class ClusterTimeseries { public ClusterTimeseries(Cluster cluster, List<Node> clusterNodes, MetricsDb db, NodeRepository nodeRepository) { this.clusterNodes = clusterNodes; ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); - this.nodeTimeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterType)), - clusterNodes.stream().map(Node::hostname).collect(Collectors.toSet())); - this.startTimePerNode = metricStartTimes(cluster, clusterNodes, nodeRepository); + var allTimeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterType)), + clusterNodes.stream().map(Node::hostname).collect(Collectors.toSet())); + Map<String, Instant> startTimePerNode = metricStartTimes(cluster, clusterNodes, allTimeseries, nodeRepository); + nodeTimeseries = filterStale(allTimeseries, startTimePerNode); } /** @@ -43,6 +39,7 @@ public class ClusterTimeseries { */ private Map<String, Instant> metricStartTimes(Cluster cluster, List<Node> clusterNodes, + List<NodeTimeseries> nodeTimeseries, NodeRepository nodeRepository) { Map<String, Instant> startTimePerHost = new HashMap<>(); if ( ! cluster.scalingEvents().isEmpty()) { @@ -65,23 +62,22 @@ public class ClusterTimeseries { return startTimePerHost; } - /** - * Returns the average load of this resource in the measurement window, - * or empty if we do not have a reliable measurement across the cluster nodes. - */ - public Optional<Double> averageLoad(Resource resource, Cluster cluster) { - ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); - - List<NodeTimeseries> currentMeasurements = filterStale(nodeTimeseries, startTimePerNode); + /** Returns the average number of measurements per node */ + public int measurementsPerNode() { + int measurementCount = nodeTimeseries.stream().mapToInt(m -> m.size()).sum(); + return measurementCount / clusterNodes.size(); + } - // Require a total number of measurements scaling with the number of nodes, - // but don't require that we have at least that many from every node - int measurementCount = currentMeasurements.stream().mapToInt(m -> m.size()).sum(); - if (measurementCount / clusterNodes.size() < Autoscaler.minimumMeasurementsPerNode(clusterType)) return Optional.empty(); - if (currentMeasurements.size() != clusterNodes.size()) return Optional.empty(); + /** Returns the number of nodes measured in this */ + public int nodesMeasured() { + return nodeTimeseries.size(); + } - double measurementSum = currentMeasurements.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum(); - return Optional.of(measurementSum / measurementCount); + /** Returns the average load of this resource in this */ + public double averageLoad(Resource resource) { + int measurementCount = nodeTimeseries.stream().mapToInt(m -> m.size()).sum(); + double measurementSum = nodeTimeseries.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum(); + return measurementSum / measurementCount; } private double value(Resource resource, MetricSnapshot snapshot) { |