diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-03-06 15:36:28 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-03-06 15:36:28 +0100 |
commit | cc502d7d4b627b30f90fe0ba514a0cd6886da731 (patch) | |
tree | 9f12fcb693618a3126b60bb113859e9acb0729e7 | |
parent | 151c36ac8d003f22e43ced12678a251a61d62a27 (diff) |
Aggregate cluster metrics at write time
23 files changed, 522 insertions, 235 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index e15deb5ea8e..801532562d8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -2,14 +2,12 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterResources; -import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; -import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import java.time.Duration; import java.time.Instant; @@ -68,23 +66,23 @@ public class Autoscaler { if (scaledIn(scalingWindow, cluster)) return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling"); - ClusterTimeseries clusterTimeseries = - new ClusterTimeseries(scalingWindow, cluster, clusterNodes, metricsDb); - AllocatableClusterResources currentAllocation = - new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); + var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb); + var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); - int measurementsPerNode = clusterTimeseries.measurementsPerNode(); + int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode(); if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow)) return Advice.none("Collecting more data before making new scaling decisions: " + "Have " + measurementsPerNode + " measurements per node but require " + minimumMeasurementsPerNode(scalingWindow)); - int nodesMeasured = clusterTimeseries.nodesMeasured(); + int nodesMeasured = clusterNodesTimeseries.nodesMeasured(); if (nodesMeasured != clusterNodes.size()) return Advice.none("Collecting more data before making new scaling decisions: " + "Have measurements from " + nodesMeasured + " but require from " + clusterNodes.size()); - var target = ResourceTarget.idealLoad(clusterTimeseries, currentAllocation, application); + + var clusterTimeseries = metricsDb.getClusterTimeseries(cluster.id()); + var target = ResourceTarget.idealLoad(clusterTimeseries, clusterNodesTimeseries, currentAllocation, application); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterMetricSnapshot.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterMetricSnapshot.java new file mode 100644 index 00000000000..fd8e91584c4 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterMetricSnapshot.java @@ -0,0 +1,42 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.autoscale; + +import java.time.Instant; + +/** + * Cluster level metrics. + * These are aggregated at fetch time over the nodes in the cluster at that point in time. + * + * @author bratseth + */ +public class ClusterMetricSnapshot implements Comparable<ClusterMetricSnapshot> { + + private final Instant at; + + private final double queryRate; + + public ClusterMetricSnapshot(Instant at, double queryRate) { + this.at = at; + this.queryRate = queryRate; + } + + public Instant at() { return at; } + + /** Queries per second */ + public double queryRate() { return queryRate; } + + public ClusterMetricSnapshot withQueryRate(double queryRate) { + return new ClusterMetricSnapshot(at, queryRate); + } + + @Override + public int compareTo(ClusterMetricSnapshot other) { + return at.compareTo(other.at); + } + + @Override + public String toString() { return "metrics at " + at + ":" + + " queryRate: " + queryRate; + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java new file mode 100644 index 00000000000..173d76e4c26 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -0,0 +1,76 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.autoscale; + +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.applications.Cluster; + +import java.time.Duration; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * A series of metric snapshots for the nodes of a cluster used to compute load + * + * @author bratseth + */ +public class ClusterNodesTimeseries { + + private final Cluster cluster; + private final NodeList clusterNodes; + + /** The measurements for all nodes in this snapshot */ + private final List<NodeTimeseries> timeseries; + + public ClusterNodesTimeseries(Duration period, Cluster cluster, NodeList clusterNodes, MetricsDb db) { + this.cluster = cluster; + this.clusterNodes = clusterNodes; + var timeseries = db.getNodeTimeseries(period, clusterNodes); + + if (cluster.lastScalingEvent().isPresent()) + timeseries = filter(timeseries, snapshot -> snapshot.generation() < 0 || // Content nodes do not yet send generation + snapshot.generation() >= cluster.lastScalingEvent().get().generation()); + timeseries = filter(timeseries, snapshot -> snapshot.inService() && snapshot.stable()); + + this.timeseries = timeseries; + } + + /** The cluster this is a timeseries for */ + public Cluster cluster() { return cluster; } + + /** The nodes of the cluster this is a timeseries for */ + public NodeList clusterNodes() { return clusterNodes; } + + /** Returns the average number of measurements per node */ + public int measurementsPerNode() { + int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); + return measurementCount / clusterNodes.size(); + } + + /** Returns the number of nodes measured in this */ + public int nodesMeasured() { + return timeseries.size(); + } + + /** Returns the average load of this resource in this */ + public double averageLoad(Resource resource) { + int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); + if (measurementCount == 0) return 0; + double measurementSum = timeseries.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum(); + return measurementSum / measurementCount; + } + + private double value(Resource resource, NodeMetricSnapshot snapshot) { + switch (resource) { + case cpu: return snapshot.cpu(); + case memory: return snapshot.memory(); + case disk: return snapshot.disk(); + default: throw new IllegalArgumentException("Got an unknown resource " + resource); + } + } + + private List<NodeTimeseries> filter(List<NodeTimeseries> timeseries, Predicate<NodeMetricSnapshot> filter) { + return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.filter(filter)).collect(Collectors.toList()); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index a16277585a2..6cfb0cdbd88 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -1,63 +1,48 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.autoscale; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.applications.Cluster; -import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.function.Predicate; import java.util.stream.Collectors; /** - * A series of metric snapshots for all nodes in a cluster + * A list of metric snapshots from a cluster, sorted by increasing time (newest last). * * @author bratseth */ public class ClusterTimeseries { - private final Cluster cluster; - private final NodeList clusterNodes; + private final ClusterSpec.Id cluster; + private final List<ClusterMetricSnapshot> snapshots; - /** The measurements for all nodes in this snapshot */ - private final List<NodeTimeseries> timeseries; - - public ClusterTimeseries(Duration period, Cluster cluster, NodeList clusterNodes, MetricsDb db) { + ClusterTimeseries(ClusterSpec.Id cluster, List<ClusterMetricSnapshot> snapshots) { this.cluster = cluster; - this.clusterNodes = clusterNodes; - var timeseries = db.getNodeTimeseries(period, clusterNodes); - - if (cluster.lastScalingEvent().isPresent()) - timeseries = filter(timeseries, snapshot -> snapshot.generation() < 0 || // Content nodes do not yet send generation - snapshot.generation() >= cluster.lastScalingEvent().get().generation()); - timeseries = filter(timeseries, snapshot -> snapshot.inService() && snapshot.stable()); - - this.timeseries = timeseries; + List<ClusterMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots); + Collections.sort(sortedSnapshots); + this.snapshots = Collections.unmodifiableList(sortedSnapshots); } - /** The cluster this is a timeseries for */ - public Cluster cluster() { return cluster; } + public boolean isEmpty() { return snapshots.isEmpty(); } - /** The nodes of the cluster this is a timeseries for */ - public NodeList clusterNodes() { return clusterNodes; } + public int size() { return snapshots.size(); } - /** Returns the average number of measurements per node */ - public int measurementsPerNode() { - int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); - return measurementCount / clusterNodes.size(); - } + public ClusterMetricSnapshot get(int index) { return snapshots.get(index); } - /** Returns the number of nodes measured in this */ - public int nodesMeasured() { - return timeseries.size(); - } + public List<ClusterMetricSnapshot> asList() { return snapshots; } + + public ClusterSpec.Id cluster() { return cluster; } - /** Returns the average load of this resource in this */ - public double averageLoad(Resource resource) { - int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); - if (measurementCount == 0) return 0; - double measurementSum = timeseries.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum(); - return measurementSum / measurementCount; + public ClusterTimeseries add(ClusterMetricSnapshot snapshot) { + List<ClusterMetricSnapshot> list = new ArrayList<>(snapshots); + list.add(snapshot); + return new ClusterTimeseries(cluster, list); } /** The max query growth rate we can predict from this time-series as a fraction of the current traffic per minute */ @@ -67,21 +52,7 @@ public class ClusterTimeseries { /** The current query rate as a fraction of the peak rate in this timeseries */ public double currentQueryFractionOfMax() { - return 0.5; // default } - private double value(Resource resource, MetricSnapshot snapshot) { - switch (resource) { - case cpu: return snapshot.cpu(); - case memory: return snapshot.memory(); - case disk: return snapshot.disk(); - default: throw new IllegalArgumentException("Got an unknown resource " + resource); - } - } - - private List<NodeTimeseries> filter(List<NodeTimeseries> timeseries, Predicate<MetricSnapshot> filter) { - return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.filter(filter)).collect(Collectors.toList()); - } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java index 1b1e5933604..c20b5c64d64 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java @@ -2,9 +2,11 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.collections.Pair; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; @@ -26,8 +28,10 @@ public class MemoryMetricsDb implements MetricsDb { private final NodeRepository nodeRepository; - /** Metric time seriest by node (hostname). Each list of metric snapshots is sorted by increasing timestamp */ - private final Map<String, NodeTimeseries> db = new HashMap<>(); + /** Metric time series by node (hostname). Each list of metric snapshots is sorted by increasing timestamp */ + private final Map<String, NodeTimeseries> nodeTimeseries = new HashMap<>(); + + private final Map<ClusterSpec.Id, ClusterTimeseries> clusterTimeseries = new HashMap<>(); /** Lock all access for now since we modify lists inside a map */ private final Object lock = new Object(); @@ -37,7 +41,10 @@ public class MemoryMetricsDb implements MetricsDb { } @Override - public void add(Collection<Pair<String, MetricSnapshot>> nodeMetrics) { + public Clock clock() { return nodeRepository.clock(); } + + @Override + public void addNodeMetrics(Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics) { synchronized (lock) { for (var value : nodeMetrics) { add(value.getFirst(), value.getSecond()); @@ -46,27 +53,41 @@ public class MemoryMetricsDb implements MetricsDb { } @Override + public void addClusterMetrics(Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics) { + synchronized (lock) { + for (var value : clusterMetrics.entrySet()) { + add(value.getKey(), value.getValue()); + } + } + } + + @Override public List<NodeTimeseries> getNodeTimeseries(Duration period, Set<String> hostnames) { Instant startTime = nodeRepository.clock().instant().minus(period); synchronized (lock) { return hostnames.stream() - .map(hostname -> db.getOrDefault(hostname, new NodeTimeseries(hostname, List.of())).justAfter(startTime)) + .map(hostname -> nodeTimeseries.getOrDefault(hostname, new NodeTimeseries(hostname, List.of())).justAfter(startTime)) .collect(Collectors.toList()); } } @Override + public ClusterTimeseries getClusterTimeseries(ClusterSpec.Id cluster) { + return clusterTimeseries.computeIfAbsent(cluster, __ -> new ClusterTimeseries(cluster, new ArrayList<>())); + } + + @Override public void gc() { synchronized (lock) { // Each measurement is Object + long + float = 16 + 8 + 4 = 28 bytes // 12 hours with 1k nodes and 3 resources and 1 measurement/sec is about 5Gb - for (String hostname : db.keySet()) { - var timeseries = db.get(hostname); + for (String hostname : nodeTimeseries.keySet()) { + var timeseries = nodeTimeseries.get(hostname); timeseries = timeseries.justAfter(nodeRepository.clock().instant().minus(Autoscaler.maxScalingWindow())); if (timeseries.isEmpty()) - db.remove(hostname); + nodeTimeseries.remove(hostname); else - db.put(hostname, timeseries); + nodeTimeseries.put(hostname, timeseries); } } } @@ -74,16 +95,21 @@ public class MemoryMetricsDb implements MetricsDb { @Override public void close() {} - private void add(String hostname, MetricSnapshot snapshot) { - NodeTimeseries timeseries = db.get(hostname); + private void add(String hostname, NodeMetricSnapshot snapshot) { + NodeTimeseries timeseries = nodeTimeseries.get(hostname); if (timeseries == null) { // new node Optional<Node> node = nodeRepository.nodes().node(hostname); if (node.isEmpty()) return; if (node.get().allocation().isEmpty()) return; timeseries = new NodeTimeseries(hostname, new ArrayList<>()); - db.put(hostname, timeseries); + nodeTimeseries.put(hostname, timeseries); } - db.put(hostname, timeseries.add(snapshot)); + nodeTimeseries.put(hostname, timeseries.add(snapshot)); + } + + private void add(ClusterSpec.Id cluster, ClusterMetricSnapshot snapshot) { + var existing = clusterTimeseries.computeIfAbsent(cluster, __ -> new ClusterTimeseries(cluster, new ArrayList<>())); + clusterTimeseries.put(cluster, existing.add(snapshot)); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java index 6fdc87f2448..4becf5ca88b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java @@ -2,15 +2,16 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.collections.Pair; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Clock; import java.time.Duration; -import java.time.Instant; import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -21,8 +22,12 @@ import java.util.stream.Collectors; */ public interface MetricsDb { - /** Adds snapshots to this. */ - void add(Collection<Pair<String, MetricSnapshot>> nodeMetrics); + Clock clock(); + + /** Adds node snapshots to this. */ + void addNodeMetrics(Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics); + + void addClusterMetrics(Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics); /** * Returns a list with one entry for each hostname containing @@ -36,6 +41,9 @@ public interface MetricsDb { return getNodeTimeseries(period, nodes.stream().map(Node::hostname).collect(Collectors.toSet())); } + /** Returns all cluster level metric snapshots for a given cluster */ + ClusterTimeseries getClusterTimeseries(ClusterSpec.Id clusterId); + /** Must be called intermittently (as long as add is called) to gc old data */ void gc(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index d6661b89536..4a8eeba2134 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -11,7 +11,6 @@ import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.applications.Application; import java.time.Instant; import java.util.ArrayList; @@ -28,14 +27,21 @@ import java.util.Optional; */ public class MetricsResponse { - private final Collection<Pair<String, MetricSnapshot>> nodeMetrics; + /** Node level metrics */ + private final Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics; + + /** + * Cluster level metrics (collect in + * Must be aggregated at fetch time to avoid issues with nodes and nodes joining/leaving the cluster over time. + */ + private final Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics = new HashMap<>(); /** Creates this from a metrics/V2 response */ public MetricsResponse(String response, NodeList applicationNodes, NodeRepository nodeRepository) { this(SlimeUtils.jsonToSlime(response), applicationNodes, nodeRepository); } - public MetricsResponse(Collection<Pair<String, MetricSnapshot>> metrics) { + public MetricsResponse(Collection<Pair<String, NodeMetricSnapshot>> metrics) { this.nodeMetrics = metrics; } @@ -46,7 +52,9 @@ public class MetricsResponse { nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes, nodeRepository)); } - public Collection<Pair<String, MetricSnapshot>> metrics() { return nodeMetrics; } + public Collection<Pair<String, NodeMetricSnapshot>> nodeMetrics() { return nodeMetrics; } + + public Map<ClusterSpec.Id, ClusterMetricSnapshot> clusterMetrics() { return clusterMetrics; } private void consumeNode(Inspector node, NodeList applicationNodes, NodeRepository nodeRepository) { String hostname = node.field("hostname").asString(); @@ -59,14 +67,21 @@ public class MetricsResponse { if (node.isEmpty()) return; // Node is not part of this cluster any more long timestampSecond = nodeData.field("timestamp").asLong(); Map<String, Double> values = consumeMetrics(nodeData.field("metrics")); - nodeMetrics.add(new Pair<>(hostname, new MetricSnapshot(Instant.ofEpochMilli(timestampSecond * 1000), - Metric.cpu.from(values), - Metric.memory.from(values), - Metric.disk.from(values), - (long)Metric.generation.from(values), + Instant at = Instant.ofEpochMilli(timestampSecond * 1000); + + nodeMetrics.add(new Pair<>(hostname, new NodeMetricSnapshot(at, + Metric.cpu.from(values), + Metric.memory.from(values), + Metric.disk.from(values), + (long)Metric.generation.from(values), Metric.inService.from(values) > 0, - clusterIsStable(node.get(), applicationNodes, nodeRepository), - Metric.queryRate.from(values)))); + clusterIsStable(node.get(), applicationNodes, nodeRepository), + Metric.queryRate.from(values)))); + + var cluster = node.get().allocation().get().membership().cluster().id(); + var metrics = clusterMetrics.getOrDefault(cluster, new ClusterMetricSnapshot(at, 0.0)); + metrics = metrics.withQueryRate(metrics.queryRate() + Metric.queryRate.from(values)); + clusterMetrics.put(cluster, metrics); } private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricSnapshot.java index 82812592809..be9f7bd4819 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricSnapshot.java @@ -8,7 +8,7 @@ import java.time.Instant; * * @author bratseth */ -public class MetricSnapshot implements Comparable<MetricSnapshot> { +public class NodeMetricSnapshot implements Comparable<NodeMetricSnapshot> { private final Instant at; @@ -20,9 +20,9 @@ public class MetricSnapshot implements Comparable<MetricSnapshot> { private final boolean stable; private final double queryRate; - public MetricSnapshot(Instant at, double cpu, double memory, double disk, - long generation, boolean inService, boolean stable, - double queryRate) { + public NodeMetricSnapshot(Instant at, double cpu, double memory, double disk, + long generation, boolean inService, boolean stable, + double queryRate) { this.at = at; this.cpu = cpu; this.memory = memory; @@ -48,7 +48,7 @@ public class MetricSnapshot implements Comparable<MetricSnapshot> { public boolean stable() { return stable; } @Override - public int compareTo(MetricSnapshot other) { + public int compareTo(NodeMetricSnapshot other) { return at.compareTo(other.at); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java index 24876609f58..cedc2edfe63 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java @@ -16,11 +16,11 @@ import java.util.stream.Collectors; public class NodeTimeseries { private final String hostname; - private final List<MetricSnapshot> snapshots; + private final List<NodeMetricSnapshot> snapshots; - NodeTimeseries(String hostname, List<MetricSnapshot> snapshots) { + NodeTimeseries(String hostname, List<NodeMetricSnapshot> snapshots) { this.hostname = hostname; - List<MetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots); + List<NodeMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots); Collections.sort(sortedSnapshots); this.snapshots = Collections.unmodifiableList(sortedSnapshots); } @@ -29,19 +29,19 @@ public class NodeTimeseries { public int size() { return snapshots.size(); } - public MetricSnapshot get(int index) { return snapshots.get(index); } + public NodeMetricSnapshot get(int index) { return snapshots.get(index); } - public List<MetricSnapshot> asList() { return snapshots; } + public List<NodeMetricSnapshot> asList() { return snapshots; } public String hostname() { return hostname; } - public NodeTimeseries add(MetricSnapshot snapshot) { - List<MetricSnapshot> list = new ArrayList<>(snapshots); + public NodeTimeseries add(NodeMetricSnapshot snapshot) { + List<NodeMetricSnapshot> list = new ArrayList<>(snapshots); list.add(snapshot); return new NodeTimeseries(hostname(), list); } - public NodeTimeseries filter(Predicate<MetricSnapshot> filter) { + public NodeTimeseries filter(Predicate<NodeMetricSnapshot> filter) { return new NodeTimeseries(hostname, snapshots.stream().filter(filter).collect(Collectors.toList())); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java index f6a096c199a..189fe815387 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java @@ -5,6 +5,7 @@ import com.google.inject.Inject; import com.yahoo.collections.ListMap; import com.yahoo.collections.Pair; import com.yahoo.component.AbstractComponent; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.io.IOUtils; import com.yahoo.vespa.defaults.Defaults; import io.questdb.cairo.CairoConfiguration; @@ -30,6 +31,7 @@ import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; @@ -45,7 +47,8 @@ import java.util.stream.Collectors; public class QuestMetricsDb extends AbstractComponent implements MetricsDb { private static final Logger log = Logger.getLogger(QuestMetricsDb.class.getName()); - private static final String table = "metrics"; + private static final String nodeTable = "metrics"; + private static final String clusterTable = "clusterMetrics"; private final Clock clock; private final String dataDir; @@ -69,7 +72,8 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { } private void initializeDb() { - IOUtils.createDirectory(dataDir + "/" + table); + IOUtils.createDirectory(dataDir + "/" + nodeTable); + IOUtils.createDirectory(dataDir + "/" + clusterTable); // silence Questdb's custom logging system IOUtils.writeFile(new File(dataDir, "quest-log.conf"), new byte[0]); @@ -78,32 +82,36 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { CairoConfiguration configuration = new DefaultCairoConfiguration(dataDir); engine = new CairoEngine(configuration); - ensureExists(table); + ensureTablesExist(); } @Override - public void add(Collection<Pair<String, MetricSnapshot>> snapshots) { - try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) { - add(snapshots, writer); + public Clock clock() { return clock; } + + @Override + public void addNodeMetrics(Collection<Pair<String, NodeMetricSnapshot>> snapshots) { + try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), nodeTable)) { + addNodeMetrics(snapshots, writer); } catch (CairoException e) { if (e.getMessage().contains("Cannot read offset")) { // This error seems non-recoverable repair(e); - try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) { - add(snapshots, writer); + try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), nodeTable)) { + addNodeMetrics(snapshots, writer); } } } } - private void add(Collection<Pair<String, MetricSnapshot>> snapshots, TableWriter writer) { + private void addNodeMetrics(Collection<Pair<String, NodeMetricSnapshot>> snapshots, TableWriter writer) { for (var snapshot : snapshots) { long atMillis = adjustIfRecent(snapshot.getSecond().at().toEpochMilli(), highestTimestampAdded); if (atMillis < highestTimestampAdded) continue; // Ignore old data highestTimestampAdded = atMillis; TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds row.putStr(0, snapshot.getFirst()); + // (1 is timestamp) row.putFloat(2, (float)snapshot.getSecond().cpu()); row.putFloat(3, (float)snapshot.getSecond().memory()); row.putFloat(4, (float)snapshot.getSecond().disk()); @@ -117,23 +125,68 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { } @Override + public void addClusterMetrics(Map<ClusterSpec.Id, ClusterMetricSnapshot> snapshots) { + try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), clusterTable)) { + addClusterMetrics(snapshots, writer); + } + catch (CairoException e) { + if (e.getMessage().contains("Cannot read offset")) { + // This error seems non-recoverable + repair(e); + try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), clusterTable)) { + addClusterMetrics(snapshots, writer); + } + } + } + } + + private void addClusterMetrics(Map<ClusterSpec.Id, ClusterMetricSnapshot> snapshots, TableWriter writer) { + for (var snapshot : snapshots.entrySet()) { + long atMillis = adjustIfRecent(snapshot.getValue().at().toEpochMilli(), highestTimestampAdded); + if (atMillis < highestTimestampAdded) continue; // Ignore old data + highestTimestampAdded = atMillis; + TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds + row.putStr(0, snapshot.getKey().value()); + // (1 is timestamp) + row.putFloat(2, (float)snapshot.getValue().queryRate()); + row.append(); + } + writer.commit(); + } + + @Override public List<NodeTimeseries> getNodeTimeseries(Duration period, Set<String> hostnames) { try (SqlCompiler compiler = new SqlCompiler(engine)) { SqlExecutionContext context = newContext(); - var snapshots = getSnapshots(clock.instant().minus(period), hostnames, compiler, context); + var snapshots = getNodeSnapshots(clock.instant().minus(period), hostnames, compiler, context); return snapshots.entrySet().stream() .map(entry -> new NodeTimeseries(entry.getKey(), entry.getValue())) .collect(Collectors.toList()); } catch (SqlException e) { - throw new IllegalStateException("Could not read timeseries data in Quest stored in " + dataDir, e); + throw new IllegalStateException("Could not read node timeseries data in Quest stored in " + dataDir, e); + } + } + + @Override + public ClusterTimeseries getClusterTimeseries(ClusterSpec.Id clusterId) { + try (SqlCompiler compiler = new SqlCompiler(engine)) { + SqlExecutionContext context = newContext(); + return getClusterSnapshots(clusterId, compiler, context); + } + catch (SqlException e) { + throw new IllegalStateException("Could not read cluster timeseries data in Quest stored in " + dataDir, e); } } @Override public void gc() { - // We remove full days at once and we want to see at least three days to not every only see - // query rates from weekends + gc(nodeTable); + gc(clusterTable); + } + + private void gc(String table) { + // We remove full days at once and we want to see at least three days to not every only see weekend data Instant oldestToKeep = clock.instant().minus(Duration.ofDays(4)); SqlExecutionContext context = newContext(); int partitions = 0; @@ -158,7 +211,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { context); } catch (SqlException e) { - log.log(Level.WARNING, "Failed to gc old metrics data in " + dataDir, e); + log.log(Level.WARNING, "Failed to gc old metrics data in " + dataDir + " table " + table, e); } } @@ -182,18 +235,26 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { initializeDb(); } - private void ensureExists(String table) { + private boolean exists(String table, SqlExecutionContext context) { + return 0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table); + } + + private void ensureTablesExist() { SqlExecutionContext context = newContext(); - if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { // table exists - ensureTableIsUpdated(table, context); - } else { - createTable(table, context); - } + if (exists(nodeTable, context)) + ensureNodeTableIsUpdated(context); + else + createNodeTable(context); + + if (exists(clusterTable, context)) + ensureClusterTableIsUpdated(context); + else + createClusterTable(context); } - private void createTable(String table, SqlExecutionContext context) { + private void createNodeTable(SqlExecutionContext context) { try (SqlCompiler compiler = new SqlCompiler(engine)) { - compiler.compile("create table " + table + + compiler.compile("create table " + nodeTable + " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," + " application_generation long, inService boolean, stable boolean, queries_rate float)" + " timestamp(at)" + @@ -203,20 +264,39 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { // compiler.compile("alter table " + tableName + " alter column hostname add index", context); } catch (SqlException e) { - throw new IllegalStateException("Could not create Quest db table '" + table + "'", e); + throw new IllegalStateException("Could not create Quest db table '" + nodeTable + "'", e); + } + } + + private void createClusterTable(SqlExecutionContext context) { + try (SqlCompiler compiler = new SqlCompiler(engine)) { + compiler.compile("create table " + clusterTable + + " (cluster string, at timestamp, queries_rate float)" + + " timestamp(at)" + + "PARTITION BY DAY;", + context); + // We should do this if we get a version where selecting on strings work embedded, see below + // compiler.compile("alter table " + tableName + " alter column cluster add index", context); + } + catch (SqlException e) { + throw new IllegalStateException("Could not create Quest db table '" + clusterTable + "'", e); } } - private void ensureTableIsUpdated(String table, SqlExecutionContext context) { + private void ensureNodeTableIsUpdated(SqlExecutionContext context) { try (SqlCompiler compiler = new SqlCompiler(engine)) { - if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { - ensureColumnExists("queries_rate", "float", table, compiler, context); // TODO: Remove after March 2021 + if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), nodeTable)) { + ensureColumnExists("queries_rate", "float", nodeTable, compiler, context); // TODO: Remove after March 2021 } } catch (SqlException e) { repair(e); } } + private void ensureClusterTableIsUpdated(SqlExecutionContext context) { + // Nothing to do for now + } + private void ensureColumnExists(String column, String columnType, String table, SqlCompiler compiler, SqlExecutionContext context) throws SqlException { if (columnNamesOf(table, compiler, context).contains(column)) return; @@ -247,34 +327,34 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { return timestamp; } - private ListMap<String, MetricSnapshot> getSnapshots(Instant startTime, - Set<String> hostnames, - SqlCompiler compiler, - SqlExecutionContext context) throws SqlException { + private ListMap<String, NodeMetricSnapshot> getNodeSnapshots(Instant startTime, + Set<String> hostnames, + SqlCompiler compiler, + SqlExecutionContext context) throws SqlException { DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME.withZone(ZoneId.of("UTC")); String from = formatter.format(startTime).substring(0, 19) + ".000000Z"; String to = formatter.format(clock.instant()).substring(0, 19) + ".000000Z"; - String sql = "select * from " + table + " where at in('" + from + "', '" + to + "');"; + String sql = "select * from " + nodeTable + " where at in('" + from + "', '" + to + "');"; // WHERE clauses does not work: // String sql = "select * from " + tableName + " where hostname in('host1', 'host2', 'host3');"; try (RecordCursorFactory factory = compiler.compile(sql, context).getRecordCursorFactory()) { - ListMap<String, MetricSnapshot> snapshots = new ListMap<>(); + ListMap<String, NodeMetricSnapshot> snapshots = new ListMap<>(); try (RecordCursor cursor = factory.getCursor(context)) { Record record = cursor.getRecord(); while (cursor.hasNext()) { String hostname = record.getStr(0).toString(); if (hostnames.contains(hostname)) { snapshots.put(hostname, - new MetricSnapshot(Instant.ofEpochMilli(record.getTimestamp(1) / 1000), - record.getFloat(2), - record.getFloat(3), - record.getFloat(4), - record.getLong(5), - record.getBool(6), - record.getBool(7), - record.getFloat(8))); + new NodeMetricSnapshot(Instant.ofEpochMilli(record.getTimestamp(1) / 1000), + record.getFloat(2), + record.getFloat(3), + record.getFloat(4), + record.getLong(5), + record.getBool(6), + record.getBool(7), + record.getFloat(8))); } } } @@ -282,6 +362,26 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { } } + private ClusterTimeseries getClusterSnapshots(ClusterSpec.Id cluster, + SqlCompiler compiler, + SqlExecutionContext context) throws SqlException { + String sql = "select * from " + clusterTable; + try (RecordCursorFactory factory = compiler.compile(sql, context).getRecordCursorFactory()) { + List<ClusterMetricSnapshot> snapshots = new ArrayList<>(); + try (RecordCursor cursor = factory.getCursor(context)) { + Record record = cursor.getRecord(); + while (cursor.hasNext()) { + String clusterId = record.getStr(0).toString(); + if (cluster.value().equals(clusterId)) { + snapshots.add(new ClusterMetricSnapshot(Instant.ofEpochMilli(record.getTimestamp(1) / 1000), + record.getFloat(2))); + } + } + } + return new ClusterTimeseries(cluster, snapshots); + } + } + private SqlExecutionContext newContext() { return new SqlExecutionContextImpl(engine, 1); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index 0594a0f0553..681ef54995f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -50,13 +50,14 @@ public class ResourceTarget { /** Create a target of achieving ideal load given a current load */ public static ResourceTarget idealLoad(ClusterTimeseries clusterTimeseries, + ClusterNodesTimeseries clusterNodesTimeseries, AllocatableClusterResources current, Application application) { - return new ResourceTarget(nodeUsage(Resource.cpu, clusterTimeseries.averageLoad(Resource.cpu), current) - / idealCpuLoad(clusterTimeseries, application), - nodeUsage(Resource.memory, clusterTimeseries.averageLoad(Resource.memory), current) + return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current) + / idealCpuLoad(clusterTimeseries, clusterNodesTimeseries, application), + nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current) / Resource.memory.idealAverageLoad(), - nodeUsage(Resource.disk, clusterTimeseries.averageLoad(Resource.disk), current) + nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current) / Resource.disk.idealAverageLoad(), true); } @@ -70,10 +71,12 @@ public class ResourceTarget { } /** Ideal cpu load must take the application traffic fraction into account */ - private static double idealCpuLoad(ClusterTimeseries clusterTimeseries, Application application) { + private static double idealCpuLoad(ClusterTimeseries clusterTimeseries, + ClusterNodesTimeseries clusterNodesTimeseries, + Application application) { // What's needed to have headroom for growth during scale-up as a fraction of current resources? double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic - Duration scalingDuration = clusterTimeseries.cluster().scalingDuration(clusterTimeseries.clusterNodes().clusterSpec()); + Duration scalingDuration = clusterNodesTimeseries.cluster().scalingDuration(clusterNodesTimeseries.clusterNodes().clusterSpec()); double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); // Cap headroom at 10% above the historical observed peak growthRateHeadroom = Math.min(growthRateHeadroom, 1 / clusterTimeseries.currentQueryFractionOfMax() + 0.1); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index bcfdaefb305..9d910df01d9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -14,7 +14,7 @@ import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler; -import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.NodeTimeseries; import com.yahoo.vespa.hosted.provision.node.History; @@ -110,7 +110,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { // - 2. all nodes have switched to the right config generation for (NodeTimeseries nodeTimeseries : metricsDb.getNodeTimeseries(Duration.between(event.at(), clock().instant()), clusterNodes)) { - Optional<MetricSnapshot> firstOnNewGeneration = + Optional<NodeMetricSnapshot> firstOnNewGeneration = nodeTimeseries.asList().stream() .filter(snapshot -> snapshot.generation() >= event.generation()).findFirst(); if (firstOnNewGeneration.isEmpty()) return cluster; // Not completed diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java index b8548c4c3f4..4aae14b0d2d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java @@ -8,7 +8,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.autoscale.MetricsFetcher; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.MetricsResponse; -import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.util.Set; @@ -74,7 +73,8 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { warnings.add(1); } else if (response != null) { - metricsDb.add(response.metrics()); + metricsDb.addNodeMetrics(response.nodeMetrics()); + metricsDb.addClusterMetrics(response.clusterMetrics()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index e0a11aa5dac..10db9a08eeb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -59,7 +59,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { List<Node> retiredNodes = entry.getValue(); try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { - if ( ! deployment.isValid()) continue; // this will be done at another config server + if ( ! deployment.isValid()) continue; List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index ceaf88dd7d9..4235bae6850 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -9,8 +9,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; -import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources; -import com.yahoo.vespa.hosted.provision.autoscale.ClusterTimeseries; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterNodesTimeseries; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.Resource; @@ -74,7 +73,7 @@ public class ApplicationSerializer { } private static void clusterUtilizationToSlime(Cluster cluster, NodeList nodes, MetricsDb metricsDb, Cursor utilizationObject) { - var timeseries = new ClusterTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb); + var timeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb); utilizationObject.setDouble("cpu", timeseries.averageLoad(Resource.cpu)); utilizationObject.setDouble("memory", timeseries.averageLoad(Resource.memory)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java index 87b8ccdc348..8c6c116a225 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java @@ -46,7 +46,7 @@ public class AutoscalingIntegrationTest { for (int i = 0; i < 1000; i++) { tester.clock().advance(Duration.ofSeconds(10)); - fetcher.fetchMetrics(application1).whenComplete((r, e) -> tester.nodeMetricsDb().add(r.metrics())); + fetcher.fetchMetrics(application1).whenComplete((r, e) -> tester.nodeMetricsDb().addNodeMetrics(r.nodeMetrics())); tester.clock().advance(Duration.ofSeconds(10)); tester.nodeMetricsDb().gc(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index 156542ef1d4..6688c0f9ce8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -137,14 +137,14 @@ class AutoscalingTester { float cpu = value * oneExtraNodeFactor; float memory = (float) Resource.memory.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; - db.add(List.of(new Pair<>(node.hostname(), new MetricSnapshot(clock().instant(), - cpu, - memory, - disk, - 0, - true, - true, - 0.0)))); + db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), + cpu, + memory, + disk, + 0, + true, + true, + 0.0)))); } } } @@ -169,14 +169,14 @@ class AutoscalingTester { float cpu = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor; float memory = value * oneExtraNodeFactor; float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; - db.add(List.of(new Pair<>(node.hostname(), new MetricSnapshot(clock().instant(), - cpu, - memory, - disk, - 0, - true, - true, - 0.0)))); + db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), + cpu, + memory, + disk, + 0, + true, + true, + 0.0)))); } } } @@ -191,14 +191,14 @@ class AutoscalingTester { for (int i = 0; i < count; i++) { clock().advance(Duration.ofMinutes(1)); for (Node node : nodes) { - db.add(List.of(new Pair<>(node.hostname(), new MetricSnapshot(clock().instant(), - cpu, - memory, - disk, - generation, - inService, - stable, - 0.0)))); + db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), + cpu, + memory, + disk, + generation, + inService, + stable, + 0.0)))); } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java index 384e8dd8439..14a9a596e78 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java @@ -45,7 +45,7 @@ public class MetricsV2MetricsFetcherTest { { httpClient.cannedResponse = cannedResponseForApplication1; - List<Pair<String, MetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application1).get().metrics()); + List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application1).get().nodeMetrics()); assertEquals("http://host-1.yahoo.com:4080/metrics/v2/values?consumer=autoscaling", httpClient.requestsReceived.get(0)); assertEquals(2, values.size()); @@ -63,7 +63,7 @@ public class MetricsV2MetricsFetcherTest { { httpClient.cannedResponse = cannedResponseForApplication2; - List<Pair<String, MetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().metrics()); + List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics()); assertEquals("http://host-3.yahoo.com:4080/metrics/v2/values?consumer=autoscaling", httpClient.requestsReceived.get(1)); assertEquals(1, values.size()); @@ -81,7 +81,7 @@ public class MetricsV2MetricsFetcherTest { tester.nodeRepository().nodes().write(tester.nodeRepository().nodes().list(Node.State.active).owner(application2) .first().get().retire(tester.clock().instant()), lock); } - List<Pair<String, MetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().metrics()); + List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics()); assertFalse(values.get(0).getSecond().stable()); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java index c1c94c7dd24..76e56004871 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java @@ -38,19 +38,19 @@ public class NodeMetricsDbTest { ManualClock clock = tester.clock(); MetricsDb db = MetricsDb.createTestInstance(tester.nodeRepository()); - Collection<Pair<String, MetricSnapshot>> values = new ArrayList<>(); + Collection<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(); for (int i = 0; i < 40; i++) { - values.add(new Pair<>(node0, new MetricSnapshot(clock.instant(), - 0.9f, - 0.6f, - 0.6f, - 0, - true, - false, - 0.0))); + values.add(new Pair<>(node0, new NodeMetricSnapshot(clock.instant(), + 0.9f, + 0.6f, + 0.6f, + 0, + true, + false, + 0.0))); clock.advance(Duration.ofMinutes(120)); } - db.add(values); + db.addNodeMetrics(values); // Avoid off-by-one bug when the below windows starts exactly on one of the above getEpochSecond() timestamps. clock.advance(Duration.ofMinutes(1)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java index 8c110559112..8fed8e02873 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.collections.Pair; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.io.IOUtils; import com.yahoo.test.ManualClock; import org.junit.Ignore; @@ -12,7 +13,9 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,7 +32,7 @@ public class QuestMetricsDbTest { private static final double delta = 0.0000001; @Test - public void testReadWrite() { + public void testNodeMetricsReadWrite() { String dataDir = "data/QuestMetricsDbReadWrite"; IOUtils.recursiveDeleteDir(new File(dataDir)); IOUtils.createDirectory(dataDir + "/metrics"); @@ -38,7 +41,7 @@ public class QuestMetricsDbTest { Instant startTime = clock.instant(); clock.advance(Duration.ofSeconds(1)); - db.add(timeseries(1000, Duration.ofSeconds(1), clock, "host1", "host2", "host3")); + db.addNodeMetrics(nodeTimeseries(1000, Duration.ofSeconds(1), clock, "host1", "host2", "host3")); clock.advance(Duration.ofSeconds(1)); @@ -48,7 +51,7 @@ public class QuestMetricsDbTest { assertEquals(1, nodeTimeSeries1.size()); assertEquals("host1", nodeTimeSeries1.get(0).hostname()); assertEquals(1000, nodeTimeSeries1.get(0).size()); - MetricSnapshot snapshot = nodeTimeSeries1.get(0).asList().get(0); + NodeMetricSnapshot snapshot = nodeTimeSeries1.get(0).asList().get(0); assertEquals(startTime.plus(Duration.ofSeconds(1)), snapshot.at()); assertEquals(0.1, snapshot.cpu(), delta); assertEquals(0.2, snapshot.memory(), delta); @@ -75,6 +78,43 @@ public class QuestMetricsDbTest { } @Test + public void testClusterMetricsReadWrite() { + String dataDir = "data/QuestMetricsDbReadWrite"; + IOUtils.recursiveDeleteDir(new File(dataDir)); + IOUtils.createDirectory(dataDir + "/clusterMetrics"); + ManualClock clock = new ManualClock("2020-10-01T00:00:00"); + QuestMetricsDb db = new QuestMetricsDb(dataDir, clock); + Instant startTime = clock.instant(); + + var cluster1 = new ClusterSpec.Id("cluster1"); + var cluster2 = new ClusterSpec.Id("cluster2"); + db.addClusterMetrics(Map.of(cluster1, new ClusterMetricSnapshot(clock.instant(), 30.0))); + db.addClusterMetrics(Map.of(cluster2, new ClusterMetricSnapshot(clock.instant(), 60.0))); + clock.advance(Duration.ofMinutes(1)); + db.addClusterMetrics(Map.of(cluster1, new ClusterMetricSnapshot(clock.instant(), 45.0))); + + ClusterTimeseries clusterTimeseries1 = db.getClusterTimeseries(cluster1); + assertEquals(cluster1, clusterTimeseries1.cluster()); + assertEquals(2, clusterTimeseries1.asList().size()); + + ClusterMetricSnapshot snapshot11 = clusterTimeseries1.get(0); + assertEquals(startTime, snapshot11.at()); + assertEquals(30, snapshot11.queryRate(), delta); + + ClusterMetricSnapshot snapshot12 = clusterTimeseries1.get(1); + assertEquals(startTime.plus(Duration.ofMinutes(1)), snapshot12.at()); + assertEquals(45, snapshot12.queryRate(), delta); + + ClusterTimeseries clusterTimeseries2 = db.getClusterTimeseries(cluster2); + assertEquals(cluster2, clusterTimeseries2.cluster()); + assertEquals(1, clusterTimeseries2.asList().size()); + + ClusterMetricSnapshot snapshot21 = clusterTimeseries2.get(0); + assertEquals(startTime, snapshot21.at()); + assertEquals(60, snapshot21.queryRate(), delta); + } + + @Test public void testWriteOldData() { String dataDir = "data/QuestMetricsDbWriteOldData"; IOUtils.recursiveDeleteDir(new File(dataDir)); @@ -83,19 +123,19 @@ public class QuestMetricsDbTest { QuestMetricsDb db = new QuestMetricsDb(dataDir, clock); Instant startTime = clock.instant(); clock.advance(Duration.ofSeconds(300)); - db.add(timeseriesAt(10, clock.instant(), "host1", "host2", "host3")); + db.addNodeMetrics(timeseriesAt(10, clock.instant(), "host1", "host2", "host3")); clock.advance(Duration.ofSeconds(1)); List<NodeTimeseries> nodeTimeSeries1 = db.getNodeTimeseries(Duration.between(startTime, clock.instant()), Set.of("host1")); assertEquals(10, nodeTimeSeries1.get(0).size()); - db.add(timeseriesAt(10, clock.instant().minus(Duration.ofSeconds(20)), "host1", "host2", "host3")); + db.addNodeMetrics(timeseriesAt(10, clock.instant().minus(Duration.ofSeconds(20)), "host1", "host2", "host3")); List<NodeTimeseries> nodeTimeSeries2 = db.getNodeTimeseries(Duration.between(startTime, clock.instant()), Set.of("host1")); assertEquals("Recent data is accepted", 20, nodeTimeSeries2.get(0).size()); - db.add(timeseriesAt(10, clock.instant().minus(Duration.ofSeconds(200)), "host1", "host2", "host3")); + db.addNodeMetrics(timeseriesAt(10, clock.instant().minus(Duration.ofSeconds(200)), "host1", "host2", "host3")); List<NodeTimeseries> nodeTimeSeries3 = db.getNodeTimeseries(Duration.between(startTime, clock.instant()), Set.of("host1")); assertEquals("Too old data is rejected", 20, nodeTimeSeries3.get(0).size()); @@ -111,7 +151,7 @@ public class QuestMetricsDbTest { Instant startTime = clock.instant(); int dayOffset = 3; clock.advance(Duration.ofHours(dayOffset)); - db.add(timeseries(24 * 10, Duration.ofHours(1), clock, "host1", "host2", "host3")); + db.addNodeMetrics(nodeTimeseries(24 * 10, Duration.ofHours(1), clock, "host1", "host2", "host3")); assertEquals(24 * 10, db.getNodeTimeseries(Duration.between(startTime, clock.instant()), Set.of("host1")).get(0).size()); @@ -146,7 +186,7 @@ public class QuestMetricsDbTest { System.out.println(" " + snapshot); clock.advance(Duration.ofSeconds(1)); - db.add(timeseries(2, Duration.ofSeconds(1), clock, "host1")); + db.addNodeMetrics(nodeTimeseries(2, Duration.ofSeconds(1), clock, "host1")); System.out.println("New data written and read:"); timeseries = db.getNodeTimeseries(Duration.ofSeconds(2), Set.of("host1")); for (var snapshot : timeseries.get(0).asList()) @@ -163,7 +203,7 @@ public class QuestMetricsDbTest { ManualClock clock = new ManualClock("2020-10-01T00:00:00"); QuestMetricsDb db = new QuestMetricsDb(dataDir, clock); Instant startTime = clock.instant(); - db.add(timeseries(10, Duration.ofSeconds(1), clock, "host1")); + db.addNodeMetrics(nodeTimeseries(10, Duration.ofSeconds(1), clock, "host1")); int added = db.getNodeTimeseries(Duration.between(startTime, clock.instant()), Set.of("host1")).get(0).asList().size(); @@ -171,36 +211,46 @@ public class QuestMetricsDbTest { db.close(); } - private Collection<Pair<String, MetricSnapshot>> timeseries(int countPerHost, Duration sampleRate, ManualClock clock, - String ... hosts) { - Collection<Pair<String, MetricSnapshot>> timeseries = new ArrayList<>(); + private Collection<Pair<String, NodeMetricSnapshot>> nodeTimeseries(int countPerHost, Duration sampleRate, ManualClock clock, + String ... hosts) { + Collection<Pair<String, NodeMetricSnapshot>> timeseries = new ArrayList<>(); for (int i = 1; i <= countPerHost; i++) { for (String host : hosts) - timeseries.add(new Pair<>(host, new MetricSnapshot(clock.instant(), + timeseries.add(new Pair<>(host, new NodeMetricSnapshot(clock.instant(), i * 0.1, i * 0.2, i * 0.4, i % 100, - true, - true, - 30.0))); + true, + true, + 30.0))); + clock.advance(sampleRate); + } + return timeseries; + } + + private List<ClusterMetricSnapshot> clusterTimeseries(int count, Duration sampleRate, ManualClock clock, + ClusterSpec.Id cluster) { + List<ClusterMetricSnapshot> timeseries = new ArrayList<>(); + for (int i = 1; i <= count; i++) { + timeseries.add(new ClusterMetricSnapshot(clock.instant(), 30.0)); clock.advance(sampleRate); } return timeseries; } - private Collection<Pair<String, MetricSnapshot>> timeseriesAt(int countPerHost, Instant at, String ... hosts) { - Collection<Pair<String, MetricSnapshot>> timeseries = new ArrayList<>(); + private Collection<Pair<String, NodeMetricSnapshot>> timeseriesAt(int countPerHost, Instant at, String ... hosts) { + Collection<Pair<String, NodeMetricSnapshot>> timeseries = new ArrayList<>(); for (int i = 1; i <= countPerHost; i++) { for (String host : hosts) - timeseries.add(new Pair<>(host, new MetricSnapshot(at, + timeseries.add(new Pair<>(host, new NodeMetricSnapshot(at, i * 0.1, i * 0.2, i * 0.4, i % 100, - true, - false, - 0.0))); + true, + false, + 0.0))); } return timeseries; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java index 1b531fd3237..e8cfe6a2310 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java @@ -16,8 +16,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; -import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; -import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; @@ -75,14 +74,14 @@ public class AutoscalingMaintainerTester { NodeList nodes = nodeRepository().nodes().list(Node.State.active).owner(applicationId); for (int i = 0; i < count; i++) { for (Node node : nodes) - metricsDb.add(List.of(new Pair<>(node.hostname(), new MetricSnapshot(clock().instant(), - cpu, - mem, - disk, - generation, - true, - true, - 0.0)))); + metricsDb.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), + cpu, + mem, + disk, + generation, + true, + true, + 0.0)))); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java index e99f7740c29..5af787092d5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java @@ -4,7 +4,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.NodeResources; -import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.MetricsV2MetricsFetcher; import com.yahoo.vespa.hosted.provision.autoscale.NodeTimeseries; @@ -49,9 +49,9 @@ public class NodeMetricsDbMaintainerTest { List<NodeTimeseries> timeseriesList = db.getNodeTimeseries(Duration.ofDays(1), Set.of("host-1.yahoo.com", "host-2.yahoo.com")); assertEquals(2, timeseriesList.size()); - List<MetricSnapshot> allSnapshots = timeseriesList.stream() - .flatMap(timeseries -> timeseries.asList().stream()) - .collect(Collectors.toList()); + List<NodeMetricSnapshot> allSnapshots = timeseriesList.stream() + .flatMap(timeseries -> timeseries.asList().stream()) + .collect(Collectors.toList()); assertTrue(allSnapshots.stream().anyMatch(snapshot -> snapshot.inService())); assertTrue(allSnapshots.stream().anyMatch(snapshot -> ! snapshot.inService())); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index 6f76fc8d99c..88d39e887d3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -17,7 +17,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; -import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot; +import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.Resource; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; @@ -125,14 +125,14 @@ public class ScalingSuggestionsMaintainerTest { NodeList nodes = nodeRepository.nodes().list(Node.State.active).owner(applicationId); for (int i = 0; i < count; i++) { for (Node node : nodes) - db.add(List.of(new Pair<>(node.hostname(), new MetricSnapshot(nodeRepository.clock().instant(), - cpu, - memory, - disk, - generation, - true, - true, - 0.0)))); + db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(nodeRepository.clock().instant(), + cpu, + memory, + disk, + generation, + true, + true, + 0.0)))); } } |