diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-09-22 22:21:13 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-09-22 22:21:13 +0200 |
commit | 3936b1f1e34b7f489822221b5cd3d917f1dc54f5 (patch) | |
tree | 3ed8306a0cd7e9c5dcb02b3fbfb50e7be43d5c8c /node-repository | |
parent | 8d6a1a768428cc7b4fa04cfef1eb6ce4f18485bc (diff) |
Remove window abstraction
Diffstat (limited to 'node-repository')
4 files changed, 131 insertions, 98 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index ba63376d61e..152ab4f5ca8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.autoscale; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.hosted.provision.Node; @@ -8,8 +9,10 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; import java.time.Duration; +import java.time.Instant; import java.util.List; import java.util.Optional; +import java.util.stream.Collector; import java.util.stream.Collectors; /** @@ -63,11 +66,10 @@ public class Autoscaler { private Optional<AllocatableClusterResources> autoscale(List<Node> clusterNodes, Limits limits, boolean exclusive) { if (unstable(clusterNodes)) return Optional.empty(); - ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); AllocatableClusterResources currentAllocation = new AllocatableClusterResources(clusterNodes, nodeRepository); - Optional<Double> cpuLoad = averageLoad(Resource.cpu, clusterNodes, clusterType); - Optional<Double> memoryLoad = averageLoad(Resource.memory, clusterNodes, clusterType); - Optional<Double> diskLoad = averageLoad(Resource.disk, clusterNodes, clusterType); + Optional<Double> cpuLoad = averageLoad(Resource.cpu, clusterNodes); + Optional<Double> memoryLoad = averageLoad(Resource.memory, clusterNodes); + Optional<Double> diskLoad = averageLoad(Resource.disk, clusterNodes); if (cpuLoad.isEmpty() || memoryLoad.isEmpty() || diskLoad.isEmpty()) return Optional.empty(); var target = ResourceTarget.idealLoad(cpuLoad.get(), memoryLoad.get(), diskLoad.get(), currentAllocation); @@ -97,17 +99,31 @@ public class Autoscaler { * Returns the average load of this resource in the measurement window, * or empty if we are not in a position to make decisions from these measurements at this time. */ - private Optional<Double> averageLoad(Resource resource, List<Node> clusterNodes, ClusterSpec.Type clusterType) { - NodeMetricsDb.Window window = metricsDb.getWindow(nodeRepository.clock().instant().minus(scalingWindow(clusterType)), - resource, - clusterNodes.stream().map(Node::hostname).collect(Collectors.toList())); + private Optional<Double> averageLoad(Resource resource, + List<Node> clusterNodes) { + ApplicationId application = clusterNodes.get(0).allocation().get().owner(); + ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type(); + Instant startTime = nodeRepository.clock().instant().minus(scalingWindow(clusterType)); + + List<NodeMetricsDb.DeploymentEvent> deployments = metricsDb.getEvents(application); + if (! deployments.isEmpty()) { + var deployment = deployments.get(deployments.size() - 1); + if (deployment.time().isAfter(startTime)) + startTime = deployment.time(); + } + + List<NodeMetricsDb.NodeMeasurements> measurements = metricsDb.getMeasurements(startTime, + resource, + clusterNodes.stream().map(Node::hostname).collect(Collectors.toList())); // Require a total number of measurements scaling with the number of nodes, // but don't require that we have at least that many from every node - if (window.measurementCount()/clusterNodes.size() < minimumMeasurementsPerNode(clusterType)) return Optional.empty(); - if (window.hostnames() != clusterNodes.size()) return Optional.empty(); + int measurementCount = measurements.stream().mapToInt(m -> m.size()).sum(); + if (measurementCount / clusterNodes.size() < minimumMeasurementsPerNode(clusterType)) return Optional.empty(); + if (measurements.size() != clusterNodes.size()) return Optional.empty(); - return Optional.of(window.average()); + double measurementSum = measurements.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> m.value()).sum(); + return Optional.of(measurementSum / measurementCount); } /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDb.java index 316708732a7..74c75c3c0a1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDb.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.autoscale; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -9,6 +10,7 @@ import java.time.Clock; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -30,6 +32,9 @@ public class NodeMetricsDb { /** Measurements by key. Each list of measurements is sorted by increasing timestamp */ private final Map<NodeMeasurementsKey, NodeMeasurements> db = new HashMap<>(); + /** Events */ + private final List<DeploymentEvent> events = new ArrayList<>(); + /** Lock all access for now since we modify lists inside a map */ private final Object lock = new Object(); @@ -37,7 +42,7 @@ public class NodeMetricsDb { this.nodeRepository = nodeRepository; } - /** Add measurements to this */ + /** Adds measurements to this. */ public void add(Collection<NodeMetrics.MetricValue> metricValues) { synchronized (lock) { for (var value : metricValues) { @@ -50,7 +55,8 @@ public class NodeMetricsDb { if (node.get().allocation().isEmpty()) continue; measurements = new NodeMeasurements(value.hostname(), resource, - node.get().allocation().get().membership().cluster().type()); + node.get().allocation().get().membership().cluster().type(), + new ArrayList<>()); db.put(key, measurements); } measurements.add(new Measurement(value.timestampSecond() * 1000, @@ -59,83 +65,51 @@ public class NodeMetricsDb { } } - /** Must be called intermittently (as long as add is called) to gc old measurements */ + /** Adds an event to this */ + public void add(DeploymentEvent event) { + synchronized (lock) { + events.add(event); + } + } + + /** Must be called intermittently (as long as any add methods are called) to gc old data */ public void gc(Clock clock) { synchronized (lock) { // Each measurement is Object + long + float = 16 + 8 + 4 = 28 bytes // 12 hours with 1k nodes and 3 resources and 1 measurement/sec is about 5Gb - for (Iterator<NodeMeasurements> i = db.values().iterator(); i.hasNext(); ) { var measurements = i.next(); measurements.removeOlderThan(clock.instant().minus(Autoscaler.scalingWindow(measurements.type)).toEpochMilli()); if (measurements.isEmpty()) i.remove(); } - } - } - - /** Returns a window within which we can ask for specific information from this db */ - public Window getWindow(Instant startTime, Resource resource, List<String> hostnames) { - return new Window(startTime, resource, hostnames); - } - - public class Window { - - private final long startTime; - private final List<NodeMeasurementsKey> keys; - private Window(Instant startTime, Resource resource, List<String> hostnames) { - this.startTime = startTime.toEpochMilli(); - keys = hostnames.stream().map(hostname -> new NodeMeasurementsKey(hostname, resource)).collect(Collectors.toList()); + // TODO: gc events } + } - public int measurementCount() { - synchronized (lock) { - int count = 0; - for (NodeMeasurementsKey key : keys) { - NodeMeasurements measurements = db.get(key); - if (measurements == null) continue; - count += measurements.after(startTime).size(); - } - return count; - } - } - - /** Returns the count of hostnames which have measurements in this window */ - public int hostnames() { - synchronized (lock) { - int count = 0; - for (NodeMeasurementsKey key : keys) { - NodeMeasurements measurements = db.get(key); - if (measurements == null || measurements.isEmpty()) continue; - - if (measurements.get(measurements.size() - 1).timestamp >= startTime) - count++; - } - return count; + /** + * Returns a list of measurements with one entry for each of the given host names + * which have any values after startTime, in the same order + */ + public List<NodeMeasurements> getMeasurements(Instant startTime, Resource resource, List<String> hostnames) { + synchronized (lock) { + List<NodeMeasurements> measurementsList = new ArrayList<>(hostnames.size()); + for (String hostname : hostnames) { + NodeMeasurements measurements = db.get(new NodeMeasurementsKey(hostname, resource)); + if (measurements == null) continue; + measurements = measurements.copyAfter(startTime); + if (measurements.isEmpty()) continue; + measurementsList.add(measurements); } + return measurementsList; } + } - public double average() { - synchronized (lock) { - double sum = 0; - int count = 0; - for (NodeMeasurementsKey key : keys) { - NodeMeasurements measurements = db.get(key); - if (measurements == null) continue; - - int index = measurements.size() - 1; - while (index >= 0 && measurements.get(index).timestamp >= startTime) { - sum += measurements.get(index).value; - count++; - - index--; - } - } - return sum / count; - } + public List<DeploymentEvent> getEvents(ApplicationId application) { + synchronized (lock) { + return events.stream().filter(event -> event.application().equals(application)).collect(Collectors.toList()); } - } private static class NodeMeasurementsKey { @@ -168,42 +142,52 @@ public class NodeMetricsDb { } - private static class NodeMeasurements { + public static class NodeMeasurements { private final String hostname; private final Resource resource; private final ClusterSpec.Type type; - private final List<Measurement> measurements = new ArrayList<>(); + private final List<Measurement> measurements; - public NodeMeasurements(String hostname, Resource resource, ClusterSpec.Type type) { + // Note: This transfers ownership of the measurement list to this + private NodeMeasurements(String hostname, Resource resource, ClusterSpec.Type type, List<Measurement> measurements) { this.hostname = hostname; this.resource = resource; this.type = type; + this.measurements = measurements; } - void removeOlderThan(long oldestTimestamp) { - while (!measurements.isEmpty() && measurements.get(0).timestamp < oldestTimestamp) - measurements.remove(0); - } + // Public access - boolean isEmpty() { return measurements.isEmpty(); } + public boolean isEmpty() { return measurements.isEmpty(); } - int size() { return measurements.size(); } + public int size() { return measurements.size(); } - Measurement get(int index) { return measurements.get(index); } + public Measurement get(int index) { return measurements.get(index); } - void add(Measurement measurement) { measurements.add(measurement); } + public List<Measurement> asList() { return Collections.unmodifiableList(measurements); } - public List<Measurement> after(long oldestTimestamp) { - return measurements.stream() - .filter(measurement -> measurement.timestamp >= oldestTimestamp) - .collect(Collectors.toList()); + public NodeMeasurements copyAfter(Instant oldestTime) { + long oldestTimestamp = oldestTime.toEpochMilli(); + return new NodeMeasurements(hostname, resource, type, + measurements.stream() + .filter(measurement -> measurement.timestamp >= oldestTimestamp) + .collect(Collectors.toList())); } - } + // Private mutation - private static class Measurement { + private void add(Measurement measurement) { measurements.add(measurement); } + + private void removeOlderThan(long oldestTimestamp) { + while (!measurements.isEmpty() && measurements.get(0).timestamp < oldestTimestamp) + measurements.remove(0); + } + + } + public static class Measurement { + // TODO: Order by timestamp /** The time of this measurement in epoch millis */ private final long timestamp; @@ -215,9 +199,35 @@ public class NodeMetricsDb { this.value = value; } + public float value() { return value; } + public Instant at() { return Instant.ofEpochMilli(timestamp); } + @Override public String toString() { return "measurement at " + timestamp + ": " + value; } } + public static class DeploymentEvent { + + private final ApplicationId application; + private final long generation; + private final long timestamp; + + public DeploymentEvent(ApplicationId application, long generation, Instant times) { + this.application = application; + this.generation = generation; + this.timestamp = times.toEpochMilli(); + } + + /** Returns the deployed application */ + public ApplicationId application() { return application; } + + /** Returns the application config generation resulting from this deployment */ + public long generation() { return generation; } + + /** Returns the time of this deployment */ + public Instant time() { return Instant.ofEpochMilli(timestamp); } + + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index ddee1afe21e..4f811f9198f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -29,6 +29,7 @@ import java.util.stream.Collectors; */ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { + private final NodeMetricsDb metricsDb; private final Autoscaler autoscaler; private final Deployer deployer; private final Metric metric; @@ -40,6 +41,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { Duration interval) { super(nodeRepository, interval, metric); this.autoscaler = new Autoscaler(metricsDb, nodeRepository); + this.metricsDb = metricsDb; this.metric = metric; this.deployer = deployer; } @@ -72,7 +74,11 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { applications().put(application.with(cluster.get().withTarget(target)), deployment.applicationLock().get()); if (target.isPresent()) { logAutoscaling(target.get(), applicationId, clusterId, clusterNodes); - deployment.activate(); + Optional<Long> resultingGeneration = deployment.activate(); + if (resultingGeneration.isEmpty()) return; // Failed to activate + metricsDb.add(new NodeMetricsDb.DeploymentEvent(applicationId, + resultingGeneration.get(), + nodeRepository().clock().instant())); } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java index 976aeb2346a..753a26ea452 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java @@ -6,10 +6,7 @@ import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; -import com.yahoo.config.provision.NodeType; import com.yahoo.test.ManualClock; -import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.NodeRepositoryTester; import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; import org.junit.Test; @@ -45,11 +42,15 @@ public class NodeMetricsDbTest { // Avoid off-by-one bug when the below windows starts exactly on one of the above getEpochSecond() timestamps. clock.advance(Duration.ofMinutes(1)); - assertEquals(35, db.getWindow(clock.instant().minus(Duration.ofHours(6)), Resource.cpu, List.of(node0)).measurementCount()); - assertEquals( 0, db.getWindow(clock.instant().minus(Duration.ofHours(6)), Resource.memory, List.of(node0)).measurementCount()); + assertEquals(35, measurementCount(db.getMeasurements(clock.instant().minus(Duration.ofHours(6)), Resource.cpu, List.of(node0)))); + assertEquals( 0, measurementCount(db.getMeasurements(clock.instant().minus(Duration.ofHours(6)), Resource.memory, List.of(node0)))); db.gc(clock); - assertEquals( 5, db.getWindow(clock.instant().minus(Duration.ofHours(6)), Resource.cpu, List.of(node0)).measurementCount()); - assertEquals( 0, db.getWindow(clock.instant().minus(Duration.ofHours(6)), Resource.memory, List.of(node0)).measurementCount()); + assertEquals( 5, measurementCount(db.getMeasurements(clock.instant().minus(Duration.ofHours(6)), Resource.cpu, List.of(node0)))); + assertEquals( 0, measurementCount(db.getMeasurements(clock.instant().minus(Duration.ofHours(6)), Resource.memory, List.of(node0)))); + } + + private int measurementCount(List<NodeMetricsDb.NodeMeasurements> measurements) { + return measurements.stream().mapToInt(m -> m.size()).sum(); } } |