diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-03-17 16:57:53 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-03-17 16:57:53 +0100 |
commit | 7ba1f8a582181b5ace2eb775817b1142026430df (patch) | |
tree | ad3d5e516f18010b793bd2e6c74aa7250b74f8bd /node-repository | |
parent | 26380d3b13a9524dc7cf285aeb65aea1dad0fd3e (diff) |
Average query rate over measurement window
Diffstat (limited to 'node-repository')
6 files changed, 73 insertions, 39 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index c7549a5ddee..ac3430fecf9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -86,7 +86,8 @@ public class Autoscaler { clusterTimeseries, clusterNodesTimeseries, currentAllocation, - application); + application, + nodeRepository.clock()); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index 3f5255c6618..75270f8afc6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -3,7 +3,9 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterSpec; +import java.time.Clock; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -84,12 +86,21 @@ public class ClusterTimeseries { return maxGrowthRate / currentQueryRate(); } - /** The current query rate as a fraction of the peak rate in this timeseries */ - public double currentQueryFractionOfMax() { + /** + * The current query rate, averaged over the same window we average utilization over, + * as a fraction of the peak rate in this timeseries + */ + public double queryFractionOfMax(Duration window, Clock clock) { if (snapshots.isEmpty()) return 0.5; var max = snapshots.stream().mapToDouble(ClusterMetricSnapshot::queryRate).max().getAsDouble(); if (max == 0) return 1.0; - return snapshots.get(snapshots.size() - 1).queryRate() / max; + Instant oldest = clock.instant().minus(window); + var average = snapshots.stream() + .filter(snapshot -> snapshot.at().isAfter(oldest)) + .mapToDouble(snapshot -> snapshot.queryRate()) + .average(); + if (average.isEmpty()) return 0.5; // No measurements in the relevant time period + return average.getAsDouble() / max; } public double currentQueryRate() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index 35717b97cf4..d7dd0fc3197 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.vespa.hosted.provision.applications.Application; +import java.time.Clock; import java.time.Duration; /** @@ -53,9 +54,10 @@ public class ResourceTarget { ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries clusterNodesTimeseries, AllocatableClusterResources current, - Application application) { + Application application, + Clock clock) { return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current) - / idealCpuLoad(scalingDuration, clusterTimeseries, application), + / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock), nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current) / Resource.memory.idealAverageLoad(), nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current) @@ -74,14 +76,15 @@ public class ResourceTarget { /** Ideal cpu load must take the application traffic fraction into account */ public static double idealCpuLoad(Duration scalingDuration, ClusterTimeseries clusterTimeseries, - Application application) { + Application application, + Clock clock) { double queryCpuFraction = queryCpuFraction(clusterTimeseries); // What's needed to have headroom for growth during scale-up as a fraction of current resources? double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); // Cap headroom at 10% above the historical observed peak - double fractionOfMax = clusterTimeseries.currentQueryFractionOfMax(); + double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock); if (fractionOfMax != 0) growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index 4d1c963d8ea..90b898640cf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.autoscale.Resource; import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget; import java.net.URI; +import java.time.Clock; import java.time.Duration; import java.util.Collection; import java.util.List; @@ -71,12 +72,12 @@ public class ApplicationSerializer { if (cluster.shouldSuggestResources(currentResources)) cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested"))); cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target"))); - clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, clusterObject.setObject("utilization")); + clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization")); scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents")); clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus()); clusterObject.setLong("scalingDuration", scalingDuration.toMillis()); clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate()); - clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.currentQueryFractionOfMax()); + clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock())); } private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) { @@ -89,9 +90,10 @@ public class ApplicationSerializer { Duration scalingDuration, ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries clusterNodesTimeseries, + Clock clock, Cursor utilizationObject) { utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu)); - utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application)); + utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock)); utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory)); utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad()); utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 650bfe761b5..daf8b9b4f9f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -53,7 +53,7 @@ public class AutoscalingTest { assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.clock().advance(Duration.ofDays(1)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high", 14, 1, 1.4, 30.8, 30.8, @@ -123,7 +123,7 @@ public class AutoscalingTest { .allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow); tester.clock().advance(Duration.ofDays(2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // Changing min and max from slow to any ClusterResources min = new ClusterResources( 2, 1, @@ -368,7 +368,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); tester.clock().advance(Duration.ofDays(2)); - tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only + tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addMemMeasurements(0.02f, 0.95f, 120, application1); tester.assertResources("Scaling down", 6, 1, 2.9, 4.0, 95.0, @@ -507,6 +507,16 @@ public class AutoscalingTest { @Test public void test_autoscaling_considers_growth_rate() { + NodeResources minResources = new NodeResources( 1, 100, 100, 1); + NodeResources midResources = new NodeResources( 5, 100, 100, 1); + NodeResources maxResources = new NodeResources(10, 100, 100, 1); + ClusterResources min = new ClusterResources(5, 1, minResources); + ClusterResources max = new ClusterResources(5, 1, maxResources); + AutoscalingTester tester = new AutoscalingTester(maxResources.withVcpu(maxResources.vcpu() * 2)); + + ApplicationId application1 = tester.applicationId("application1"); + ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); +/* NodeResources resources = new NodeResources(3, 100, 100, 1); ClusterResources min = new ClusterResources( 1, 1, resources); ClusterResources max = new ClusterResources(10, 1, resources); @@ -514,22 +524,22 @@ public class AutoscalingTest { ApplicationId application1 = tester.applicationId("application1"); ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1"); - - tester.deploy(application1, cluster1, 5, 1, resources); +*/ + tester.deploy(application1, cluster1, 5, 1, midResources); tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only tester.addCpuMeasurements(0.25f, 1f, 120, application1); // (no query rate data) tester.assertResources("Advice to scale up since we assume we need 2x cpu for growth when no data", - 7, 1, 3, 100, 100, + 5, 1, 6.3, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); tester.setScalingDuration(application1, cluster1.id(), Duration.ofMinutes(5)); tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> 10.0 + (t < 50 ? t : 100 - t)); - tester.assertResources("Advice to scale down since observed growth is much slower than scaling time", - 4, 1, 3, 100, 100, + tester.assertResources("Advice to scale down since observed growth is slower than scaling time", + 5, 1, 3.4, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); tester.clearQueryRateMeasurements(application1, cluster1.id()); @@ -538,8 +548,8 @@ public class AutoscalingTest { tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> 10.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49))); - tester.assertResources("Advice to scale up since observed growth is much faster than scaling time", - 10, 1, 3, 100, 100, + tester.assertResources("Advice to scale up since observed growth is faster than scaling time", + 5, 1, 6.7, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); } @@ -561,27 +571,27 @@ public class AutoscalingTest { // Why twice the query rate at time = 0? // This makes headroom for queries doubling, which we want to observe the effect of here - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 10.0); tester.assertResources("Query and write load is equal -> scale up somewhat", 5, 1, 7.3, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 100.0 : 50.0, t -> 10.0); - tester.assertResources("Query load is 5x write load -> scale up more", - 5, 1, 9.7, 100, 100, + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 80.0 : 40.0, t -> 10.0); + tester.assertResources("Query load is 4x write load -> scale up more", + 5, 1, 9.1, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); tester.assertResources("Write load is 10x query load -> scale down", - 5, 1, 3.8, 100, 100, + 5, 1, 3.7, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t-> 0.0); tester.assertResources("Query only -> largest possible", 5, 1, 10.0, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); - tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> 0.0, t -> 10.0); + tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> 0.0, t -> 10.0); tester.assertResources("Write only -> smallest possible", 5, 1, 2.1, 100, 100, tester.autoscale(application1, cluster1.id(), min, max).target()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java index 6b7c1790bc4..6f60de62f1f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java @@ -28,6 +28,7 @@ public class ResourceTargetTest { @Test public void test_traffic_headroom() { + ManualClock clock = new ManualClock(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); application = application.with(cluster); @@ -36,21 +37,25 @@ public class ResourceTargetTest { application = application.with(new Status(0.0, 1.0)); assertEquals(0.131, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), delta); // Almost no current traffic share: Ideal load is low but capped application = application.with(new Status(0.0001, 1.0)); assertEquals(0.131, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), delta); } @Test public void test_growth_headroom() { + ManualClock clock = new ManualClock(); + Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); Cluster cluster = cluster(new NodeResources(1, 10, 100, 1)); application = application.with(cluster); @@ -58,16 +63,18 @@ public class ResourceTargetTest { // No current traffic: Ideal load is low but capped assertEquals(0.275, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), + application, + clock), delta); // Almost current traffic: Ideal load is low but capped application = application.with(new Status(0.0001, 1.0)); assertEquals(0.04, ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0), - application), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock), + application, + clock), delta); } @@ -86,9 +93,9 @@ public class ResourceTargetTest { private ClusterTimeseries timeseries(Cluster cluster, int measurements, IntFunction<Double> queryRate, - IntFunction<Double> writeRate) { + IntFunction<Double> writeRate, + ManualClock clock) { List<ClusterMetricSnapshot> snapshots = new ArrayList<>(measurements); - ManualClock clock = new ManualClock(); for (int i = 0; i < measurements; i++) { snapshots.add(new ClusterMetricSnapshot(clock.instant(), queryRate.apply(i), writeRate.apply(i))); clock.advance(Duration.ofMinutes(5)); |