summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-03-17 16:57:53 +0100
committerJon Bratseth <bratseth@gmail.com>2021-03-17 16:57:53 +0100
commit7ba1f8a582181b5ace2eb775817b1142026430df (patch)
treead3d5e516f18010b793bd2e6c74aa7250b74f8bd /node-repository
parent26380d3b13a9524dc7cf285aeb65aea1dad0fd3e (diff)
Average query rate over measurement window
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java46
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java27
6 files changed, 73 insertions, 39 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index c7549a5ddee..ac3430fecf9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -86,7 +86,8 @@ public class Autoscaler {
clusterTimeseries,
clusterNodesTimeseries,
currentAllocation,
- application);
+ application,
+ nodeRepository.clock());
Optional<AllocatableClusterResources> bestAllocation =
allocationOptimizer.findBestAllocation(target, currentAllocation, limits);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index 3f5255c6618..75270f8afc6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -3,7 +3,9 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ClusterSpec;
+import java.time.Clock;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@@ -84,12 +86,21 @@ public class ClusterTimeseries {
return maxGrowthRate / currentQueryRate();
}
- /** The current query rate as a fraction of the peak rate in this timeseries */
- public double currentQueryFractionOfMax() {
+ /**
+ * The current query rate, averaged over the same window we average utilization over,
+ * as a fraction of the peak rate in this timeseries
+ */
+ public double queryFractionOfMax(Duration window, Clock clock) {
if (snapshots.isEmpty()) return 0.5;
var max = snapshots.stream().mapToDouble(ClusterMetricSnapshot::queryRate).max().getAsDouble();
if (max == 0) return 1.0;
- return snapshots.get(snapshots.size() - 1).queryRate() / max;
+ Instant oldest = clock.instant().minus(window);
+ var average = snapshots.stream()
+ .filter(snapshot -> snapshot.at().isAfter(oldest))
+ .mapToDouble(snapshot -> snapshot.queryRate())
+ .average();
+ if (average.isEmpty()) return 0.5; // No measurements in the relevant time period
+ return average.getAsDouble() / max;
}
public double currentQueryRate() {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
index 35717b97cf4..d7dd0fc3197 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.vespa.hosted.provision.applications.Application;
+import java.time.Clock;
import java.time.Duration;
/**
@@ -53,9 +54,10 @@ public class ResourceTarget {
ClusterTimeseries clusterTimeseries,
ClusterNodesTimeseries clusterNodesTimeseries,
AllocatableClusterResources current,
- Application application) {
+ Application application,
+ Clock clock) {
return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current)
- / idealCpuLoad(scalingDuration, clusterTimeseries, application),
+ / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock),
nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current)
/ Resource.memory.idealAverageLoad(),
nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current)
@@ -74,14 +76,15 @@ public class ResourceTarget {
/** Ideal cpu load must take the application traffic fraction into account */
public static double idealCpuLoad(Duration scalingDuration,
ClusterTimeseries clusterTimeseries,
- Application application) {
+ Application application,
+ Clock clock) {
double queryCpuFraction = queryCpuFraction(clusterTimeseries);
// What's needed to have headroom for growth during scale-up as a fraction of current resources?
double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(); // in fraction per minute of the current traffic
double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes();
// Cap headroom at 10% above the historical observed peak
- double fractionOfMax = clusterTimeseries.currentQueryFractionOfMax();
+ double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock);
if (fractionOfMax != 0)
growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
index 4d1c963d8ea..90b898640cf 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
@@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.autoscale.Resource;
import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget;
import java.net.URI;
+import java.time.Clock;
import java.time.Duration;
import java.util.Collection;
import java.util.List;
@@ -71,12 +72,12 @@ public class ApplicationSerializer {
if (cluster.shouldSuggestResources(currentResources))
cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested")));
cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target")));
- clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, clusterObject.setObject("utilization"));
+ clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization"));
scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents"));
clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus());
clusterObject.setLong("scalingDuration", scalingDuration.toMillis());
clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate());
- clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.currentQueryFractionOfMax());
+ clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock()));
}
private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) {
@@ -89,9 +90,10 @@ public class ApplicationSerializer {
Duration scalingDuration,
ClusterTimeseries clusterTimeseries,
ClusterNodesTimeseries clusterNodesTimeseries,
+ Clock clock,
Cursor utilizationObject) {
utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu));
- utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application));
+ utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock));
utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory));
utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad());
utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 650bfe761b5..daf8b9b4f9f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -53,7 +53,7 @@ public class AutoscalingTest {
assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
tester.clock().advance(Duration.ofDays(1));
- tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+ tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high",
14, 1, 1.4, 30.8, 30.8,
@@ -123,7 +123,7 @@ public class AutoscalingTest {
.allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow);
tester.clock().advance(Duration.ofDays(2));
- tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+ tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
// Changing min and max from slow to any
ClusterResources min = new ClusterResources( 2, 1,
@@ -368,7 +368,7 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2));
tester.clock().advance(Duration.ofDays(2));
- tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
+ tester.addQueryRateMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
tester.assertResources("Scaling down",
6, 1, 2.9, 4.0, 95.0,
@@ -507,6 +507,16 @@ public class AutoscalingTest {
@Test
public void test_autoscaling_considers_growth_rate() {
+ NodeResources minResources = new NodeResources( 1, 100, 100, 1);
+ NodeResources midResources = new NodeResources( 5, 100, 100, 1);
+ NodeResources maxResources = new NodeResources(10, 100, 100, 1);
+ ClusterResources min = new ClusterResources(5, 1, minResources);
+ ClusterResources max = new ClusterResources(5, 1, maxResources);
+ AutoscalingTester tester = new AutoscalingTester(maxResources.withVcpu(maxResources.vcpu() * 2));
+
+ ApplicationId application1 = tester.applicationId("application1");
+ ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1");
+/*
NodeResources resources = new NodeResources(3, 100, 100, 1);
ClusterResources min = new ClusterResources( 1, 1, resources);
ClusterResources max = new ClusterResources(10, 1, resources);
@@ -514,22 +524,22 @@ public class AutoscalingTest {
ApplicationId application1 = tester.applicationId("application1");
ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1");
-
- tester.deploy(application1, cluster1, 5, 1, resources);
+*/
+ tester.deploy(application1, cluster1, 5, 1, midResources);
tester.addQueryRateMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0); // Query traffic only
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
// (no query rate data)
tester.assertResources("Advice to scale up since we assume we need 2x cpu for growth when no data",
- 7, 1, 3, 100, 100,
+ 5, 1, 6.3, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
tester.setScalingDuration(application1, cluster1.id(), Duration.ofMinutes(5));
tester.addQueryRateMeasurements(application1, cluster1.id(),
100,
t -> 10.0 + (t < 50 ? t : 100 - t));
- tester.assertResources("Advice to scale down since observed growth is much slower than scaling time",
- 4, 1, 3, 100, 100,
+ tester.assertResources("Advice to scale down since observed growth is slower than scaling time",
+ 5, 1, 3.4, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
tester.clearQueryRateMeasurements(application1, cluster1.id());
@@ -538,8 +548,8 @@ public class AutoscalingTest {
tester.addQueryRateMeasurements(application1, cluster1.id(),
100,
t -> 10.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49)));
- tester.assertResources("Advice to scale up since observed growth is much faster than scaling time",
- 10, 1, 3, 100, 100,
+ tester.assertResources("Advice to scale up since observed growth is faster than scaling time",
+ 5, 1, 6.7, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
}
@@ -561,27 +571,27 @@ public class AutoscalingTest {
// Why twice the query rate at time = 0?
// This makes headroom for queries doubling, which we want to observe the effect of here
- tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 10.0);
+ tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 10.0);
tester.assertResources("Query and write load is equal -> scale up somewhat",
5, 1, 7.3, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
- tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 100.0 : 50.0, t -> 10.0);
- tester.assertResources("Query load is 5x write load -> scale up more",
- 5, 1, 9.7, 100, 100,
+ tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 80.0 : 40.0, t -> 10.0);
+ tester.assertResources("Query load is 4x write load -> scale up more",
+ 5, 1, 9.1, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
- tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0);
+ tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t -> 100.0);
tester.assertResources("Write load is 10x query load -> scale down",
- 5, 1, 3.8, 100, 100,
+ 5, 1, 3.7, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
- tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> t == 0 ? 20.0 : 10.0, t-> 0.0);
+ tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> t == 0 ? 20.0 : 10.0, t-> 0.0);
tester.assertResources("Query only -> largest possible",
5, 1, 10.0, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
- tester.addLoadMeasurements(application1, cluster1.id(), 10, t -> 0.0, t -> 10.0);
+ tester.addLoadMeasurements(application1, cluster1.id(), 100, t -> 0.0, t -> 10.0);
tester.assertResources("Write only -> smallest possible",
5, 1, 2.1, 100, 100,
tester.autoscale(application1, cluster1.id(), min, max).target());
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java
index 6b7c1790bc4..6f60de62f1f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java
@@ -28,6 +28,7 @@ public class ResourceTargetTest {
@Test
public void test_traffic_headroom() {
+ ManualClock clock = new ManualClock();
Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
application = application.with(cluster);
@@ -36,21 +37,25 @@ public class ResourceTargetTest {
application = application.with(new Status(0.0, 1.0));
assertEquals(0.131,
ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0),
- application),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
+ application,
+ clock),
delta);
// Almost no current traffic share: Ideal load is low but capped
application = application.with(new Status(0.0001, 1.0));
assertEquals(0.131,
ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0),
- application),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
+ application,
+ clock),
delta);
}
@Test
public void test_growth_headroom() {
+ ManualClock clock = new ManualClock();
+
Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
Cluster cluster = cluster(new NodeResources(1, 10, 100, 1));
application = application.with(cluster);
@@ -58,16 +63,18 @@ public class ResourceTargetTest {
// No current traffic: Ideal load is low but capped
assertEquals(0.275,
ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0),
- application),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
+ application,
+ clock),
delta);
// Almost current traffic: Ideal load is low but capped
application = application.with(new Status(0.0001, 1.0));
assertEquals(0.04,
ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0),
- application),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock),
+ application,
+ clock),
delta);
}
@@ -86,9 +93,9 @@ public class ResourceTargetTest {
private ClusterTimeseries timeseries(Cluster cluster,
int measurements,
IntFunction<Double> queryRate,
- IntFunction<Double> writeRate) {
+ IntFunction<Double> writeRate,
+ ManualClock clock) {
List<ClusterMetricSnapshot> snapshots = new ArrayList<>(measurements);
- ManualClock clock = new ManualClock();
for (int i = 0; i < measurements; i++) {
snapshots.add(new ClusterMetricSnapshot(clock.instant(), queryRate.apply(i), writeRate.apply(i)));
clock.advance(Duration.ofMinutes(5));