aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-11-20 18:55:59 +0100
committerJon Bratseth <bratseth@gmail.com>2020-11-20 18:55:59 +0100
commit238d0f56d0f232d8b5f12324f57938b159e27afa (patch)
treed71b9e208f94ba7111427cb5796d18e5f1afbe55
parentce9ccaf1a95050f1df8b0c9be3c1daab7dc416fe (diff)
Wait a while before scaling down
-rw-r--r--config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java10
-rw-r--r--container-search/src/main/java/com/yahoo/search/Query.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java39
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java35
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java7
8 files changed, 96 insertions, 24 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java
index f1c86485a64..8f4c9f81d7f 100644
--- a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java
+++ b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java
@@ -1,6 +1,8 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.config.provision;
+import com.yahoo.config.Node;
+
import java.util.List;
import java.util.Objects;
@@ -53,6 +55,14 @@ public class ClusterResources {
return true;
}
+ /** Returns the total resources of this, that is the number of nodes times the node resources */
+ public NodeResources totalResources() {
+ return nodeResources.withVcpu(nodeResources.vcpu() * nodes)
+ .withMemoryGb(nodeResources.memoryGb() * nodes)
+ .withDiskGb(nodeResources.diskGb() * nodes)
+ .withBandwidthGbps(nodeResources.bandwidthGbps() * nodes);
+ }
+
@Override
public boolean equals(Object o) {
if (o == this) return true;
diff --git a/container-search/src/main/java/com/yahoo/search/Query.java b/container-search/src/main/java/com/yahoo/search/Query.java
index 4995927f7a2..ce31b9a3ba3 100644
--- a/container-search/src/main/java/com/yahoo/search/Query.java
+++ b/container-search/src/main/java/com/yahoo/search/Query.java
@@ -181,7 +181,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable {
//---------------- Tracing ----------------------------------------------------
- private static Logger log = Logger.getLogger(Query.class.getName());
+ private static final Logger log = Logger.getLogger(Query.class.getName());
/** The time this query was created */
private long startTime;
@@ -200,7 +200,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable {
public static final CompoundName TIMEOUT = new CompoundName("timeout");
- private static QueryProfileType argumentType;
+ private static final QueryProfileType argumentType;
static {
argumentType = new QueryProfileType("native");
argumentType.setBuiltin(true);
@@ -226,7 +226,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable {
public static QueryProfileType getArgumentType() { return argumentType; }
/** The aliases of query properties */
- private static Map<String, CompoundName> propertyAliases;
+ private static final Map<String, CompoundName> propertyAliases;
static {
Map<String,CompoundName> propertyAliasesBuilder = new HashMap<>();
addAliases(Query.getArgumentType(), propertyAliasesBuilder);
@@ -316,7 +316,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable {
* Creates a query from a request
*
* @param request the HTTP request from which this is created
- * @param queryProfile the query profile to use for this query, or null if none.
+ * @param queryProfile the query profile to use for this query, or null if none
*/
public Query(HttpRequest request, CompiledQueryProfile queryProfile) {
this(request, request.propertyMap(), queryProfile);
@@ -325,9 +325,9 @@ public class Query extends com.yahoo.processing.Request implements Cloneable {
/**
* Creates a query from a request
*
- * @param request the HTTP request from which this is created.
- * @param requestMap the property map of the query.
- * @param queryProfile the query profile to use for this query, or null if none.
+ * @param request the HTTP request from which this is created
+ * @param requestMap the property map of the query
+ * @param queryProfile the query profile to use for this query, or null if none
*/
public Query(HttpRequest request, Map<String, String> requestMap, CompiledQueryProfile queryProfile) {
super(new QueryPropertyAliases(propertyAliases));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index a17ee081447..a3fb2f3c327 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -73,6 +73,11 @@ public class Cluster {
/** Returns the recent scaling events in this cluster */
public List<ScalingEvent> scalingEvents() { return scalingEvents; }
+ public Optional<ScalingEvent> lastScalingEvent() {
+ if (scalingEvents.isEmpty()) return Optional.empty();
+ return Optional.of(scalingEvents.get(scalingEvents.size() - 1));
+ }
+
public Cluster withConfiguration(boolean exclusive, ClusterResources min, ClusterResources max) {
return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 1a8c4c8a6c2..b1f711e1e34 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -3,11 +3,13 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import java.time.Duration;
+import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.logging.Logger;
@@ -60,10 +62,10 @@ public class Autoscaler {
}
private Advice autoscale(Cluster cluster, List<Node> clusterNodes, Limits limits, boolean exclusive) {
- log.fine(() -> "Autoscale " + cluster.toString());
+ log.fine(() -> "Autoscaling " + cluster);
if (unstable(clusterNodes, nodeRepository)) {
- log.fine(() -> "Unstable - Advice.none " + cluster.toString());
+ log.fine(() -> "Unstable: Advice.none for " + cluster);
return Advice.none();
}
@@ -81,13 +83,18 @@ public class Autoscaler {
Optional<AllocatableClusterResources> bestAllocation =
allocationOptimizer.findBestAllocation(target, currentAllocation, limits, exclusive);
if (bestAllocation.isEmpty()) {
- log.fine(() -> "bestAllocation.isEmpty: Advice.dontScale for " + cluster.toString());
+ log.fine(() -> "bestAllocation.isEmpty: Advice.dontScale for " + cluster);
return Advice.dontScale();
}
if (similar(bestAllocation.get(), currentAllocation)) {
- log.fine(() -> "Current allocation similar: Advice.dontScale for " + cluster.toString());
+ log.fine(() -> "Current allocation similar: Advice.dontScale for " + cluster);
return Advice.dontScale();
}
+ if (isDownscaling(bestAllocation.get(), currentAllocation) && recentlyScaled(cluster, clusterNodes)) {
+ log.fine(() -> "Too soon to downscale: Advice.dontScale for " + cluster);
+ return Advice.dontScale();
+ }
+
return Advice.scaleTo(bestAllocation.get().toAdvertisedClusterResources());
}
@@ -106,10 +113,23 @@ public class Autoscaler {
return Math.abs(r1 - r2) / (( r1 + r2) / 2) < threshold;
}
+ /** Returns true if this reduces total resources in any dimension */
+ private boolean isDownscaling(AllocatableClusterResources target, AllocatableClusterResources current) {
+ NodeResources targetTotal = target.toAdvertisedClusterResources().totalResources();
+ NodeResources currentTotal = current.toAdvertisedClusterResources().totalResources();
+ return ! targetTotal.justNumbers().satisfies(currentTotal.justNumbers());
+ }
+
+ private boolean recentlyScaled(Cluster cluster, List<Node> clusterNodes) {
+ Duration downscalingDelay = downscalingDelay(clusterNodes.get(0).allocation().get().membership().cluster().type());
+ return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
+ .isAfter(nodeRepository.clock().instant().minus(downscalingDelay));
+ }
+
/** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
static Duration scalingWindow(ClusterSpec.Type clusterType) {
if (clusterType.isContent()) return Duration.ofHours(12);
- return Duration.ofHours(1);
+ return Duration.ofMinutes(30);
}
static Duration maxScalingWindow() {
@@ -122,6 +142,15 @@ public class Autoscaler {
return 20;
}
+ /**
+ * We should wait a while before scaling down after a scaling event as a peak in usage
+ * indicates more peaks may arrive in the near future.
+ */
+ static Duration downscalingDelay(ClusterSpec.Type clusterType) {
+ if (clusterType.isContent()) return Duration.ofHours(12);
+ return Duration.ofHours(1);
+ }
+
public static boolean unstable(List<Node> nodes, NodeRepository nodeRepository) {
// The cluster is processing recent changes
if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index bb91b77dce5..f4cfc5799dc 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -78,12 +78,12 @@ public class ClusterTimeseries {
// but don't require that we have at least that many from every node
int measurementCount = currentMeasurements.stream().mapToInt(m -> m.size()).sum();
if (measurementCount / clusterNodes.size() < Autoscaler.minimumMeasurementsPerNode(clusterType)) {
- log.fine(() -> "Too few measurements per node for " + cluster.toString() + ": measurementCount " + measurementCount +
+ log.fine(() -> "Too few measurements per node for " + cluster + ": measurementCount " + measurementCount +
" (" + nodeTimeseries.stream().mapToInt(m -> m.size()).sum() + " before filtering");
return Optional.empty();
}
if (currentMeasurements.size() != clusterNodes.size()) {
- log.fine(() -> "Mssing measurements from some nodes for " + cluster.toString() + ": Has from " + currentMeasurements.size() +
+ log.fine(() -> "Missing measurements from some nodes for " + cluster + ": Has from " + currentMeasurements.size() +
"but need " + clusterNodes.size() + "(before filtering: " + nodeTimeseries.size() + ")");
return Optional.empty();
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index b53f56e4743..4c2c0caa0b4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -100,11 +100,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
}
static String toString(ClusterResources r) {
- return String.format(Locale.US, "%d%s * [vcpu: %.1f, memory: %.1f Gb, disk %.1f Gb]" +
- " (total: [vcpu: %.1f, memory: %.1f Gb, disk: %.1f Gb])",
- r.nodes(), r.groups() > 1 ? " (in " + r.groups() + " groups)" : "",
- r.nodeResources().vcpu(), r.nodeResources().memoryGb(), r.nodeResources().diskGb(),
- r.nodes() * r.nodeResources().vcpu(), r.nodes() * r.nodeResources().memoryGb(), r.nodes() * r.nodeResources().diskGb());
+ return r + " (total: " + r.totalResources() + ")";
}
private Map<ClusterSpec.Id, List<Node>> nodesByCluster(List<Node> applicationNodes) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 5813a7067cd..5393aa7cfb8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.Nodelike;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
import org.junit.Test;
+import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
@@ -44,11 +45,13 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 5, 1, hostResources);
+ tester.clock().advance(Duration.ofDays(1));
assertTrue("No measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
tester.addCpuMeasurements(0.25f, 1f, 59, application1);
assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
+ tester.clock().advance(Duration.ofDays(1));
tester.addCpuMeasurements(0.25f, 1f, 60, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high",
15, 1, 1.3, 28.6, 28.6,
@@ -58,6 +61,8 @@ public class AutoscalingTest {
assertTrue("Cluster in flux -> No further change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
tester.deactivateRetired(application1, cluster1, scaledResources);
+
+ tester.clock().advance(Duration.ofDays(1));
tester.addCpuMeasurements(0.8f, 1f, 3, application1);
assertTrue("Load change is large, but insufficient measurements for new config -> No change",
tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
@@ -112,6 +117,7 @@ public class AutoscalingTest {
tester.nodeRepository().getNodes(application1).stream()
.allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow);
+ tester.clock().advance(Duration.ofDays(1));
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
// Changing min and max from slow to any
ClusterResources min = new ClusterResources( 2, 1,
@@ -184,7 +190,7 @@ public class AutoscalingTest {
}
@Test
- public void test_autoscaling_limits_when_min_equals_xax() {
+ public void test_autoscaling_limits_when_min_equals_max() {
NodeResources resources = new NodeResources(3, 100, 100, 1);
ClusterResources min = new ClusterResources( 2, 1, new NodeResources(1, 1, 1, 1));
ClusterResources max = min;
@@ -195,6 +201,7 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 5, 1, resources);
+ tester.clock().advance(Duration.ofDays(1));
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
assertTrue(tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
}
@@ -283,6 +290,31 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2));
+ tester.clock().advance(Duration.ofDays(1));
+ tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
+ tester.assertResources("Scaling down",
+ 6, 1, 2.8, 4.0, 95.0,
+ tester.autoscale(application1, cluster1.id(), min, max).target());
+ }
+
+ @Test
+ public void scaling_down_only_after_delay() {
+ NodeResources hostResources = new NodeResources(6, 100, 100, 1);
+ ClusterResources min = new ClusterResources( 2, 1, new NodeResources(1, 1, 1, 1));
+ ClusterResources max = new ClusterResources(20, 1, new NodeResources(100, 1000, 1000, 1));
+ AutoscalingTester tester = new AutoscalingTester(hostResources);
+
+ ApplicationId application1 = tester.applicationId("application1");
+ ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1");
+
+ tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2));
+
+ // No autoscaling as it is too soon to scale down after initial deploy (counting as a scaling event)
+ tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
+ assertTrue(tester.autoscale(application1, cluster1.id(), min, max).target().isEmpty());
+
+ // Trying the same a day later causes autoscaling
+ tester.clock().advance(Duration.ofDays(1));
tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
tester.assertResources("Scaling down",
6, 1, 2.8, 4.0, 95.0,
@@ -344,6 +376,7 @@ public class AutoscalingTest {
// deploy (Why 103 Gb memory? See AutoscalingTester.MockHostResourcesCalculator
tester.deploy(application1, cluster1, 5, 1, new NodeResources(3, 103, 100, 1));
+ tester.clock().advance(Duration.ofDays(1));
tester.addMemMeasurements(0.9f, 0.6f, 120, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high.",
8, 1, 3, 83, 34.3,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
index 5e318e00288..4b14174488e 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
@@ -110,9 +110,8 @@ public class AutoscalingMaintainerTest {
assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
// Add measurement of the expected generation, leading to rescaling
- tester.clock().advance(Duration.ofSeconds(1));
+ tester.clock().advance(Duration.ofHours(2));
tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1);
- //tester.clock().advance(Duration.ofSeconds(1));
Instant lastMaintenanceTime = tester.clock().instant();
tester.maintainer().maintain();
assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
@@ -122,10 +121,10 @@ public class AutoscalingMaintainerTest {
@Test
public void test_toString() {
- assertEquals("4 * [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb] (total: [vcpu: 4.0, memory: 8.0 Gb, disk: 16.0 Gb])",
+ assertEquals("4 nodes with [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb, bandwidth: 1.0 Gbps] (total: [vcpu: 4.0, memory: 8.0 Gb, disk 16.0 Gb, bandwidth: 4.0 Gbps])",
AutoscalingMaintainer.toString(new ClusterResources(4, 1, new NodeResources(1, 2, 4, 1))));
- assertEquals("4 (in 2 groups) * [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb] (total: [vcpu: 4.0, memory: 8.0 Gb, disk: 16.0 Gb])",
+ assertEquals("4 nodes (in 2 groups) with [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb, bandwidth: 1.0 Gbps] (total: [vcpu: 4.0, memory: 8.0 Gb, disk 16.0 Gb, bandwidth: 4.0 Gbps])",
AutoscalingMaintainer.toString(new ClusterResources(4, 2, new NodeResources(1, 2, 4, 1))));
}