diff options
8 files changed, 96 insertions, 24 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java index f1c86485a64..8f4c9f81d7f 100644 --- a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java +++ b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterResources.java @@ -1,6 +1,8 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.config.provision; +import com.yahoo.config.Node; + import java.util.List; import java.util.Objects; @@ -53,6 +55,14 @@ public class ClusterResources { return true; } + /** Returns the total resources of this, that is the number of nodes times the node resources */ + public NodeResources totalResources() { + return nodeResources.withVcpu(nodeResources.vcpu() * nodes) + .withMemoryGb(nodeResources.memoryGb() * nodes) + .withDiskGb(nodeResources.diskGb() * nodes) + .withBandwidthGbps(nodeResources.bandwidthGbps() * nodes); + } + @Override public boolean equals(Object o) { if (o == this) return true; diff --git a/container-search/src/main/java/com/yahoo/search/Query.java b/container-search/src/main/java/com/yahoo/search/Query.java index 4995927f7a2..ce31b9a3ba3 100644 --- a/container-search/src/main/java/com/yahoo/search/Query.java +++ b/container-search/src/main/java/com/yahoo/search/Query.java @@ -181,7 +181,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { //---------------- Tracing ---------------------------------------------------- - private static Logger log = Logger.getLogger(Query.class.getName()); + private static final Logger log = Logger.getLogger(Query.class.getName()); /** The time this query was created */ private long startTime; @@ -200,7 +200,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { public static final CompoundName TIMEOUT = new CompoundName("timeout"); - private static QueryProfileType argumentType; + private static final QueryProfileType argumentType; static { argumentType = new QueryProfileType("native"); argumentType.setBuiltin(true); @@ -226,7 +226,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { public static QueryProfileType getArgumentType() { return argumentType; } /** The aliases of query properties */ - private static Map<String, CompoundName> propertyAliases; + private static final Map<String, CompoundName> propertyAliases; static { Map<String,CompoundName> propertyAliasesBuilder = new HashMap<>(); addAliases(Query.getArgumentType(), propertyAliasesBuilder); @@ -316,7 +316,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { * Creates a query from a request * * @param request the HTTP request from which this is created - * @param queryProfile the query profile to use for this query, or null if none. + * @param queryProfile the query profile to use for this query, or null if none */ public Query(HttpRequest request, CompiledQueryProfile queryProfile) { this(request, request.propertyMap(), queryProfile); @@ -325,9 +325,9 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { /** * Creates a query from a request * - * @param request the HTTP request from which this is created. - * @param requestMap the property map of the query. - * @param queryProfile the query profile to use for this query, or null if none. + * @param request the HTTP request from which this is created + * @param requestMap the property map of the query + * @param queryProfile the query profile to use for this query, or null if none */ public Query(HttpRequest request, Map<String, String> requestMap, CompiledQueryProfile queryProfile) { super(new QueryPropertyAliases(propertyAliases)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java index a17ee081447..a3fb2f3c327 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java @@ -73,6 +73,11 @@ public class Cluster { /** Returns the recent scaling events in this cluster */ public List<ScalingEvent> scalingEvents() { return scalingEvents; } + public Optional<ScalingEvent> lastScalingEvent() { + if (scalingEvents.isEmpty()) return Optional.empty(); + return Optional.of(scalingEvents.get(scalingEvents.size() - 1)); + } + public Cluster withConfiguration(boolean exclusive, ClusterResources min, ClusterResources max) { return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 1a8c4c8a6c2..b1f711e1e34 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -3,11 +3,13 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; import java.time.Duration; +import java.time.Instant; import java.util.List; import java.util.Optional; import java.util.logging.Logger; @@ -60,10 +62,10 @@ public class Autoscaler { } private Advice autoscale(Cluster cluster, List<Node> clusterNodes, Limits limits, boolean exclusive) { - log.fine(() -> "Autoscale " + cluster.toString()); + log.fine(() -> "Autoscaling " + cluster); if (unstable(clusterNodes, nodeRepository)) { - log.fine(() -> "Unstable - Advice.none " + cluster.toString()); + log.fine(() -> "Unstable: Advice.none for " + cluster); return Advice.none(); } @@ -81,13 +83,18 @@ public class Autoscaler { Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits, exclusive); if (bestAllocation.isEmpty()) { - log.fine(() -> "bestAllocation.isEmpty: Advice.dontScale for " + cluster.toString()); + log.fine(() -> "bestAllocation.isEmpty: Advice.dontScale for " + cluster); return Advice.dontScale(); } if (similar(bestAllocation.get(), currentAllocation)) { - log.fine(() -> "Current allocation similar: Advice.dontScale for " + cluster.toString()); + log.fine(() -> "Current allocation similar: Advice.dontScale for " + cluster); return Advice.dontScale(); } + if (isDownscaling(bestAllocation.get(), currentAllocation) && recentlyScaled(cluster, clusterNodes)) { + log.fine(() -> "Too soon to downscale: Advice.dontScale for " + cluster); + return Advice.dontScale(); + } + return Advice.scaleTo(bestAllocation.get().toAdvertisedClusterResources()); } @@ -106,10 +113,23 @@ public class Autoscaler { return Math.abs(r1 - r2) / (( r1 + r2) / 2) < threshold; } + /** Returns true if this reduces total resources in any dimension */ + private boolean isDownscaling(AllocatableClusterResources target, AllocatableClusterResources current) { + NodeResources targetTotal = target.toAdvertisedClusterResources().totalResources(); + NodeResources currentTotal = current.toAdvertisedClusterResources().totalResources(); + return ! targetTotal.justNumbers().satisfies(currentTotal.justNumbers()); + } + + private boolean recentlyScaled(Cluster cluster, List<Node> clusterNodes) { + Duration downscalingDelay = downscalingDelay(clusterNodes.get(0).allocation().get().membership().cluster().type()); + return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN) + .isAfter(nodeRepository.clock().instant().minus(downscalingDelay)); + } + /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */ static Duration scalingWindow(ClusterSpec.Type clusterType) { if (clusterType.isContent()) return Duration.ofHours(12); - return Duration.ofHours(1); + return Duration.ofMinutes(30); } static Duration maxScalingWindow() { @@ -122,6 +142,15 @@ public class Autoscaler { return 20; } + /** + * We should wait a while before scaling down after a scaling event as a peak in usage + * indicates more peaks may arrive in the near future. + */ + static Duration downscalingDelay(ClusterSpec.Type clusterType) { + if (clusterType.isContent()) return Duration.ofHours(12); + return Duration.ofHours(1); + } + public static boolean unstable(List<Node> nodes, NodeRepository nodeRepository) { // The cluster is processing recent changes if (nodes.stream().anyMatch(node -> node.status().wantToRetire() || diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index bb91b77dce5..f4cfc5799dc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -78,12 +78,12 @@ public class ClusterTimeseries { // but don't require that we have at least that many from every node int measurementCount = currentMeasurements.stream().mapToInt(m -> m.size()).sum(); if (measurementCount / clusterNodes.size() < Autoscaler.minimumMeasurementsPerNode(clusterType)) { - log.fine(() -> "Too few measurements per node for " + cluster.toString() + ": measurementCount " + measurementCount + + log.fine(() -> "Too few measurements per node for " + cluster + ": measurementCount " + measurementCount + " (" + nodeTimeseries.stream().mapToInt(m -> m.size()).sum() + " before filtering"); return Optional.empty(); } if (currentMeasurements.size() != clusterNodes.size()) { - log.fine(() -> "Mssing measurements from some nodes for " + cluster.toString() + ": Has from " + currentMeasurements.size() + + log.fine(() -> "Missing measurements from some nodes for " + cluster + ": Has from " + currentMeasurements.size() + "but need " + clusterNodes.size() + "(before filtering: " + nodeTimeseries.size() + ")"); return Optional.empty(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index b53f56e4743..4c2c0caa0b4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -100,11 +100,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { } static String toString(ClusterResources r) { - return String.format(Locale.US, "%d%s * [vcpu: %.1f, memory: %.1f Gb, disk %.1f Gb]" + - " (total: [vcpu: %.1f, memory: %.1f Gb, disk: %.1f Gb])", - r.nodes(), r.groups() > 1 ? " (in " + r.groups() + " groups)" : "", - r.nodeResources().vcpu(), r.nodeResources().memoryGb(), r.nodeResources().diskGb(), - r.nodes() * r.nodeResources().vcpu(), r.nodes() * r.nodeResources().memoryGb(), r.nodes() * r.nodeResources().diskGb()); + return r + " (total: " + r.totalResources() + ")"; } private Map<ClusterSpec.Id, List<Node>> nodesByCluster(List<Node> applicationNodes) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 5813a7067cd..5393aa7cfb8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.Nodelike; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import org.junit.Test; +import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -44,11 +45,13 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, hostResources); + tester.clock().advance(Duration.ofDays(1)); assertTrue("No measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.addCpuMeasurements(0.25f, 1f, 59, application1); assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); + tester.clock().advance(Duration.ofDays(1)); tester.addCpuMeasurements(0.25f, 1f, 60, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high", 15, 1, 1.3, 28.6, 28.6, @@ -58,6 +61,8 @@ public class AutoscalingTest { assertTrue("Cluster in flux -> No further change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); tester.deactivateRetired(application1, cluster1, scaledResources); + + tester.clock().advance(Duration.ofDays(1)); tester.addCpuMeasurements(0.8f, 1f, 3, application1); assertTrue("Load change is large, but insufficient measurements for new config -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); @@ -112,6 +117,7 @@ public class AutoscalingTest { tester.nodeRepository().getNodes(application1).stream() .allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow); + tester.clock().advance(Duration.ofDays(1)); tester.addCpuMeasurements(0.25f, 1f, 120, application1); // Changing min and max from slow to any ClusterResources min = new ClusterResources( 2, 1, @@ -184,7 +190,7 @@ public class AutoscalingTest { } @Test - public void test_autoscaling_limits_when_min_equals_xax() { + public void test_autoscaling_limits_when_min_equals_max() { NodeResources resources = new NodeResources(3, 100, 100, 1); ClusterResources min = new ClusterResources( 2, 1, new NodeResources(1, 1, 1, 1)); ClusterResources max = min; @@ -195,6 +201,7 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 5, 1, resources); + tester.clock().advance(Duration.ofDays(1)); tester.addCpuMeasurements(0.25f, 1f, 120, application1); assertTrue(tester.autoscale(application1, cluster1.id(), min, max).isEmpty()); } @@ -283,6 +290,31 @@ public class AutoscalingTest { // deploy tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); + tester.clock().advance(Duration.ofDays(1)); + tester.addMemMeasurements(0.02f, 0.95f, 120, application1); + tester.assertResources("Scaling down", + 6, 1, 2.8, 4.0, 95.0, + tester.autoscale(application1, cluster1.id(), min, max).target()); + } + + @Test + public void scaling_down_only_after_delay() { + NodeResources hostResources = new NodeResources(6, 100, 100, 1); + ClusterResources min = new ClusterResources( 2, 1, new NodeResources(1, 1, 1, 1)); + ClusterResources max = new ClusterResources(20, 1, new NodeResources(100, 1000, 1000, 1)); + AutoscalingTester tester = new AutoscalingTester(hostResources); + + ApplicationId application1 = tester.applicationId("application1"); + ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.content, "cluster1"); + + tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2)); + + // No autoscaling as it is too soon to scale down after initial deploy (counting as a scaling event) + tester.addMemMeasurements(0.02f, 0.95f, 120, application1); + assertTrue(tester.autoscale(application1, cluster1.id(), min, max).target().isEmpty()); + + // Trying the same a day later causes autoscaling + tester.clock().advance(Duration.ofDays(1)); tester.addMemMeasurements(0.02f, 0.95f, 120, application1); tester.assertResources("Scaling down", 6, 1, 2.8, 4.0, 95.0, @@ -344,6 +376,7 @@ public class AutoscalingTest { // deploy (Why 103 Gb memory? See AutoscalingTester.MockHostResourcesCalculator tester.deploy(application1, cluster1, 5, 1, new NodeResources(3, 103, 100, 1)); + tester.clock().advance(Duration.ofDays(1)); tester.addMemMeasurements(0.9f, 0.6f, 120, application1); ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high.", 8, 1, 3, 83, 34.3, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index 5e318e00288..4b14174488e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -110,9 +110,8 @@ public class AutoscalingMaintainerTest { assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); // Add measurement of the expected generation, leading to rescaling - tester.clock().advance(Duration.ofSeconds(1)); + tester.clock().advance(Duration.ofHours(2)); tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1); - //tester.clock().advance(Duration.ofSeconds(1)); Instant lastMaintenanceTime = tester.clock().instant(); tester.maintainer().maintain(); assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli()); @@ -122,10 +121,10 @@ public class AutoscalingMaintainerTest { @Test public void test_toString() { - assertEquals("4 * [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb] (total: [vcpu: 4.0, memory: 8.0 Gb, disk: 16.0 Gb])", + assertEquals("4 nodes with [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb, bandwidth: 1.0 Gbps] (total: [vcpu: 4.0, memory: 8.0 Gb, disk 16.0 Gb, bandwidth: 4.0 Gbps])", AutoscalingMaintainer.toString(new ClusterResources(4, 1, new NodeResources(1, 2, 4, 1)))); - assertEquals("4 (in 2 groups) * [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb] (total: [vcpu: 4.0, memory: 8.0 Gb, disk: 16.0 Gb])", + assertEquals("4 nodes (in 2 groups) with [vcpu: 1.0, memory: 2.0 Gb, disk 4.0 Gb, bandwidth: 1.0 Gbps] (total: [vcpu: 4.0, memory: 8.0 Gb, disk 16.0 Gb, bandwidth: 4.0 Gbps])", AutoscalingMaintainer.toString(new ClusterResources(4, 2, new NodeResources(1, 2, 4, 1)))); } |