diff options
12 files changed, 128 insertions, 87 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/search/NodeResourcesTuning.java b/config-model/src/main/java/com/yahoo/vespa/model/search/NodeResourcesTuning.java index 5b747b93268..b444de5fb14 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/search/NodeResourcesTuning.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/search/NodeResourcesTuning.java @@ -27,8 +27,9 @@ public class NodeResourcesTuning implements ProtonConfig.Producer { private final int threadsPerSearch; private final double fractionOfMemoryReserved; - // "Reserve" 0.5GB of memory for other processes running on the content node (config-proxy, metrics-proxy). - public static final double reservedMemoryGb = 0.7; + // Memory for other processes running on the node (config-proxy, metrics-proxy). + // Keep in sync with node-repository/ClusterModel + public static final double nodeMemoryOverheadGb = 0.7; public NodeResourcesTuning(NodeResources resources, int threadsPerSearch, @@ -128,7 +129,7 @@ public class NodeResourcesTuning implements ProtonConfig.Producer { /** Returns the memory we can expect will be available for the content node processes */ private double usableMemoryGb() { - double usableMemoryGb = resources.memoryGb() - reservedMemoryGb; + double usableMemoryGb = resources.memoryGb() - nodeMemoryOverheadGb; return usableMemoryGb * (1 - fractionOfMemoryReserved); } diff --git a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java index 5472ea2ca82..2c6a0fb7826 100644 --- a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java +++ b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java @@ -57,7 +57,7 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed; import static com.yahoo.config.provision.NodeResources.StorageType; import static com.yahoo.vespa.defaults.Defaults.getDefaults; import static com.yahoo.vespa.model.search.NodeResourcesTuning.GB; -import static com.yahoo.vespa.model.search.NodeResourcesTuning.reservedMemoryGb; +import static com.yahoo.vespa.model.search.NodeResourcesTuning.nodeMemoryOverheadGb; import static com.yahoo.vespa.model.test.utils.ApplicationPackageUtils.generateSchemas; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -277,8 +277,8 @@ public class ModelProvisioningTest { assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(2, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); assertEquals(18, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is lowered with combined clusters"); - assertEquals((long) ((3 - reservedMemoryGb) * (Math.pow(1024, 3)) * (1 - 0.18)), protonMemorySize(model.getContentClusters() - .get("content1")), "Memory for proton is lowered to account for the jvm heap"); + assertEquals((long) ((3 - nodeMemoryOverheadGb) * (Math.pow(1024, 3)) * (1 - 0.18)), protonMemorySize(model.getContentClusters() + .get("content1")), "Memory for proton is lowered to account for the jvm heap"); assertProvisioned(0, ClusterSpec.Id.from("container1"), ClusterSpec.Type.container, model); assertProvisioned(2, ClusterSpec.Id.from("content1"), ClusterSpec.Id.from("container1"), ClusterSpec.Type.combined, model); assertEquals(1, logger.msgs().size()); @@ -314,8 +314,8 @@ public class ModelProvisioningTest { assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(2, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); assertEquals(30, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is lowered with combined clusters"); - assertEquals((long) ((3 - reservedMemoryGb) * (Math.pow(1024, 3)) * (1 - 0.30)), protonMemorySize(model.getContentClusters() - .get("content1")), "Memory for proton is lowered to account for the jvm heap"); + assertEquals((long) ((3 - nodeMemoryOverheadGb) * (Math.pow(1024, 3)) * (1 - 0.30)), protonMemorySize(model.getContentClusters() + .get("content1")), "Memory for proton is lowered to account for the jvm heap"); assertProvisioned(0, ClusterSpec.Id.from("container1"), ClusterSpec.Type.container, model); assertProvisioned(2, ClusterSpec.Id.from("content1"), ClusterSpec.Id.from("container1"), ClusterSpec.Type.combined, model); } @@ -346,7 +346,7 @@ public class ModelProvisioningTest { assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(2, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); assertEquals(ApplicationContainerCluster.defaultHeapSizePercentageOfTotalNodeMemory, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is normal"); - assertEquals((long) ((3 - reservedMemoryGb) * (Math.pow(1024, 3))), protonMemorySize(model.getContentClusters().get("content1")), "Memory for proton is normal"); + assertEquals((long) ((3 - nodeMemoryOverheadGb) * (Math.pow(1024, 3))), protonMemorySize(model.getContentClusters().get("content1")), "Memory for proton is normal"); } @Test @@ -2569,7 +2569,7 @@ public class ModelProvisioningTest { ProtonConfig cfg = getProtonConfig(model, cluster.getSearchNodes().get(0).getConfigId()); assertEquals(2000, cfg.flush().memory().maxtlssize()); // from config override assertEquals(1000, cfg.flush().memory().maxmemory()); // from explicit tuning - assertEquals((long) ((128 - reservedMemoryGb) * GB * 0.08), cfg.flush().memory().each().maxmemory()); // from default node flavor tuning + assertEquals((long) ((128 - nodeMemoryOverheadGb) * GB * 0.08), cfg.flush().memory().each().maxmemory()); // from default node flavor tuning } private static ProtonConfig getProtonConfig(VespaModel model, String configId) { diff --git a/config-model/src/test/java/com/yahoo/vespa/model/search/NodeResourcesTuningTest.java b/config-model/src/test/java/com/yahoo/vespa/model/search/NodeResourcesTuningTest.java index 5831090c261..ab138cb2e34 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/search/NodeResourcesTuningTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/search/NodeResourcesTuningTest.java @@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.List; -import static com.yahoo.vespa.model.search.NodeResourcesTuning.reservedMemoryGb; +import static com.yahoo.vespa.model.search.NodeResourcesTuning.nodeMemoryOverheadGb; import static org.junit.jupiter.api.Assertions.assertEquals; import static com.yahoo.vespa.model.search.NodeResourcesTuning.MB; import static com.yahoo.vespa.model.search.NodeResourcesTuning.GB; @@ -33,13 +33,13 @@ public class NodeResourcesTuningTest { @Test void require_that_hwinfo_memory_size_is_set() { - assertEquals(24 * GB, configFromMemorySetting(24 + reservedMemoryGb, 0).hwinfo().memory().size()); - assertEquals(combinedFactor * 24 * GB, configFromMemorySetting(24 + reservedMemoryGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).hwinfo().memory().size(), 1000); + assertEquals(24 * GB, configFromMemorySetting(24 + nodeMemoryOverheadGb, 0).hwinfo().memory().size()); + assertEquals(combinedFactor * 24 * GB, configFromMemorySetting(24 + nodeMemoryOverheadGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).hwinfo().memory().size(), 1000); } @Test void reserved_memory_on_content_node() { - assertEquals(0.7, reservedMemoryGb, delta); + assertEquals(0.7, nodeMemoryOverheadGb, delta); } private ProtonConfig getProtonMemoryConfig(List<Pair<String, String>> sdAndMode, double gb) { @@ -54,7 +54,7 @@ public class NodeResourcesTuningTest { } private void verify_that_initial_numdocs_is_dependent_of_mode() { - ProtonConfig cfg = getProtonMemoryConfig(Arrays.asList(new Pair<>("a", "INDEX"), new Pair<>("b", "STREAMING"), new Pair<>("c", "STORE_ONLY")), 24 + reservedMemoryGb); + ProtonConfig cfg = getProtonMemoryConfig(Arrays.asList(new Pair<>("a", "INDEX"), new Pair<>("b", "STREAMING"), new Pair<>("c", "STORE_ONLY")), 24 + nodeMemoryOverheadGb); assertEquals(3, cfg.documentdb().size()); assertEquals(1024, cfg.documentdb(0).allocation().initialnumdocs()); assertEquals("a", cfg.documentdb(0).inputdoctypename()); @@ -162,14 +162,14 @@ public class NodeResourcesTuningTest { @Test void require_that_summary_cache_max_bytes_is_set_based_on_memory() { - assertEquals(1 * GB / 25, configFromMemorySetting(1 + reservedMemoryGb, 0).summary().cache().maxbytes()); - assertEquals(256 * GB / 25, configFromMemorySetting(256 + reservedMemoryGb, 0).summary().cache().maxbytes()); + assertEquals(1 * GB / 25, configFromMemorySetting(1 + nodeMemoryOverheadGb, 0).summary().cache().maxbytes()); + assertEquals(256 * GB / 25, configFromMemorySetting(256 + nodeMemoryOverheadGb, 0).summary().cache().maxbytes()); } @Test void require_that_summary_cache_memory_is_reduced_with_combined_cluster() { - assertEquals(combinedFactor * 1 * GB / 25, configFromMemorySetting(1 + reservedMemoryGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).summary().cache().maxbytes(), 1000); - assertEquals(combinedFactor * 256 * GB / 25, configFromMemorySetting(256 + reservedMemoryGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).summary().cache().maxbytes(), 1000); + assertEquals(combinedFactor * 1 * GB / 25, configFromMemorySetting(1 + nodeMemoryOverheadGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).summary().cache().maxbytes(), 1000); + assertEquals(combinedFactor * 256 * GB / 25, configFromMemorySetting(256 + nodeMemoryOverheadGb, ApplicationContainerCluster.heapSizePercentageOfTotalNodeMemoryWhenCombinedCluster * 0.01).summary().cache().maxbytes(), 1000); } @Test @@ -191,12 +191,12 @@ public class NodeResourcesTuningTest { } private static void assertDocumentStoreMaxFileSize(long expFileSizeBytes, int wantedMemoryGb) { - assertEquals(expFileSizeBytes, configFromMemorySetting(wantedMemoryGb + reservedMemoryGb, 0).summary().log().maxfilesize()); + assertEquals(expFileSizeBytes, configFromMemorySetting(wantedMemoryGb + nodeMemoryOverheadGb, 0).summary().log().maxfilesize()); } private static void assertFlushStrategyMemory(long expMemoryBytes, int wantedMemoryGb) { - assertEquals(expMemoryBytes, configFromMemorySetting(wantedMemoryGb + reservedMemoryGb, 0).flush().memory().maxmemory()); - assertEquals(expMemoryBytes, configFromMemorySetting(wantedMemoryGb + reservedMemoryGb, 0).flush().memory().each().maxmemory()); + assertEquals(expMemoryBytes, configFromMemorySetting(wantedMemoryGb + nodeMemoryOverheadGb, 0).flush().memory().maxmemory()); + assertEquals(expMemoryBytes, configFromMemorySetting(wantedMemoryGb + nodeMemoryOverheadGb, 0).flush().memory().each().maxmemory()); } private static void assertFlushStrategyTlsSize(long expTlsSizeBytes, int diskGb) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 4020166a132..a7d5cc50828 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -55,7 +55,7 @@ public class Autoscaler { } private Autoscaling autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) { - ClusterModel clusterModel = new ClusterModel(nodeRepository.zone(), + ClusterModel clusterModel = new ClusterModel(nodeRepository, application, clusterNodes.not().retired().clusterSpec(), cluster, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 2f9ad28a072..bb599b69398 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.autoscale; +import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.Node; @@ -8,6 +9,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.provisioning.CapacityPolicies; import java.time.Clock; import java.time.Duration; @@ -42,12 +44,16 @@ public class ClusterModel { static final double idealContainerDiskLoad = 0.95; static final double idealContentDiskLoad = 0.6; + // Memory for other processes running on the node (config-proxy, metrics-proxy). + // Keep in sync with config-model/NodeResourcesTuning. + static final double nodeMemoryOverheadGb = 0.7; + // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. // TODO: Measure this, and only take it into account with queries private static final double fixedCpuCostFraction = 0.1; - private final Zone zone; + private final NodeRepository nodeRepository; private final Application application; private final ClusterSpec clusterSpec; private final Cluster cluster; @@ -69,14 +75,14 @@ public class ClusterModel { private Double maxQueryGrowthRate = null; private OptionalDouble averageQueryRate = null; - public ClusterModel(Zone zone, + public ClusterModel(NodeRepository nodeRepository, Application application, ClusterSpec clusterSpec, Cluster cluster, NodeList clusterNodes, MetricsDb metricsDb, Clock clock) { - this.zone = zone; + this.nodeRepository = nodeRepository; this.application = application; this.clusterSpec = clusterSpec; this.cluster = cluster; @@ -88,7 +94,7 @@ public class ClusterModel { this.at = clock.instant(); } - ClusterModel(Zone zone, + ClusterModel(NodeRepository nodeRepository, Application application, ClusterSpec clusterSpec, Cluster cluster, @@ -96,7 +102,7 @@ public class ClusterModel { Duration scalingDuration, ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries nodeTimeseries) { - this.zone = zone; + this.nodeRepository = nodeRepository; this.application = application; this.clusterSpec = clusterSpec; this.cluster = cluster; @@ -179,7 +185,7 @@ public class ClusterModel { double queryCpu = queryCpuPerGroup * groupCount() / groups; double writeCpu = (double)groupSize() / groupSize; return new Load(queryCpuFraction() * queryCpu + (1 - queryCpuFraction()) * writeCpu, - (double)groupSize() / groupSize, + (1 - fixedMemoryFraction()) * (double)groupSize() / groupSize + fixedMemoryFraction() * 1, (double)groupSize() / groupSize); } else { @@ -315,7 +321,7 @@ public class ClusterModel { /** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */ private double growthRateHeadroom() { - if ( ! zone.environment().isProduction()) return 1; + if ( ! nodeRepository.zone().environment().isProduction()) return 1; double growthRateHeadroom = 1 + maxQueryGrowthRate() * scalingDuration().toMinutes(); // Cap headroom at 10% above the historical observed peak if (queryFractionOfMax() != 0) @@ -329,7 +335,7 @@ public class ClusterModel { * as a multiple of current resources. */ private double trafficShiftHeadroom() { - if ( ! zone.environment().isProduction()) return 1; + if ( ! nodeRepository.zone().environment().isProduction()) return 1; if (canRescaleWithinBcpDeadline()) return 1; double trafficShiftHeadroom; if (application.status().maxReadShare() == 0) // No traffic fraction data @@ -369,6 +375,34 @@ public class ClusterModel { return idealContentMemoryLoad; } + /** + * Returns the fraction of memory of the current allocation which is currently consumed by + * fixed data structures which take the same amount of space regardless of document volume. + */ + private double fixedMemoryFraction() { + if (clusterSpec().type().isContainer()) return 1.0; + double fixedMemory = nodeMemoryOverheadGb + + (averageRealMemory() - nodeMemoryOverheadGb) * 0.05; // TODO: Measure actual content node usage + return fixedMemory / averageRealMemory(); + } + + private double averageRealMemory() { + if (nodes.isEmpty()) { // we're estimating + var initialResources = new CapacityPolicies(nodeRepository).specifyFully(cluster.minResources().nodeResources(), + clusterSpec, + application.id()); + return nodeRepository.resourcesCalculator().requestToReal(initialResources, + nodeRepository.exclusiveAllocation(clusterSpec), + false).memoryGb(); + } + else { + return nodes.stream() + .mapToDouble(node -> nodeRepository.resourcesCalculator().realResourcesOf(node, nodeRepository).memoryGb()) + .average() + .getAsDouble(); + } + } + private double idealDiskLoad() { // Stateless clusters are not expected to consume more disk over time - // if they do it is due to logs which will be rotated away right before the disk is full @@ -380,7 +414,7 @@ public class ClusterModel { * This is useful in cases where it's possible to continue without the cluster model, * as QuestDb is known to temporarily fail during reading of data. */ - public static Optional<ClusterModel> create(Zone zone, + public static Optional<ClusterModel> create(NodeRepository nodeRepository, Application application, ClusterSpec clusterSpec, Cluster cluster, @@ -388,7 +422,7 @@ public class ClusterModel { MetricsDb metricsDb, Clock clock) { try { - return Optional.of(new ClusterModel(zone, application, clusterSpec, cluster, clusterNodes, metricsDb, clock)); + return Optional.of(new ClusterModel(nodeRepository, application, clusterSpec, cluster, clusterNodes, metricsDb, clock)); } catch (Exception e) { log.log(Level.WARNING, "Failed creating a cluster model for " + application + " " + cluster, e); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index b0c7a25095e..c8d20d89dfa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -177,7 +177,7 @@ public class NodeRepositoryProvisioner implements Provisioner { firstDeployment // start at min, preserve current resources otherwise ? new AllocatableClusterResources(initialResourcesFrom(requested, clusterSpec, application.id()), clusterSpec, nodeRepository) : new AllocatableClusterResources(nodes, nodeRepository); - var clusterModel = new ClusterModel(zone, application, clusterSpec, cluster, nodes, nodeRepository.metricsDb(), nodeRepository.clock()); + var clusterModel = new ClusterModel(nodeRepository, application, clusterSpec, cluster, nodes, nodeRepository.metricsDb(), nodeRepository.clock()); return within(Limits.of(requested), currentResources, firstDeployment, clusterModel); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 36d0e464b3d..b7d96dbe3d2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -61,7 +61,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.7f, 10); var scaledResources = fixture.tester().assertResources("Scaling up since resource usage is too high", - 9, 1, 3.6, 7.7, 31.7, + 8, 1, 4.0, 9.3, 36.2, fixture.autoscale()); fixture.deploy(Capacity.from(scaledResources)); @@ -83,7 +83,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(7)); fixture.loader().applyCpuLoad(0.1f, 10); fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly", - 8, 1, 1.0, 7.3, 22.1, + 7, 1, 1.1, 8.7, 25.4, fixture.autoscale()); } @@ -107,7 +107,7 @@ public class AutoscalingTest { fixture.loader().applyLoad(new Load(0.1, 0.1, 0.1), 3); fixture.loader().applyLoad(new Load(1.0, 1.0, 1.0), 1); fixture.tester().assertResources("Scaling up since resource usage is too high", - 9, 1, 4.7, 14.8, 66.0, + 8, 1, 5.3, 17.5, 75.4, fixture.autoscale()); } @@ -167,7 +167,7 @@ public class AutoscalingTest { var fixture = DynamicProvisioningTester.fixture().awsProdSetup(true).build(); fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10); fixture.tester().assertResources("Scaling up (only) since resource usage is too high", - 8, 1, 7.1, 8.8, 75.4, + 8, 1, 7.1, 9.3, 75.4, fixture.autoscale()); } @@ -199,7 +199,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.70, 1); fixture.loader().applyCpuLoad(0.01, 100); fixture.tester().assertResources("Scaling up since peak resource usage is too high", - 9, 1, 3.8, 7.7, 31.7, + 8, 1, 4.3, 9.3, 36.2, fixture.autoscale()); } @@ -210,7 +210,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.70, 1); fixture.loader().applyCpuLoad(0.01, 100); fixture.tester().assertResources("Scaling up since peak resource usage is too high", - 10, 1, 4, 8.0, 22.7, + 9, 1, 4, 16.0, 25.5, fixture.autoscale()); } @@ -221,7 +221,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.70, 1); fixture.loader().applyCpuLoad(0.01, 100); fixture.tester().assertResources("Scaling up since peak resource usage is too high", - 9, 1, 3.8, 8.0, 37.5, + 8, 1, 4.3, 9.7, 42.9, fixture.autoscale()); } @@ -283,7 +283,7 @@ public class AutoscalingTest { new NodeResources(100, 1000, 1000, 1, DiskSpeed.any)); var capacity = Capacity.from(min, max); ClusterResources scaledResources = fixture.tester().assertResources("Scaling up", - 13, 1, 1.5, 26.7, 26.7, + 13, 1, 1.5, 29.1, 26.7, fixture.autoscale(capacity)); assertEquals("Disk speed from new capacity is used", DiskSpeed.any, scaledResources.nodeResources().diskSpeed()); @@ -401,7 +401,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(0.4, 240); fixture.tester().assertResources("Scaling cpu up", - 8, 4, 4.6, 4.0, 10.0, + 8, 4, 4.6, 4.2, 10.0, fixture.autoscale()); } @@ -446,7 +446,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(1.0, 120); fixture.tester().assertResources("Suggesting above capacity limit", - 8, 1, 6.2, 7.0, 29.0, + 8, 1, 6.2, 7.4, 29.0, fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min)); } @@ -492,7 +492,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(0.9, 120); fixture.tester().assertResources("Scaling up to 2 nodes, scaling memory and disk down at the same time", - 10, 5, 7.7, 39.3, 38.5, + 10, 5, 7.7, 41.5, 38.5, fixture.autoscale()); } @@ -528,7 +528,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(timePassed.negated()); fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 200.0 : 100.0, t -> 10.0); fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper", - 9, 1, 2.5, 30.7, 30.1, + 7, 1, 3.2, 43.3, 40.1, fixture.autoscale()); } @@ -548,7 +548,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(timePassed.negated()); fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); fixture.tester().assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper", - 6, 1, 1.0, 49.1, 48.1, + 5, 1, 1.0, 62.6, 60.1, fixture.autoscale()); } @@ -565,7 +565,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(1)); fixture.loader().applyMemLoad(1.0, 1000); fixture.tester().assertResources("Increase group size to reduce memory load", - 8, 2, 13.9, 94.5, 60.1, + 8, 2, 13.9, 96.3, 60.1, fixture.autoscale()); } @@ -594,7 +594,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofHours(12 * 3 + 1)); fixture.loader().applyCpuLoad(0.02, 120); fixture.tester().assertResources("Scaling down since enough time has passed", - 3, 1, 1.0, 24.6, 101.4, + 3, 1, 1.0, 23.6, 101.4, fixture.autoscale()); } @@ -638,7 +638,7 @@ public class AutoscalingTest { fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale up since we assume we need 2x cpu for growth when no data scaling time data", - 10, 1, 1.2, 5.5, 22.5, + 8, 1, 1.5, 7.4, 29.0, fixture.autoscale()); fixture.setScalingDuration(Duration.ofMinutes(5)); @@ -647,7 +647,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale down since observed growth is slower than scaling time", - 10, 1, 1.0, 5.5, 22.5, + 8, 1, 1.3, 7.4, 29.0, fixture.autoscale()); fixture.setScalingDuration(Duration.ofMinutes(60)); @@ -658,7 +658,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale up since observed growth is faster than scaling time", - 9, 1, 1.4, 6.1, 25.3, + 8, 1, 1.6, 7.4, 29.0, fixture.autoscale()); } @@ -670,12 +670,12 @@ public class AutoscalingTest { fixture.setScalingDuration(Duration.ofMinutes(60)); fixture.tester().clock().advance(Duration.ofDays(2)); Duration timeAdded = fixture.loader().addLoadMeasurements(100, - t -> scalingFactor * (100.0 + (t < 50 ? t * t * t : 125000 - (t - 49) * (t - 49) * (t - 49))), + t -> scalingFactor * (100.0 + (t < 50 ? t * t * t : 155000 - (t - 49) * (t - 49) * (t - 49))), t -> 0.0); fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.7, 200); fixture.tester().assertResources("Scale up slightly since observed growth is faster than scaling time, but we are not confident", - 10, 1, 1.0, 5.5, 22.5, + 8, 1, 1.3, 7.4, 29.0, fixture.autoscale()); } @@ -693,7 +693,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester.assertResources("Query and write load is equal -> scale up somewhat", - 10, 1, 1.4, 5.5, 22.5, + 8, 1, 1.8, 7.4, 29.0, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -702,7 +702,7 @@ public class AutoscalingTest { fixture.loader().addCpuMeasurements(0.4, 200); // TODO: Ackhually, we scale down here - why? fixture.tester().assertResources("Query load is 4x write load -> scale up more", - 10, 1, 1.3, 5.5, 22.5, + 8, 1, 1.6, 7.4, 29.0, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -710,7 +710,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write load is 10x query load -> scale down", - 6, 1, 1.1, 9.8, 40.5, + 6, 1, 1.1, 10.0, 40.5, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -718,7 +718,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Query only -> largest possible", - 9, 1, 2.7, 6.1, 25.3, + 8, 1, 3.1, 7.4, 29.0, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -726,7 +726,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write only -> smallest possible", - 4, 1, 1.1, 16.4, 67.6, + 4, 1, 1.1, 16.1, 67.6, fixture.autoscale()); } @@ -781,7 +781,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(1.0, 1.0, 1.0), 200); fixture.tester().assertResources("We scale even in dev because resources are 'required'", - 3, 1, 1.0, 12.3, 62.5, + 3, 1, 1.0, 13.4, 62.5, fixture.autoscale()); } @@ -851,7 +851,7 @@ public class AutoscalingTest { fixture.loader().applyLoad(new Load(0.06, 0.52, 0.27), 100); var autoscaling = fixture.autoscale(); fixture.tester().assertResources("Scaling down", - 7, 1, 2, 14.7, 384.0, + 7, 1, 2, 14.5, 384.0, autoscaling); fixture.deploy(Capacity.from(autoscaling.resources().get())); assertEquals("Initial nodes are kept", initialNodes, fixture.nodes().asList()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java index 704491ed44f..d748280cba2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java @@ -32,7 +32,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 3.6, 6.1, 25.3, + 8, 1, 4.0, 7.4, 29.0, fixture.autoscale()); // Higher query rate @@ -40,7 +40,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 7.1, 6.1, 25.3, + 8, 1, 8.0, 7.4, 29.0, fixture.autoscale()); // Higher headroom @@ -48,7 +48,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 4.2, 6.1, 25.3, + 8, 1, 4.8, 7.4, 29.0, fixture.autoscale()); // Higher per query cost @@ -56,7 +56,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 5.4, 6.1, 25.3, + 8, 1, 6.0, 7.4, 29.0, fixture.autoscale()); // Bcp elsewhere is 0 - use local only @@ -64,7 +64,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(0, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling using local info", - 8, 1, 1, 7.0, 29.0, + 8, 1, 1, 7.4, 29.0, fixture.autoscale()); } @@ -85,7 +85,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 10.5, 41.0, 168.9, + 3, 3, 10.5, 38.4, 168.9, fixture.autoscale()); // Higher query rate @@ -93,7 +93,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 20.9, 41.0, 168.9, + 3, 3, 20.9, 38.4, 168.9, fixture.autoscale()); // Higher headroom @@ -101,7 +101,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 12.4, 41.0, 168.9, + 3, 3, 12.4, 38.4, 168.9, fixture.autoscale()); // Higher per query cost @@ -109,7 +109,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 15.7, 41.0, 168.9, + 3, 3, 15.7, 38.4, 168.9, fixture.autoscale()); } @@ -186,7 +186,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 14.2, 7.0, 29.0, + 8, 1, 14.2, 7.4, 29.0, fixture.autoscale()); // Some local traffic @@ -196,7 +196,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration1.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 10.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 6.9, 7.0, 29.0, + 8, 1, 6.9, 7.4, 29.0, fixture.autoscale()); // Enough local traffic to get half the votes @@ -206,7 +206,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration2.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 50.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 2.7, 6.1, 25.3, + 8, 1, 3.0, 7.4, 29.0, fixture.autoscale()); // Mostly local @@ -216,7 +216,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration3.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 90.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 2.1, 6.1, 25.3, + 8, 1, 2.4, 7.4, 29.0, fixture.autoscale()); // Local only @@ -226,7 +226,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration4.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 100.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 2.0, 6.1, 25.3, + 8, 1, 2.3, 7.4, 29.0, fixture.autoscale()); // No group info, should be the same as the above @@ -236,7 +236,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration5.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 100.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 2.0, 6.1, 25.3, + 8, 1, 2.3, 7.4, 29.0, fixture.autoscale()); // 40 query rate, no group info (for reference to the below) @@ -246,7 +246,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration6.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 1.4, 6.1, 25.3, + 8, 1, 1.6, 7.4, 29.0, fixture.autoscale()); // Local query rate is too low but global is even lower so disregard it, giving the same as above @@ -256,7 +256,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration7.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 1.4, 6.1, 25.3, + 8, 1, 1.6, 7.4, 29.0, fixture.autoscale()); // Local query rate is too low to be fully confident, and so is global but as it is slightly larger, incorporate it slightly @@ -266,7 +266,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration8.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 9, 1, 1.8, 6.1, 25.3, + 8, 1, 2.0, 7.4, 29.0, fixture.autoscale()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index ed00134af55..ec084014a6a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -5,12 +5,17 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Capacity; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.Zone; import com.yahoo.test.ManualClock; +import com.yahoo.vespa.curator.mock.MockCurator; +import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.Status; +import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; +import com.yahoo.vespa.hosted.provision.testutils.MockNodeRepository; import org.junit.Test; import java.time.Duration; @@ -84,12 +89,11 @@ public class ClusterModelTest { private ClusterModel clusterModel(Status status, IntFunction<Double> queryRate, IntFunction<Double> writeRate) { ManualClock clock = new ManualClock(); - Zone zone = Zone.defaultZone(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); ClusterSpec clusterSpec = clusterSpec(); Cluster cluster = cluster(resources()); application = application.with(cluster); - return new ClusterModel(zone, + return new ClusterModel(new ProvisioningTester.Builder().build().nodeRepository(), application.with(status), clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, queryRate, writeRate, clock), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java index 5d1fd58489b..b150b372fe8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java @@ -10,10 +10,12 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; +import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.flags.PermanentFlags; import com.yahoo.vespa.flags.custom.HostResources; @@ -27,6 +29,8 @@ import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsHostResourcesCalcu import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsNodeTypes; import com.yahoo.vespa.hosted.provision.provisioning.DynamicProvisioningTester; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; +import com.yahoo.vespa.hosted.provision.testutils.MockNodeRepository; + import java.time.Duration; import java.util.Arrays; import java.util.List; @@ -40,14 +44,12 @@ import java.util.Optional; public class Fixture { final DynamicProvisioningTester tester; - final Zone zone; final ApplicationId applicationId; final ClusterSpec clusterSpec; final Capacity capacity; final Loader loader; public Fixture(Fixture.Builder builder, Optional<ClusterResources> initialResources, int hostCount) { - zone = builder.zone; applicationId = builder.application; clusterSpec = builder.cluster; capacity = builder.capacity; @@ -80,7 +82,7 @@ public class Fixture { public Capacity capacity() { return capacity; } public ClusterModel clusterModel() { - return new ClusterModel(zone, + return new ClusterModel(tester.nodeRepository(), application(), clusterSpec, cluster(), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index bc54e552270..1b677224295 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -75,7 +75,7 @@ public class ScalingSuggestionsMaintainerTest { assertEquals("8 nodes with [vcpu: 3.2, memory: 4.5 Gb, disk: 10.0 Gb, bandwidth: 0.1 Gbps, architecture: any]", suggestionOf(app1, cluster1, tester).resources().get().toString()); - assertEquals("8 nodes with [vcpu: 3.6, memory: 4.4 Gb, disk: 11.8 Gb, bandwidth: 0.1 Gbps, architecture: any]", + assertEquals("8 nodes with [vcpu: 3.6, memory: 4.7 Gb, disk: 11.8 Gb, bandwidth: 0.1 Gbps, architecture: any]", suggestionOf(app2, cluster2, tester).resources().get().toString()); // Utilization goes way down diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index 28cd3067155..3107d9738a9 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -525,14 +525,14 @@ public class ProvisioningTest { tester.activate(app1, cluster1, Capacity.from(resources(4, 2, 2, 10, 20), resources(6, 3, 3, 15, 25))); tester.assertNodes("Allocation preserving resources within new limits", - 6, 2, 3, 8.0/4*21 / (6.0/2), 25, + 6, 2, 3, 14.57, 25, app1, cluster1); // Widening window does not change allocation tester.activate(app1, cluster1, Capacity.from(resources(4, 2, 1, 5, 15), resources(8, 4, 4, 21, 30))); tester.assertNodes("Same allocation", - 6, 2, 3, 8.0/4*21 / (6.0/2), 25, + 6, 2, 3, 14.57, 25, app1, cluster1); // Changing limits in opposite directions cause a mixture of min and max |