From e25cd69faff357151fd5efcf5dfba8790e13437f Mon Sep 17 00:00:00 2001 From: jonmv Date: Wed, 29 May 2024 22:17:59 +0200 Subject: Correctly account for node memory overhead when computing args for startup script --- .../src/main/java/com/yahoo/vespa/model/Host.java | 2 +- .../validation/JvmHeapSizeValidator.java | 11 +++---- ...estartOnDeployForOnnxModelChangesValidator.java | 2 +- .../container/ApplicationContainerCluster.java | 35 ++++++++++++---------- .../vespa/model/container/ContainerCluster.java | 9 +++--- .../model/container/xml/ContainerModelBuilder.java | 2 +- .../model/content/cluster/ContentCluster.java | 2 +- .../model/provision/ModelProvisioningTest.java | 12 ++++---- .../validation/JvmHeapSizeValidatorTest.java | 4 +-- 9 files changed, 43 insertions(+), 36 deletions(-) (limited to 'config-model/src') diff --git a/config-model/src/main/java/com/yahoo/vespa/model/Host.java b/config-model/src/main/java/com/yahoo/vespa/model/Host.java index a8085919a98..f87f1382ffb 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/Host.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/Host.java @@ -16,7 +16,7 @@ import java.util.Objects; public final class Host extends TreeConfigProducer implements SentinelConfig.Producer, Comparable { // Memory needed for auxiliary processes always running on the node (config-proxy, metrics-proxy). - // Keep in sync with node-repository/ClusterModel. + // Keep in sync with node-repository/ClusterModel and startup scripts (go and bash). public static final double memoryOverheadGb = 0.7; private ConfigSentinel configSentinel = null; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidator.java b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidator.java index 9cf5fe84c21..4900b56801c 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidator.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidator.java @@ -23,21 +23,22 @@ public class JvmHeapSizeValidator implements Validator { context.model().getContainerClusters().forEach((clusterId, appCluster) -> { var mp = appCluster.getMemoryPercentage().orElse(null); if (mp == null) return; - if (mp.availableMemoryGb().isEmpty()) { + if (mp.asAbsoluteGb().isEmpty()) { context.deployState().getDeployLogger().log(Level.FINE, "Host resources unknown or percentage overridden with 'allocated-memory'"); return; } long jvmModelCost = appCluster.onnxModelCostCalculator().aggregatedModelCostInBytes(); if (jvmModelCost > 0) { - double availableMemoryGb = mp.availableMemoryGb().getAsDouble(); + double availableMemoryGb = mp.asAbsoluteGb().getAsDouble(); + int percentageOfTotal = mp.ofContainerTotal().getAsInt(); double modelCostGb = jvmModelCost / (1024D * 1024 * 1024); context.deployState().getDeployLogger().log(Level.FINE, () -> Text.format("JVM: %d%% (limit: %d%%), %.2fGB (limit: %.2fGB), ONNX: %.2fGB", - mp.percentage(), percentLimit, availableMemoryGb, gbLimit, modelCostGb)); - if (mp.percentage() < percentLimit) { + percentageOfTotal, percentLimit, availableMemoryGb, gbLimit, modelCostGb)); + if (percentageOfTotal < percentLimit) { context.illegal(Text.format("Allocated percentage of memory of JVM in cluster '%s' is too low (%d%% < %d%%). " + "Estimated cost of ONNX models is %.2fGB. Either use a node flavor with more memory or use less expensive models. " + "You may override this validation by specifying 'allocated-memory' (https://docs.vespa.ai/en/performance/container-tuning.html#jvm-heap-size).", - clusterId, mp.percentage(), percentLimit, modelCostGb)); + clusterId, percentageOfTotal, percentLimit, modelCostGb)); } if (availableMemoryGb < gbLimit) { context.illegal( diff --git a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/change/RestartOnDeployForOnnxModelChangesValidator.java b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/change/RestartOnDeployForOnnxModelChangesValidator.java index 008a3fc5547..e57110e44e5 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/change/RestartOnDeployForOnnxModelChangesValidator.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/change/RestartOnDeployForOnnxModelChangesValidator.java @@ -115,7 +115,7 @@ public class RestartOnDeployForOnnxModelChangesValidator implements ChangeValida double memoryUsedByModels = currentModelCostInGb + nextModelCostInGb; double availableMemory = Math.max(0, totalMemory - Host.memoryOverheadGb - memoryUsedByModels); - var availableMemoryPercentage = cluster.availableMemoryPercentage(); + var availableMemoryPercentage = cluster.heapSizePercentageOfAvailable(); int memoryPercentage = (int) (availableMemory / totalMemory * availableMemoryPercentage); var prefix = "Validating Onnx models memory usage for %s".formatted(cluster); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java index ed7646b3066..5a86ebfd7eb 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java @@ -209,22 +209,27 @@ public final class ApplicationContainerCluster extends ContainerCluster c.getHostResource().realResources().memoryGb()).min().orElseThrow(); - double jvmHeapDeductionGb = onnxModelCostCalculator.aggregatedModelCostInBytes() / (1024D * 1024 * 1024); - double availableMemory = Math.max(0, totalMemory - Host.memoryOverheadGb - jvmHeapDeductionGb); - int memoryPercentage = (int) (availableMemory / totalMemory * availableMemoryPercentage); - logger.log(FINE, () -> "cluster id '%s': memoryPercentage=%d, availableMemory=%f, totalMemory=%f, availableMemoryPercentage=%d, jvmHeapDeductionGb=%f" - .formatted(id(), memoryPercentage, availableMemory, totalMemory, availableMemoryPercentage, jvmHeapDeductionGb)); - return Optional.of(JvmMemoryPercentage.of(memoryPercentage, availableMemory)); + int heapSizePercentageOfAvailable = heapSizePercentageOfAvailable(); + if (getContainers().isEmpty()) return Optional.of(JvmMemoryPercentage.of(heapSizePercentageOfAvailable)); // Node memory is not known + + // Node memory is known so convert available memory ofContainerAvailable to node memory ofContainerAvailable + double totalMemoryGb = getContainers().stream().mapToDouble(c -> c.getHostResource().realResources().memoryGb()).min().orElseThrow(); + double totalMemoryMinusOverhead = Math.max(0, totalMemoryGb - Host.memoryOverheadGb); + double onnxModelCostGb = onnxModelCostCalculator.aggregatedModelCostInBytes() / (1024D * 1024 * 1024); + double availableMemoryGb = Math.max(0, totalMemoryMinusOverhead - onnxModelCostGb); + int memoryPercentageOfAvailable = (int) (heapSizePercentageOfAvailable * availableMemoryGb / totalMemoryMinusOverhead); + int memoryPercentageOfTotal = (int) (heapSizePercentageOfAvailable * availableMemoryGb / totalMemoryGb); + logger.log(FINE, () -> ("cluster id '%s': memoryPercentageOfAvailable=%d, memoryPercentageOfTotal=%d, " + + "availableMemoryGb=%f, totalMemoryGb=%f, heapSizePercentageOfAvailable=%d, onnxModelCostGb=%f") + .formatted(id(), memoryPercentageOfAvailable, memoryPercentageOfTotal, + availableMemoryGb, totalMemoryGb, heapSizePercentageOfAvailable, onnxModelCostGb)); + return Optional.of(JvmMemoryPercentage.of(memoryPercentageOfAvailable, memoryPercentageOfTotal, + availableMemoryGb * heapSizePercentageOfAvailable * 1e-2)); } return Optional.empty(); } - public int availableMemoryPercentage() { + public int heapSizePercentageOfAvailable() { return getHostClusterId().isPresent() ? heapSizePercentageOfTotalAvailableMemoryWhenCombinedCluster : heapSizePercentageOfAvailableMemory; @@ -310,14 +315,14 @@ public final class ApplicationContainerCluster extends ContainerCluster * Returns the percentage of host physical memory this application has specified for nodes in this cluster, * or empty if this is not specified by the application. */ - public record JvmMemoryPercentage(int percentage, OptionalDouble availableMemoryGb) { - static JvmMemoryPercentage of(int percentage) { return new JvmMemoryPercentage(percentage, OptionalDouble.empty()); } - static JvmMemoryPercentage of(int percentage, double availableMemoryGb) { - return new JvmMemoryPercentage(percentage, OptionalDouble.of(availableMemoryGb)); + public record JvmMemoryPercentage(int ofContainerAvailable, OptionalInt ofContainerTotal, OptionalDouble asAbsoluteGb) { // optionalInt pctOfTotal < int pctOfAvailable + static JvmMemoryPercentage of(int percentageOfAvailable) { return new JvmMemoryPercentage(percentageOfAvailable, OptionalInt.empty(), OptionalDouble.empty()); } + static JvmMemoryPercentage of(int percentageOfAvailable, int percentageOfTotal, double absoluteMemoryGb) { + return new JvmMemoryPercentage(percentageOfAvailable, OptionalInt.of(percentageOfTotal), OptionalDouble.of(absoluteMemoryGb)); } } public Optional getMemoryPercentage() { return Optional.empty(); } diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java index 5bcd21a5b9b..9841ffe760c 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java @@ -1040,7 +1040,7 @@ public class ContainerModelBuilder extends ConfigModelBuilder { } catch (NumberFormatException e) { throw new IllegalArgumentException("The memory percentage given for nodes in " + cluster + - " must be an integer percentage ending by the '%' sign", e); + " must be given as an integer followe by '%'", e); } } diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java index bac86e37e8f..1ebf27734bd 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/ContentCluster.java @@ -247,7 +247,7 @@ public class ContentCluster extends TreeConfigProducer implem for (ContainerModel containerModel : containers) { Optional hostClusterId = containerModel.getCluster().getHostClusterId(); if (hostClusterId.isPresent() && hostClusterId.get().equals(clusterId) && containerModel.getCluster().getMemoryPercentage().isPresent()) { - return containerModel.getCluster().getMemoryPercentage().get().percentage() * 0.01; + return containerModel.getCluster().getMemoryPercentage().get().ofContainerAvailable() * 0.01; } } return 0.0; diff --git a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java index 5ead9812b56..2846925b9c3 100644 --- a/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java +++ b/config-model/src/test/java/com/yahoo/config/model/provision/ModelProvisioningTest.java @@ -151,7 +151,7 @@ public class ModelProvisioningTest { assertEquals("-Xlog:gc", mydisc2.getContainers().get(1).getJvmOptions()); assertEquals("lib/blablamalloc.so", mydisc2.getContainers().get(0).getPreLoad()); assertEquals("lib/blablamalloc.so", mydisc2.getContainers().get(1).getPreLoad()); - assertEquals(45, mydisc2.getMemoryPercentage().get().percentage()); + assertEquals(45, mydisc2.getMemoryPercentage().get().ofContainerAvailable()); assertEquals(Optional.of("-XX:+UseParNewGC"), mydisc2.getJvmGCOptions()); QrStartConfig.Builder qrStartBuilder = new QrStartConfig.Builder(); mydisc2.getConfig(qrStartBuilder); @@ -234,8 +234,8 @@ public class ModelProvisioningTest { assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(1, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); assertEquals(2, model.getContentClusters().get("content").getRootGroup().getNodes().size(), "Nodes in cluster without ID"); - assertEquals(65, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size for container1"); - assertEquals(84, physicalMemoryPercentage(model.getContainerClusters().get("container2")), "Heap size for container2"); + assertEquals(85, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size for container1"); + assertEquals(85, physicalMemoryPercentage(model.getContainerClusters().get("container2")), "Heap size for container2"); assertProvisioned(2, ClusterSpec.Id.from("content1"), ClusterSpec.Type.content, model); assertProvisioned(1, ClusterSpec.Id.from("container1"), ClusterSpec.Type.container, model); assertProvisioned(2, ClusterSpec.Id.from("content"), ClusterSpec.Type.content, model); @@ -287,8 +287,8 @@ public class ModelProvisioningTest { VespaModel model = tester.createModel(xmlWithNodes, true, deployStateWithClusterEndpoints("container1").deployLogger(logger)); assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(2, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); - assertEquals(18, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is lowered with combined clusters"); - assertEquals(2025077080L, protonMemorySize(model.getContentClusters().get("content1")), "Memory for proton is lowered to account for the jvm heap"); + assertEquals(24, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is lowered with combined clusters"); + assertEquals(1876900708, protonMemorySize(model.getContentClusters().get("content1")), "Memory for proton is lowered to account for the jvm heap"); assertProvisioned(0, ClusterSpec.Id.from("container1"), ClusterSpec.Type.container, model); assertProvisioned(2, ClusterSpec.Id.from("content1"), ClusterSpec.Id.from("container1"), ClusterSpec.Type.combined, model); var msgs = logger.msgs().stream().filter(m -> m.level().equals(Level.WARNING)).toList(); @@ -356,7 +356,7 @@ public class ModelProvisioningTest { VespaModel model = tester.createModel(xmlWithNodes, true, deployStateWithClusterEndpoints("container1")); assertEquals(2, model.getContentClusters().get("content1").getRootGroup().getNodes().size(), "Nodes in content1"); assertEquals(2, model.getContainerClusters().get("container1").getContainers().size(), "Nodes in container1"); - assertEquals(65, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is normal"); + assertEquals(85, physicalMemoryPercentage(model.getContainerClusters().get("container1")), "Heap size is normal"); assertEquals((long) ((3 - memoryOverheadGb) * (Math.pow(1024, 3))), protonMemorySize(model.getContentClusters().get("content1")), "Memory for proton is normal"); } diff --git a/config-model/src/test/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidatorTest.java b/config-model/src/test/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidatorTest.java index 45125c8eb68..340968f89d1 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidatorTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/application/validation/JvmHeapSizeValidatorTest.java @@ -45,10 +45,10 @@ class JvmHeapSizeValidatorTest { @Test void fails_on_too_low_heap_size() throws IOException, SAXException { - var deployState = createDeployState(2.2, 1024L * 1024 * 1024); + var deployState = createDeployState(2.3, 1024L * 1024 * 1024); var model = new VespaModel(new NullConfigModelRegistry(), deployState); ValidationTester.expect(new JvmHeapSizeValidator(), model, deployState, - "Allocated memory to JVM in cluster 'container' is too low (0.50GB < 0.60GB). Estimated cost of ONNX models is 1.00GB."); + "Allocated memory to JVM in cluster 'container' is too low (0.51GB < 0.60GB). Estimated cost of ONNX models is 1.00GB."); } @Test -- cgit v1.2.3