diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-03-13 15:16:10 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2023-03-13 15:16:10 +0100 |
commit | 756a5944e6455f9c0379f37a53e18221ac9f0343 (patch) | |
tree | 1f813f52cf396b43e48ccae43171c68bd74dc925 /node-repository | |
parent | 3afdc7b7bb0f4535c80c7b9ea802bd4d10ac5774 (diff) |
Autoscale by gpu signals too
Just take max of cpu and gpu load.
Diffstat (limited to 'node-repository')
2 files changed, 65 insertions, 4 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index 8b7a2bafc40..fa9cfff0d68 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -120,11 +120,13 @@ public class MetricsResponse { cpu { // a node resource @Override - public List<String> metricResponseNames() { return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName()); } + public List<String> metricResponseNames() { + return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName()); + } @Override double computeFinal(ListMap<String, Double> values) { - return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).average().orElse(0) / 100; // % to ratio + return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio } }, @@ -133,11 +135,17 @@ public class MetricsResponse { @Override public List<String> metricResponseNames() { return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(), - SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); + SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(), + HostedNodeAdminMetrics.GPU_MEM_USED.baseName(), + HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()); } @Override double computeFinal(ListMap<String, Double> values) { + return Math.max(gpuMemUtil(values), cpuMemUtil(values)); + } + + private double cpuMemUtil(ListMap<String, Double> values) { var valueList = values.get(SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); // prefer over mem.util if ( ! valueList.isEmpty()) return valueList.get(0); @@ -147,6 +155,12 @@ public class MetricsResponse { return 0; } + private double gpuMemUtil(ListMap<String, Double> values) { + var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum(); + var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum(); + return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0; + } + }, disk { // a node resource diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java index 24697d02681..01a4e96a195 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java @@ -80,7 +80,7 @@ public class MetricsV2MetricsFetcherTest { assertTrue(values.get(0).getSecond().stable()); } - { + { // read response 2 when unstable httpClient.cannedResponse = cannedResponseForApplication2; try (Mutex lock = tester.nodeRepository().applications().lock(application1)) { tester.nodeRepository().nodes().write(tester.nodeRepository().nodes().list(Node.State.active).owner(application2) @@ -89,6 +89,18 @@ public class MetricsV2MetricsFetcherTest { List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics()); assertFalse(values.get(0).getSecond().stable()); } + + { + httpClient.cannedResponse = cannedResponseForApplication3; + List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics()); + assertEquals("http://host-3.yahoo.com:4080/metrics/v2/values?consumer=autoscaling", + httpClient.requestsReceived.get(1)); + assertEquals(1, values.size()); + assertEquals("host-3.yahoo.com", values.get(0).getFirst()); + assertEquals(0.13, values.get(0).getSecond().load().cpu(), delta); + assertEquals(0.9375, values.get(0).getSecond().load().memory(), delta); + } + } private static class MockHttpClient implements MetricsV2MetricsFetcher.AsyncHttpClient { @@ -208,7 +220,42 @@ public class MetricsV2MetricsFetcherTest { { "values": { "cpu.util": 10, + "gpu.util": 8, + "mem.util": 15, + "gpu.memory.used": 0, + "gpu.memory.total": 8, + "disk.util": 20, + "application_generation.last": 3, + "in_service.last": 0 + }, + "dimensions": { + "state": "active" + } + } + ] + } + } + ] + } + """; + + final String cannedResponseForApplication3 = + """ + { + "nodes": [ + { + "hostname": "host-3.yahoo.com", + "role": "role0", + "node": { + "timestamp": 1300, + "metrics": [ + { + "values": { + "cpu.util": 10, + "gpu.util": 13, "mem.util": 15, + "gpu.memory.used": 7.5, + "gpu.memory.total": 8, "disk.util": 20, "application_generation.last": 3, "in_service.last": 0 |