summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-03-13 15:16:10 +0100
committerJon Bratseth <bratseth@gmail.com>2023-03-13 15:16:10 +0100
commit756a5944e6455f9c0379f37a53e18221ac9f0343 (patch)
tree1f813f52cf396b43e48ccae43171c68bd74dc925 /node-repository
parent3afdc7b7bb0f4535c80c7b9ea802bd4d10ac5774 (diff)
Autoscale by gpu signals too
Just take max of cpu and gpu load.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java49
2 files changed, 65 insertions, 4 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index 8b7a2bafc40..fa9cfff0d68 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -120,11 +120,13 @@ public class MetricsResponse {
cpu { // a node resource
@Override
- public List<String> metricResponseNames() { return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName()); }
+ public List<String> metricResponseNames() {
+ return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName());
+ }
@Override
double computeFinal(ListMap<String, Double> values) {
- return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).average().orElse(0) / 100; // % to ratio
+ return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio
}
},
@@ -133,11 +135,17 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(),
- SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average());
+ SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(),
+ HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
+ HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName());
}
@Override
double computeFinal(ListMap<String, Double> values) {
+ return Math.max(gpuMemUtil(values), cpuMemUtil(values));
+ }
+
+ private double cpuMemUtil(ListMap<String, Double> values) {
var valueList = values.get(SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); // prefer over mem.util
if ( ! valueList.isEmpty()) return valueList.get(0);
@@ -147,6 +155,12 @@ public class MetricsResponse {
return 0;
}
+ private double gpuMemUtil(ListMap<String, Double> values) {
+ var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
+ var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
+ return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
+ }
+
},
disk { // a node resource
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
index 24697d02681..01a4e96a195 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
@@ -80,7 +80,7 @@ public class MetricsV2MetricsFetcherTest {
assertTrue(values.get(0).getSecond().stable());
}
- {
+ { // read response 2 when unstable
httpClient.cannedResponse = cannedResponseForApplication2;
try (Mutex lock = tester.nodeRepository().applications().lock(application1)) {
tester.nodeRepository().nodes().write(tester.nodeRepository().nodes().list(Node.State.active).owner(application2)
@@ -89,6 +89,18 @@ public class MetricsV2MetricsFetcherTest {
List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics());
assertFalse(values.get(0).getSecond().stable());
}
+
+ {
+ httpClient.cannedResponse = cannedResponseForApplication3;
+ List<Pair<String, NodeMetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2).get().nodeMetrics());
+ assertEquals("http://host-3.yahoo.com:4080/metrics/v2/values?consumer=autoscaling",
+ httpClient.requestsReceived.get(1));
+ assertEquals(1, values.size());
+ assertEquals("host-3.yahoo.com", values.get(0).getFirst());
+ assertEquals(0.13, values.get(0).getSecond().load().cpu(), delta);
+ assertEquals(0.9375, values.get(0).getSecond().load().memory(), delta);
+ }
+
}
private static class MockHttpClient implements MetricsV2MetricsFetcher.AsyncHttpClient {
@@ -208,7 +220,42 @@ public class MetricsV2MetricsFetcherTest {
{
"values": {
"cpu.util": 10,
+ "gpu.util": 8,
+ "mem.util": 15,
+ "gpu.memory.used": 0,
+ "gpu.memory.total": 8,
+ "disk.util": 20,
+ "application_generation.last": 3,
+ "in_service.last": 0
+ },
+ "dimensions": {
+ "state": "active"
+ }
+ }
+ ]
+ }
+ }
+ ]
+ }
+ """;
+
+ final String cannedResponseForApplication3 =
+ """
+ {
+ "nodes": [
+ {
+ "hostname": "host-3.yahoo.com",
+ "role": "role0",
+ "node": {
+ "timestamp": 1300,
+ "metrics": [
+ {
+ "values": {
+ "cpu.util": 10,
+ "gpu.util": 13,
"mem.util": 15,
+ "gpu.memory.used": 7.5,
+ "gpu.memory.total": 8,
"disk.util": 20,
"application_generation.last": 3,
"in_service.last": 0