summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-03-13 15:16:10 +0100
committerJon Bratseth <bratseth@gmail.com>2023-03-13 15:16:10 +0100
commit756a5944e6455f9c0379f37a53e18221ac9f0343 (patch)
tree1f813f52cf396b43e48ccae43171c68bd74dc925 /node-repository/src/main
parent3afdc7b7bb0f4535c80c7b9ea802bd4d10ac5774 (diff)
Autoscale by gpu signals too
Just take max of cpu and gpu load.
Diffstat (limited to 'node-repository/src/main')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java20
1 files changed, 17 insertions, 3 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index 8b7a2bafc40..fa9cfff0d68 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -120,11 +120,13 @@ public class MetricsResponse {
cpu { // a node resource
@Override
- public List<String> metricResponseNames() { return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName()); }
+ public List<String> metricResponseNames() {
+ return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName());
+ }
@Override
double computeFinal(ListMap<String, Double> values) {
- return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).average().orElse(0) / 100; // % to ratio
+ return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio
}
},
@@ -133,11 +135,17 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(),
- SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average());
+ SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(),
+ HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
+ HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName());
}
@Override
double computeFinal(ListMap<String, Double> values) {
+ return Math.max(gpuMemUtil(values), cpuMemUtil(values));
+ }
+
+ private double cpuMemUtil(ListMap<String, Double> values) {
var valueList = values.get(SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average()); // prefer over mem.util
if ( ! valueList.isEmpty()) return valueList.get(0);
@@ -147,6 +155,12 @@ public class MetricsResponse {
return 0;
}
+ private double gpuMemUtil(ListMap<String, Double> values) {
+ var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
+ var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
+ return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
+ }
+
},
disk { // a node resource