summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java45
1 files changed, 9 insertions, 36 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index f35879d0b24..a6882e49efa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -76,10 +76,8 @@ public class MetricsResponse {
nodeMetrics.add(new Pair<>(hostname, new NodeMetricSnapshot(at,
new Load(Metric.cpu.from(nodeValues),
Metric.memory.from(nodeValues),
- Metric.disk.from(nodeValues),
- Metric.gpu.from(nodeValues),
- Metric.gpuMemory.from(nodeValues)),
- (long) Metric.generation.from(nodeValues),
+ Metric.disk.from(nodeValues)),
+ (long)Metric.generation.from(nodeValues),
Metric.inService.from(nodeValues) > 0,
clusterIsStable(node.get(), applicationNodes, nodeValues),
Metric.queryRate.from(nodeValues))));
@@ -128,7 +126,6 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
- // TODO(mpolden): Track only CPU util once we support proper GPU scaling
return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName());
}
@@ -142,7 +139,6 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
- // TODO(mpolden): Track only CPU memory once we support proper GPU scaling
return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(),
SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(),
HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
@@ -151,7 +147,7 @@ public class MetricsResponse {
@Override
double computeFinal(ListMap<String, Double> values) {
- return Math.max(cpuMemUtil(values), gpuMemory.computeFinal(values));
+ return Math.max(gpuMemUtil(values), cpuMemUtil(values));
}
private double cpuMemUtil(ListMap<String, Double> values) {
@@ -164,6 +160,12 @@ public class MetricsResponse {
return 0;
}
+ private double gpuMemUtil(ListMap<String, Double> values) {
+ var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
+ var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
+ return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
+ }
+
},
disk { // a node resource
@@ -185,35 +187,6 @@ public class MetricsResponse {
}
},
- gpu { // a node resource
-
- @Override
- public List<String> metricResponseNames() {
- return List.of(HostedNodeAdminMetrics.GPU_UTIL.baseName());
- }
-
- @Override
- double computeFinal(ListMap<String, Double> values) {
- return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio
- }
-
- },
- gpuMemory { // a node resource
-
- @Override
- public List<String> metricResponseNames() {
- return List.of(HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
- HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName());
- }
-
- @Override
- double computeFinal(ListMap<String, Double> values) {
- var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
- var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
- return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
- }
-
- },
generation { // application config generation active on the node
@Override