diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java | 45 |
1 files changed, 9 insertions, 36 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index f35879d0b24..a6882e49efa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -76,10 +76,8 @@ public class MetricsResponse { nodeMetrics.add(new Pair<>(hostname, new NodeMetricSnapshot(at, new Load(Metric.cpu.from(nodeValues), Metric.memory.from(nodeValues), - Metric.disk.from(nodeValues), - Metric.gpu.from(nodeValues), - Metric.gpuMemory.from(nodeValues)), - (long) Metric.generation.from(nodeValues), + Metric.disk.from(nodeValues)), + (long)Metric.generation.from(nodeValues), Metric.inService.from(nodeValues) > 0, clusterIsStable(node.get(), applicationNodes, nodeValues), Metric.queryRate.from(nodeValues)))); @@ -128,7 +126,6 @@ public class MetricsResponse { @Override public List<String> metricResponseNames() { - // TODO(mpolden): Track only CPU util once we support proper GPU scaling return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName()); } @@ -142,7 +139,6 @@ public class MetricsResponse { @Override public List<String> metricResponseNames() { - // TODO(mpolden): Track only CPU memory once we support proper GPU scaling return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(), SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(), HostedNodeAdminMetrics.GPU_MEM_USED.baseName(), @@ -151,7 +147,7 @@ public class MetricsResponse { @Override double computeFinal(ListMap<String, Double> values) { - return Math.max(cpuMemUtil(values), gpuMemory.computeFinal(values)); + return Math.max(gpuMemUtil(values), cpuMemUtil(values)); } private double cpuMemUtil(ListMap<String, Double> values) { @@ -164,6 +160,12 @@ public class MetricsResponse { return 0; } + private double gpuMemUtil(ListMap<String, Double> values) { + var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum(); + var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum(); + return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0; + } + }, disk { // a node resource @@ -185,35 +187,6 @@ public class MetricsResponse { } }, - gpu { // a node resource - - @Override - public List<String> metricResponseNames() { - return List.of(HostedNodeAdminMetrics.GPU_UTIL.baseName()); - } - - @Override - double computeFinal(ListMap<String, Double> values) { - return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio - } - - }, - gpuMemory { // a node resource - - @Override - public List<String> metricResponseNames() { - return List.of(HostedNodeAdminMetrics.GPU_MEM_USED.baseName(), - HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()); - } - - @Override - double computeFinal(ListMap<String, Double> values) { - var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum(); - var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum(); - return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0; - } - - }, generation { // application config generation active on the node @Override |