aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2023-12-14 14:14:16 +0100
committerMartin Polden <mpolden@mpolden.no>2023-12-20 09:52:49 +0100
commita3c938f789d8b2d8474708eff091174b5f210672 (patch)
tree8f0b6c390f8268a8bab271703ff0448a53bfecd1 /node-repository/src/main
parent4e36216e36a0e57fe8da52840e4e940927edeb2c (diff)
Store GPU metrics and load separately
Diffstat (limited to 'node-repository/src/main')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java63
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java45
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java13
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java5
7 files changed, 96 insertions, 53 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 1e4a11fdea2..986ab830283 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -233,11 +233,13 @@ public class ClusterModel {
double queryCpu = queryCpuPerGroup * groupCount() / groups;
double writeCpu = (double)groupSize() / groupSize;
return new Load(cpu.queryFraction() * queryCpu + (1 - cpu.queryFraction()) * writeCpu,
- (1 - memory.fixedFraction()) * (double)groupSize() / groupSize + memory.fixedFraction() * 1,
- (double)groupSize() / groupSize);
+ (1 - memory.fixedFraction()) * (double) groupSize() / groupSize + memory.fixedFraction() * 1,
+ (double)groupSize() / groupSize,
+ 1,
+ 1);
}
else {
- return new Load((double)nodeCount() / nodes, 1, 1);
+ return new Load((double) nodeCount() / nodes, 1, 1, 1, 1);
}
}
@@ -246,7 +248,7 @@ public class ClusterModel {
* if one of the nodes go down.
*/
public Load idealLoad() {
- var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad()).divide(redundancyAdjustment());
+ var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad(), cpu.idealLoad(), memory.idealLoad()).divide(redundancyAdjustment());
if ( !cluster.bcpGroupInfo().isEmpty() && cluster.bcpGroupInfo().queryRate() > 0) {
// Since we have little local information, use information about query cost in other groups
Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal);
@@ -392,7 +394,7 @@ public class ClusterModel {
if (averageQueryRate().isEmpty() || averageQueryRate().getAsDouble() == 0.0) return OptionalDouble.empty();
// TODO: Query rate should generally be sampled at the time where we see the peak resource usage
int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize();
- return OptionalDouble.of(peakLoad().cpu() * cpu.queryFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu()
+ return OptionalDouble.of(peakLoad().cpu() * cpu.queryFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu()
/ averageQueryRate().getAsDouble() / groupCount());
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
index e1ef21ebd13..6978e269c3d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
@@ -67,7 +67,8 @@ public class ClusterNodesTimeseries {
* the average of the highest reading for that dimension on each node.
*/
public Load peakLoad() {
- return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk));
+ return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk),
+ peakLoad(Load.Dimension.gpu), peakLoad(Load.Dimension.gpuMemory));
}
private double peakLoad(Load.Dimension dimension) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
index 799ed621807..22c13795d18 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
@@ -3,9 +3,7 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.NodeResources;
-import java.util.Objects;
import java.util.function.DoubleBinaryOperator;
-import java.util.function.DoubleFunction;
import java.util.function.DoubleUnaryOperator;
import java.util.function.Predicate;
@@ -14,32 +12,36 @@ import java.util.function.Predicate;
*
* @author bratseth
*/
-public class Load {
+public record Load(double cpu, double memory, double disk, double gpu, double gpuMemory) {
- public enum Dimension { cpu, memory, disk }
+ public enum Dimension { cpu, memory, disk, gpu, gpuMemory }
- private final double cpu, memory, disk;
-
- public Load(double cpu, double memory, double disk) {
+ public Load(double cpu, double memory, double disk, double gpu, double gpuMemory) {
this.cpu = requireNormalized(cpu, "cpu");
this.memory = requireNormalized(memory, "memory");
this.disk = requireNormalized(disk, "disk");
+ this.gpu = requireNormalized(gpu, "gpu");
+ this.gpuMemory = requireNormalized(gpuMemory, "gpuMemory");
}
public double cpu() { return cpu; }
public double memory() { return memory; }
public double disk() { return disk; }
+ public double gpu() { return gpu; }
+ public double gpuMemory() { return gpuMemory; }
- public Load withCpu(double cpu) { return new Load(cpu, memory, disk); }
- public Load withMemory(double memory) { return new Load(cpu, memory, disk); }
- public Load withDisk(double disk) { return new Load(cpu, memory, disk); }
+ public Load withCpu(double cpu) { return new Load(cpu, memory, disk, gpu, gpuMemory); }
+ public Load withMemory(double memory) { return new Load(cpu, memory, disk, gpu, gpuMemory); }
+ public Load withDisk(double disk) { return new Load(cpu, memory, disk, gpu, gpuMemory); }
+ public Load withGpu(double gpu) { return new Load(cpu, memory, disk, gpu, gpuMemory); }
+ public Load withGpuMemory(double gpuMemory) { return new Load(cpu, memory, disk, gpu, gpuMemory); }
public Load add(Load other) {
return join(other, (a, b) -> a + b);
}
public Load multiply(NodeResources resources) {
- return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb());
+ return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb(), gpu * resources.gpuResources().count(), gpu * resources.gpuResources().memoryGb());
}
public Load multiply(double factor) {
return map(v -> v * factor);
@@ -55,21 +57,25 @@ public class Load {
return map(v -> divide(v, divisor));
}
public Load divide(NodeResources resources) {
- return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb()));
+ return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb()), divide(gpu, resources.gpuResources().count()), divide(gpuMemory, resources.gpuResources().memoryGb()));
}
/** Returns the load where the given function is applied to each dimension of this. */
public Load map(DoubleUnaryOperator f) {
return new Load(f.applyAsDouble(cpu),
f.applyAsDouble(memory),
- f.applyAsDouble(disk));
+ f.applyAsDouble(disk),
+ f.applyAsDouble(gpu),
+ f.applyAsDouble(gpuMemory));
}
/** Returns the load where the given function is applied to each dimension of this and the given load. */
public Load join(Load other, DoubleBinaryOperator f) {
return new Load(f.applyAsDouble(this.cpu(), other.cpu()),
f.applyAsDouble(this.memory(), other.memory()),
- f.applyAsDouble(this.disk(), other.disk()));
+ f.applyAsDouble(this.disk(), other.disk()),
+ f.applyAsDouble(this.gpu(), other.gpu()),
+ f.applyAsDouble(this.gpuMemory(), other.gpuMemory()));
}
/** Returns true if any dimension matches the predicate. */
@@ -88,6 +94,8 @@ public class Load {
case cpu -> cpu();
case memory -> memory();
case disk -> disk();
+ case gpu -> gpu();
+ case gpuMemory -> gpuMemory();
};
}
@@ -95,7 +103,7 @@ public class Load {
if (Double.isNaN(value))
throw new IllegalArgumentException(name + " must be a number but is NaN");
if (value < 0)
- throw new IllegalArgumentException(name + " must be zero or lager, but is " + value);
+ throw new IllegalArgumentException(name + " must be zero or larger, but is " + value);
return value;
}
@@ -105,28 +113,19 @@ public class Load {
}
@Override
- public boolean equals(Object o) {
- if (o == this) return true;
- if ( ! (o instanceof Load other)) return false;
- if (other.cpu() != this.cpu()) return false;
- if (other.memory() != this.memory()) return false;
- if (other.disk() != this.disk()) return false;
- return true;
- }
-
- @Override
- public int hashCode() { return Objects.hash(cpu, memory, disk); }
-
- @Override
public String toString() {
- return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk";
+ return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk," + gpu + " gpu," + gpuMemory + " gpuMemory";
}
- public static Load zero() { return new Load(0, 0, 0); }
- public static Load one() { return new Load(1, 1, 1); }
+ public static Load zero() { return new Load(0, 0, 0, 0, 0); }
+ public static Load one() { return new Load(1, 1, 1, 1, 1); }
public static Load byDividing(NodeResources a, NodeResources b) {
- return new Load(divide(a.vcpu(), b.vcpu()), divide(a.memoryGb(), b.memoryGb()), divide(a.diskGb(), b.diskGb()));
+ return new Load(divide(a.vcpu(), b.vcpu()),
+ divide(a.memoryGb(), b.memoryGb()),
+ divide(a.diskGb(), b.diskGb()),
+ divide(a.gpuResources().count(), b.gpuResources().count()),
+ divide(a.gpuResources().memoryGb(), b.gpuResources().memoryGb()));
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index a6882e49efa..f35879d0b24 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -76,8 +76,10 @@ public class MetricsResponse {
nodeMetrics.add(new Pair<>(hostname, new NodeMetricSnapshot(at,
new Load(Metric.cpu.from(nodeValues),
Metric.memory.from(nodeValues),
- Metric.disk.from(nodeValues)),
- (long)Metric.generation.from(nodeValues),
+ Metric.disk.from(nodeValues),
+ Metric.gpu.from(nodeValues),
+ Metric.gpuMemory.from(nodeValues)),
+ (long) Metric.generation.from(nodeValues),
Metric.inService.from(nodeValues) > 0,
clusterIsStable(node.get(), applicationNodes, nodeValues),
Metric.queryRate.from(nodeValues))));
@@ -126,6 +128,7 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
+ // TODO(mpolden): Track only CPU util once we support proper GPU scaling
return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName());
}
@@ -139,6 +142,7 @@ public class MetricsResponse {
@Override
public List<String> metricResponseNames() {
+ // TODO(mpolden): Track only CPU memory once we support proper GPU scaling
return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(),
SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(),
HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
@@ -147,7 +151,7 @@ public class MetricsResponse {
@Override
double computeFinal(ListMap<String, Double> values) {
- return Math.max(gpuMemUtil(values), cpuMemUtil(values));
+ return Math.max(cpuMemUtil(values), gpuMemory.computeFinal(values));
}
private double cpuMemUtil(ListMap<String, Double> values) {
@@ -160,12 +164,6 @@ public class MetricsResponse {
return 0;
}
- private double gpuMemUtil(ListMap<String, Double> values) {
- var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
- var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
- return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
- }
-
},
disk { // a node resource
@@ -187,6 +185,35 @@ public class MetricsResponse {
}
},
+ gpu { // a node resource
+
+ @Override
+ public List<String> metricResponseNames() {
+ return List.of(HostedNodeAdminMetrics.GPU_UTIL.baseName());
+ }
+
+ @Override
+ double computeFinal(ListMap<String, Double> values) {
+ return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio
+ }
+
+ },
+ gpuMemory { // a node resource
+
+ @Override
+ public List<String> metricResponseNames() {
+ return List.of(HostedNodeAdminMetrics.GPU_MEM_USED.baseName(),
+ HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName());
+ }
+
+ @Override
+ double computeFinal(ListMap<String, Double> values) {
+ var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum();
+ var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum();
+ return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0;
+ }
+
+ },
generation { // application config generation active on the node
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
index 38127fa3093..c0de9a43f7f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
@@ -144,6 +144,8 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
row.putBool(6, snapshot.getSecond().inService());
row.putBool(7, snapshot.getSecond().stable());
row.putFloat(8, (float) snapshot.getSecond().queryRate());
+ row.putFloat(9, (float) snapshot.getSecond().load().gpu());
+ row.putFloat(10, (float) snapshot.getSecond().load().gpuMemory());
row.append();
}
writer.commit();
@@ -243,6 +245,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
private void ensureNodeTableIsUpdated() {
try {
// Example: nodeTable.ensureColumnExists("write_rate", "float");
+ // TODO(mpolden): Remove after January 2024
+ nodeTable.ensureColumnExists("gpu_util", "float");
+ nodeTable.ensureColumnExists("gpu_mem_total_util", "float");
} catch (Exception e) {
nodeTable.repair(e);
}
@@ -262,7 +267,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
try {
issue("create table " + nodeTable.name +
" (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," +
- " application_generation long, inService boolean, stable boolean, queries_rate float)" +
+ " application_generation long, inService boolean, stable boolean, queries_rate float," +
+ " gpu_util float, gpu_mem_total_util float" +
+ " )" +
" timestamp(at)" +
"PARTITION BY DAY;",
newContext());
@@ -311,7 +318,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
new NodeMetricSnapshot(Instant.ofEpochMilli(record.getTimestamp(1) / 1000),
new Load(record.getFloat(2),
record.getFloat(3),
- record.getFloat(4)),
+ record.getFloat(4),
+ record.getFloat(9),
+ record.getFloat(10)),
record.getLong(5),
record.getBool(6),
record.getBool(7),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
index c4e7d3b9acc..6f325700401 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
@@ -76,6 +76,8 @@ public class ApplicationSerializer {
private static final String cpuKey = "cpu";
private static final String memoryKey = "memory";
private static final String diskKey = "disk";
+ private static final String gpuKey = "gpu";
+ private static final String gpuMemory = "gpuMemory";
private static final String fromKey = "from";
private static final String toKey = "to";
private static final String generationKey = "generation";
@@ -201,12 +203,16 @@ public class ApplicationSerializer {
loadObject.setDouble(cpuKey, load.cpu());
loadObject.setDouble(memoryKey, load.memory());
loadObject.setDouble(diskKey, load.disk());
+ loadObject.setDouble(gpuKey, load.gpu());
+ loadObject.setDouble(gpuMemory, load.gpuMemory());
}
private static Load loadFromSlime(Inspector loadObject) {
return new Load(loadObject.field(cpuKey).asDouble(),
loadObject.field(memoryKey).asDouble(),
- loadObject.field(diskKey).asDouble());
+ loadObject.field(diskKey).asDouble(),
+ loadObject.field(gpuKey).asDouble(),
+ loadObject.field(gpuMemory).asDouble());
}
private static void toSlime(Autoscaling.Metrics metrics, Cursor metricsObject) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
index fe6b204ed31..d3b88997059 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
@@ -40,7 +40,6 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
import com.yahoo.vespa.hosted.provision.autoscale.Load;
import com.yahoo.vespa.hosted.provision.autoscale.MemoryMetricsDb;
-import com.yahoo.vespa.hosted.provision.lb.LoadBalancerService;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.node.Status;
@@ -239,8 +238,8 @@ public class MockNodeRepository extends NodeRepository {
Optional.of(new ClusterResources(4, 1,
new NodeResources(3, 16, 100, 1))),
clock().instant(),
- new Load(0.1, 0.2, 0.3),
- new Load(0.4, 0.5, 0.6),
+ new Load(0.1, 0.2, 0.3, 0, 0),
+ new Load(0.4, 0.5, 0.6, 0, 0),
new Autoscaling.Metrics(0.7, 0.8, 0.9)));
try (Mutex lock = applications().lock(app1Id)) {
applications().put(app1.with(cluster1), lock);