diff options
Diffstat (limited to 'node-repository/src/main/java/com')
7 files changed, 96 insertions, 53 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 1e4a11fdea2..986ab830283 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -233,11 +233,13 @@ public class ClusterModel { double queryCpu = queryCpuPerGroup * groupCount() / groups; double writeCpu = (double)groupSize() / groupSize; return new Load(cpu.queryFraction() * queryCpu + (1 - cpu.queryFraction()) * writeCpu, - (1 - memory.fixedFraction()) * (double)groupSize() / groupSize + memory.fixedFraction() * 1, - (double)groupSize() / groupSize); + (1 - memory.fixedFraction()) * (double) groupSize() / groupSize + memory.fixedFraction() * 1, + (double)groupSize() / groupSize, + 1, + 1); } else { - return new Load((double)nodeCount() / nodes, 1, 1); + return new Load((double) nodeCount() / nodes, 1, 1, 1, 1); } } @@ -246,7 +248,7 @@ public class ClusterModel { * if one of the nodes go down. */ public Load idealLoad() { - var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad()).divide(redundancyAdjustment()); + var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad(), cpu.idealLoad(), memory.idealLoad()).divide(redundancyAdjustment()); if ( !cluster.bcpGroupInfo().isEmpty() && cluster.bcpGroupInfo().queryRate() > 0) { // Since we have little local information, use information about query cost in other groups Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal); @@ -392,7 +394,7 @@ public class ClusterModel { if (averageQueryRate().isEmpty() || averageQueryRate().getAsDouble() == 0.0) return OptionalDouble.empty(); // TODO: Query rate should generally be sampled at the time where we see the peak resource usage int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize(); - return OptionalDouble.of(peakLoad().cpu() * cpu.queryFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu() + return OptionalDouble.of(peakLoad().cpu() * cpu.queryFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu() / averageQueryRate().getAsDouble() / groupCount()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index e1ef21ebd13..6978e269c3d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -67,7 +67,8 @@ public class ClusterNodesTimeseries { * the average of the highest reading for that dimension on each node. */ public Load peakLoad() { - return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk)); + return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk), + peakLoad(Load.Dimension.gpu), peakLoad(Load.Dimension.gpuMemory)); } private double peakLoad(Load.Dimension dimension) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index 799ed621807..22c13795d18 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -3,9 +3,7 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.NodeResources; -import java.util.Objects; import java.util.function.DoubleBinaryOperator; -import java.util.function.DoubleFunction; import java.util.function.DoubleUnaryOperator; import java.util.function.Predicate; @@ -14,32 +12,36 @@ import java.util.function.Predicate; * * @author bratseth */ -public class Load { +public record Load(double cpu, double memory, double disk, double gpu, double gpuMemory) { - public enum Dimension { cpu, memory, disk } + public enum Dimension { cpu, memory, disk, gpu, gpuMemory } - private final double cpu, memory, disk; - - public Load(double cpu, double memory, double disk) { + public Load(double cpu, double memory, double disk, double gpu, double gpuMemory) { this.cpu = requireNormalized(cpu, "cpu"); this.memory = requireNormalized(memory, "memory"); this.disk = requireNormalized(disk, "disk"); + this.gpu = requireNormalized(gpu, "gpu"); + this.gpuMemory = requireNormalized(gpuMemory, "gpuMemory"); } public double cpu() { return cpu; } public double memory() { return memory; } public double disk() { return disk; } + public double gpu() { return gpu; } + public double gpuMemory() { return gpuMemory; } - public Load withCpu(double cpu) { return new Load(cpu, memory, disk); } - public Load withMemory(double memory) { return new Load(cpu, memory, disk); } - public Load withDisk(double disk) { return new Load(cpu, memory, disk); } + public Load withCpu(double cpu) { return new Load(cpu, memory, disk, gpu, gpuMemory); } + public Load withMemory(double memory) { return new Load(cpu, memory, disk, gpu, gpuMemory); } + public Load withDisk(double disk) { return new Load(cpu, memory, disk, gpu, gpuMemory); } + public Load withGpu(double gpu) { return new Load(cpu, memory, disk, gpu, gpuMemory); } + public Load withGpuMemory(double gpuMemory) { return new Load(cpu, memory, disk, gpu, gpuMemory); } public Load add(Load other) { return join(other, (a, b) -> a + b); } public Load multiply(NodeResources resources) { - return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb()); + return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb(), gpu * resources.gpuResources().count(), gpu * resources.gpuResources().memoryGb()); } public Load multiply(double factor) { return map(v -> v * factor); @@ -55,21 +57,25 @@ public class Load { return map(v -> divide(v, divisor)); } public Load divide(NodeResources resources) { - return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb())); + return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb()), divide(gpu, resources.gpuResources().count()), divide(gpuMemory, resources.gpuResources().memoryGb())); } /** Returns the load where the given function is applied to each dimension of this. */ public Load map(DoubleUnaryOperator f) { return new Load(f.applyAsDouble(cpu), f.applyAsDouble(memory), - f.applyAsDouble(disk)); + f.applyAsDouble(disk), + f.applyAsDouble(gpu), + f.applyAsDouble(gpuMemory)); } /** Returns the load where the given function is applied to each dimension of this and the given load. */ public Load join(Load other, DoubleBinaryOperator f) { return new Load(f.applyAsDouble(this.cpu(), other.cpu()), f.applyAsDouble(this.memory(), other.memory()), - f.applyAsDouble(this.disk(), other.disk())); + f.applyAsDouble(this.disk(), other.disk()), + f.applyAsDouble(this.gpu(), other.gpu()), + f.applyAsDouble(this.gpuMemory(), other.gpuMemory())); } /** Returns true if any dimension matches the predicate. */ @@ -88,6 +94,8 @@ public class Load { case cpu -> cpu(); case memory -> memory(); case disk -> disk(); + case gpu -> gpu(); + case gpuMemory -> gpuMemory(); }; } @@ -95,7 +103,7 @@ public class Load { if (Double.isNaN(value)) throw new IllegalArgumentException(name + " must be a number but is NaN"); if (value < 0) - throw new IllegalArgumentException(name + " must be zero or lager, but is " + value); + throw new IllegalArgumentException(name + " must be zero or larger, but is " + value); return value; } @@ -105,28 +113,19 @@ public class Load { } @Override - public boolean equals(Object o) { - if (o == this) return true; - if ( ! (o instanceof Load other)) return false; - if (other.cpu() != this.cpu()) return false; - if (other.memory() != this.memory()) return false; - if (other.disk() != this.disk()) return false; - return true; - } - - @Override - public int hashCode() { return Objects.hash(cpu, memory, disk); } - - @Override public String toString() { - return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk"; + return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk," + gpu + " gpu," + gpuMemory + " gpuMemory"; } - public static Load zero() { return new Load(0, 0, 0); } - public static Load one() { return new Load(1, 1, 1); } + public static Load zero() { return new Load(0, 0, 0, 0, 0); } + public static Load one() { return new Load(1, 1, 1, 1, 1); } public static Load byDividing(NodeResources a, NodeResources b) { - return new Load(divide(a.vcpu(), b.vcpu()), divide(a.memoryGb(), b.memoryGb()), divide(a.diskGb(), b.diskGb())); + return new Load(divide(a.vcpu(), b.vcpu()), + divide(a.memoryGb(), b.memoryGb()), + divide(a.diskGb(), b.diskGb()), + divide(a.gpuResources().count(), b.gpuResources().count()), + divide(a.gpuResources().memoryGb(), b.gpuResources().memoryGb())); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index a6882e49efa..f35879d0b24 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -76,8 +76,10 @@ public class MetricsResponse { nodeMetrics.add(new Pair<>(hostname, new NodeMetricSnapshot(at, new Load(Metric.cpu.from(nodeValues), Metric.memory.from(nodeValues), - Metric.disk.from(nodeValues)), - (long)Metric.generation.from(nodeValues), + Metric.disk.from(nodeValues), + Metric.gpu.from(nodeValues), + Metric.gpuMemory.from(nodeValues)), + (long) Metric.generation.from(nodeValues), Metric.inService.from(nodeValues) > 0, clusterIsStable(node.get(), applicationNodes, nodeValues), Metric.queryRate.from(nodeValues)))); @@ -126,6 +128,7 @@ public class MetricsResponse { @Override public List<String> metricResponseNames() { + // TODO(mpolden): Track only CPU util once we support proper GPU scaling return List.of(HostedNodeAdminMetrics.CPU_UTIL.baseName(), HostedNodeAdminMetrics.GPU_UTIL.baseName()); } @@ -139,6 +142,7 @@ public class MetricsResponse { @Override public List<String> metricResponseNames() { + // TODO(mpolden): Track only CPU memory once we support proper GPU scaling return List.of(HostedNodeAdminMetrics.MEM_UTIL.baseName(), SearchNodeMetrics.CONTENT_PROTON_RESOURCE_USAGE_MEMORY.average(), HostedNodeAdminMetrics.GPU_MEM_USED.baseName(), @@ -147,7 +151,7 @@ public class MetricsResponse { @Override double computeFinal(ListMap<String, Double> values) { - return Math.max(gpuMemUtil(values), cpuMemUtil(values)); + return Math.max(cpuMemUtil(values), gpuMemory.computeFinal(values)); } private double cpuMemUtil(ListMap<String, Double> values) { @@ -160,12 +164,6 @@ public class MetricsResponse { return 0; } - private double gpuMemUtil(ListMap<String, Double> values) { - var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum(); - var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum(); - return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0; - } - }, disk { // a node resource @@ -187,6 +185,35 @@ public class MetricsResponse { } }, + gpu { // a node resource + + @Override + public List<String> metricResponseNames() { + return List.of(HostedNodeAdminMetrics.GPU_UTIL.baseName()); + } + + @Override + double computeFinal(ListMap<String, Double> values) { + return values.values().stream().flatMap(List::stream).mapToDouble(v -> v).max().orElse(0) / 100; // % to ratio + } + + }, + gpuMemory { // a node resource + + @Override + public List<String> metricResponseNames() { + return List.of(HostedNodeAdminMetrics.GPU_MEM_USED.baseName(), + HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()); + } + + @Override + double computeFinal(ListMap<String, Double> values) { + var usedGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_USED.baseName()).stream().mapToDouble(v -> v).sum(); + var totalGpuMemory = values.get(HostedNodeAdminMetrics.GPU_MEM_TOTAL.baseName()).stream().mapToDouble(v -> v).sum(); + return totalGpuMemory > 0 ? usedGpuMemory / totalGpuMemory : 0; + } + + }, generation { // application config generation active on the node @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java index 38127fa3093..c0de9a43f7f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java @@ -144,6 +144,8 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { row.putBool(6, snapshot.getSecond().inService()); row.putBool(7, snapshot.getSecond().stable()); row.putFloat(8, (float) snapshot.getSecond().queryRate()); + row.putFloat(9, (float) snapshot.getSecond().load().gpu()); + row.putFloat(10, (float) snapshot.getSecond().load().gpuMemory()); row.append(); } writer.commit(); @@ -243,6 +245,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { private void ensureNodeTableIsUpdated() { try { // Example: nodeTable.ensureColumnExists("write_rate", "float"); + // TODO(mpolden): Remove after January 2024 + nodeTable.ensureColumnExists("gpu_util", "float"); + nodeTable.ensureColumnExists("gpu_mem_total_util", "float"); } catch (Exception e) { nodeTable.repair(e); } @@ -262,7 +267,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { try { issue("create table " + nodeTable.name + " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," + - " application_generation long, inService boolean, stable boolean, queries_rate float)" + + " application_generation long, inService boolean, stable boolean, queries_rate float," + + " gpu_util float, gpu_mem_total_util float" + + " )" + " timestamp(at)" + "PARTITION BY DAY;", newContext()); @@ -311,7 +318,9 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { new NodeMetricSnapshot(Instant.ofEpochMilli(record.getTimestamp(1) / 1000), new Load(record.getFloat(2), record.getFloat(3), - record.getFloat(4)), + record.getFloat(4), + record.getFloat(9), + record.getFloat(10)), record.getLong(5), record.getBool(6), record.getBool(7), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java index c4e7d3b9acc..6f325700401 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java @@ -76,6 +76,8 @@ public class ApplicationSerializer { private static final String cpuKey = "cpu"; private static final String memoryKey = "memory"; private static final String diskKey = "disk"; + private static final String gpuKey = "gpu"; + private static final String gpuMemory = "gpuMemory"; private static final String fromKey = "from"; private static final String toKey = "to"; private static final String generationKey = "generation"; @@ -201,12 +203,16 @@ public class ApplicationSerializer { loadObject.setDouble(cpuKey, load.cpu()); loadObject.setDouble(memoryKey, load.memory()); loadObject.setDouble(diskKey, load.disk()); + loadObject.setDouble(gpuKey, load.gpu()); + loadObject.setDouble(gpuMemory, load.gpuMemory()); } private static Load loadFromSlime(Inspector loadObject) { return new Load(loadObject.field(cpuKey).asDouble(), loadObject.field(memoryKey).asDouble(), - loadObject.field(diskKey).asDouble()); + loadObject.field(diskKey).asDouble(), + loadObject.field(gpuKey).asDouble(), + loadObject.field(gpuMemory).asDouble()); } private static void toSlime(Autoscaling.Metrics metrics, Cursor metricsObject) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index fe6b204ed31..d3b88997059 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -40,7 +40,6 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; import com.yahoo.vespa.hosted.provision.autoscale.Load; import com.yahoo.vespa.hosted.provision.autoscale.MemoryMetricsDb; -import com.yahoo.vespa.hosted.provision.lb.LoadBalancerService; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.node.Status; @@ -239,8 +238,8 @@ public class MockNodeRepository extends NodeRepository { Optional.of(new ClusterResources(4, 1, new NodeResources(3, 16, 100, 1))), clock().instant(), - new Load(0.1, 0.2, 0.3), - new Load(0.4, 0.5, 0.6), + new Load(0.1, 0.2, 0.3, 0, 0), + new Load(0.4, 0.5, 0.6, 0, 0), new Autoscaling.Metrics(0.7, 0.8, 0.9))); try (Mutex lock = applications().lock(app1Id)) { applications().put(app1.with(cluster1), lock); |