aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2023-02-03 14:50:12 +0100
committerGitHub <noreply@github.com>2023-02-03 14:50:12 +0100
commita08ae588d6035b69f0961dff596fc871fd1c4e58 (patch)
tree7650cea03b6df0b7fe657b188f572ffb4f3050b2
parentfacd90e5c4a4f17273f8b368b78baf0a1d5a294c (diff)
parent7d3d6ee73bc31ea39ef49586cbcff61aa2b4b956 (diff)
Merge pull request #25849 from vespa-engine/mpolden/gpu-metrics
Collect GPU metrics
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java9
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java275
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java51
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java2
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java26
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java45
6 files changed, 176 insertions, 232 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java
index c1c08c15740..f131aca2db0 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java
@@ -18,7 +18,6 @@ import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
-import java.util.stream.Collectors;
/**
* High-level interface for container operations. Code managing containers should use this and not
@@ -38,7 +37,7 @@ public class ContainerOperations {
this.containerEngine = Objects.requireNonNull(containerEngine);
this.imageDownloader = new ContainerImageDownloader(containerEngine);
this.imagePruner = new ContainerImagePruner(containerEngine, Clock.systemUTC());
- this.containerStatsCollector = new ContainerStatsCollector(cgroup, fileSystem);
+ this.containerStatsCollector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem);
}
public ContainerData createContainer(NodeAgentContext context, ContainerResources containerResources) {
@@ -112,13 +111,13 @@ public class ContainerOperations {
/** Get container statistics */
public Optional<ContainerStats> getContainerStats(NodeAgentContext context) {
String iface = containerEngine.networkInterface(context);
- return getContainer(context).flatMap(container -> containerStatsCollector.collect(container.id(), container.pid(), iface));
+ return getContainer(context).flatMap(container -> containerStatsCollector.collect(context, container.id(), container.pid(), iface));
}
/** Returns true if no containers managed by node-admin are running */
public boolean noManagedContainersRunning(TaskContext context) {
return containerEngine.listContainers(context).stream()
- .filter(c -> c.managed())
+ .filter(PartialContainer::managed)
.noneMatch(container -> container.state() == Container.State.running);
}
@@ -129,7 +128,7 @@ public class ContainerOperations {
*/
public boolean retainManagedContainers(TaskContext context, Set<ContainerName> containerNames) {
return containerEngine.listContainers(context).stream()
- .filter(c -> c.managed())
+ .filter(PartialContainer::managed)
.filter(container -> !containerNames.contains(container.name()))
.peek(container -> containerEngine.removeContainer(context, container))
.count() > 0;
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java
index 168f319febd..a5606784c12 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java
@@ -1,229 +1,86 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.container;
+import ai.vespa.validation.Validation;
+
import java.util.Collections;
import java.util.LinkedHashMap;
+import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
- * CPU, memory and network statistics collected from a container.
+ * CPU, GPU, memory and network statistics collected from a container.
*
* @author freva
*/
-public class ContainerStats {
-
- private final Map<String, NetworkStats> networkStatsByInterface;
- private final MemoryStats memoryStats;
- private final CpuStats cpuStats;
+public record ContainerStats(Map<String, NetworkStats> networks,
+ MemoryStats memoryStats,
+ CpuStats cpuStats,
+ List<GpuStats> gpuStats) {
- public ContainerStats(Map<String, NetworkStats> networkStatsByInterface, MemoryStats memoryStats, CpuStats cpuStats) {
- this.networkStatsByInterface = new LinkedHashMap<>(Objects.requireNonNull(networkStatsByInterface));
+ public ContainerStats(Map<String, NetworkStats> networks, MemoryStats memoryStats, CpuStats cpuStats, List<GpuStats> gpuStats) {
+ this.networks = Collections.unmodifiableMap(new LinkedHashMap<>(Objects.requireNonNull(networks)));
this.memoryStats = Objects.requireNonNull(memoryStats);
this.cpuStats = Objects.requireNonNull(cpuStats);
+ this.gpuStats = List.copyOf(Objects.requireNonNull(gpuStats));
}
- public Map<String, NetworkStats> getNetworks() {
- return Collections.unmodifiableMap(networkStatsByInterface);
- }
-
- public MemoryStats getMemoryStats() {
- return memoryStats;
- }
-
- public CpuStats getCpuStats() {
- return cpuStats;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- ContainerStats that = (ContainerStats) o;
- return networkStatsByInterface.equals(that.networkStatsByInterface) && memoryStats.equals(that.memoryStats) && cpuStats.equals(that.cpuStats);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(networkStatsByInterface, memoryStats, cpuStats);
- }
-
- /** Statistics for network usage */
- public static class NetworkStats {
-
- private final long rxBytes;
- private final long rxDropped;
- private final long rxErrors;
- private final long txBytes;
- private final long txDropped;
- private final long txErrors;
-
- public NetworkStats(long rxBytes, long rxDropped, long rxErrors, long txBytes, long txDropped, long txErrors) {
- this.rxBytes = rxBytes;
- this.rxDropped = rxDropped;
- this.rxErrors = rxErrors;
- this.txBytes = txBytes;
- this.txDropped = txDropped;
- this.txErrors = txErrors;
- }
-
- /** Returns received bytes */
- public long getRxBytes() { return this.rxBytes; }
-
- /** Returns received bytes, which was dropped */
- public long getRxDropped() { return this.rxDropped; }
-
- /** Returns received errors */
- public long getRxErrors() { return this.rxErrors; }
-
- /** Returns transmitted bytes */
- public long getTxBytes() { return this.txBytes; }
-
- /** Returns transmitted bytes, which was dropped */
- public long getTxDropped() { return this.txDropped; }
-
- /** Returns transmission errors */
- public long getTxErrors() { return this.txErrors; }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- NetworkStats that = (NetworkStats) o;
- return rxBytes == that.rxBytes && rxDropped == that.rxDropped && rxErrors == that.rxErrors && txBytes == that.txBytes && txDropped == that.txDropped && txErrors == that.txErrors;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(rxBytes, rxDropped, rxErrors, txBytes, txDropped, txErrors);
- }
-
- @Override
- public String toString() {
- return "NetworkStats{" +
- "rxBytes=" + rxBytes +
- ", rxDropped=" + rxDropped +
- ", rxErrors=" + rxErrors +
- ", txBytes=" + txBytes +
- ", txDropped=" + txDropped +
- ", txErrors=" + txErrors +
- '}';
- }
-
- }
-
- /** Statistics for memory usage */
- public static class MemoryStats {
-
- private final long cache;
- private final long usage;
- private final long limit;
-
- public MemoryStats(long cache, long usage, long limit) {
- this.cache = cache;
- this.usage = usage;
- this.limit = limit;
- }
-
- /** Returns memory used by cache in bytes */
- public long getCache() { return this.cache; }
-
- /** Returns memory usage in bytes */
- public long getUsage() { return this.usage; }
-
- /** Returns memory limit in bytes */
- public long getLimit() { return this.limit; }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- MemoryStats that = (MemoryStats) o;
- return cache == that.cache && usage == that.usage && limit == that.limit;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(cache, usage, limit);
- }
-
- @Override
- public String toString() {
- return "MemoryStats{" +
- "cache=" + cache +
- ", usage=" + usage +
- ", limit=" + limit +
- '}';
- }
-
- }
-
- /** Statistics for CPU usage */
- public static class CpuStats {
-
- private final int onlineCpus;
- private final long systemCpuUsage;
- private final long totalUsage;
- private final long usageInKernelMode;
- private final long throttledTime;
- private final long throttlingActivePeriods;
- private final long throttledPeriods;
-
- public CpuStats(int onlineCpus, long systemCpuUsage, long totalUsage, long usageInKernelMode,
- long throttledTime, long throttlingActivePeriods, long throttledPeriods) {
- this.onlineCpus = onlineCpus;
- this.systemCpuUsage = systemCpuUsage;
- this.totalUsage = totalUsage;
- this.usageInKernelMode = usageInKernelMode;
- this.throttledTime = throttledTime;
- this.throttlingActivePeriods = throttlingActivePeriods;
- this.throttledPeriods = throttledPeriods;
- }
-
- public int getOnlineCpus() { return this.onlineCpus; }
-
- /** Total CPU time (in µs) spent executing all the processes on this host */
- public long getSystemCpuUsage() { return this.systemCpuUsage; }
-
- /** Total CPU time (in µs) spent running all the processes in this container */
- public long getTotalUsage() { return totalUsage; }
-
- /** Total CPU time (in µs) spent in kernel mode while executing processes in this container */
- public long getUsageInKernelMode() { return usageInKernelMode; }
-
- /** Total CPU time (in µs) processes in this container were throttled for */
- public long getThrottledTime() { return throttledTime; }
-
- /** Number of periods with throttling enabled for this container */
- public long getThrottlingActivePeriods() { return throttlingActivePeriods; }
-
- /** Number of periods this container hit the throttling limit */
- public long getThrottledPeriods() { return throttledPeriods; }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- CpuStats cpuStats = (CpuStats) o;
- return onlineCpus == cpuStats.onlineCpus && systemCpuUsage == cpuStats.systemCpuUsage && totalUsage == cpuStats.totalUsage && usageInKernelMode == cpuStats.usageInKernelMode && throttledTime == cpuStats.throttledTime && throttlingActivePeriods == cpuStats.throttlingActivePeriods && throttledPeriods == cpuStats.throttledPeriods;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(onlineCpus, systemCpuUsage, totalUsage, usageInKernelMode, throttledTime, throttlingActivePeriods, throttledPeriods);
- }
-
- @Override
- public String toString() {
- return "CpuStats{" +
- "onlineCpus=" + onlineCpus +
- ", systemCpuUsage=" + systemCpuUsage +
- ", totalUsage=" + totalUsage +
- ", usageInKernelMode=" + usageInKernelMode +
- ", throttledTime=" + throttledTime +
- ", throttlingActivePeriods=" + throttlingActivePeriods +
- ", throttledPeriods=" + throttledPeriods +
- '}';
+ /**
+ * Statistics for network usage
+ *
+ * @param rxBytes received bytes
+ * @param rxDropped received bytes, which were dropped
+ * @param rxErrors received errors
+ * @param txBytes transmitted bytes
+ * @param txDropped transmitted bytes, which were dropped
+ * @param txErrors transmission errors
+ */
+ public record NetworkStats(long rxBytes, long rxDropped, long rxErrors, long txBytes, long txDropped, long txErrors) {}
+
+ /**
+ * Statistics for memory usage
+ *
+ * @param cache memory used by cache in bytes
+ * @param usage memory usage in bytes
+ * @param limit memory limit in bytes
+ */
+ public record MemoryStats(long cache, long usage, long limit) {}
+
+ /**
+ * Statistics for CPU usage
+ *
+ * @param onlineCpus CPU cores
+ * @param systemCpuUsage Total CPU time (in µs) spent executing all the processes on this host
+ * @param totalUsage Total CPU time (in µs) spent running all the processes in this container
+ * @param usageInKernelMode Total CPU time (in µs) spent in kernel mode while executing processes in this container
+ * @param throttledTime Total CPU time (in µs) processes in this container were throttled for
+ * @param throttlingActivePeriods Number of periods with throttling enabled for this container
+ * @param throttledPeriods Number of periods this container hit the throttling limit
+ */
+ public record CpuStats(int onlineCpus,
+ long systemCpuUsage,
+ long totalUsage,
+ long usageInKernelMode,
+ long throttledTime,
+ long throttlingActivePeriods,
+ long throttledPeriods) {}
+
+ /**
+ * GPU statistics
+ *
+ * @param deviceNumber GPU device number
+ * @param loadPercentage Load/utilization in %
+ * @param memoryTotalBytes Total memory, in bytes
+ * @param memoryUsedBytes Memory used, in bytes
+ */
+ public record GpuStats(int deviceNumber, int loadPercentage, long memoryTotalBytes, long memoryUsedBytes) {
+
+ public GpuStats {
+ Validation.requireAtLeast(deviceNumber, "deviceNumber", 0);
+ Validation.requireAtLeast(loadPercentage, "loadPercentage", 0);
+ Validation.requireAtLeast(memoryTotalBytes, "memoryTotalBytes", 0L);
+ Validation.requireAtLeast(memoryUsedBytes, "memoryUsedBytes", 0L);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java
index 67956892898..c17f98b9c9d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java
@@ -1,19 +1,24 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.container;
+import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
+import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser;
+
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
+import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.stream.Stream;
/**
- * Collects CPU, memory and network statistics for a container.
+ * Collects CPU, GPU, memory and network statistics for a container.
*
* Uses same approach as runc: https://github.com/opencontainers/runc/tree/master/libcontainer/cgroups/fs
*
@@ -21,27 +26,30 @@ import java.util.Optional;
*/
class ContainerStatsCollector {
+ private final ContainerEngine containerEngine;
private final CGroup cgroup;
private final FileSystem fileSystem;
private final int onlineCpus;
- ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem) {
- this(cgroup, fileSystem, Runtime.getRuntime().availableProcessors());
+ ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem) {
+ this(containerEngine, cgroup, fileSystem, Runtime.getRuntime().availableProcessors());
}
- ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem, int onlineCpus) {
+ ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem, int onlineCpus) {
+ this.containerEngine = Objects.requireNonNull(containerEngine);
this.cgroup = Objects.requireNonNull(cgroup);
this.fileSystem = Objects.requireNonNull(fileSystem);
this.onlineCpus = onlineCpus;
}
/** Collect statistics for given container ID and PID */
- public Optional<ContainerStats> collect(ContainerId containerId, int pid, String iface) {
+ public Optional<ContainerStats> collect(NodeAgentContext context, ContainerId containerId, int pid, String iface) {
try {
ContainerStats.CpuStats cpuStats = collectCpuStats(containerId);
ContainerStats.MemoryStats memoryStats = collectMemoryStats(containerId);
Map<String, ContainerStats.NetworkStats> networkStats = Map.of(iface, collectNetworkStats(iface, pid));
- return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats));
+ List<ContainerStats.GpuStats> gpuStats = collectGpuStats(context);
+ return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats, gpuStats));
} catch (NoSuchFileException ignored) {
return Optional.empty(); // Container disappeared while we collected stats
} catch (IOException e) {
@@ -49,6 +57,31 @@ class ContainerStatsCollector {
}
}
+ private List<ContainerStats.GpuStats> collectGpuStats(NodeAgentContext context) {
+ boolean hasGpu = Files.exists(fileSystem.getPath("/dev/nvidia0"));
+ if (!hasGpu) {
+ return List.of();
+ }
+ Stream<String> lines = containerEngine.execute(context, UnixUser.ROOT, Duration.ofSeconds(5),
+ "nvidia-smi",
+ "--query-gpu=index,utilization.gpu,memory.total,memory.free",
+ "--format=csv,noheader,nounits")
+ .getOutputLinesStream();
+ return lines.map(ContainerStatsCollector::parseGpuStats).toList();
+ }
+
+ private static ContainerStats.GpuStats parseGpuStats(String s) {
+ String[] fields = fields(s, ",\\s*");
+ if (fields.length < 4) throw new IllegalArgumentException("Could not parse GPU stats from '" + s + "'");
+ int deviceNumber = Integer.parseInt(fields[0]);
+ int loadPercentage = Integer.parseInt(fields[1]);
+ long mega = 2 << 19;
+ long memoryTotalBytes = Long.parseLong(fields[2]) * mega;
+ long memoryFreeBytes = Long.parseLong(fields[3]) * mega;
+ long memoryUsedBytes = memoryTotalBytes - memoryFreeBytes;
+ return new ContainerStats.GpuStats(deviceNumber, loadPercentage, memoryTotalBytes, memoryUsedBytes);
+ }
+
private ContainerStats.CpuStats collectCpuStats(ContainerId containerId) throws IOException {
Map<CGroup.CpuStatField, Long> cpuStats = cgroup.cpuStats(containerId);
return new ContainerStats.CpuStats(onlineCpus,
@@ -114,7 +147,11 @@ class ContainerStatsCollector {
}
private static String[] fields(String s) {
- return s.split("\\s+");
+ return fields(s, "\\s+");
+ }
+
+ private static String[] fields(String s, String regex) {
+ return s.trim().split(regex);
}
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index 528383eb91e..f168523a1ef 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -123,7 +123,7 @@ public class NodeAdminImpl implements NodeAdmin {
Optional<ContainerStats> containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended);
if (containerStats.isPresent()) {
++numContainers;
- totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage();
+ totalContainerMemoryBytes += containerStats.get().memoryStats().usage();
}
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java
index 09590be42f8..af869786504 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java
@@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser;
import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath;
import com.yahoo.vespa.hosted.node.admin.task.util.process.CommandResult;
+import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal;
import java.nio.file.Path;
import java.time.Duration;
@@ -19,7 +20,6 @@ import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
-import java.util.stream.Collectors;
/**
* @author mpolden
@@ -30,6 +30,16 @@ public class ContainerEngineMock implements ContainerEngine {
private final Map<String, ImageDownload> images = new ConcurrentHashMap<>();
private boolean asyncImageDownload = false;
+ private final TestTerminal terminal;
+
+ public ContainerEngineMock() {
+ this(null);
+ }
+
+ public ContainerEngineMock(TestTerminal terminal) {
+ this.terminal = terminal;
+ }
+
public ContainerEngineMock asyncImageDownload(boolean enabled) {
this.asyncImageDownload = enabled;
return this;
@@ -139,12 +149,22 @@ public class ContainerEngineMock implements ContainerEngine {
@Override
public CommandResult execute(NodeAgentContext context, UnixUser user, Duration timeout, String... command) {
- return new CommandResult(null, 0, "");
+ if (terminal == null) {
+ return new CommandResult(null, 0, "");
+ }
+ return terminal.newCommandLine(context)
+ .add(command)
+ .executeSilently();
}
@Override
public CommandResult executeInNetworkNamespace(NodeAgentContext context, String... command) {
- return new CommandResult(null, 0, "");
+ if (terminal == null) {
+ return new CommandResult(null, 0, "");
+ }
+ return terminal.newCommandLine(context)
+ .add(command)
+ .executeSilently();
}
@Override
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java
index 79c7558ea9e..f852eb6235d 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java
@@ -1,12 +1,19 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.container;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
+import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
+import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath;
+import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal;
import com.yahoo.vespa.test.file.TestFileSystem;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.FileSystem;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -27,30 +34,54 @@ import static org.mockito.Mockito.when;
*/
public class ContainerStatsCollectorTest {
+ private final TestTerminal testTerminal = new TestTerminal();
+ private final ContainerEngineMock containerEngine = new ContainerEngineMock(testTerminal);
private final FileSystem fileSystem = TestFileSystem.create();
private final CGroup cgroup = mock(CGroup.class);
+ private final NodeAgentContext context = NodeAgentContextImpl.builder(NodeSpec.Builder.testSpec("c1").build())
+ .fileSystem(TestFileSystem.create())
+ .build();
@Test
- void collect() throws IOException {
- ContainerStatsCollector collector = new ContainerStatsCollector(cgroup, fileSystem, 24);
+ void collect() throws Exception {
+ ContainerStatsCollector collector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem, 24);
ContainerId containerId = new ContainerId("id1");
int containerPid = 42;
- assertTrue(collector.collect(containerId, containerPid, "eth0").isEmpty(), "No stats found");
+ assertTrue(collector.collect(context, containerId, containerPid, "eth0").isEmpty(), "No stats found");
mockMemoryStats(containerId);
mockCpuStats(containerId);
mockNetworkStats(containerPid);
- Optional<ContainerStats> stats = collector.collect(containerId, containerPid, "eth0");
+ Optional<ContainerStats> stats = collector.collect(context, containerId, containerPid, "eth0");
assertTrue(stats.isPresent());
assertEquals(new ContainerStats.CpuStats(24, 6049374780000L, 691675615472L,
262190000000L, 3L, 1L, 2L),
- stats.get().getCpuStats());
+ stats.get().cpuStats());
assertEquals(new ContainerStats.MemoryStats(470790144L, 1228017664L, 2147483648L),
- stats.get().getMemoryStats());
+ stats.get().memoryStats());
assertEquals(Map.of("eth0", new ContainerStats.NetworkStats(22280813L, 4L, 3L,
19859383L, 6L, 5L)),
- stats.get().getNetworks());
+ stats.get().networks());
+ assertEquals(List.of(), stats.get().gpuStats());
+
+ mockGpuStats();
+ stats = collector.collect(context, containerId, containerPid, "eth0");
+ assertTrue(stats.isPresent());
+ assertEquals(List.of(new ContainerStats.GpuStats(0, 35, 16106127360L, 6144655360L),
+ new ContainerStats.GpuStats(1, 67, 32212254720L, 19314769920L)),
+ stats.get().gpuStats());
+ }
+
+ private void mockGpuStats() throws IOException {
+ Path devPath = fileSystem.getPath("/dev");
+ Files.createDirectories(devPath);
+ Files.createFile(devPath.resolve("nvidia0"));
+ testTerminal.expectCommand("nvidia-smi --query-gpu=index,utilization.gpu,memory.total,memory.free --format=csv,noheader,nounits 2>&1", 0,
+ """
+ 0, 35, 15360, 9500
+ 1, 67, 30720, 12300
+ """);
}
private void mockNetworkStats(int pid) {