diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2023-02-03 14:50:12 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-02-03 14:50:12 +0100 |
commit | a08ae588d6035b69f0961dff596fc871fd1c4e58 (patch) | |
tree | 7650cea03b6df0b7fe657b188f572ffb4f3050b2 | |
parent | facd90e5c4a4f17273f8b368b78baf0a1d5a294c (diff) | |
parent | 7d3d6ee73bc31ea39ef49586cbcff61aa2b4b956 (diff) |
Merge pull request #25849 from vespa-engine/mpolden/gpu-metrics
Collect GPU metrics
6 files changed, 176 insertions, 232 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java index c1c08c15740..f131aca2db0 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java @@ -18,7 +18,6 @@ import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; /** * High-level interface for container operations. Code managing containers should use this and not @@ -38,7 +37,7 @@ public class ContainerOperations { this.containerEngine = Objects.requireNonNull(containerEngine); this.imageDownloader = new ContainerImageDownloader(containerEngine); this.imagePruner = new ContainerImagePruner(containerEngine, Clock.systemUTC()); - this.containerStatsCollector = new ContainerStatsCollector(cgroup, fileSystem); + this.containerStatsCollector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem); } public ContainerData createContainer(NodeAgentContext context, ContainerResources containerResources) { @@ -112,13 +111,13 @@ public class ContainerOperations { /** Get container statistics */ public Optional<ContainerStats> getContainerStats(NodeAgentContext context) { String iface = containerEngine.networkInterface(context); - return getContainer(context).flatMap(container -> containerStatsCollector.collect(container.id(), container.pid(), iface)); + return getContainer(context).flatMap(container -> containerStatsCollector.collect(context, container.id(), container.pid(), iface)); } /** Returns true if no containers managed by node-admin are running */ public boolean noManagedContainersRunning(TaskContext context) { return containerEngine.listContainers(context).stream() - .filter(c -> c.managed()) + .filter(PartialContainer::managed) .noneMatch(container -> container.state() == Container.State.running); } @@ -129,7 +128,7 @@ public class ContainerOperations { */ public boolean retainManagedContainers(TaskContext context, Set<ContainerName> containerNames) { return containerEngine.listContainers(context).stream() - .filter(c -> c.managed()) + .filter(PartialContainer::managed) .filter(container -> !containerNames.contains(container.name())) .peek(container -> containerEngine.removeContainer(context, container)) .count() > 0; diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java index 168f319febd..a5606784c12 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java @@ -1,229 +1,86 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import ai.vespa.validation.Validation; + import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.Objects; /** - * CPU, memory and network statistics collected from a container. + * CPU, GPU, memory and network statistics collected from a container. * * @author freva */ -public class ContainerStats { - - private final Map<String, NetworkStats> networkStatsByInterface; - private final MemoryStats memoryStats; - private final CpuStats cpuStats; +public record ContainerStats(Map<String, NetworkStats> networks, + MemoryStats memoryStats, + CpuStats cpuStats, + List<GpuStats> gpuStats) { - public ContainerStats(Map<String, NetworkStats> networkStatsByInterface, MemoryStats memoryStats, CpuStats cpuStats) { - this.networkStatsByInterface = new LinkedHashMap<>(Objects.requireNonNull(networkStatsByInterface)); + public ContainerStats(Map<String, NetworkStats> networks, MemoryStats memoryStats, CpuStats cpuStats, List<GpuStats> gpuStats) { + this.networks = Collections.unmodifiableMap(new LinkedHashMap<>(Objects.requireNonNull(networks))); this.memoryStats = Objects.requireNonNull(memoryStats); this.cpuStats = Objects.requireNonNull(cpuStats); + this.gpuStats = List.copyOf(Objects.requireNonNull(gpuStats)); } - public Map<String, NetworkStats> getNetworks() { - return Collections.unmodifiableMap(networkStatsByInterface); - } - - public MemoryStats getMemoryStats() { - return memoryStats; - } - - public CpuStats getCpuStats() { - return cpuStats; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ContainerStats that = (ContainerStats) o; - return networkStatsByInterface.equals(that.networkStatsByInterface) && memoryStats.equals(that.memoryStats) && cpuStats.equals(that.cpuStats); - } - - @Override - public int hashCode() { - return Objects.hash(networkStatsByInterface, memoryStats, cpuStats); - } - - /** Statistics for network usage */ - public static class NetworkStats { - - private final long rxBytes; - private final long rxDropped; - private final long rxErrors; - private final long txBytes; - private final long txDropped; - private final long txErrors; - - public NetworkStats(long rxBytes, long rxDropped, long rxErrors, long txBytes, long txDropped, long txErrors) { - this.rxBytes = rxBytes; - this.rxDropped = rxDropped; - this.rxErrors = rxErrors; - this.txBytes = txBytes; - this.txDropped = txDropped; - this.txErrors = txErrors; - } - - /** Returns received bytes */ - public long getRxBytes() { return this.rxBytes; } - - /** Returns received bytes, which was dropped */ - public long getRxDropped() { return this.rxDropped; } - - /** Returns received errors */ - public long getRxErrors() { return this.rxErrors; } - - /** Returns transmitted bytes */ - public long getTxBytes() { return this.txBytes; } - - /** Returns transmitted bytes, which was dropped */ - public long getTxDropped() { return this.txDropped; } - - /** Returns transmission errors */ - public long getTxErrors() { return this.txErrors; } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - NetworkStats that = (NetworkStats) o; - return rxBytes == that.rxBytes && rxDropped == that.rxDropped && rxErrors == that.rxErrors && txBytes == that.txBytes && txDropped == that.txDropped && txErrors == that.txErrors; - } - - @Override - public int hashCode() { - return Objects.hash(rxBytes, rxDropped, rxErrors, txBytes, txDropped, txErrors); - } - - @Override - public String toString() { - return "NetworkStats{" + - "rxBytes=" + rxBytes + - ", rxDropped=" + rxDropped + - ", rxErrors=" + rxErrors + - ", txBytes=" + txBytes + - ", txDropped=" + txDropped + - ", txErrors=" + txErrors + - '}'; - } - - } - - /** Statistics for memory usage */ - public static class MemoryStats { - - private final long cache; - private final long usage; - private final long limit; - - public MemoryStats(long cache, long usage, long limit) { - this.cache = cache; - this.usage = usage; - this.limit = limit; - } - - /** Returns memory used by cache in bytes */ - public long getCache() { return this.cache; } - - /** Returns memory usage in bytes */ - public long getUsage() { return this.usage; } - - /** Returns memory limit in bytes */ - public long getLimit() { return this.limit; } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - MemoryStats that = (MemoryStats) o; - return cache == that.cache && usage == that.usage && limit == that.limit; - } - - @Override - public int hashCode() { - return Objects.hash(cache, usage, limit); - } - - @Override - public String toString() { - return "MemoryStats{" + - "cache=" + cache + - ", usage=" + usage + - ", limit=" + limit + - '}'; - } - - } - - /** Statistics for CPU usage */ - public static class CpuStats { - - private final int onlineCpus; - private final long systemCpuUsage; - private final long totalUsage; - private final long usageInKernelMode; - private final long throttledTime; - private final long throttlingActivePeriods; - private final long throttledPeriods; - - public CpuStats(int onlineCpus, long systemCpuUsage, long totalUsage, long usageInKernelMode, - long throttledTime, long throttlingActivePeriods, long throttledPeriods) { - this.onlineCpus = onlineCpus; - this.systemCpuUsage = systemCpuUsage; - this.totalUsage = totalUsage; - this.usageInKernelMode = usageInKernelMode; - this.throttledTime = throttledTime; - this.throttlingActivePeriods = throttlingActivePeriods; - this.throttledPeriods = throttledPeriods; - } - - public int getOnlineCpus() { return this.onlineCpus; } - - /** Total CPU time (in µs) spent executing all the processes on this host */ - public long getSystemCpuUsage() { return this.systemCpuUsage; } - - /** Total CPU time (in µs) spent running all the processes in this container */ - public long getTotalUsage() { return totalUsage; } - - /** Total CPU time (in µs) spent in kernel mode while executing processes in this container */ - public long getUsageInKernelMode() { return usageInKernelMode; } - - /** Total CPU time (in µs) processes in this container were throttled for */ - public long getThrottledTime() { return throttledTime; } - - /** Number of periods with throttling enabled for this container */ - public long getThrottlingActivePeriods() { return throttlingActivePeriods; } - - /** Number of periods this container hit the throttling limit */ - public long getThrottledPeriods() { return throttledPeriods; } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - CpuStats cpuStats = (CpuStats) o; - return onlineCpus == cpuStats.onlineCpus && systemCpuUsage == cpuStats.systemCpuUsage && totalUsage == cpuStats.totalUsage && usageInKernelMode == cpuStats.usageInKernelMode && throttledTime == cpuStats.throttledTime && throttlingActivePeriods == cpuStats.throttlingActivePeriods && throttledPeriods == cpuStats.throttledPeriods; - } - - @Override - public int hashCode() { - return Objects.hash(onlineCpus, systemCpuUsage, totalUsage, usageInKernelMode, throttledTime, throttlingActivePeriods, throttledPeriods); - } - - @Override - public String toString() { - return "CpuStats{" + - "onlineCpus=" + onlineCpus + - ", systemCpuUsage=" + systemCpuUsage + - ", totalUsage=" + totalUsage + - ", usageInKernelMode=" + usageInKernelMode + - ", throttledTime=" + throttledTime + - ", throttlingActivePeriods=" + throttlingActivePeriods + - ", throttledPeriods=" + throttledPeriods + - '}'; + /** + * Statistics for network usage + * + * @param rxBytes received bytes + * @param rxDropped received bytes, which were dropped + * @param rxErrors received errors + * @param txBytes transmitted bytes + * @param txDropped transmitted bytes, which were dropped + * @param txErrors transmission errors + */ + public record NetworkStats(long rxBytes, long rxDropped, long rxErrors, long txBytes, long txDropped, long txErrors) {} + + /** + * Statistics for memory usage + * + * @param cache memory used by cache in bytes + * @param usage memory usage in bytes + * @param limit memory limit in bytes + */ + public record MemoryStats(long cache, long usage, long limit) {} + + /** + * Statistics for CPU usage + * + * @param onlineCpus CPU cores + * @param systemCpuUsage Total CPU time (in µs) spent executing all the processes on this host + * @param totalUsage Total CPU time (in µs) spent running all the processes in this container + * @param usageInKernelMode Total CPU time (in µs) spent in kernel mode while executing processes in this container + * @param throttledTime Total CPU time (in µs) processes in this container were throttled for + * @param throttlingActivePeriods Number of periods with throttling enabled for this container + * @param throttledPeriods Number of periods this container hit the throttling limit + */ + public record CpuStats(int onlineCpus, + long systemCpuUsage, + long totalUsage, + long usageInKernelMode, + long throttledTime, + long throttlingActivePeriods, + long throttledPeriods) {} + + /** + * GPU statistics + * + * @param deviceNumber GPU device number + * @param loadPercentage Load/utilization in % + * @param memoryTotalBytes Total memory, in bytes + * @param memoryUsedBytes Memory used, in bytes + */ + public record GpuStats(int deviceNumber, int loadPercentage, long memoryTotalBytes, long memoryUsedBytes) { + + public GpuStats { + Validation.requireAtLeast(deviceNumber, "deviceNumber", 0); + Validation.requireAtLeast(loadPercentage, "loadPercentage", 0); + Validation.requireAtLeast(memoryTotalBytes, "memoryTotalBytes", 0L); + Validation.requireAtLeast(memoryUsedBytes, "memoryUsedBytes", 0L); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java index 67956892898..c17f98b9c9d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java @@ -1,19 +1,24 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser; + import java.io.IOException; import java.io.UncheckedIOException; import java.nio.file.FileSystem; import java.nio.file.Files; import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.time.Duration; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.stream.Stream; /** - * Collects CPU, memory and network statistics for a container. + * Collects CPU, GPU, memory and network statistics for a container. * * Uses same approach as runc: https://github.com/opencontainers/runc/tree/master/libcontainer/cgroups/fs * @@ -21,27 +26,30 @@ import java.util.Optional; */ class ContainerStatsCollector { + private final ContainerEngine containerEngine; private final CGroup cgroup; private final FileSystem fileSystem; private final int onlineCpus; - ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem) { - this(cgroup, fileSystem, Runtime.getRuntime().availableProcessors()); + ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem) { + this(containerEngine, cgroup, fileSystem, Runtime.getRuntime().availableProcessors()); } - ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem, int onlineCpus) { + ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem, int onlineCpus) { + this.containerEngine = Objects.requireNonNull(containerEngine); this.cgroup = Objects.requireNonNull(cgroup); this.fileSystem = Objects.requireNonNull(fileSystem); this.onlineCpus = onlineCpus; } /** Collect statistics for given container ID and PID */ - public Optional<ContainerStats> collect(ContainerId containerId, int pid, String iface) { + public Optional<ContainerStats> collect(NodeAgentContext context, ContainerId containerId, int pid, String iface) { try { ContainerStats.CpuStats cpuStats = collectCpuStats(containerId); ContainerStats.MemoryStats memoryStats = collectMemoryStats(containerId); Map<String, ContainerStats.NetworkStats> networkStats = Map.of(iface, collectNetworkStats(iface, pid)); - return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats)); + List<ContainerStats.GpuStats> gpuStats = collectGpuStats(context); + return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats, gpuStats)); } catch (NoSuchFileException ignored) { return Optional.empty(); // Container disappeared while we collected stats } catch (IOException e) { @@ -49,6 +57,31 @@ class ContainerStatsCollector { } } + private List<ContainerStats.GpuStats> collectGpuStats(NodeAgentContext context) { + boolean hasGpu = Files.exists(fileSystem.getPath("/dev/nvidia0")); + if (!hasGpu) { + return List.of(); + } + Stream<String> lines = containerEngine.execute(context, UnixUser.ROOT, Duration.ofSeconds(5), + "nvidia-smi", + "--query-gpu=index,utilization.gpu,memory.total,memory.free", + "--format=csv,noheader,nounits") + .getOutputLinesStream(); + return lines.map(ContainerStatsCollector::parseGpuStats).toList(); + } + + private static ContainerStats.GpuStats parseGpuStats(String s) { + String[] fields = fields(s, ",\\s*"); + if (fields.length < 4) throw new IllegalArgumentException("Could not parse GPU stats from '" + s + "'"); + int deviceNumber = Integer.parseInt(fields[0]); + int loadPercentage = Integer.parseInt(fields[1]); + long mega = 2 << 19; + long memoryTotalBytes = Long.parseLong(fields[2]) * mega; + long memoryFreeBytes = Long.parseLong(fields[3]) * mega; + long memoryUsedBytes = memoryTotalBytes - memoryFreeBytes; + return new ContainerStats.GpuStats(deviceNumber, loadPercentage, memoryTotalBytes, memoryUsedBytes); + } + private ContainerStats.CpuStats collectCpuStats(ContainerId containerId) throws IOException { Map<CGroup.CpuStatField, Long> cpuStats = cgroup.cpuStats(containerId); return new ContainerStats.CpuStats(onlineCpus, @@ -114,7 +147,11 @@ class ContainerStatsCollector { } private static String[] fields(String s) { - return s.split("\\s+"); + return fields(s, "\\s+"); + } + + private static String[] fields(String s, String regex) { + return s.trim().split(regex); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 528383eb91e..f168523a1ef 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -123,7 +123,7 @@ public class NodeAdminImpl implements NodeAdmin { Optional<ContainerStats> containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); if (containerStats.isPresent()) { ++numContainers; - totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage(); + totalContainerMemoryBytes += containerStats.get().memoryStats().usage(); } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java index 09590be42f8..af869786504 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser; import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath; import com.yahoo.vespa.hosted.node.admin.task.util.process.CommandResult; +import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal; import java.nio.file.Path; import java.time.Duration; @@ -19,7 +20,6 @@ import java.util.Objects; import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; -import java.util.stream.Collectors; /** * @author mpolden @@ -30,6 +30,16 @@ public class ContainerEngineMock implements ContainerEngine { private final Map<String, ImageDownload> images = new ConcurrentHashMap<>(); private boolean asyncImageDownload = false; + private final TestTerminal terminal; + + public ContainerEngineMock() { + this(null); + } + + public ContainerEngineMock(TestTerminal terminal) { + this.terminal = terminal; + } + public ContainerEngineMock asyncImageDownload(boolean enabled) { this.asyncImageDownload = enabled; return this; @@ -139,12 +149,22 @@ public class ContainerEngineMock implements ContainerEngine { @Override public CommandResult execute(NodeAgentContext context, UnixUser user, Duration timeout, String... command) { - return new CommandResult(null, 0, ""); + if (terminal == null) { + return new CommandResult(null, 0, ""); + } + return terminal.newCommandLine(context) + .add(command) + .executeSilently(); } @Override public CommandResult executeInNetworkNamespace(NodeAgentContext context, String... command) { - return new CommandResult(null, 0, ""); + if (terminal == null) { + return new CommandResult(null, 0, ""); + } + return terminal.newCommandLine(context) + .add(command) + .executeSilently(); } @Override diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java index 79c7558ea9e..f852eb6235d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java @@ -1,12 +1,19 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; +import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal; import com.yahoo.vespa.test.file.TestFileSystem; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; import java.util.Map; import java.util.Optional; @@ -27,30 +34,54 @@ import static org.mockito.Mockito.when; */ public class ContainerStatsCollectorTest { + private final TestTerminal testTerminal = new TestTerminal(); + private final ContainerEngineMock containerEngine = new ContainerEngineMock(testTerminal); private final FileSystem fileSystem = TestFileSystem.create(); private final CGroup cgroup = mock(CGroup.class); + private final NodeAgentContext context = NodeAgentContextImpl.builder(NodeSpec.Builder.testSpec("c1").build()) + .fileSystem(TestFileSystem.create()) + .build(); @Test - void collect() throws IOException { - ContainerStatsCollector collector = new ContainerStatsCollector(cgroup, fileSystem, 24); + void collect() throws Exception { + ContainerStatsCollector collector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem, 24); ContainerId containerId = new ContainerId("id1"); int containerPid = 42; - assertTrue(collector.collect(containerId, containerPid, "eth0").isEmpty(), "No stats found"); + assertTrue(collector.collect(context, containerId, containerPid, "eth0").isEmpty(), "No stats found"); mockMemoryStats(containerId); mockCpuStats(containerId); mockNetworkStats(containerPid); - Optional<ContainerStats> stats = collector.collect(containerId, containerPid, "eth0"); + Optional<ContainerStats> stats = collector.collect(context, containerId, containerPid, "eth0"); assertTrue(stats.isPresent()); assertEquals(new ContainerStats.CpuStats(24, 6049374780000L, 691675615472L, 262190000000L, 3L, 1L, 2L), - stats.get().getCpuStats()); + stats.get().cpuStats()); assertEquals(new ContainerStats.MemoryStats(470790144L, 1228017664L, 2147483648L), - stats.get().getMemoryStats()); + stats.get().memoryStats()); assertEquals(Map.of("eth0", new ContainerStats.NetworkStats(22280813L, 4L, 3L, 19859383L, 6L, 5L)), - stats.get().getNetworks()); + stats.get().networks()); + assertEquals(List.of(), stats.get().gpuStats()); + + mockGpuStats(); + stats = collector.collect(context, containerId, containerPid, "eth0"); + assertTrue(stats.isPresent()); + assertEquals(List.of(new ContainerStats.GpuStats(0, 35, 16106127360L, 6144655360L), + new ContainerStats.GpuStats(1, 67, 32212254720L, 19314769920L)), + stats.get().gpuStats()); + } + + private void mockGpuStats() throws IOException { + Path devPath = fileSystem.getPath("/dev"); + Files.createDirectories(devPath); + Files.createFile(devPath.resolve("nvidia0")); + testTerminal.expectCommand("nvidia-smi --query-gpu=index,utilization.gpu,memory.total,memory.free --format=csv,noheader,nounits 2>&1", 0, + """ + 0, 35, 15360, 9500 + 1, 67, 30720, 12300 + """); } private void mockNetworkStats(int pid) { |