diff options
author | Martin Polden <mpolden@mpolden.no> | 2023-02-02 14:17:44 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2023-02-02 15:11:22 +0100 |
commit | 7d3d6ee73bc31ea39ef49586cbcff61aa2b4b956 (patch) | |
tree | 9873768cb3b5999a03ddfcb6422e95994fe78a76 /node-admin | |
parent | 38fe98240c0bf63b9fbe08af2428f0a5ba8e0731 (diff) |
Collect GPU metrics
Diffstat (limited to 'node-admin')
6 files changed, 140 insertions, 56 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java index c1c08c15740..f131aca2db0 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerOperations.java @@ -18,7 +18,6 @@ import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; /** * High-level interface for container operations. Code managing containers should use this and not @@ -38,7 +37,7 @@ public class ContainerOperations { this.containerEngine = Objects.requireNonNull(containerEngine); this.imageDownloader = new ContainerImageDownloader(containerEngine); this.imagePruner = new ContainerImagePruner(containerEngine, Clock.systemUTC()); - this.containerStatsCollector = new ContainerStatsCollector(cgroup, fileSystem); + this.containerStatsCollector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem); } public ContainerData createContainer(NodeAgentContext context, ContainerResources containerResources) { @@ -112,13 +111,13 @@ public class ContainerOperations { /** Get container statistics */ public Optional<ContainerStats> getContainerStats(NodeAgentContext context) { String iface = containerEngine.networkInterface(context); - return getContainer(context).flatMap(container -> containerStatsCollector.collect(container.id(), container.pid(), iface)); + return getContainer(context).flatMap(container -> containerStatsCollector.collect(context, container.id(), container.pid(), iface)); } /** Returns true if no containers managed by node-admin are running */ public boolean noManagedContainersRunning(TaskContext context) { return containerEngine.listContainers(context).stream() - .filter(c -> c.managed()) + .filter(PartialContainer::managed) .noneMatch(container -> container.state() == Container.State.running); } @@ -129,7 +128,7 @@ public class ContainerOperations { */ public boolean retainManagedContainers(TaskContext context, Set<ContainerName> containerNames) { return containerEngine.listContainers(context).stream() - .filter(c -> c.managed()) + .filter(PartialContainer::managed) .filter(container -> !containerNames.contains(container.name())) .peek(container -> containerEngine.removeContainer(context, container)) .count() > 0; diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java index bb54c65c3f4..a5606784c12 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStats.java @@ -1,51 +1,29 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import ai.vespa.validation.Validation; + import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.Objects; /** - * CPU, memory and network statistics collected from a container. + * CPU, GPU, memory and network statistics collected from a container. * * @author freva */ -public class ContainerStats { - - private final Map<String, NetworkStats> networkStatsByInterface; - private final MemoryStats memoryStats; - private final CpuStats cpuStats; +public record ContainerStats(Map<String, NetworkStats> networks, + MemoryStats memoryStats, + CpuStats cpuStats, + List<GpuStats> gpuStats) { - public ContainerStats(Map<String, NetworkStats> networkStatsByInterface, MemoryStats memoryStats, CpuStats cpuStats) { - this.networkStatsByInterface = new LinkedHashMap<>(Objects.requireNonNull(networkStatsByInterface)); + public ContainerStats(Map<String, NetworkStats> networks, MemoryStats memoryStats, CpuStats cpuStats, List<GpuStats> gpuStats) { + this.networks = Collections.unmodifiableMap(new LinkedHashMap<>(Objects.requireNonNull(networks))); this.memoryStats = Objects.requireNonNull(memoryStats); this.cpuStats = Objects.requireNonNull(cpuStats); - } - - public Map<String, NetworkStats> getNetworks() { - return Collections.unmodifiableMap(networkStatsByInterface); - } - - public MemoryStats getMemoryStats() { - return memoryStats; - } - - public CpuStats getCpuStats() { - return cpuStats; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ContainerStats that = (ContainerStats) o; - return networkStatsByInterface.equals(that.networkStatsByInterface) && memoryStats.equals(that.memoryStats) && cpuStats.equals(that.cpuStats); - } - - @Override - public int hashCode() { - return Objects.hash(networkStatsByInterface, memoryStats, cpuStats); + this.gpuStats = List.copyOf(Objects.requireNonNull(gpuStats)); } /** @@ -88,4 +66,23 @@ public class ContainerStats { long throttlingActivePeriods, long throttledPeriods) {} + /** + * GPU statistics + * + * @param deviceNumber GPU device number + * @param loadPercentage Load/utilization in % + * @param memoryTotalBytes Total memory, in bytes + * @param memoryUsedBytes Memory used, in bytes + */ + public record GpuStats(int deviceNumber, int loadPercentage, long memoryTotalBytes, long memoryUsedBytes) { + + public GpuStats { + Validation.requireAtLeast(deviceNumber, "deviceNumber", 0); + Validation.requireAtLeast(loadPercentage, "loadPercentage", 0); + Validation.requireAtLeast(memoryTotalBytes, "memoryTotalBytes", 0L); + Validation.requireAtLeast(memoryUsedBytes, "memoryUsedBytes", 0L); + } + + } + } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java index 67956892898..c17f98b9c9d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java @@ -1,19 +1,24 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser; + import java.io.IOException; import java.io.UncheckedIOException; import java.nio.file.FileSystem; import java.nio.file.Files; import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.time.Duration; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.stream.Stream; /** - * Collects CPU, memory and network statistics for a container. + * Collects CPU, GPU, memory and network statistics for a container. * * Uses same approach as runc: https://github.com/opencontainers/runc/tree/master/libcontainer/cgroups/fs * @@ -21,27 +26,30 @@ import java.util.Optional; */ class ContainerStatsCollector { + private final ContainerEngine containerEngine; private final CGroup cgroup; private final FileSystem fileSystem; private final int onlineCpus; - ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem) { - this(cgroup, fileSystem, Runtime.getRuntime().availableProcessors()); + ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem) { + this(containerEngine, cgroup, fileSystem, Runtime.getRuntime().availableProcessors()); } - ContainerStatsCollector(CGroup cgroup, FileSystem fileSystem, int onlineCpus) { + ContainerStatsCollector(ContainerEngine containerEngine, CGroup cgroup, FileSystem fileSystem, int onlineCpus) { + this.containerEngine = Objects.requireNonNull(containerEngine); this.cgroup = Objects.requireNonNull(cgroup); this.fileSystem = Objects.requireNonNull(fileSystem); this.onlineCpus = onlineCpus; } /** Collect statistics for given container ID and PID */ - public Optional<ContainerStats> collect(ContainerId containerId, int pid, String iface) { + public Optional<ContainerStats> collect(NodeAgentContext context, ContainerId containerId, int pid, String iface) { try { ContainerStats.CpuStats cpuStats = collectCpuStats(containerId); ContainerStats.MemoryStats memoryStats = collectMemoryStats(containerId); Map<String, ContainerStats.NetworkStats> networkStats = Map.of(iface, collectNetworkStats(iface, pid)); - return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats)); + List<ContainerStats.GpuStats> gpuStats = collectGpuStats(context); + return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats, gpuStats)); } catch (NoSuchFileException ignored) { return Optional.empty(); // Container disappeared while we collected stats } catch (IOException e) { @@ -49,6 +57,31 @@ class ContainerStatsCollector { } } + private List<ContainerStats.GpuStats> collectGpuStats(NodeAgentContext context) { + boolean hasGpu = Files.exists(fileSystem.getPath("/dev/nvidia0")); + if (!hasGpu) { + return List.of(); + } + Stream<String> lines = containerEngine.execute(context, UnixUser.ROOT, Duration.ofSeconds(5), + "nvidia-smi", + "--query-gpu=index,utilization.gpu,memory.total,memory.free", + "--format=csv,noheader,nounits") + .getOutputLinesStream(); + return lines.map(ContainerStatsCollector::parseGpuStats).toList(); + } + + private static ContainerStats.GpuStats parseGpuStats(String s) { + String[] fields = fields(s, ",\\s*"); + if (fields.length < 4) throw new IllegalArgumentException("Could not parse GPU stats from '" + s + "'"); + int deviceNumber = Integer.parseInt(fields[0]); + int loadPercentage = Integer.parseInt(fields[1]); + long mega = 2 << 19; + long memoryTotalBytes = Long.parseLong(fields[2]) * mega; + long memoryFreeBytes = Long.parseLong(fields[3]) * mega; + long memoryUsedBytes = memoryTotalBytes - memoryFreeBytes; + return new ContainerStats.GpuStats(deviceNumber, loadPercentage, memoryTotalBytes, memoryUsedBytes); + } + private ContainerStats.CpuStats collectCpuStats(ContainerId containerId) throws IOException { Map<CGroup.CpuStatField, Long> cpuStats = cgroup.cpuStats(containerId); return new ContainerStats.CpuStats(onlineCpus, @@ -114,7 +147,11 @@ class ContainerStatsCollector { } private static String[] fields(String s) { - return s.split("\\s+"); + return fields(s, "\\s+"); + } + + private static String[] fields(String s, String regex) { + return s.trim().split(regex); } } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index b6b5e04c415..f168523a1ef 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -123,7 +123,7 @@ public class NodeAdminImpl implements NodeAdmin { Optional<ContainerStats> containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); if (containerStats.isPresent()) { ++numContainers; - totalContainerMemoryBytes += containerStats.get().getMemoryStats().usage(); + totalContainerMemoryBytes += containerStats.get().memoryStats().usage(); } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java index 09590be42f8..af869786504 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerEngineMock.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser; import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath; import com.yahoo.vespa.hosted.node.admin.task.util.process.CommandResult; +import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal; import java.nio.file.Path; import java.time.Duration; @@ -19,7 +20,6 @@ import java.util.Objects; import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; -import java.util.stream.Collectors; /** * @author mpolden @@ -30,6 +30,16 @@ public class ContainerEngineMock implements ContainerEngine { private final Map<String, ImageDownload> images = new ConcurrentHashMap<>(); private boolean asyncImageDownload = false; + private final TestTerminal terminal; + + public ContainerEngineMock() { + this(null); + } + + public ContainerEngineMock(TestTerminal terminal) { + this.terminal = terminal; + } + public ContainerEngineMock asyncImageDownload(boolean enabled) { this.asyncImageDownload = enabled; return this; @@ -139,12 +149,22 @@ public class ContainerEngineMock implements ContainerEngine { @Override public CommandResult execute(NodeAgentContext context, UnixUser user, Duration timeout, String... command) { - return new CommandResult(null, 0, ""); + if (terminal == null) { + return new CommandResult(null, 0, ""); + } + return terminal.newCommandLine(context) + .add(command) + .executeSilently(); } @Override public CommandResult executeInNetworkNamespace(NodeAgentContext context, String... command) { - return new CommandResult(null, 0, ""); + if (terminal == null) { + return new CommandResult(null, 0, ""); + } + return terminal.newCommandLine(context) + .add(command) + .executeSilently(); } @Override diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java index 79c7558ea9e..f852eb6235d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollectorTest.java @@ -1,12 +1,19 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.container; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; +import com.yahoo.vespa.hosted.node.admin.task.util.process.TestTerminal; import com.yahoo.vespa.test.file.TestFileSystem; import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; import java.util.Map; import java.util.Optional; @@ -27,30 +34,54 @@ import static org.mockito.Mockito.when; */ public class ContainerStatsCollectorTest { + private final TestTerminal testTerminal = new TestTerminal(); + private final ContainerEngineMock containerEngine = new ContainerEngineMock(testTerminal); private final FileSystem fileSystem = TestFileSystem.create(); private final CGroup cgroup = mock(CGroup.class); + private final NodeAgentContext context = NodeAgentContextImpl.builder(NodeSpec.Builder.testSpec("c1").build()) + .fileSystem(TestFileSystem.create()) + .build(); @Test - void collect() throws IOException { - ContainerStatsCollector collector = new ContainerStatsCollector(cgroup, fileSystem, 24); + void collect() throws Exception { + ContainerStatsCollector collector = new ContainerStatsCollector(containerEngine, cgroup, fileSystem, 24); ContainerId containerId = new ContainerId("id1"); int containerPid = 42; - assertTrue(collector.collect(containerId, containerPid, "eth0").isEmpty(), "No stats found"); + assertTrue(collector.collect(context, containerId, containerPid, "eth0").isEmpty(), "No stats found"); mockMemoryStats(containerId); mockCpuStats(containerId); mockNetworkStats(containerPid); - Optional<ContainerStats> stats = collector.collect(containerId, containerPid, "eth0"); + Optional<ContainerStats> stats = collector.collect(context, containerId, containerPid, "eth0"); assertTrue(stats.isPresent()); assertEquals(new ContainerStats.CpuStats(24, 6049374780000L, 691675615472L, 262190000000L, 3L, 1L, 2L), - stats.get().getCpuStats()); + stats.get().cpuStats()); assertEquals(new ContainerStats.MemoryStats(470790144L, 1228017664L, 2147483648L), - stats.get().getMemoryStats()); + stats.get().memoryStats()); assertEquals(Map.of("eth0", new ContainerStats.NetworkStats(22280813L, 4L, 3L, 19859383L, 6L, 5L)), - stats.get().getNetworks()); + stats.get().networks()); + assertEquals(List.of(), stats.get().gpuStats()); + + mockGpuStats(); + stats = collector.collect(context, containerId, containerPid, "eth0"); + assertTrue(stats.isPresent()); + assertEquals(List.of(new ContainerStats.GpuStats(0, 35, 16106127360L, 6144655360L), + new ContainerStats.GpuStats(1, 67, 32212254720L, 19314769920L)), + stats.get().gpuStats()); + } + + private void mockGpuStats() throws IOException { + Path devPath = fileSystem.getPath("/dev"); + Files.createDirectories(devPath); + Files.createFile(devPath.resolve("nvidia0")); + testTerminal.expectCommand("nvidia-smi --query-gpu=index,utilization.gpu,memory.total,memory.free --format=csv,noheader,nounits 2>&1", 0, + """ + 0, 35, 15360, 9500 + 1, 67, 30720, 12300 + """); } private void mockNetworkStats(int pid) { |