From afb107c1c1222548cfeb08ad4591a99a74724226 Mon Sep 17 00:00:00 2001 From: Håkon Hallingstad Date: Fri, 5 Aug 2022 12:59:17 +0200 Subject: Memory overhead metric --- .../hosted/node/admin/nodeadmin/NodeAdminImpl.java | 57 ++++++++++++++++------ .../hosted/node/admin/nodeadmin/ProcMeminfo.java | 12 +++++ .../node/admin/nodeadmin/ProcMeminfoReader.java | 42 ++++++++++++++++ .../hosted/node/admin/nodeagent/NodeAgent.java | 8 ++- .../node/admin/integration/ContainerTester.java | 7 ++- .../node/admin/nodeadmin/NodeAdminImplTest.java | 4 +- 6 files changed, 108 insertions(+), 22 deletions(-) create mode 100644 node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java create mode 100644 node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 6de4f8aede1..cdf2dcd514d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; +import com.yahoo.vespa.hosted.node.admin.container.ContainerStats; import com.yahoo.vespa.hosted.node.admin.container.metrics.Counter; import com.yahoo.vespa.hosted.node.admin.container.metrics.Dimensions; import com.yahoo.vespa.hosted.node.admin.container.metrics.Gauge; @@ -11,12 +12,14 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextManager; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentFactory; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentScheduler; +import java.nio.file.FileSystem; import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; @@ -42,24 +45,28 @@ public class NodeAdminImpl implements NodeAdmin { private Instant startOfFreezeConvergence; private final Map nodeAgentWithSchedulerByHostname = new ConcurrentHashMap<>(); + private final ProcMeminfoReader procMeminfoReader; private final Gauge jvmHeapUsed; private final Gauge jvmHeapFree; private final Gauge jvmHeapTotal; + private final Gauge memoryOverhead; + private final Gauge containerCount; private final Counter numberOfUnhandledExceptions; - public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock) { + public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock, FileSystem fileSystem) { this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext), - metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD); + metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD, new ProcMeminfoReader(fileSystem)); } public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, - Clock clock, Duration freezeTimeout, Duration spread) { + Clock clock, Duration freezeTimeout, Duration spread, ProcMeminfoReader procMeminfoReader) { this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext), - metrics, clock, freezeTimeout, spread); + metrics, clock, freezeTimeout, spread, procMeminfoReader); } NodeAdminImpl(NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory, - Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread) { + Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread, + ProcMeminfoReader procMeminfoReader) { this.nodeAgentWithSchedulerFactory = nodeAgentWithSchedulerFactory; this.clock = clock; this.freezeTimeout = freezeTimeout; @@ -71,9 +78,12 @@ public class NodeAdminImpl implements NodeAdmin { this.numberOfUnhandledExceptions = metrics.declareCounter("unhandled_exceptions", new Dimensions(Map.of("src", "node-agents"))); + this.procMeminfoReader = procMeminfoReader; this.jvmHeapUsed = metrics.declareGauge("mem.heap.used"); this.jvmHeapFree = metrics.declareGauge("mem.heap.free"); this.jvmHeapTotal = metrics.declareGauge("mem.heap.total"); + this.memoryOverhead = metrics.declareGauge("mem.system.overhead"); + this.containerCount = metrics.declareGauge("container.count"); } @Override @@ -103,21 +113,36 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void updateMetrics(boolean isSuspended) { + long numContainers = 0; + long totalContainerMemoryBytes = 0; + final long invalidTotalContainerMemoryBytes = -1; + for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) { + ++numContainers; int count = nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions(); if (!isSuspended) numberOfUnhandledExceptions.add(count); - nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); + Optional containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); + if (totalContainerMemoryBytes != invalidTotalContainerMemoryBytes) { + if (containerStats.isPresent()) { + totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage(); + } else { + totalContainerMemoryBytes = invalidTotalContainerMemoryBytes; + } + } } - if (!isSuspended) { - Runtime runtime = Runtime.getRuntime(); - runtime.gc(); - long freeMemory = runtime.freeMemory(); - long totalMemory = runtime.totalMemory(); - long usedMemory = totalMemory - freeMemory; - jvmHeapFree.sample(freeMemory); - jvmHeapUsed.sample(usedMemory); - jvmHeapTotal.sample(totalMemory); + Runtime runtime = Runtime.getRuntime(); + runtime.gc(); + long freeMemory = runtime.freeMemory(); + long totalMemory = runtime.totalMemory(); + long usedMemory = totalMemory - freeMemory; + jvmHeapFree.sample(freeMemory); + jvmHeapUsed.sample(usedMemory); + jvmHeapTotal.sample(totalMemory); + containerCount.sample(numContainers); + if (totalContainerMemoryBytes != invalidTotalContainerMemoryBytes) { + ProcMeminfo meminfo = procMeminfoReader.read(); + memoryOverhead.sample(meminfo.memTotalBytes() - meminfo.memAvailableBytes() - totalContainerMemoryBytes); } } @@ -206,7 +231,7 @@ public class NodeAdminImpl implements NodeAdmin { void start() { nodeAgent.start(currentContext()); } void stopForHostSuspension() { nodeAgent.stopForHostSuspension(currentContext()); } void stopForRemoval() { nodeAgent.stopForRemoval(currentContext()); } - void updateContainerNodeMetrics(boolean isSuspended) { nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); } + Optional updateContainerNodeMetrics(boolean isSuspended) { return nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); } int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); } @Override public void scheduleTickWith(NodeAgentContext context, Instant at) { nodeAgentScheduler.scheduleTickWith(context, at); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java new file mode 100644 index 00000000000..a1f750a34e3 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java @@ -0,0 +1,12 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +/** + * Represents /proc/meminfo, see proc(5). + * + * @param memTotalBytes Total usable RAM (i.e., physical RAM minus a few reserved bits and the kernel binary code). + * @param memAvailableBytes An estimate of how much memory is available for starting new applications, without swapping. + * + * @author hakon + */ +public record ProcMeminfo(long memTotalBytes, long memAvailableBytes) { } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java new file mode 100644 index 00000000000..17abe6c7b46 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java @@ -0,0 +1,42 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +import com.yahoo.yolean.Exceptions; + +import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Reads /proc/meminfo, see proc(5). + * + * @author hakon + */ +public class ProcMeminfoReader { + private static final String PROC_MEMINFO = "/proc/meminfo"; + private static final Pattern MEM_TOTAL_PATTERN = Pattern.compile("MemTotal: *([0-9]+) kB"); + private static final Pattern MEM_AVAILABLE_PATTERN = Pattern.compile("MemAvailable: *([0-9]+) kB"); + + private final FileSystem fileSystem; + + public ProcMeminfoReader(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + public ProcMeminfo read() { + return read(Exceptions.uncheck(() -> Files.readString(fileSystem.getPath(PROC_MEMINFO)))); + } + + static ProcMeminfo read(String meminfoContent) { + return new ProcMeminfo(readKbGroup(meminfoContent, MEM_TOTAL_PATTERN), + readKbGroup(meminfoContent, MEM_AVAILABLE_PATTERN)); + } + + private static long readKbGroup(String string, Pattern pattern) { + Matcher matcher = pattern.matcher(string); + if (!matcher.find()) + throw new IllegalArgumentException(pattern + " did not match anything in " + PROC_MEMINFO); + return Long.parseLong(matcher.group(1)) * 1024; + } +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 5b9bcee83bf..18c981fdf17 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -1,6 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeagent; +import com.yahoo.vespa.hosted.node.admin.container.ContainerStats; + +import java.util.Optional; + /** * Responsible for management of a single node over its lifecycle. * May own its own resources, threads etc. Runs independently, but receives signals @@ -28,9 +32,9 @@ public interface NodeAgent { void stopForRemoval(NodeAgentContext context); /** - * Updates metric receiver with the latest node-agent stats + * Updates metric receiver with the latest node-agent stats, and returns the container stats if available. */ - default void updateContainerNodeMetrics(NodeAgentContext context, boolean isSuspended) {} + default Optional updateContainerNodeMetrics(NodeAgentContext context, boolean isSuspended) { return Optional.empty(); } /** * Returns and resets number of unhandled exceptions diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integration/ContainerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integration/ContainerTester.java index a2312a23925..1773eb4be25 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integration/ContainerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integration/ContainerTester.java @@ -17,6 +17,8 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.maintenance.servicedump.VespaServiceDumper; import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl; import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ProcMeminfo; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ProcMeminfoReader; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextFactory; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; @@ -36,7 +38,6 @@ import java.util.Optional; import java.util.concurrent.Phaser; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Logger; import static org.mockito.ArgumentMatchers.any; @@ -87,6 +88,8 @@ public class ContainerTester implements AutoCloseable { Clock clock = Clock.systemUTC(); Metrics metrics = new Metrics(); FileSystem fileSystem = TestFileSystem.create(); + ProcMeminfoReader procMeminfoReader = mock(ProcMeminfoReader.class); + when(procMeminfoReader.read()).thenReturn(new ProcMeminfo(1, 2)); NodeAgentFactory nodeAgentFactory = (contextSupplier, nodeContext) -> new NodeAgentImpl(contextSupplier, nodeRepository, orchestrator, containerOperations, () -> RegistryCredentials.none, @@ -106,7 +109,7 @@ public class ContainerTester implements AutoCloseable { phaser.arriveAndDeregister(); } }; - nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, clock, Duration.ofMillis(10), Duration.ZERO); + nodeAdmin = new NodeAdminImpl(nodeAgentFactory, metrics, clock, Duration.ofMillis(10), Duration.ZERO, procMeminfoReader); NodeAgentContextFactory nodeAgentContextFactory = (nodeSpec, acl) -> NodeAgentContextImpl.builder(nodeSpec).acl(acl).fileSystem(fileSystem).build(); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeAgentContextFactory, nodeRepository, orchestrator, diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java index 8504a724417..96c18517bfe 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImplTest.java @@ -35,9 +35,9 @@ public class NodeAdminImplTest { private final NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory = mock(NodeAgentWithSchedulerFactory.class); private final ManualClock clock = new ManualClock(); - + private final ProcMeminfoReader procMeminfoReader = mock(ProcMeminfoReader.class); private final NodeAdminImpl nodeAdmin = new NodeAdminImpl(nodeAgentWithSchedulerFactory, - new Metrics(), clock, Duration.ZERO, Duration.ZERO); + new Metrics(), clock, Duration.ZERO, Duration.ZERO, procMeminfoReader); @Test void nodeAgentsAreProperlyLifeCycleManaged() { -- cgit v1.2.3 From e77c830792a9e6a452e6f632e90afb1c3f39ab88 Mon Sep 17 00:00:00 2001 From: Håkon Hallingstad Date: Fri, 5 Aug 2022 15:16:29 +0200 Subject: true --- .../hosted/node/admin/nodeadmin/NodeAdminImpl.java | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index cdf2dcd514d..a371cdcde25 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -115,19 +115,14 @@ public class NodeAdminImpl implements NodeAdmin { public void updateMetrics(boolean isSuspended) { long numContainers = 0; long totalContainerMemoryBytes = 0; - final long invalidTotalContainerMemoryBytes = -1; for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) { - ++numContainers; int count = nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions(); if (!isSuspended) numberOfUnhandledExceptions.add(count); Optional containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); - if (totalContainerMemoryBytes != invalidTotalContainerMemoryBytes) { - if (containerStats.isPresent()) { - totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage(); - } else { - totalContainerMemoryBytes = invalidTotalContainerMemoryBytes; - } + if (containerStats.isPresent()) { + ++numContainers; + totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage(); } } @@ -139,8 +134,10 @@ public class NodeAdminImpl implements NodeAdmin { jvmHeapFree.sample(freeMemory); jvmHeapUsed.sample(usedMemory); jvmHeapTotal.sample(totalMemory); - containerCount.sample(numContainers); - if (totalContainerMemoryBytes != invalidTotalContainerMemoryBytes) { + + // No container stats are found while suspended, so skip setting these if so. + if (!isSuspended) { + containerCount.sample(numContainers); ProcMeminfo meminfo = procMeminfoReader.read(); memoryOverhead.sample(meminfo.memTotalBytes() - meminfo.memAvailableBytes() - totalContainerMemoryBytes); } -- cgit v1.2.3