diff options
Diffstat (limited to 'node-admin/src/main/java/com/yahoo/vespa/hosted')
4 files changed, 97 insertions, 17 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 6de4f8aede1..a371cdcde25 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; +import com.yahoo.vespa.hosted.node.admin.container.ContainerStats; import com.yahoo.vespa.hosted.node.admin.container.metrics.Counter; import com.yahoo.vespa.hosted.node.admin.container.metrics.Dimensions; import com.yahoo.vespa.hosted.node.admin.container.metrics.Gauge; @@ -11,12 +12,14 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextManager; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentFactory; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentScheduler; +import java.nio.file.FileSystem; import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; @@ -42,24 +45,28 @@ public class NodeAdminImpl implements NodeAdmin { private Instant startOfFreezeConvergence; private final Map<String, NodeAgentWithScheduler> nodeAgentWithSchedulerByHostname = new ConcurrentHashMap<>(); + private final ProcMeminfoReader procMeminfoReader; private final Gauge jvmHeapUsed; private final Gauge jvmHeapFree; private final Gauge jvmHeapTotal; + private final Gauge memoryOverhead; + private final Gauge containerCount; private final Counter numberOfUnhandledExceptions; - public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock) { + public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock, FileSystem fileSystem) { this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext), - metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD); + metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD, new ProcMeminfoReader(fileSystem)); } public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, - Clock clock, Duration freezeTimeout, Duration spread) { + Clock clock, Duration freezeTimeout, Duration spread, ProcMeminfoReader procMeminfoReader) { this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext), - metrics, clock, freezeTimeout, spread); + metrics, clock, freezeTimeout, spread, procMeminfoReader); } NodeAdminImpl(NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory, - Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread) { + Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread, + ProcMeminfoReader procMeminfoReader) { this.nodeAgentWithSchedulerFactory = nodeAgentWithSchedulerFactory; this.clock = clock; this.freezeTimeout = freezeTimeout; @@ -71,9 +78,12 @@ public class NodeAdminImpl implements NodeAdmin { this.numberOfUnhandledExceptions = metrics.declareCounter("unhandled_exceptions", new Dimensions(Map.of("src", "node-agents"))); + this.procMeminfoReader = procMeminfoReader; this.jvmHeapUsed = metrics.declareGauge("mem.heap.used"); this.jvmHeapFree = metrics.declareGauge("mem.heap.free"); this.jvmHeapTotal = metrics.declareGauge("mem.heap.total"); + this.memoryOverhead = metrics.declareGauge("mem.system.overhead"); + this.containerCount = metrics.declareGauge("container.count"); } @Override @@ -103,21 +113,33 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void updateMetrics(boolean isSuspended) { + long numContainers = 0; + long totalContainerMemoryBytes = 0; + for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) { int count = nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions(); if (!isSuspended) numberOfUnhandledExceptions.add(count); - nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); + Optional<ContainerStats> containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended); + if (containerStats.isPresent()) { + ++numContainers; + totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage(); + } } + Runtime runtime = Runtime.getRuntime(); + runtime.gc(); + long freeMemory = runtime.freeMemory(); + long totalMemory = runtime.totalMemory(); + long usedMemory = totalMemory - freeMemory; + jvmHeapFree.sample(freeMemory); + jvmHeapUsed.sample(usedMemory); + jvmHeapTotal.sample(totalMemory); + + // No container stats are found while suspended, so skip setting these if so. if (!isSuspended) { - Runtime runtime = Runtime.getRuntime(); - runtime.gc(); - long freeMemory = runtime.freeMemory(); - long totalMemory = runtime.totalMemory(); - long usedMemory = totalMemory - freeMemory; - jvmHeapFree.sample(freeMemory); - jvmHeapUsed.sample(usedMemory); - jvmHeapTotal.sample(totalMemory); + containerCount.sample(numContainers); + ProcMeminfo meminfo = procMeminfoReader.read(); + memoryOverhead.sample(meminfo.memTotalBytes() - meminfo.memAvailableBytes() - totalContainerMemoryBytes); } } @@ -206,7 +228,7 @@ public class NodeAdminImpl implements NodeAdmin { void start() { nodeAgent.start(currentContext()); } void stopForHostSuspension() { nodeAgent.stopForHostSuspension(currentContext()); } void stopForRemoval() { nodeAgent.stopForRemoval(currentContext()); } - void updateContainerNodeMetrics(boolean isSuspended) { nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); } + Optional<ContainerStats> updateContainerNodeMetrics(boolean isSuspended) { return nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); } int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); } @Override public void scheduleTickWith(NodeAgentContext context, Instant at) { nodeAgentScheduler.scheduleTickWith(context, at); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java new file mode 100644 index 00000000000..a1f750a34e3 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfo.java @@ -0,0 +1,12 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +/** + * Represents /proc/meminfo, see proc(5). + * + * @param memTotalBytes Total usable RAM (i.e., physical RAM minus a few reserved bits and the kernel binary code). + * @param memAvailableBytes An estimate of how much memory is available for starting new applications, without swapping. + * + * @author hakon + */ +public record ProcMeminfo(long memTotalBytes, long memAvailableBytes) { } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java new file mode 100644 index 00000000000..17abe6c7b46 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/ProcMeminfoReader.java @@ -0,0 +1,42 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +import com.yahoo.yolean.Exceptions; + +import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Reads /proc/meminfo, see proc(5). + * + * @author hakon + */ +public class ProcMeminfoReader { + private static final String PROC_MEMINFO = "/proc/meminfo"; + private static final Pattern MEM_TOTAL_PATTERN = Pattern.compile("MemTotal: *([0-9]+) kB"); + private static final Pattern MEM_AVAILABLE_PATTERN = Pattern.compile("MemAvailable: *([0-9]+) kB"); + + private final FileSystem fileSystem; + + public ProcMeminfoReader(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + public ProcMeminfo read() { + return read(Exceptions.uncheck(() -> Files.readString(fileSystem.getPath(PROC_MEMINFO)))); + } + + static ProcMeminfo read(String meminfoContent) { + return new ProcMeminfo(readKbGroup(meminfoContent, MEM_TOTAL_PATTERN), + readKbGroup(meminfoContent, MEM_AVAILABLE_PATTERN)); + } + + private static long readKbGroup(String string, Pattern pattern) { + Matcher matcher = pattern.matcher(string); + if (!matcher.find()) + throw new IllegalArgumentException(pattern + " did not match anything in " + PROC_MEMINFO); + return Long.parseLong(matcher.group(1)) * 1024; + } +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 5b9bcee83bf..18c981fdf17 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -1,6 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeagent; +import com.yahoo.vespa.hosted.node.admin.container.ContainerStats; + +import java.util.Optional; + /** * Responsible for management of a single node over its lifecycle. * May own its own resources, threads etc. Runs independently, but receives signals @@ -28,9 +32,9 @@ public interface NodeAgent { void stopForRemoval(NodeAgentContext context); /** - * Updates metric receiver with the latest node-agent stats + * Updates metric receiver with the latest node-agent stats, and returns the container stats if available. */ - default void updateContainerNodeMetrics(NodeAgentContext context, boolean isSuspended) {} + default Optional<ContainerStats> updateContainerNodeMetrics(NodeAgentContext context, boolean isSuspended) { return Optional.empty(); } /** * Returns and resets number of unhandled exceptions |