summaryrefslogtreecommitdiffstats
path: root/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java52
1 files changed, 37 insertions, 15 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index 6de4f8aede1..a371cdcde25 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.nodeadmin;
+import com.yahoo.vespa.hosted.node.admin.container.ContainerStats;
import com.yahoo.vespa.hosted.node.admin.container.metrics.Counter;
import com.yahoo.vespa.hosted.node.admin.container.metrics.Dimensions;
import com.yahoo.vespa.hosted.node.admin.container.metrics.Gauge;
@@ -11,12 +12,14 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextManager;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentFactory;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentScheduler;
+import java.nio.file.FileSystem;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
@@ -42,24 +45,28 @@ public class NodeAdminImpl implements NodeAdmin {
private Instant startOfFreezeConvergence;
private final Map<String, NodeAgentWithScheduler> nodeAgentWithSchedulerByHostname = new ConcurrentHashMap<>();
+ private final ProcMeminfoReader procMeminfoReader;
private final Gauge jvmHeapUsed;
private final Gauge jvmHeapFree;
private final Gauge jvmHeapTotal;
+ private final Gauge memoryOverhead;
+ private final Gauge containerCount;
private final Counter numberOfUnhandledExceptions;
- public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock) {
+ public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics, Clock clock, FileSystem fileSystem) {
this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext),
- metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD);
+ metrics, clock, NODE_AGENT_FREEZE_TIMEOUT, NODE_AGENT_SPREAD, new ProcMeminfoReader(fileSystem));
}
public NodeAdminImpl(NodeAgentFactory nodeAgentFactory, Metrics metrics,
- Clock clock, Duration freezeTimeout, Duration spread) {
+ Clock clock, Duration freezeTimeout, Duration spread, ProcMeminfoReader procMeminfoReader) {
this(nodeAgentContext -> create(clock, nodeAgentFactory, nodeAgentContext),
- metrics, clock, freezeTimeout, spread);
+ metrics, clock, freezeTimeout, spread, procMeminfoReader);
}
NodeAdminImpl(NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory,
- Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread) {
+ Metrics metrics, Clock clock, Duration freezeTimeout, Duration spread,
+ ProcMeminfoReader procMeminfoReader) {
this.nodeAgentWithSchedulerFactory = nodeAgentWithSchedulerFactory;
this.clock = clock;
this.freezeTimeout = freezeTimeout;
@@ -71,9 +78,12 @@ public class NodeAdminImpl implements NodeAdmin {
this.numberOfUnhandledExceptions = metrics.declareCounter("unhandled_exceptions",
new Dimensions(Map.of("src", "node-agents")));
+ this.procMeminfoReader = procMeminfoReader;
this.jvmHeapUsed = metrics.declareGauge("mem.heap.used");
this.jvmHeapFree = metrics.declareGauge("mem.heap.free");
this.jvmHeapTotal = metrics.declareGauge("mem.heap.total");
+ this.memoryOverhead = metrics.declareGauge("mem.system.overhead");
+ this.containerCount = metrics.declareGauge("container.count");
}
@Override
@@ -103,21 +113,33 @@ public class NodeAdminImpl implements NodeAdmin {
@Override
public void updateMetrics(boolean isSuspended) {
+ long numContainers = 0;
+ long totalContainerMemoryBytes = 0;
+
for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) {
int count = nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions();
if (!isSuspended) numberOfUnhandledExceptions.add(count);
- nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended);
+ Optional<ContainerStats> containerStats = nodeAgentWithScheduler.updateContainerNodeMetrics(isSuspended);
+ if (containerStats.isPresent()) {
+ ++numContainers;
+ totalContainerMemoryBytes += containerStats.get().getMemoryStats().getUsage();
+ }
}
+ Runtime runtime = Runtime.getRuntime();
+ runtime.gc();
+ long freeMemory = runtime.freeMemory();
+ long totalMemory = runtime.totalMemory();
+ long usedMemory = totalMemory - freeMemory;
+ jvmHeapFree.sample(freeMemory);
+ jvmHeapUsed.sample(usedMemory);
+ jvmHeapTotal.sample(totalMemory);
+
+ // No container stats are found while suspended, so skip setting these if so.
if (!isSuspended) {
- Runtime runtime = Runtime.getRuntime();
- runtime.gc();
- long freeMemory = runtime.freeMemory();
- long totalMemory = runtime.totalMemory();
- long usedMemory = totalMemory - freeMemory;
- jvmHeapFree.sample(freeMemory);
- jvmHeapUsed.sample(usedMemory);
- jvmHeapTotal.sample(totalMemory);
+ containerCount.sample(numContainers);
+ ProcMeminfo meminfo = procMeminfoReader.read();
+ memoryOverhead.sample(meminfo.memTotalBytes() - meminfo.memAvailableBytes() - totalContainerMemoryBytes);
}
}
@@ -206,7 +228,7 @@ public class NodeAdminImpl implements NodeAdmin {
void start() { nodeAgent.start(currentContext()); }
void stopForHostSuspension() { nodeAgent.stopForHostSuspension(currentContext()); }
void stopForRemoval() { nodeAgent.stopForRemoval(currentContext()); }
- void updateContainerNodeMetrics(boolean isSuspended) { nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); }
+ Optional<ContainerStats> updateContainerNodeMetrics(boolean isSuspended) { return nodeAgent.updateContainerNodeMetrics(currentContext(), isSuspended); }
int getAndResetNumberOfUnhandledExceptions() { return nodeAgent.getAndResetNumberOfUnhandledExceptions(); }
@Override public void scheduleTickWith(NodeAgentContext context, Instant at) { nodeAgentScheduler.scheduleTickWith(context, at); }