diff options
author | valerijf <valerijf@yahoo-inc.com> | 2017-03-15 13:24:48 +0100 |
---|---|---|
committer | valerijf <valerijf@yahoo-inc.com> | 2017-03-15 13:24:48 +0100 |
commit | 4edb40a2037bdc770026372279661c66ba3748bc (patch) | |
tree | 4be3ef3aad993c1386b29e64920684c909857149 /node-admin | |
parent | 910907eb531275a5009dac4aebf6e4b4ef2934dc (diff) |
Update CPU usage metric
Diffstat (limited to 'node-admin')
5 files changed, 18 insertions, 10 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 35ef4072d10..ad9352896d6 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -74,7 +74,7 @@ public class NodeAdminImpl implements NodeAdmin { scheduler.scheduleWithFixedDelay(() -> { try { - nodeAgents.values().forEach(NodeAgent::updateContainerNodeMetrics); + nodeAgents.values().forEach(nodeAgent -> nodeAgent.updateContainerNodeMetrics(nodeAgents.size())); } catch (Throwable e) { logger.warning("Metric fetcher scheduler failed", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index e4ea3acef11..dcaa82db81f 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -64,7 +64,7 @@ public interface NodeAgent { /** * Updates metric receiver with the latest node-agent stats */ - void updateContainerNodeMetrics(); + void updateContainerNodeMetrics(int numAllocatedContainersOnHost); ContainerName getContainerName(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 7a88bcad024..f91b8d2ad47 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -503,7 +503,7 @@ public class NodeAgentImpl implements NodeAgent { } @SuppressWarnings("unchecked") - public void updateContainerNodeMetrics() { + public void updateContainerNodeMetrics(int numAllocatedContainersOnHost) { ContainerNodeSpec nodeSpec; synchronized (monitor) { nodeSpec = lastNodeSpec; @@ -541,8 +541,12 @@ public class NodeAgentImpl implements NodeAgent { long currentCpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue(); long currentCpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue(); - double cpuPercentage = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime); - metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentage); + // CPU usage by a container is given by dividing used CPU time by the container with CPU time used by the entire + // system. Because each container is allocated same amount of CPU shares, no container should use more than 1/n + // of the total CPU time, where n is the number of running containers. + double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime); + double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost; + metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentageOfAllocated); addIfNotNull(dimensions, "node.cpu.throttled_time", stats.getCpuStats().get("throttling_data"), "throttled_time"); addIfNotNull(dimensions, "node.memory.limit", stats.getMemoryStats(), "limit"); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 47e59aca4c7..cdb415de945 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -497,10 +497,14 @@ public class NodeAgentImplTest { long totalContainerCpuTime = (long) ((Map) cpu_stats.get("cpu_usage")).get("total_usage"); long totalSystemCpuTime = (long) cpu_stats.get("system_cpu_usage"); - nodeAgent.lastCpuMetric.getCpuUsagePercentage(totalContainerCpuTime - 456_789_123, (long) (totalSystemCpuTime - 1e9)); - // During the last 10^9 total cpu ns, 456,789,123ns were spent on running the container. That means the expected - // cpu usage percentage is 100 * (456,789,123 / 10^9) = 45.6789123% - nodeAgent.updateContainerNodeMetrics(); + nodeAgent.lastCpuMetric.getCpuUsagePercentage(totalContainerCpuTime - 123_456_789, (long) (totalSystemCpuTime - 1e9)); + int numAllocatedContainersOnHost = 4; + // During the last 10^9 total CPU ns, 123,456,789ns were spent on running the container. That means the container + // used 100 * (123,456,789 / 10^9) = 12.3456789% of total system CPU time. + // There are a total of 4 allocated nodes on this host, which means that the container only has 100 / 4 = 25% + // of total system CPU time at its disposal. Therefore, the expected CPU usage by this container is: + // 12.3456789% / 25% = 49.3827156% + nodeAgent.updateContainerNodeMetrics(4); Set<Map<String, Object>> actualMetrics = new HashSet<>(); for (MetricReceiverWrapper.DimensionMetrics dimensionMetrics : metricReceiver.getMetrics(MetricReceiverWrapper.APPLICATION_DOCKER)) { diff --git a/node-admin/src/test/resources/docker.stats.metrics.expected.json b/node-admin/src/test/resources/docker.stats.metrics.expected.json index 0845f5acb82..8222c6beca0 100644 --- a/node-admin/src/test/resources/docker.stats.metrics.expected.json +++ b/node-admin/src/test/resources/docker.stats.metrics.expected.json @@ -76,7 +76,7 @@ "zone":"dev.us-east-1" }, "metrics":{ - "node.cpu.busy.pct": 45.6789123, + "node.cpu.busy.pct": 49.3827156, "node.cpu.throttled_time": 4523.0, "node.memory.usage":1.326026752E9, "node.memory.limit":4.294967296E9, |