diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2017-11-23 14:38:03 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2017-11-24 15:20:17 +0100 |
commit | 090b2faec4fe68689521872db90043665cdf1018 (patch) | |
tree | 0c2eb9f2f2c4af3f87bfc2f8e3509d92b50d0238 | |
parent | 8407446fe4e8df5fa16ef3b019ccb1bea2f87099 (diff) |
Add metric for kernel cpu usage in docker container
3 files changed, 37 insertions, 16 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 09eb14039e8..b66ef50236c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -524,6 +524,7 @@ public class NodeAgentImpl implements NodeAgent { Docker.ContainerStats stats = containerStats.get(); final String APP = MetricReceiverWrapper.APPLICATION_NODE; final int totalNumCpuCores = ((List<Number>) ((Map) stats.getCpuStats().get("cpu_usage")).get("percpu_usage")).size(); + final long cpuContainerKernelTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("usage_in_kernelmode")).longValue(); final long cpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue(); final long cpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue(); final long memoryTotalBytes = ((Number) stats.getMemoryStats().get("limit")).longValue(); @@ -532,26 +533,32 @@ public class NodeAgentImpl implements NodeAgent { final long diskTotalBytes = (long) (nodeSpec.minDiskAvailableGb * BYTES_IN_GB); final Optional<Long> diskTotalBytesUsed = storageMaintainer.getDiskUsageFor(containerName); + lastCpuMetric.updateCpuDeltas(cpuSystemTotalTime, cpuContainerTotalTime, cpuContainerKernelTime); + // CPU usage by a container as percentage of total host CPU, cpuPercentageOfHost, is given by dividing used - // CPU time by the container with CPU time used by the entire system. + // CPU time used by the container with CPU time used by the entire system. + double cpuUsageRatioOfHost = lastCpuMetric.getCpuUsageRatio(); + // CPU usage by a container as percentage of total CPU allocated to it is given by dividing the // cpuPercentageOfHost with the ratio of container minCpuCores by total number of CPU cores. - double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(cpuContainerTotalTime, cpuSystemTotalTime); - double cpuPercentageOfAllocated = totalNumCpuCores * cpuPercentageOfHost / nodeSpec.minCpuCores; + double cpuUsageRatioOfAllocated = totalNumCpuCores * cpuUsageRatioOfHost / nodeSpec.minCpuCores; + double cpuKernelUsageRatioOfAllocated = cpuUsageRatioOfAllocated * lastCpuMetric.getCpuKernelUsageRatio(); + long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache; - double memoryPercentUsed = 100.0 * memoryTotalBytesUsed / memoryTotalBytes; - Optional<Double> diskPercentUsed = diskTotalBytesUsed.map(used -> 100.0 * used / diskTotalBytes); + double memoryUsageRatio = (double) memoryTotalBytesUsed / memoryTotalBytes; + Optional<Double> diskUsageRatio = diskTotalBytesUsed.map(used -> (double) used / diskTotalBytes); List<DimensionMetrics> metrics = new ArrayList<>(); DimensionMetrics.Builder systemMetricsBuilder = new DimensionMetrics.Builder(APP, dimensions) .withMetric("mem.limit", memoryTotalBytes) .withMetric("mem.used", memoryTotalBytesUsed) - .withMetric("mem.util", memoryPercentUsed) - .withMetric("cpu.util", cpuPercentageOfAllocated) + .withMetric("mem.util", 100 * memoryUsageRatio) + .withMetric("cpu.util", 100 * cpuUsageRatioOfAllocated) + .withMetric("cpu.sys.util", 100 * cpuKernelUsageRatioOfAllocated) .withMetric("disk.limit", diskTotalBytes); diskTotalBytesUsed.ifPresent(diskUsed -> systemMetricsBuilder.withMetric("disk.used", diskUsed)); - diskPercentUsed.ifPresent(diskUtil -> systemMetricsBuilder.withMetric("disk.util", diskUtil)); + diskUsageRatio.ifPresent(diskRatio -> systemMetricsBuilder.withMetric("disk.util", 100 * diskRatio)); metrics.add(systemMetricsBuilder.build()); stats.getNetworks().forEach((interfaceName, interfaceStats) -> { @@ -612,17 +619,30 @@ public class NodeAgentImpl implements NodeAgent { } class CpuUsageReporter { + private long containerKernelUsage = 0; private long totalContainerUsage = 0; private long totalSystemUsage = 0; - double getCpuUsagePercentage(long currentContainerUsage, long currentSystemUsage) { - long deltaSystemUsage = currentSystemUsage - totalSystemUsage; - double cpuUsagePct = (deltaSystemUsage == 0 || totalSystemUsage == 0) ? - 0 : 100.0 * (currentContainerUsage - totalContainerUsage) / deltaSystemUsage; + private long deltaContainerKernelUsage; + private long deltaContainerUsage; + private long deltaSystemUsage; + + private void updateCpuDeltas(long totalSystemUsage, long totalContainerUsage, long containerKernelUsage) { + deltaSystemUsage = totalSystemUsage - this.totalSystemUsage; + deltaContainerUsage = totalContainerUsage - this.totalContainerUsage; + deltaContainerKernelUsage = containerKernelUsage - this.containerKernelUsage; + + this.totalSystemUsage = totalSystemUsage; + this.totalContainerUsage = totalContainerUsage; + this.containerKernelUsage = containerKernelUsage; + } + + double getCpuKernelUsageRatio() { + return deltaContainerUsage == 0 ? 0 : (double) deltaContainerKernelUsage / deltaContainerUsage; + } - totalContainerUsage = currentContainerUsage; - totalSystemUsage = currentSystemUsage; - return cpuUsagePct; + double getCpuUsageRatio() { + return deltaSystemUsage == 0 ? 0 : (double) deltaContainerUsage / deltaSystemUsage; } } diff --git a/node-admin/src/test/resources/docker.stats.json b/node-admin/src/test/resources/docker.stats.json index 3b1087b9202..ff4a2fde943 100644 --- a/node-admin/src/test/resources/docker.stats.json +++ b/node-admin/src/test/resources/docker.stats.json @@ -36,7 +36,7 @@ 44567860460, 39049895962 ], - "usage_in_kernelmode":44050000000, + "usage_in_kernelmode":44106083850, "usage_in_usermode":158950000000 }, "system_cpu_usage":5876882680000000, diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt index 8a4d696b08e..023d3958c60 100644 --- a/node-admin/src/test/resources/expected.container.system.metrics.txt +++ b/node-admin/src/test/resources/expected.container.system.metrics.txt @@ -11,6 +11,7 @@ s: "mem.limit": 4294967296, "mem.used": 1073741824, "disk.used": 39625000000, + "cpu.sys.util": 3.402, "disk.util": 15.85, "cpu.util": 5.4, "mem.util": 25.0, |