diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-06-21 20:24:33 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-21 20:24:33 +0200 |
commit | 25ae53f995d0110d27afcc430bcf19b1e6ef5755 (patch) | |
tree | 1820143783af76a764de106f4a88a8fd5be7cbe0 | |
parent | e6ffcc205f280c8c4bb317de64682d9d73cf939f (diff) | |
parent | 229e190c6f6bd3602fbbfc8118fbbed7685aeb69 (diff) |
Merge pull request #9869 from vespa-engine/freva/add-throttled-time
Add throttled cpu time metric
6 files changed, 74 insertions, 22 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java index e857ce6da33..b49fd36da78 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java @@ -12,6 +12,8 @@ import java.util.Set; public class SystemMetrics { public static final String CPU_UTIL = "cpu.util"; public static final String CPU_SYS_UTIL = "cpu.sys.util"; + public static final String CPU_THROTTLED_TIME = "cpu.throttled_time.rate"; + public static final String CPU_THROTTLED_CPU_TIME = "cpu.throttled_cpu_time.rate"; public static final String CPU_VCPUS = "cpu.vcpus"; public static final String DISK_LIMIT = "disk.limit"; public static final String DISK_USED = "disk.used"; @@ -28,6 +30,8 @@ public class SystemMetrics { Set<Metric> dockerNodeMetrics = ImmutableSet.of(new Metric(CPU_UTIL), new Metric(CPU_SYS_UTIL), + new Metric(CPU_THROTTLED_TIME), + new Metric(CPU_THROTTLED_CPU_TIME), new Metric(CPU_VCPUS), new Metric(DISK_LIMIT), new Metric(DISK_USED), diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java index 70ba58cd9cf..bd8ffb0163c 100644 --- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java +++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java @@ -9,7 +9,7 @@ import java.util.Objects; public class ContainerResources { public static final ContainerResources UNLIMITED = ContainerResources.from(0, 0, 0); - private static final int CPU_PERIOD = 100_000; // 100 µs + public static final int CPU_PERIOD_US = 100_000; // 100 ms /** * Hard limit on container's CPU usage: Implemented using Completely Fair Scheduler (CFS) by allocating a given @@ -65,11 +65,12 @@ public class ContainerResources { // Although docker allows to update cpu quota to 0, this is not a legal value, must be set -1 for unlimited // See: https://github.com/docker/for-linux/issues/558 public int cpuQuota() { - return cpus > 0 ? (int) (cpus * CPU_PERIOD) : -1; + return cpus > 0 ? (int) (cpus * CPU_PERIOD_US) : -1; } + /** Duration (in µs) of a single period used as the basis for process scheduling */ public int cpuPeriod() { - return CPU_PERIOD; + return CPU_PERIOD_US; } public int cpuShares() { diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java index d33ddadb52c..797dffdef1f 100644 --- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java +++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java @@ -95,6 +95,9 @@ public class ContainerStats { private final long systemCpuUsage; private final long totalUsage; private final long usageInKernelMode; + private final long throttledTime; + private final long throttlingActivePeriods; + private final long throttledPeriods; public CpuStats(CpuStatsConfig cpuStats) { // Added in 1.27 @@ -102,12 +105,30 @@ public class ContainerStats { this.systemCpuUsage = cpuStats.getSystemCpuUsage(); this.totalUsage = cpuStats.getCpuUsage().getTotalUsage(); this.usageInKernelMode = cpuStats.getCpuUsage().getUsageInKernelmode(); + this.throttledTime = cpuStats.getThrottlingData().getThrottledTime(); + this.throttlingActivePeriods = cpuStats.getThrottlingData().getPeriods(); + this.throttledPeriods = cpuStats.getThrottlingData().getThrottledPeriods(); } public int getOnlineCpus() { return this.onlineCpus; } + + /** Total CPU time (in ns) spent executing all the processes on this host */ public long getSystemCpuUsage() { return this.systemCpuUsage; } + + /** Total CPU time (in ns) spent running all the processes in this container */ public long getTotalUsage() { return totalUsage; } + + /** Total CPU time (in ns) spent in kernel mode while executing processes in this container */ public long getUsageInKernelMode() { return usageInKernelMode; } + + /** Total CPU time (in ns) processes in this container were throttled for */ + public long getThrottledTime() { return throttledTime; } + + /** Number of periods with throttling enabled for this container */ + public long getThrottlingActivePeriods() { return throttlingActivePeriods; } + + /** Number of periods this container hit the throttling limit */ + public long getThrottledPeriods() { return throttledPeriods; } } // For testing only, create ContainerStats from JSON returned by docker daemon stats API diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 977f1016ed8..44bcae3e838 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -534,21 +534,20 @@ public class NodeAgentImpl implements NodeAgent { ContainerStats stats = containerStats.get(); final String APP = Metrics.APPLICATION_NODE; final int totalNumCpuCores = stats.getCpuStats().getOnlineCpus(); - final long cpuContainerKernelTime = stats.getCpuStats().getUsageInKernelMode(); - final long cpuContainerTotalTime = stats.getCpuStats().getTotalUsage(); - final long cpuSystemTotalTime = stats.getCpuStats().getSystemCpuUsage(); final long memoryTotalBytes = stats.getMemoryStats().getLimit(); final long memoryTotalBytesUsage = stats.getMemoryStats().getUsage(); final long memoryTotalBytesCache = stats.getMemoryStats().getCache(); final long diskTotalBytes = (long) (node.diskGb() * BYTES_IN_GB); final Optional<Long> diskTotalBytesUsed = storageMaintainer.getDiskUsageFor(context); - lastCpuMetric.updateCpuDeltas(cpuSystemTotalTime, cpuContainerTotalTime, cpuContainerKernelTime); + lastCpuMetric.updateCpuDeltas(stats.getCpuStats()); // Ratio of CPU cores allocated to this container to total number of CPU cores on this host final double allocatedCpuRatio = node.vcpus() / totalNumCpuCores; double cpuUsageRatioOfAllocated = lastCpuMetric.getCpuUsageRatio() / allocatedCpuRatio; double cpuKernelUsageRatioOfAllocated = lastCpuMetric.getCpuKernelUsageRatio() / allocatedCpuRatio; + double cpuThrottledTimeRate = lastCpuMetric.getThrottledTimeRate(); + double cpuThrottledCpuTimeRate = lastCpuMetric.getThrottledCpuTimeRate(); long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache; double memoryUsageRatio = (double) memoryTotalBytesUsed / memoryTotalBytes; @@ -564,6 +563,8 @@ public class NodeAgentImpl implements NodeAgent { .withMetric("mem_total.util", 100 * memoryTotalUsageRatio) .withMetric("cpu.util", 100 * cpuUsageRatioOfAllocated) .withMetric("cpu.sys.util", 100 * cpuKernelUsageRatioOfAllocated) + .withMetric("cpu.throttled_time.rate", cpuThrottledTimeRate) + .withMetric("cpu.throttled_cpu_time.rate", cpuThrottledCpuTimeRate) .withMetric("cpu.vcpus", node.vcpus()) .withMetric("disk.limit", diskTotalBytes); @@ -621,22 +622,35 @@ public class NodeAgentImpl implements NodeAgent { } class CpuUsageReporter { + private static final double PERIOD_IN_NANOSECONDS = 1_000d * ContainerResources.CPU_PERIOD_US; private long containerKernelUsage = 0; private long totalContainerUsage = 0; private long totalSystemUsage = 0; + private long throttledTime = 0; + private long throttlingActivePeriods = 0; + private long throttledPeriods = 0; private long deltaContainerKernelUsage; private long deltaContainerUsage; private long deltaSystemUsage; - - private void updateCpuDeltas(long totalSystemUsage, long totalContainerUsage, long containerKernelUsage) { - deltaSystemUsage = this.totalSystemUsage == 0 ? 0 : (totalSystemUsage - this.totalSystemUsage); - deltaContainerUsage = totalContainerUsage - this.totalContainerUsage; - deltaContainerKernelUsage = containerKernelUsage - this.containerKernelUsage; - - this.totalSystemUsage = totalSystemUsage; - this.totalContainerUsage = totalContainerUsage; - this.containerKernelUsage = containerKernelUsage; + private long deltaThrottledTime; + private long deltaThrottlingActivePeriods; + private long deltaThrottledPeriods; + + private void updateCpuDeltas(ContainerStats.CpuStats cpuStats) { + deltaSystemUsage = totalSystemUsage == 0 ? 0 : (cpuStats.getSystemCpuUsage() - totalSystemUsage); + deltaContainerUsage = cpuStats.getTotalUsage() - totalContainerUsage; + deltaContainerKernelUsage = cpuStats.getUsageInKernelMode() - containerKernelUsage; + deltaThrottledTime = cpuStats.getThrottledTime() - throttledTime; + deltaThrottlingActivePeriods = cpuStats.getThrottlingActivePeriods() - throttlingActivePeriods; + deltaThrottledPeriods = cpuStats.getThrottledPeriods() - throttledPeriods; + + totalSystemUsage = cpuStats.getSystemCpuUsage(); + totalContainerUsage = cpuStats.getTotalUsage(); + containerKernelUsage = cpuStats.getUsageInKernelMode(); + throttledTime = cpuStats.getThrottledTime(); + throttlingActivePeriods = cpuStats.getThrottlingActivePeriods(); + throttledPeriods = cpuStats.getThrottledPeriods(); } /** @@ -651,6 +665,16 @@ public class NodeAgentImpl implements NodeAgent { double getCpuKernelUsageRatio() { return deltaSystemUsage == 0 ? Double.NaN : (double) deltaContainerKernelUsage / deltaSystemUsage; } + + double getThrottledTimeRate() { + return deltaThrottlingActivePeriods == 0 ? Double.NaN : + (double) deltaThrottledPeriods / deltaThrottlingActivePeriods; + } + + double getThrottledCpuTimeRate() { + return deltaThrottlingActivePeriods == 0 ? Double.NaN : + deltaThrottledTime / (PERIOD_IN_NANOSECONDS * deltaThrottlingActivePeriods); + } } // TODO: Also skip orchestration if we're downgrading in test/staging diff --git a/node-admin/src/test/resources/docker.stats.json b/node-admin/src/test/resources/docker.stats.json index ff4a2fde943..5b42d9a2428 100644 --- a/node-admin/src/test/resources/docker.stats.json +++ b/node-admin/src/test/resources/docker.stats.json @@ -18,9 +18,9 @@ }, "system_cpu_usage":5876874910000000, "throttling_data":{ - "periods":3212, - "throttled_periods":322, - "throttled_time":4490 + "periods":820694, + "throttled_periods":177731, + "throttled_time":81891944744550 } }, "cpu_stats":{ @@ -41,9 +41,9 @@ }, "system_cpu_usage":5876882680000000, "throttling_data":{ - "periods":3242, - "throttled_periods":332, - "throttled_time":4523 + "periods":821264, + "throttled_periods":178201, + "throttled_time":82181944744550 } }, "memory_stats":{ diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt index ec750798c98..54d4d36c7d0 100644 --- a/node-admin/src/test/resources/expected.container.system.metrics.txt +++ b/node-admin/src/test/resources/expected.container.system.metrics.txt @@ -10,6 +10,8 @@ s: }, "metrics": { "cpu.sys.util": 3.402, + "cpu.throttled_cpu_time.rate": 5.087, + "cpu.throttled_time.rate": 0.824, "cpu.util": 5.4, "cpu.vcpus": 2.0, "disk.limit": 250000000000, |