aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-06-21 20:24:33 +0200
committerGitHub <noreply@github.com>2019-06-21 20:24:33 +0200
commit25ae53f995d0110d27afcc430bcf19b1e6ef5755 (patch)
tree1820143783af76a764de106f4a88a8fd5be7cbe0
parente6ffcc205f280c8c4bb317de64682d9d73cf939f (diff)
parent229e190c6f6bd3602fbbfc8118fbbed7685aeb69 (diff)
Merge pull request #9869 from vespa-engine/freva/add-throttled-time
Add throttled cpu time metric
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java4
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java7
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java21
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java50
-rw-r--r--node-admin/src/test/resources/docker.stats.json12
-rw-r--r--node-admin/src/test/resources/expected.container.system.metrics.txt2
6 files changed, 74 insertions, 22 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java
index e857ce6da33..b49fd36da78 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/admin/monitoring/SystemMetrics.java
@@ -12,6 +12,8 @@ import java.util.Set;
public class SystemMetrics {
public static final String CPU_UTIL = "cpu.util";
public static final String CPU_SYS_UTIL = "cpu.sys.util";
+ public static final String CPU_THROTTLED_TIME = "cpu.throttled_time.rate";
+ public static final String CPU_THROTTLED_CPU_TIME = "cpu.throttled_cpu_time.rate";
public static final String CPU_VCPUS = "cpu.vcpus";
public static final String DISK_LIMIT = "disk.limit";
public static final String DISK_USED = "disk.used";
@@ -28,6 +30,8 @@ public class SystemMetrics {
Set<Metric> dockerNodeMetrics =
ImmutableSet.of(new Metric(CPU_UTIL),
new Metric(CPU_SYS_UTIL),
+ new Metric(CPU_THROTTLED_TIME),
+ new Metric(CPU_THROTTLED_CPU_TIME),
new Metric(CPU_VCPUS),
new Metric(DISK_LIMIT),
new Metric(DISK_USED),
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
index 70ba58cd9cf..bd8ffb0163c 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerResources.java
@@ -9,7 +9,7 @@ import java.util.Objects;
public class ContainerResources {
public static final ContainerResources UNLIMITED = ContainerResources.from(0, 0, 0);
- private static final int CPU_PERIOD = 100_000; // 100 µs
+ public static final int CPU_PERIOD_US = 100_000; // 100 ms
/**
* Hard limit on container's CPU usage: Implemented using Completely Fair Scheduler (CFS) by allocating a given
@@ -65,11 +65,12 @@ public class ContainerResources {
// Although docker allows to update cpu quota to 0, this is not a legal value, must be set -1 for unlimited
// See: https://github.com/docker/for-linux/issues/558
public int cpuQuota() {
- return cpus > 0 ? (int) (cpus * CPU_PERIOD) : -1;
+ return cpus > 0 ? (int) (cpus * CPU_PERIOD_US) : -1;
}
+ /** Duration (in µs) of a single period used as the basis for process scheduling */
public int cpuPeriod() {
- return CPU_PERIOD;
+ return CPU_PERIOD_US;
}
public int cpuShares() {
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java
index d33ddadb52c..797dffdef1f 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/ContainerStats.java
@@ -95,6 +95,9 @@ public class ContainerStats {
private final long systemCpuUsage;
private final long totalUsage;
private final long usageInKernelMode;
+ private final long throttledTime;
+ private final long throttlingActivePeriods;
+ private final long throttledPeriods;
public CpuStats(CpuStatsConfig cpuStats) {
// Added in 1.27
@@ -102,12 +105,30 @@ public class ContainerStats {
this.systemCpuUsage = cpuStats.getSystemCpuUsage();
this.totalUsage = cpuStats.getCpuUsage().getTotalUsage();
this.usageInKernelMode = cpuStats.getCpuUsage().getUsageInKernelmode();
+ this.throttledTime = cpuStats.getThrottlingData().getThrottledTime();
+ this.throttlingActivePeriods = cpuStats.getThrottlingData().getPeriods();
+ this.throttledPeriods = cpuStats.getThrottlingData().getThrottledPeriods();
}
public int getOnlineCpus() { return this.onlineCpus; }
+
+ /** Total CPU time (in ns) spent executing all the processes on this host */
public long getSystemCpuUsage() { return this.systemCpuUsage; }
+
+ /** Total CPU time (in ns) spent running all the processes in this container */
public long getTotalUsage() { return totalUsage; }
+
+ /** Total CPU time (in ns) spent in kernel mode while executing processes in this container */
public long getUsageInKernelMode() { return usageInKernelMode; }
+
+ /** Total CPU time (in ns) processes in this container were throttled for */
+ public long getThrottledTime() { return throttledTime; }
+
+ /** Number of periods with throttling enabled for this container */
+ public long getThrottlingActivePeriods() { return throttlingActivePeriods; }
+
+ /** Number of periods this container hit the throttling limit */
+ public long getThrottledPeriods() { return throttledPeriods; }
}
// For testing only, create ContainerStats from JSON returned by docker daemon stats API
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 977f1016ed8..44bcae3e838 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -534,21 +534,20 @@ public class NodeAgentImpl implements NodeAgent {
ContainerStats stats = containerStats.get();
final String APP = Metrics.APPLICATION_NODE;
final int totalNumCpuCores = stats.getCpuStats().getOnlineCpus();
- final long cpuContainerKernelTime = stats.getCpuStats().getUsageInKernelMode();
- final long cpuContainerTotalTime = stats.getCpuStats().getTotalUsage();
- final long cpuSystemTotalTime = stats.getCpuStats().getSystemCpuUsage();
final long memoryTotalBytes = stats.getMemoryStats().getLimit();
final long memoryTotalBytesUsage = stats.getMemoryStats().getUsage();
final long memoryTotalBytesCache = stats.getMemoryStats().getCache();
final long diskTotalBytes = (long) (node.diskGb() * BYTES_IN_GB);
final Optional<Long> diskTotalBytesUsed = storageMaintainer.getDiskUsageFor(context);
- lastCpuMetric.updateCpuDeltas(cpuSystemTotalTime, cpuContainerTotalTime, cpuContainerKernelTime);
+ lastCpuMetric.updateCpuDeltas(stats.getCpuStats());
// Ratio of CPU cores allocated to this container to total number of CPU cores on this host
final double allocatedCpuRatio = node.vcpus() / totalNumCpuCores;
double cpuUsageRatioOfAllocated = lastCpuMetric.getCpuUsageRatio() / allocatedCpuRatio;
double cpuKernelUsageRatioOfAllocated = lastCpuMetric.getCpuKernelUsageRatio() / allocatedCpuRatio;
+ double cpuThrottledTimeRate = lastCpuMetric.getThrottledTimeRate();
+ double cpuThrottledCpuTimeRate = lastCpuMetric.getThrottledCpuTimeRate();
long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache;
double memoryUsageRatio = (double) memoryTotalBytesUsed / memoryTotalBytes;
@@ -564,6 +563,8 @@ public class NodeAgentImpl implements NodeAgent {
.withMetric("mem_total.util", 100 * memoryTotalUsageRatio)
.withMetric("cpu.util", 100 * cpuUsageRatioOfAllocated)
.withMetric("cpu.sys.util", 100 * cpuKernelUsageRatioOfAllocated)
+ .withMetric("cpu.throttled_time.rate", cpuThrottledTimeRate)
+ .withMetric("cpu.throttled_cpu_time.rate", cpuThrottledCpuTimeRate)
.withMetric("cpu.vcpus", node.vcpus())
.withMetric("disk.limit", diskTotalBytes);
@@ -621,22 +622,35 @@ public class NodeAgentImpl implements NodeAgent {
}
class CpuUsageReporter {
+ private static final double PERIOD_IN_NANOSECONDS = 1_000d * ContainerResources.CPU_PERIOD_US;
private long containerKernelUsage = 0;
private long totalContainerUsage = 0;
private long totalSystemUsage = 0;
+ private long throttledTime = 0;
+ private long throttlingActivePeriods = 0;
+ private long throttledPeriods = 0;
private long deltaContainerKernelUsage;
private long deltaContainerUsage;
private long deltaSystemUsage;
-
- private void updateCpuDeltas(long totalSystemUsage, long totalContainerUsage, long containerKernelUsage) {
- deltaSystemUsage = this.totalSystemUsage == 0 ? 0 : (totalSystemUsage - this.totalSystemUsage);
- deltaContainerUsage = totalContainerUsage - this.totalContainerUsage;
- deltaContainerKernelUsage = containerKernelUsage - this.containerKernelUsage;
-
- this.totalSystemUsage = totalSystemUsage;
- this.totalContainerUsage = totalContainerUsage;
- this.containerKernelUsage = containerKernelUsage;
+ private long deltaThrottledTime;
+ private long deltaThrottlingActivePeriods;
+ private long deltaThrottledPeriods;
+
+ private void updateCpuDeltas(ContainerStats.CpuStats cpuStats) {
+ deltaSystemUsage = totalSystemUsage == 0 ? 0 : (cpuStats.getSystemCpuUsage() - totalSystemUsage);
+ deltaContainerUsage = cpuStats.getTotalUsage() - totalContainerUsage;
+ deltaContainerKernelUsage = cpuStats.getUsageInKernelMode() - containerKernelUsage;
+ deltaThrottledTime = cpuStats.getThrottledTime() - throttledTime;
+ deltaThrottlingActivePeriods = cpuStats.getThrottlingActivePeriods() - throttlingActivePeriods;
+ deltaThrottledPeriods = cpuStats.getThrottledPeriods() - throttledPeriods;
+
+ totalSystemUsage = cpuStats.getSystemCpuUsage();
+ totalContainerUsage = cpuStats.getTotalUsage();
+ containerKernelUsage = cpuStats.getUsageInKernelMode();
+ throttledTime = cpuStats.getThrottledTime();
+ throttlingActivePeriods = cpuStats.getThrottlingActivePeriods();
+ throttledPeriods = cpuStats.getThrottledPeriods();
}
/**
@@ -651,6 +665,16 @@ public class NodeAgentImpl implements NodeAgent {
double getCpuKernelUsageRatio() {
return deltaSystemUsage == 0 ? Double.NaN : (double) deltaContainerKernelUsage / deltaSystemUsage;
}
+
+ double getThrottledTimeRate() {
+ return deltaThrottlingActivePeriods == 0 ? Double.NaN :
+ (double) deltaThrottledPeriods / deltaThrottlingActivePeriods;
+ }
+
+ double getThrottledCpuTimeRate() {
+ return deltaThrottlingActivePeriods == 0 ? Double.NaN :
+ deltaThrottledTime / (PERIOD_IN_NANOSECONDS * deltaThrottlingActivePeriods);
+ }
}
// TODO: Also skip orchestration if we're downgrading in test/staging
diff --git a/node-admin/src/test/resources/docker.stats.json b/node-admin/src/test/resources/docker.stats.json
index ff4a2fde943..5b42d9a2428 100644
--- a/node-admin/src/test/resources/docker.stats.json
+++ b/node-admin/src/test/resources/docker.stats.json
@@ -18,9 +18,9 @@
},
"system_cpu_usage":5876874910000000,
"throttling_data":{
- "periods":3212,
- "throttled_periods":322,
- "throttled_time":4490
+ "periods":820694,
+ "throttled_periods":177731,
+ "throttled_time":81891944744550
}
},
"cpu_stats":{
@@ -41,9 +41,9 @@
},
"system_cpu_usage":5876882680000000,
"throttling_data":{
- "periods":3242,
- "throttled_periods":332,
- "throttled_time":4523
+ "periods":821264,
+ "throttled_periods":178201,
+ "throttled_time":82181944744550
}
},
"memory_stats":{
diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt
index ec750798c98..54d4d36c7d0 100644
--- a/node-admin/src/test/resources/expected.container.system.metrics.txt
+++ b/node-admin/src/test/resources/expected.container.system.metrics.txt
@@ -10,6 +10,8 @@ s:
},
"metrics": {
"cpu.sys.util": 3.402,
+ "cpu.throttled_cpu_time.rate": 5.087,
+ "cpu.throttled_time.rate": 0.824,
"cpu.util": 5.4,
"cpu.vcpus": 2.0,
"disk.limit": 250000000000,