summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java1
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java57
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java1
-rw-r--r--node-admin/src/test/resources/docker.stats.json2
-rw-r--r--node-admin/src/test/resources/docker.stats.metrics.active.expected.json103
-rw-r--r--node-admin/src/test/resources/docker.stats.metrics.ready.expected.json19
6 files changed, 168 insertions, 15 deletions
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java
index 94940601f80..cdbe94d05f0 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java
@@ -26,6 +26,7 @@ import java.util.stream.Collectors;
public class MetricReceiverWrapper {
// Application names used
public static final String APPLICATION_DOCKER = "docker";
+ public static final String APPLICATION_NODE = "node";
public static final String APPLICATION_HOST_LIFE = "host_life";
private final static ObjectMapper objectMapper = new ObjectMapper();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index b4bfaf5c9b0..8e290725fcb 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -522,24 +522,58 @@ public class NodeAgentImpl implements NodeAgent {
.add("clustertype", membership.clusterType)
.add("clusterid", membership.clusterId));
Dimensions dimensions = dimensionsBuilder.build();
+ metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_NODE, dimensions, "alive").sample(1);
+ // TODO: REMOVE
metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.alive").sample(1);
-
// The remaining metrics require container to exists and be running
if (containerState == ABSENT) return;
Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName);
if (!containerStats.isPresent()) return;
Docker.ContainerStats stats = containerStats.get();
-
- long currentCpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue();
- long currentCpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue();
+ final String APP = MetricReceiverWrapper.APPLICATION_NODE;
+ final long bytesInGB = 1 << 30;
+ final long cpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue();
+ final long cpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue();
+ final long memoryTotalBytes = ((Number) stats.getMemoryStats().get("limit")).longValue();
+ final long memoryTotalBytesUsage = ((Number) stats.getMemoryStats().get("usage")).longValue();
+ final long memoryTotalBytesCache = ((Number) ((Map) stats.getMemoryStats().get("stats")).get("cache")).longValue();
+ final Optional<Long> diskTotalBytes = nodeSpec.minDiskAvailableGb.map(size -> (long) (size * bytesInGB));
+ final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer
+ .updateIfNeededAndGetDiskMetricsFor(containerName));
// CPU usage by a container is given by dividing used CPU time by the container with CPU time used by the entire
// system. Because each container is allocated same amount of CPU shares, no container should use more than 1/n
// of the total CPU time, where n is the number of running containers.
- double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime);
+ double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(cpuContainerTotalTime, cpuSystemTotalTime);
double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost;
+ long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache;
+ double memoryPercentUsed = 100.0 * memoryTotalBytesUsed / memoryTotalBytes;
+ Optional<Double> diskPercentUsed = diskTotalBytes.flatMap(total -> diskTotalBytesUsed.map(used -> 100.0 * used / total));
+
+ metricReceiver.declareGauge(APP, dimensions, "cpu.util").sample(cpuPercentageOfAllocated);
+ metricReceiver.declareGauge(APP, dimensions, "mem.limit").sample(memoryTotalBytes);
+ metricReceiver.declareGauge(APP, dimensions, "mem.used").sample(memoryTotalBytesUsed);
+ metricReceiver.declareGauge(APP, dimensions, "mem.util").sample(memoryPercentUsed);
+ diskTotalBytes.ifPresent(diskLimit -> metricReceiver.declareGauge(APP, dimensions, "disk.limit").sample(diskLimit));
+ diskTotalBytesUsed.ifPresent(diskUsed -> metricReceiver.declareGauge(APP, dimensions, "disk.used").sample(diskUsed));
+ diskPercentUsed.ifPresent(diskUtil -> metricReceiver.declareGauge(APP, dimensions, "disk.util").sample(diskUtil));
+
+ stats.getNetworks().forEach((interfaceName, interfaceStats) -> {
+ Dimensions netDims = dimensionsBuilder.add("interface", interfaceName).build();
+ Map<String, Number> infStats = (Map<String, Number>) interfaceStats;
+
+ metricReceiver.declareGauge(APP, netDims, "net.in.bytes").sample(infStats.get("rx_bytes").longValue());
+ metricReceiver.declareGauge(APP, netDims, "net.in.errors").sample(infStats.get("rx_errors").longValue());
+ metricReceiver.declareGauge(APP, netDims, "net.in.dropped").sample(infStats.get("rx_dropped").longValue());
+ metricReceiver.declareGauge(APP, netDims, "net.out.bytes").sample(infStats.get("tx_bytes").longValue());
+ metricReceiver.declareGauge(APP, netDims, "net.out.errors").sample(infStats.get("tx_errors").longValue());
+ metricReceiver.declareGauge(APP, netDims, "net.out.dropped").sample(infStats.get("tx_dropped").longValue());
+ });
+
+
+ // TODO: Remove when all alerts and dashboards have been updated to use new metric names
metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentageOfAllocated);
addIfNotNull(dimensions, "node.cpu.throttled_time", stats.getCpuStats().get("throttling_data"), "throttled_time");
@@ -561,14 +595,11 @@ public class NodeAgentImpl implements NodeAgent {
addIfNotNull(netDims, "node.net.out.dropped", interfaceStats, "tx_dropped");
});
- long bytesInGB = 1 << 30;
- nodeSpec.minDiskAvailableGb.ifPresent(diskGB -> metricReceiver
- .declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.limit").sample(diskGB * bytesInGB));
-
- storageMaintainer.ifPresent(maintainer -> maintainer
- .updateIfNeededAndGetDiskMetricsFor(containerName)
- .forEach((metricName, metricValue) ->
- metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, metricName).sample(metricValue.doubleValue())));
+ diskTotalBytes.ifPresent(diskLimit ->
+ metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.limit").sample(diskLimit));
+ diskTotalBytesUsed.ifPresent(diskUsed ->
+ metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.used").sample(diskUsed));
+ // TODO END REMOVE
metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_HOST_LIFE, dimensions, "uptime").sample(lastCpuMetric.getUptime());
metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_HOST_LIFE, dimensions, "alive").sample(1);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 64c37fedd53..f74f4e0af78 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -484,6 +484,7 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec));
+ when(storageMaintainer.updateIfNeededAndGetDiskMetricsFor(eq(containerName))).thenReturn(Optional.of(42547019776L));
when(dockerOperations.shouldScheduleDownloadOfImage(eq(dockerImage))).thenReturn(false);
when(dockerOperations.getContainerStats(eq(containerName)))
.thenReturn(Optional.of(stats1))
diff --git a/node-admin/src/test/resources/docker.stats.json b/node-admin/src/test/resources/docker.stats.json
index d266b01f12d..3b1087b9202 100644
--- a/node-admin/src/test/resources/docker.stats.json
+++ b/node-admin/src/test/resources/docker.stats.json
@@ -52,7 +52,7 @@
"stats":{
"active_anon":1326051328,
"active_file":188919808,
- "cache":426680320,
+ "cache":678965248,
"hierarchical_memory_limit":4294967296,
"hierarchical_memsw_limit":8589934592,
"inactive_anon":0,
diff --git a/node-admin/src/test/resources/docker.stats.metrics.active.expected.json b/node-admin/src/test/resources/docker.stats.metrics.active.expected.json
index 2ed42cdb7c4..b5fe0999e18 100644
--- a/node-admin/src/test/resources/docker.stats.metrics.active.expected.json
+++ b/node-admin/src/test/resources/docker.stats.metrics.active.expected.json
@@ -30,6 +30,106 @@
}
},
{
+ "application": "node",
+ "dimensions": {
+ "flavor": "docker",
+ "applicationName": "testapp",
+ "instanceName": "testinstance",
+ "applicationId": "tester.testapp.testinstance",
+ "app": "testapp.testinstance",
+ "clustertype": "clustType",
+ "role": "tenants",
+ "tenantName": "tester",
+ "host": "host1.test.yahoo.com",
+ "vespaVersion": "1.2.3",
+ "state": "active",
+ "clusterid": "clustId",
+ "parentHostname": "parent.host.name.yahoo.com",
+ "zone": "dev.us-east-1"
+ },
+ "metrics": {
+ "alive": 1.0,
+ "cpu.util": 6.75,
+ "mem.limit": 4.294967296E9,
+ "mem.used": 1.073741824E9,
+ "mem.util": 25.0,
+ "disk.limit": 2.68435456E11,
+ "disk.used": 4.2547019776E10,
+ "disk.util": 15.85
+ },
+ "routing": {
+ "yamas": {
+ "namespaces": ["Vespa"]
+ }
+ }
+ },
+ {
+ "application": "node",
+ "dimensions": {
+ "flavor": "docker",
+ "applicationName": "testapp",
+ "instanceName": "testinstance",
+ "applicationId": "tester.testapp.testinstance",
+ "app": "testapp.testinstance",
+ "clustertype": "clustType",
+ "role": "tenants",
+ "tenantName": "tester",
+ "host": "host1.test.yahoo.com",
+ "vespaVersion": "1.2.3",
+ "state": "active",
+ "clusterid": "clustId",
+ "parentHostname": "parent.host.name.yahoo.com",
+ "zone": "dev.us-east-1",
+ "interface": "eth1"
+ },
+ "metrics": {
+ "net.out.bytes": 5.4246745E7,
+ "net.out.errors": 0.0,
+ "net.out.dropped": 0.0,
+ "net.in.bytes": 3245766.0,
+ "net.in.errors": 0.0,
+ "net.in.dropped": 0.0
+ },
+ "routing": {
+ "yamas": {
+ "namespaces": ["Vespa"]
+ }
+ }
+ },
+ {
+ "application": "node",
+ "dimensions": {
+ "flavor": "docker",
+ "applicationName": "testapp",
+ "instanceName": "testinstance",
+ "applicationId": "tester.testapp.testinstance",
+ "app": "testapp.testinstance",
+ "clustertype": "clustType",
+ "role": "tenants",
+ "tenantName": "tester",
+ "host": "host1.test.yahoo.com",
+ "vespaVersion": "1.2.3",
+ "state": "active",
+ "clusterid": "clustId",
+ "parentHostname": "parent.host.name.yahoo.com",
+ "zone": "dev.us-east-1",
+ "interface": "eth0"
+ },
+ "metrics": {
+ "net.out.bytes": 2.0303455E7,
+ "net.out.errors": 3.0,
+ "net.out.dropped": 13.0,
+ "net.in.bytes": 1.949927E7,
+ "net.in.errors": 55.0,
+ "net.in.dropped": 4.0
+ },
+ "routing": {
+ "yamas": {
+ "namespaces": ["Vespa"]
+ }
+ }
+ },
+ {
"application": "docker",
"dimensions": {
"flavor": "docker",
@@ -84,8 +184,9 @@
"node.alive": 1.0,
"node.cpu.busy.pct": 6.75,
"node.cpu.throttled_time": 4523.0,
- "node.memory.usage": 1.326026752E9,
+ "node.memory.usage": 1.073741824E9,
"node.memory.limit": 4.294967296E9,
+ "node.disk.used": 4.2547019776E10,
"node.disk.limit": 2.68435456E11
},
"routing": {
diff --git a/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json b/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json
index de3e26c8e13..ddfb1c61c06 100644
--- a/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json
+++ b/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json
@@ -1,5 +1,24 @@
[
{
+ "application": "node",
+ "dimensions": {
+ "flavor": "docker",
+ "role": "tenants",
+ "host": "host1.test.yahoo.com",
+ "state": "ready",
+ "parentHostname": "parent.host.name.yahoo.com",
+ "zone": "dev.us-east-1"
+ },
+ "metrics": {
+ "alive": 1.0
+ },
+ "routing": {
+ "yamas": {
+ "namespaces": ["Vespa"]
+ }
+ }
+ },
+ {
"application": "docker",
"dimensions": {
"flavor": "docker",