diff options
6 files changed, 168 insertions, 15 deletions
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java index 94940601f80..cdbe94d05f0 100644 --- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java +++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/metrics/MetricReceiverWrapper.java @@ -26,6 +26,7 @@ import java.util.stream.Collectors; public class MetricReceiverWrapper { // Application names used public static final String APPLICATION_DOCKER = "docker"; + public static final String APPLICATION_NODE = "node"; public static final String APPLICATION_HOST_LIFE = "host_life"; private final static ObjectMapper objectMapper = new ObjectMapper(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index b4bfaf5c9b0..8e290725fcb 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -522,24 +522,58 @@ public class NodeAgentImpl implements NodeAgent { .add("clustertype", membership.clusterType) .add("clusterid", membership.clusterId)); Dimensions dimensions = dimensionsBuilder.build(); + metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_NODE, dimensions, "alive").sample(1); + // TODO: REMOVE metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.alive").sample(1); - // The remaining metrics require container to exists and be running if (containerState == ABSENT) return; Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName); if (!containerStats.isPresent()) return; Docker.ContainerStats stats = containerStats.get(); - - long currentCpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue(); - long currentCpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue(); + final String APP = MetricReceiverWrapper.APPLICATION_NODE; + final long bytesInGB = 1 << 30; + final long cpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue(); + final long cpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue(); + final long memoryTotalBytes = ((Number) stats.getMemoryStats().get("limit")).longValue(); + final long memoryTotalBytesUsage = ((Number) stats.getMemoryStats().get("usage")).longValue(); + final long memoryTotalBytesCache = ((Number) ((Map) stats.getMemoryStats().get("stats")).get("cache")).longValue(); + final Optional<Long> diskTotalBytes = nodeSpec.minDiskAvailableGb.map(size -> (long) (size * bytesInGB)); + final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer + .updateIfNeededAndGetDiskMetricsFor(containerName)); // CPU usage by a container is given by dividing used CPU time by the container with CPU time used by the entire // system. Because each container is allocated same amount of CPU shares, no container should use more than 1/n // of the total CPU time, where n is the number of running containers. - double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime); + double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(cpuContainerTotalTime, cpuSystemTotalTime); double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost; + long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache; + double memoryPercentUsed = 100.0 * memoryTotalBytesUsed / memoryTotalBytes; + Optional<Double> diskPercentUsed = diskTotalBytes.flatMap(total -> diskTotalBytesUsed.map(used -> 100.0 * used / total)); + + metricReceiver.declareGauge(APP, dimensions, "cpu.util").sample(cpuPercentageOfAllocated); + metricReceiver.declareGauge(APP, dimensions, "mem.limit").sample(memoryTotalBytes); + metricReceiver.declareGauge(APP, dimensions, "mem.used").sample(memoryTotalBytesUsed); + metricReceiver.declareGauge(APP, dimensions, "mem.util").sample(memoryPercentUsed); + diskTotalBytes.ifPresent(diskLimit -> metricReceiver.declareGauge(APP, dimensions, "disk.limit").sample(diskLimit)); + diskTotalBytesUsed.ifPresent(diskUsed -> metricReceiver.declareGauge(APP, dimensions, "disk.used").sample(diskUsed)); + diskPercentUsed.ifPresent(diskUtil -> metricReceiver.declareGauge(APP, dimensions, "disk.util").sample(diskUtil)); + + stats.getNetworks().forEach((interfaceName, interfaceStats) -> { + Dimensions netDims = dimensionsBuilder.add("interface", interfaceName).build(); + Map<String, Number> infStats = (Map<String, Number>) interfaceStats; + + metricReceiver.declareGauge(APP, netDims, "net.in.bytes").sample(infStats.get("rx_bytes").longValue()); + metricReceiver.declareGauge(APP, netDims, "net.in.errors").sample(infStats.get("rx_errors").longValue()); + metricReceiver.declareGauge(APP, netDims, "net.in.dropped").sample(infStats.get("rx_dropped").longValue()); + metricReceiver.declareGauge(APP, netDims, "net.out.bytes").sample(infStats.get("tx_bytes").longValue()); + metricReceiver.declareGauge(APP, netDims, "net.out.errors").sample(infStats.get("tx_errors").longValue()); + metricReceiver.declareGauge(APP, netDims, "net.out.dropped").sample(infStats.get("tx_dropped").longValue()); + }); + + + // TODO: Remove when all alerts and dashboards have been updated to use new metric names metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentageOfAllocated); addIfNotNull(dimensions, "node.cpu.throttled_time", stats.getCpuStats().get("throttling_data"), "throttled_time"); @@ -561,14 +595,11 @@ public class NodeAgentImpl implements NodeAgent { addIfNotNull(netDims, "node.net.out.dropped", interfaceStats, "tx_dropped"); }); - long bytesInGB = 1 << 30; - nodeSpec.minDiskAvailableGb.ifPresent(diskGB -> metricReceiver - .declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.limit").sample(diskGB * bytesInGB)); - - storageMaintainer.ifPresent(maintainer -> maintainer - .updateIfNeededAndGetDiskMetricsFor(containerName) - .forEach((metricName, metricValue) -> - metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, metricName).sample(metricValue.doubleValue()))); + diskTotalBytes.ifPresent(diskLimit -> + metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.limit").sample(diskLimit)); + diskTotalBytesUsed.ifPresent(diskUsed -> + metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.disk.used").sample(diskUsed)); + // TODO END REMOVE metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_HOST_LIFE, dimensions, "uptime").sample(lastCpuMetric.getUptime()); metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_HOST_LIFE, dimensions, "alive").sample(1); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 64c37fedd53..f74f4e0af78 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -484,6 +484,7 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec)); + when(storageMaintainer.updateIfNeededAndGetDiskMetricsFor(eq(containerName))).thenReturn(Optional.of(42547019776L)); when(dockerOperations.shouldScheduleDownloadOfImage(eq(dockerImage))).thenReturn(false); when(dockerOperations.getContainerStats(eq(containerName))) .thenReturn(Optional.of(stats1)) diff --git a/node-admin/src/test/resources/docker.stats.json b/node-admin/src/test/resources/docker.stats.json index d266b01f12d..3b1087b9202 100644 --- a/node-admin/src/test/resources/docker.stats.json +++ b/node-admin/src/test/resources/docker.stats.json @@ -52,7 +52,7 @@ "stats":{ "active_anon":1326051328, "active_file":188919808, - "cache":426680320, + "cache":678965248, "hierarchical_memory_limit":4294967296, "hierarchical_memsw_limit":8589934592, "inactive_anon":0, diff --git a/node-admin/src/test/resources/docker.stats.metrics.active.expected.json b/node-admin/src/test/resources/docker.stats.metrics.active.expected.json index 2ed42cdb7c4..b5fe0999e18 100644 --- a/node-admin/src/test/resources/docker.stats.metrics.active.expected.json +++ b/node-admin/src/test/resources/docker.stats.metrics.active.expected.json @@ -30,6 +30,106 @@ } }, { + "application": "node", + "dimensions": { + "flavor": "docker", + "applicationName": "testapp", + "instanceName": "testinstance", + "applicationId": "tester.testapp.testinstance", + "app": "testapp.testinstance", + "clustertype": "clustType", + "role": "tenants", + "tenantName": "tester", + "host": "host1.test.yahoo.com", + "vespaVersion": "1.2.3", + "state": "active", + "clusterid": "clustId", + "parentHostname": "parent.host.name.yahoo.com", + "zone": "dev.us-east-1" + }, + "metrics": { + "alive": 1.0, + "cpu.util": 6.75, + "mem.limit": 4.294967296E9, + "mem.used": 1.073741824E9, + "mem.util": 25.0, + "disk.limit": 2.68435456E11, + "disk.used": 4.2547019776E10, + "disk.util": 15.85 + }, + "routing": { + "yamas": { + "namespaces": ["Vespa"] + } + } + }, + { + "application": "node", + "dimensions": { + "flavor": "docker", + "applicationName": "testapp", + "instanceName": "testinstance", + "applicationId": "tester.testapp.testinstance", + "app": "testapp.testinstance", + "clustertype": "clustType", + "role": "tenants", + "tenantName": "tester", + "host": "host1.test.yahoo.com", + "vespaVersion": "1.2.3", + "state": "active", + "clusterid": "clustId", + "parentHostname": "parent.host.name.yahoo.com", + "zone": "dev.us-east-1", + "interface": "eth1" + }, + "metrics": { + "net.out.bytes": 5.4246745E7, + "net.out.errors": 0.0, + "net.out.dropped": 0.0, + "net.in.bytes": 3245766.0, + "net.in.errors": 0.0, + "net.in.dropped": 0.0 + }, + "routing": { + "yamas": { + "namespaces": ["Vespa"] + } + } + }, + { + "application": "node", + "dimensions": { + "flavor": "docker", + "applicationName": "testapp", + "instanceName": "testinstance", + "applicationId": "tester.testapp.testinstance", + "app": "testapp.testinstance", + "clustertype": "clustType", + "role": "tenants", + "tenantName": "tester", + "host": "host1.test.yahoo.com", + "vespaVersion": "1.2.3", + "state": "active", + "clusterid": "clustId", + "parentHostname": "parent.host.name.yahoo.com", + "zone": "dev.us-east-1", + "interface": "eth0" + }, + "metrics": { + "net.out.bytes": 2.0303455E7, + "net.out.errors": 3.0, + "net.out.dropped": 13.0, + "net.in.bytes": 1.949927E7, + "net.in.errors": 55.0, + "net.in.dropped": 4.0 + }, + "routing": { + "yamas": { + "namespaces": ["Vespa"] + } + } + }, + { "application": "docker", "dimensions": { "flavor": "docker", @@ -84,8 +184,9 @@ "node.alive": 1.0, "node.cpu.busy.pct": 6.75, "node.cpu.throttled_time": 4523.0, - "node.memory.usage": 1.326026752E9, + "node.memory.usage": 1.073741824E9, "node.memory.limit": 4.294967296E9, + "node.disk.used": 4.2547019776E10, "node.disk.limit": 2.68435456E11 }, "routing": { diff --git a/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json b/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json index de3e26c8e13..ddfb1c61c06 100644 --- a/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json +++ b/node-admin/src/test/resources/docker.stats.metrics.ready.expected.json @@ -1,5 +1,24 @@ [ { + "application": "node", + "dimensions": { + "flavor": "docker", + "role": "tenants", + "host": "host1.test.yahoo.com", + "state": "ready", + "parentHostname": "parent.host.name.yahoo.com", + "zone": "dev.us-east-1" + }, + "metrics": { + "alive": 1.0 + }, + "routing": { + "yamas": { + "namespaces": ["Vespa"] + } + } + }, + { "application": "docker", "dimensions": { "flavor": "docker", |