diff options
7 files changed, 154 insertions, 14 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index fd5029162bc..baab7c92b0d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -6,6 +6,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.collections.Pair; import com.yahoo.io.IOUtils; +import com.yahoo.log.LogLevel; import com.yahoo.net.HostName; import com.yahoo.system.ProcessExecuter; import com.yahoo.vespa.hosted.dockerapi.ContainerName; @@ -35,6 +36,8 @@ import java.util.Map; import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import java.util.logging.Logger; +import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.yahoo.vespa.defaults.Defaults.getDefaults; @@ -43,17 +46,20 @@ import static com.yahoo.vespa.defaults.Defaults.getDefaults; * @author freva */ public class StorageMaintainer { + private static final Pattern TOTAL_MEMORY_PATTERN = Pattern.compile("^MemTotal:\\s*(?<totalMem>\\d+) kB$", Pattern.MULTILINE); private static final ContainerName NODE_ADMIN = new ContainerName("node-admin"); private static final ObjectMapper objectMapper = new ObjectMapper(); private static Optional<String> kernelVersion = Optional.empty(); private static final long intervalSec = 1000; + private final Logger logger = Logger.getLogger(StorageMaintainer.class.getName()); private final Object monitor = new Object(); private final CounterWrapper numberOfNodeAdminMaintenanceFails; private final Docker docker; private final Environment environment; private final Clock clock; + private double hostTotalMemoryGb = 0; private Map<ContainerName, MaintenanceThrottler> maintenanceThrottlerByContainerName = new ConcurrentHashMap<>(); @@ -167,6 +173,29 @@ public class StorageMaintainer { return diskUsageKB * 1024; } + Optional<String> readMeminfo() { + try { + return Optional.of(new String(Files.readAllBytes(Paths.get("/proc/meminfo")))); + } catch (IOException e) { + logger.log(LogLevel.WARNING, "Failed to read meminfo", e); + return Optional.empty(); + } + } + + public double getHostTotalMemoryGb() { + if (hostTotalMemoryGb == 0) { + readMeminfo().ifPresent(memInfo -> { + Matcher matcher = TOTAL_MEMORY_PATTERN.matcher(memInfo); + if (matcher.find()) { + hostTotalMemoryGb = Integer.valueOf(matcher.group("totalMem")) / 1024d / 1024; + } else { + logger.log(LogLevel.WARNING, "Failed to parse total memory from meminfo: " + memInfo); + } + }); + } + + return hostTotalMemoryGb; + } /** * Deletes old log files for vespa, nginx, logstash, etc. diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index eaae5030b50..84679d1dadd 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -83,7 +83,7 @@ public class NodeAdminImpl implements NodeAdmin { metricsScheduler.scheduleAtFixedRate(() -> { try { - nodeAgents.values().forEach(nodeAgent -> nodeAgent.updateContainerNodeMetrics(nodeAgents.size())); + nodeAgents.values().forEach(NodeAgent::updateContainerNodeMetrics); } catch (Throwable e) { logger.warning("Metric fetcher scheduler failed", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index b0facdec09d..5d31c10fcc1 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -41,7 +41,7 @@ public interface NodeAgent { /** * Updates metric receiver with the latest node-agent stats */ - void updateContainerNodeMetrics(int numAllocatedContainersOnHost); + void updateContainerNodeMetrics(); String getHostname(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 08fae2b707a..6145e2e372e 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -501,7 +501,7 @@ public class NodeAgentImpl implements NodeAgent { } @SuppressWarnings("unchecked") - public void updateContainerNodeMetrics(int numAllocatedContainersOnHost) { + public void updateContainerNodeMetrics() { final ContainerNodeSpec nodeSpec = lastNodeSpec; if (nodeSpec == null || containerState == ABSENT) return; @@ -527,11 +527,14 @@ public class NodeAgentImpl implements NodeAgent { final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer .updateIfNeededAndGetDiskMetricsFor(containerName)); - // CPU usage by a container is given by dividing used CPU time by the container with CPU time used by the entire - // system. Because each container is allocated same amount of CPU shares, no container should use more than 1/n - // of the total CPU time, where n is the number of running containers. + // CPU usage by a container as percentage of total host CPU, cpuPercentageOfHost, is given by dividing used + // CPU time by the container with CPU time used by the entire system. + // CPU usage by a container as percentage of total CPU allocated to it is given by dividing the + // cpuPercentageOfHost with the ratio of container resources over total host resources. This calculation + // assumes that the ratio between container and host resources for disk, memory, and cpu is roughly equal + // and therefore only calculates the ratio of container memory against host memory. double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(cpuContainerTotalTime, cpuSystemTotalTime); - double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost; + double cpuPercentageOfAllocated = getInverseContainerShareOfHost(nodeSpec) * cpuPercentageOfHost; long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache; double memoryPercentUsed = 100.0 * memoryTotalBytesUsed / memoryTotalBytes; Optional<Double> diskPercentUsed = diskTotalBytes.flatMap(total -> diskTotalBytesUsed.map(used -> 100.0 * used / total)); @@ -603,6 +606,14 @@ public class NodeAgentImpl implements NodeAgent { return temp; } + private double getInverseContainerShareOfHost(ContainerNodeSpec nodeSpec) { + return nodeSpec.minMainMemoryAvailableGb + .map(memory -> { + double hostMemory = storageMaintainer.map(StorageMaintainer::getHostTotalMemoryGb).orElse(0d); + return hostMemory / memory; + }).orElse(0d); + } + class CpuUsageReporter { private long totalContainerUsage = 0; private long totalSystemUsage = 0; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java index 4092c967bb7..74f09d36e9d 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java @@ -19,6 +19,7 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.time.Duration; +import java.util.Optional; import static org.junit.Assert.*; import static org.mockito.Matchers.any; @@ -119,6 +120,18 @@ public class StorageMaintainerTest { verify(docker, times(2)).executeInContainerAsRoot(any(), anyVararg()); } + @Test + public void testGetTotalMemory() { + StorageMaintainer storageMaintainer = mock(StorageMaintainer.class); + when(storageMaintainer.getHostTotalMemoryGb()).thenCallRealMethod(); + + when(storageMaintainer.readMeminfo()).thenReturn(Optional.empty()); + assertEquals(0d, storageMaintainer.getHostTotalMemoryGb(), 0); + + when(storageMaintainer.readMeminfo()).thenReturn(Optional.of("MemTotal: 1572864 kB\nMemUsed: 1000000 kB\n")); + assertEquals(1.5d, storageMaintainer.getHostTotalMemoryGb(), 0); + } + private static void writeNBytesToFile(File file, int nBytes) throws IOException { Files.write(file.toPath(), new byte[nBytes]); } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 91d9b382b7c..d2a90abaffb 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -26,6 +26,8 @@ import org.mockito.InOrder; import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.time.Duration; import java.util.Collections; import java.util.Map; @@ -38,6 +40,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyVararg; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; @@ -515,35 +518,43 @@ public class NodeAgentImplTest { .vespaVersion(vespaVersion) .owner(owner) .membership(membership) + .minMainMemoryAvailableGb(2) .build(); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec)); when(storageMaintainer.updateIfNeededAndGetDiskMetricsFor(eq(containerName))).thenReturn(Optional.of(42547019776L)); + when(storageMaintainer.getHostTotalMemoryGb()).thenReturn(10d); when(dockerOperations.getContainerStats(eq(containerName))) .thenReturn(Optional.of(stats1)) .thenReturn(Optional.of(stats2)); nodeAgent.converge(); // Run the converge loop once to initialize lastNodeSpec - nodeAgent.updateContainerNodeMetrics(5); // Update metrics once to init and lastCpuMetric + nodeAgent.updateContainerNodeMetrics(); // Update metrics once to init and lastCpuMetric clock.advance(Duration.ofSeconds(1234)); - nodeAgent.updateContainerNodeMetrics(5); - String[] expectedCommand = {"rpc_invoke", "-t", "1", "tcp/localhost:19091", "setExtraMetrics", - "s:{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"mem.limit\":4294967296,\"mem.used\":1073741824,\"disk.used\":42547019776,\"disk.util\":15.85,\"cpu.util\":0.0,\"mem.util\":25.0,\"disk.limit\":268435456000},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"net.out.bytes\":20303455,\"net.in.dropped\":4,\"net.out.dropped\":13,\"net.in.bytes\":19499270,\"net.out.errors\":3,\"net.in.errors\":55},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"interface\":\"eth0\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"net.out.bytes\":54246745,\"net.in.dropped\":0,\"net.out.dropped\":0,\"net.in.bytes\":3245766,\"net.out.errors\":0,\"net.in.errors\":0},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"interface\":\"eth1\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}"}; + Path pathToExpectedMetrics = Paths.get(classLoader.getResource("expected.container.system.metrics.txt").getPath()); + String expectedMetrics = new String(Files.readAllBytes(pathToExpectedMetrics)) + .replaceAll("\\s", "") + .replaceAll("\\n", ""); + + String[] expectedCommand = {"rpc_invoke", "-t", "1", "tcp/localhost:19091", "setExtraMetrics", expectedMetrics}; doAnswer(invocation -> { ContainerName calledContainerName = (ContainerName) invocation.getArguments()[0]; long calledTimeout = (long) invocation.getArguments()[1]; - String[] calledCommand = (String[]) invocation.getArguments()[2]; + String[] calledCommand = new String[invocation.getArguments().length - 2]; + System.arraycopy(invocation.getArguments(), 2, calledCommand, 0, calledCommand.length); calledCommand[calledCommand.length - 1] = calledCommand[calledCommand.length - 1].replaceAll("\"timestamp\":\\d+", "\"timestamp\":0"); assertEquals(containerName, calledContainerName); assertEquals(5L, calledTimeout); assertArrayEquals(expectedCommand, calledCommand); return null; - }).when(dockerOperations).executeCommandInContainerAsRoot(any(), any(), any()); + }).when(dockerOperations).executeCommandInContainerAsRoot(any(), any(), anyVararg()); + + nodeAgent.updateContainerNodeMetrics(); } @Test @@ -559,7 +570,7 @@ public class NodeAgentImplTest { nodeAgent.converge(); // Run the converge loop once to initialize lastNodeSpec - nodeAgent.updateContainerNodeMetrics(5); + nodeAgent.updateContainerNodeMetrics(); Set<Map<String, Object>> actualMetrics = metricReceiver.getAllMetricsRaw(); assertEquals(Collections.emptySet(), actualMetrics); diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt new file mode 100644 index 00000000000..cfe01ae34f6 --- /dev/null +++ b/node-admin/src/test/resources/expected.container.system.metrics.txt @@ -0,0 +1,76 @@ +s: +{ + "routing": { + "yamas": { + "namespaces": + ["Vespa"] + } + }, + "application": "vespa.node", + "metrics": { + "mem.limit": 4294967296, + "mem.used": 1073741824, + "disk.used": 42547019776, + "disk.util": 15.85, + "cpu.util": 6.75, + "mem.util": 25.0, + "disk.limit": 268435456000 + }, + "dimensions": { + "host": "host1.test.yahoo.com", + "role": "tenants", + "state": "active", + "parentHostname": "parent.host.name.yahoo.com" + }, + "timestamp": 0 +} +{ + "routing": { + "yamas": { + "namespaces": + ["Vespa"] + } + }, + "application": "vespa.node", + "metrics": { + "net.out.bytes": 20303455, + "net.in.dropped": 4, + "net.out.dropped": 13, + "net.in.bytes": 19499270, + "net.out.errors": 3, + "net.in.errors": 55 + }, + "dimensions": { + "host": "host1.test.yahoo.com", + "role": "tenants", + "state": "active", + "interface": "eth0", + "parentHostname": "parent.host.name.yahoo.com" + }, + "timestamp": 0 +} +{ + "routing": { + "yamas": { + "namespaces": + ["Vespa"] + } + }, + "application": "vespa.node", + "metrics": { + "net.out.bytes": 54246745, + "net.in.dropped": 0, + "net.out.dropped": 0, + "net.in.bytes": 3245766, + "net.out.errors": 0, + "net.in.errors": 0 + }, + "dimensions": { + "host": "host1.test.yahoo.com", + "role": "tenants", + "state": "active", + "interface": "eth1", + "parentHostname": "parent.host.name.yahoo.com" + }, + "timestamp": 0 +}
\ No newline at end of file |