aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-06-20 15:46:37 +0200
committerGitHub <noreply@github.com>2017-06-20 15:46:37 +0200
commit52c57a784be4fc09ad57255829d61392cbda610d (patch)
treed5954eb10dd16d0ae1cf0d5400270489cc6af155
parentbae91b6aac5080948796e9d5c411805083dafb94 (diff)
parent5ed3a292967e03d32cb9e4d59803b169deb0d253 (diff)
Merge pull request #2840 from yahoo/freva/fix-container-cpu-metrics
Freva/fix container cpu metrics
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java29
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java21
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java13
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java25
-rw-r--r--node-admin/src/test/resources/expected.container.system.metrics.txt76
7 files changed, 154 insertions, 14 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index fd5029162bc..baab7c92b0d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -6,6 +6,7 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.collections.Pair;
import com.yahoo.io.IOUtils;
+import com.yahoo.log.LogLevel;
import com.yahoo.net.HostName;
import com.yahoo.system.ProcessExecuter;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
@@ -35,6 +36,8 @@ import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.yahoo.vespa.defaults.Defaults.getDefaults;
@@ -43,17 +46,20 @@ import static com.yahoo.vespa.defaults.Defaults.getDefaults;
* @author freva
*/
public class StorageMaintainer {
+ private static final Pattern TOTAL_MEMORY_PATTERN = Pattern.compile("^MemTotal:\\s*(?<totalMem>\\d+) kB$", Pattern.MULTILINE);
private static final ContainerName NODE_ADMIN = new ContainerName("node-admin");
private static final ObjectMapper objectMapper = new ObjectMapper();
private static Optional<String> kernelVersion = Optional.empty();
private static final long intervalSec = 1000;
+ private final Logger logger = Logger.getLogger(StorageMaintainer.class.getName());
private final Object monitor = new Object();
private final CounterWrapper numberOfNodeAdminMaintenanceFails;
private final Docker docker;
private final Environment environment;
private final Clock clock;
+ private double hostTotalMemoryGb = 0;
private Map<ContainerName, MaintenanceThrottler> maintenanceThrottlerByContainerName = new ConcurrentHashMap<>();
@@ -167,6 +173,29 @@ public class StorageMaintainer {
return diskUsageKB * 1024;
}
+ Optional<String> readMeminfo() {
+ try {
+ return Optional.of(new String(Files.readAllBytes(Paths.get("/proc/meminfo"))));
+ } catch (IOException e) {
+ logger.log(LogLevel.WARNING, "Failed to read meminfo", e);
+ return Optional.empty();
+ }
+ }
+
+ public double getHostTotalMemoryGb() {
+ if (hostTotalMemoryGb == 0) {
+ readMeminfo().ifPresent(memInfo -> {
+ Matcher matcher = TOTAL_MEMORY_PATTERN.matcher(memInfo);
+ if (matcher.find()) {
+ hostTotalMemoryGb = Integer.valueOf(matcher.group("totalMem")) / 1024d / 1024;
+ } else {
+ logger.log(LogLevel.WARNING, "Failed to parse total memory from meminfo: " + memInfo);
+ }
+ });
+ }
+
+ return hostTotalMemoryGb;
+ }
/**
* Deletes old log files for vespa, nginx, logstash, etc.
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index eaae5030b50..84679d1dadd 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -83,7 +83,7 @@ public class NodeAdminImpl implements NodeAdmin {
metricsScheduler.scheduleAtFixedRate(() -> {
try {
- nodeAgents.values().forEach(nodeAgent -> nodeAgent.updateContainerNodeMetrics(nodeAgents.size()));
+ nodeAgents.values().forEach(NodeAgent::updateContainerNodeMetrics);
} catch (Throwable e) {
logger.warning("Metric fetcher scheduler failed", e);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
index b0facdec09d..5d31c10fcc1 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
@@ -41,7 +41,7 @@ public interface NodeAgent {
/**
* Updates metric receiver with the latest node-agent stats
*/
- void updateContainerNodeMetrics(int numAllocatedContainersOnHost);
+ void updateContainerNodeMetrics();
String getHostname();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 08fae2b707a..6145e2e372e 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -501,7 +501,7 @@ public class NodeAgentImpl implements NodeAgent {
}
@SuppressWarnings("unchecked")
- public void updateContainerNodeMetrics(int numAllocatedContainersOnHost) {
+ public void updateContainerNodeMetrics() {
final ContainerNodeSpec nodeSpec = lastNodeSpec;
if (nodeSpec == null || containerState == ABSENT) return;
@@ -527,11 +527,14 @@ public class NodeAgentImpl implements NodeAgent {
final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer
.updateIfNeededAndGetDiskMetricsFor(containerName));
- // CPU usage by a container is given by dividing used CPU time by the container with CPU time used by the entire
- // system. Because each container is allocated same amount of CPU shares, no container should use more than 1/n
- // of the total CPU time, where n is the number of running containers.
+ // CPU usage by a container as percentage of total host CPU, cpuPercentageOfHost, is given by dividing used
+ // CPU time by the container with CPU time used by the entire system.
+ // CPU usage by a container as percentage of total CPU allocated to it is given by dividing the
+ // cpuPercentageOfHost with the ratio of container resources over total host resources. This calculation
+ // assumes that the ratio between container and host resources for disk, memory, and cpu is roughly equal
+ // and therefore only calculates the ratio of container memory against host memory.
double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(cpuContainerTotalTime, cpuSystemTotalTime);
- double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost;
+ double cpuPercentageOfAllocated = getInverseContainerShareOfHost(nodeSpec) * cpuPercentageOfHost;
long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache;
double memoryPercentUsed = 100.0 * memoryTotalBytesUsed / memoryTotalBytes;
Optional<Double> diskPercentUsed = diskTotalBytes.flatMap(total -> diskTotalBytesUsed.map(used -> 100.0 * used / total));
@@ -603,6 +606,14 @@ public class NodeAgentImpl implements NodeAgent {
return temp;
}
+ private double getInverseContainerShareOfHost(ContainerNodeSpec nodeSpec) {
+ return nodeSpec.minMainMemoryAvailableGb
+ .map(memory -> {
+ double hostMemory = storageMaintainer.map(StorageMaintainer::getHostTotalMemoryGb).orElse(0d);
+ return hostMemory / memory;
+ }).orElse(0d);
+ }
+
class CpuUsageReporter {
private long totalContainerUsage = 0;
private long totalSystemUsage = 0;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
index 4092c967bb7..74f09d36e9d 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
@@ -19,6 +19,7 @@ import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.time.Duration;
+import java.util.Optional;
import static org.junit.Assert.*;
import static org.mockito.Matchers.any;
@@ -119,6 +120,18 @@ public class StorageMaintainerTest {
verify(docker, times(2)).executeInContainerAsRoot(any(), anyVararg());
}
+ @Test
+ public void testGetTotalMemory() {
+ StorageMaintainer storageMaintainer = mock(StorageMaintainer.class);
+ when(storageMaintainer.getHostTotalMemoryGb()).thenCallRealMethod();
+
+ when(storageMaintainer.readMeminfo()).thenReturn(Optional.empty());
+ assertEquals(0d, storageMaintainer.getHostTotalMemoryGb(), 0);
+
+ when(storageMaintainer.readMeminfo()).thenReturn(Optional.of("MemTotal: 1572864 kB\nMemUsed: 1000000 kB\n"));
+ assertEquals(1.5d, storageMaintainer.getHostTotalMemoryGb(), 0);
+ }
+
private static void writeNBytesToFile(File file, int nBytes) throws IOException {
Files.write(file.toPath(), new byte[nBytes]);
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 91d9b382b7c..d2a90abaffb 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -26,6 +26,8 @@ import org.mockito.InOrder;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.time.Duration;
import java.util.Collections;
import java.util.Map;
@@ -38,6 +40,7 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyVararg;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.doNothing;
@@ -515,35 +518,43 @@ public class NodeAgentImplTest {
.vespaVersion(vespaVersion)
.owner(owner)
.membership(membership)
+ .minMainMemoryAvailableGb(2)
.build();
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec));
when(storageMaintainer.updateIfNeededAndGetDiskMetricsFor(eq(containerName))).thenReturn(Optional.of(42547019776L));
+ when(storageMaintainer.getHostTotalMemoryGb()).thenReturn(10d);
when(dockerOperations.getContainerStats(eq(containerName)))
.thenReturn(Optional.of(stats1))
.thenReturn(Optional.of(stats2));
nodeAgent.converge(); // Run the converge loop once to initialize lastNodeSpec
- nodeAgent.updateContainerNodeMetrics(5); // Update metrics once to init and lastCpuMetric
+ nodeAgent.updateContainerNodeMetrics(); // Update metrics once to init and lastCpuMetric
clock.advance(Duration.ofSeconds(1234));
- nodeAgent.updateContainerNodeMetrics(5);
- String[] expectedCommand = {"rpc_invoke", "-t", "1", "tcp/localhost:19091", "setExtraMetrics",
- "s:{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"mem.limit\":4294967296,\"mem.used\":1073741824,\"disk.used\":42547019776,\"disk.util\":15.85,\"cpu.util\":0.0,\"mem.util\":25.0,\"disk.limit\":268435456000},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"net.out.bytes\":20303455,\"net.in.dropped\":4,\"net.out.dropped\":13,\"net.in.bytes\":19499270,\"net.out.errors\":3,\"net.in.errors\":55},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"interface\":\"eth0\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}{\"routing\":{\"yamas\":{\"namespaces\":[\"Vespa\"]}},\"application\":\"vespa.node\",\"metrics\":{\"net.out.bytes\":54246745,\"net.in.dropped\":0,\"net.out.dropped\":0,\"net.in.bytes\":3245766,\"net.out.errors\":0,\"net.in.errors\":0},\"dimensions\":{\"app\":\"testapp.testinstance\",\"role\":\"tenants\",\"instanceName\":\"testinstance\",\"vespaVersion\":\"1.2.3\",\"clusterid\":\"clustId\",\"interface\":\"eth1\",\"parentHostname\":\"parent.host.name.yahoo.com\",\"flavor\":\"docker\",\"clustertype\":\"clustType\",\"tenantName\":\"tester\",\"zone\":\"dev.us-east-1\",\"host\":\"host1.test.yahoo.com\",\"state\":\"active\",\"applicationId\":\"tester.testapp.testinstance\",\"applicationName\":\"testapp\"},\"timestamp\":0}"};
+ Path pathToExpectedMetrics = Paths.get(classLoader.getResource("expected.container.system.metrics.txt").getPath());
+ String expectedMetrics = new String(Files.readAllBytes(pathToExpectedMetrics))
+ .replaceAll("\\s", "")
+ .replaceAll("\\n", "");
+
+ String[] expectedCommand = {"rpc_invoke", "-t", "1", "tcp/localhost:19091", "setExtraMetrics", expectedMetrics};
doAnswer(invocation -> {
ContainerName calledContainerName = (ContainerName) invocation.getArguments()[0];
long calledTimeout = (long) invocation.getArguments()[1];
- String[] calledCommand = (String[]) invocation.getArguments()[2];
+ String[] calledCommand = new String[invocation.getArguments().length - 2];
+ System.arraycopy(invocation.getArguments(), 2, calledCommand, 0, calledCommand.length);
calledCommand[calledCommand.length - 1] = calledCommand[calledCommand.length - 1].replaceAll("\"timestamp\":\\d+", "\"timestamp\":0");
assertEquals(containerName, calledContainerName);
assertEquals(5L, calledTimeout);
assertArrayEquals(expectedCommand, calledCommand);
return null;
- }).when(dockerOperations).executeCommandInContainerAsRoot(any(), any(), any());
+ }).when(dockerOperations).executeCommandInContainerAsRoot(any(), any(), anyVararg());
+
+ nodeAgent.updateContainerNodeMetrics();
}
@Test
@@ -559,7 +570,7 @@ public class NodeAgentImplTest {
nodeAgent.converge(); // Run the converge loop once to initialize lastNodeSpec
- nodeAgent.updateContainerNodeMetrics(5);
+ nodeAgent.updateContainerNodeMetrics();
Set<Map<String, Object>> actualMetrics = metricReceiver.getAllMetricsRaw();
assertEquals(Collections.emptySet(), actualMetrics);
diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt
new file mode 100644
index 00000000000..cfe01ae34f6
--- /dev/null
+++ b/node-admin/src/test/resources/expected.container.system.metrics.txt
@@ -0,0 +1,76 @@
+s:
+{
+ "routing": {
+ "yamas": {
+ "namespaces":
+ ["Vespa"]
+ }
+ },
+ "application": "vespa.node",
+ "metrics": {
+ "mem.limit": 4294967296,
+ "mem.used": 1073741824,
+ "disk.used": 42547019776,
+ "disk.util": 15.85,
+ "cpu.util": 6.75,
+ "mem.util": 25.0,
+ "disk.limit": 268435456000
+ },
+ "dimensions": {
+ "host": "host1.test.yahoo.com",
+ "role": "tenants",
+ "state": "active",
+ "parentHostname": "parent.host.name.yahoo.com"
+ },
+ "timestamp": 0
+}
+{
+ "routing": {
+ "yamas": {
+ "namespaces":
+ ["Vespa"]
+ }
+ },
+ "application": "vespa.node",
+ "metrics": {
+ "net.out.bytes": 20303455,
+ "net.in.dropped": 4,
+ "net.out.dropped": 13,
+ "net.in.bytes": 19499270,
+ "net.out.errors": 3,
+ "net.in.errors": 55
+ },
+ "dimensions": {
+ "host": "host1.test.yahoo.com",
+ "role": "tenants",
+ "state": "active",
+ "interface": "eth0",
+ "parentHostname": "parent.host.name.yahoo.com"
+ },
+ "timestamp": 0
+}
+{
+ "routing": {
+ "yamas": {
+ "namespaces":
+ ["Vespa"]
+ }
+ },
+ "application": "vespa.node",
+ "metrics": {
+ "net.out.bytes": 54246745,
+ "net.in.dropped": 0,
+ "net.out.dropped": 0,
+ "net.in.bytes": 3245766,
+ "net.out.errors": 0,
+ "net.in.errors": 0
+ },
+ "dimensions": {
+ "host": "host1.test.yahoo.com",
+ "role": "tenants",
+ "state": "active",
+ "interface": "eth1",
+ "parentHostname": "parent.host.name.yahoo.com"
+ },
+ "timestamp": 0
+} \ No newline at end of file