summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-09-07 08:48:16 +0200
committerGitHub <noreply@github.com>2017-09-07 08:48:16 +0200
commite42cd7c1d9b13835163ee0fbf4876f9c86fcf998 (patch)
tree3e418399fa1682d171350cb327662ef2847c10cf
parent69d1d5e192ec55601bbd54374a67d3a5489630b9 (diff)
parentef1cc7f7bb80c7ea95870f9057d650697c942d5c (diff)
Merge pull request #3333 from vespa-engine/freva/fix-storage-maintainer
Fix storage maintainer
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java44
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java14
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java34
-rw-r--r--node-admin/src/test/resources/expected.container.system.metrics.txt4
4 files changed, 54 insertions, 42 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index 37467d2338b..2450759a5c2 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -47,7 +47,6 @@ import static com.yahoo.vespa.defaults.Defaults.getDefaults;
public class StorageMaintainer {
private static final ContainerName NODE_ADMIN = new ContainerName("node-admin");
private static final ObjectMapper objectMapper = new ObjectMapper();
- private static Optional<String> kernelVersion = Optional.empty();
private final CounterWrapper numberOfNodeAdminMaintenanceFails;
private final Docker docker;
@@ -143,12 +142,6 @@ public class StorageMaintainer {
throw new RuntimeException("Disk usage command timedout, aborting.");
}
String output = IOUtils.readAll(new InputStreamReader(duCommand.getInputStream()));
- String error = IOUtils.readAll(new InputStreamReader(duCommand.getErrorStream()));
-
- if (! error.isEmpty()) {
- throw new RuntimeException("Disk usage wrote to error log: " + error);
- }
-
String[] results = output.split("\t");
if (results.length != 2) {
throw new RuntimeException("Result from disk usage command not as expected: " + output);
@@ -187,17 +180,19 @@ public class StorageMaintainer {
maintainerExecutor.addJob("delete-files")
.withArgument("basePath", path)
.withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds())
- .withArgument("fileNameRegex", ".*\\.log\\..+")
- .withArgument("recursive", false);
-
- maintainerExecutor.addJob("delete-files")
- .withArgument("basePath", path)
- .withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds())
- .withArgument("fileNameRegex", ".*QueryAccessLog.*")
+ .withArgument("fileNameRegex", ".*\\.log.+")
.withArgument("recursive", false);
}
}
+ Path qrsDir = environment.pathInNodeAdminFromPathInNode(
+ containerName, getDefaults().underVespaHome("logs/vespa/qrs"));
+ maintainerExecutor.addJob("delete-files")
+ .withArgument("basePath", qrsDir)
+ .withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds())
+ .withArgument("fileNameRegex", ".*QueryAccessLog.*")
+ .withArgument("recursive", false);
+
Path logArchiveDir = environment.pathInNodeAdminFromPathInNode(
containerName, getDefaults().underVespaHome("logs/vespa/logarchive"));
maintainerExecutor.addJob("delete-files")
@@ -235,11 +230,7 @@ public class StorageMaintainer {
attributes.put("region", environment.getRegion());
attributes.put("environment", environment.getEnvironment());
attributes.put("flavor", nodeSpec.nodeFlavor);
- try {
- attributes.put("kernel_version", getKernelVersion());
- } catch (Throwable ignored) {
- attributes.put("kernel_version", "unknown");
- }
+ attributes.put("kernel_version", System.getProperty("os.version"));
nodeSpec.currentDockerImage.ifPresent(image -> attributes.put("docker_image", image.asString()));
nodeSpec.vespaVersion.ifPresent(version -> attributes.put("vespa_version", version));
@@ -326,21 +317,6 @@ public class StorageMaintainer {
}
-
- private String getKernelVersion() throws IOException, InterruptedException {
- if (! kernelVersion.isPresent()) {
- Pair<Integer, String> result = new ProcessExecuter().exec(new String[]{"uname", "-r"});
- if (result.getFirst() == 0) {
- kernelVersion = Optional.of(result.getSecond().trim());
- } else {
- throw new RuntimeException("Failed to get kernel version\n" + result);
- }
- }
-
- return kernelVersion.orElse("unknown");
- }
-
-
private String executeMaintainer(String mainClass, String... args) {
String[] command = Stream.concat(
Stream.of("sudo", "-E", getDefaults().underVespaHome("libexec/vespa/node-admin/maintenance.sh"), mainClass),
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 0d110adf5a4..29484c08472 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -50,6 +50,10 @@ import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.Containe
* @author bakksjo
*/
public class NodeAgentImpl implements NodeAgent {
+ // This is used as a definition of 1 GB when comparing flavor specs in node-repo
+ private static final long BYTES_IN_GB = 1_000_000_000L;
+
+
private final AtomicBoolean terminated = new AtomicBoolean(false);
private boolean isFrozen = true;
private boolean wantFrozen = false;
@@ -451,8 +455,12 @@ public class NodeAgentImpl implements NodeAgent {
break;
case active:
storageMaintainer.ifPresent(maintainer -> {
- maintainer.removeOldFilesFromNode(containerName);
maintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false);
+
+ maintainer.getDiskUsageFor(containerName)
+ .map(diskUsage -> (double) diskUsage / BYTES_IN_GB / nodeSpec.minDiskAvailableGb)
+ .filter(diskUtil -> diskUtil >= 0.8)
+ .ifPresent(diskUtil -> maintainer.removeOldFilesFromNode(containerName));
});
scheduleDownLoadIfNeeded(nodeSpec);
if (isDownloadingImage()) {
@@ -481,7 +489,6 @@ public class NodeAgentImpl implements NodeAgent {
orchestrator.resume(hostname);
break;
case inactive:
- storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName));
removeContainerIfNeededUpdateContainerState(nodeSpec, container);
updateNodeRepoWithCurrentAttributes(nodeSpec);
break;
@@ -517,14 +524,13 @@ public class NodeAgentImpl implements NodeAgent {
Docker.ContainerStats stats = containerStats.get();
final String APP = MetricReceiverWrapper.APPLICATION_NODE;
- final long bytesInGB = 1 << 30;
final int totalNumCpuCores = ((List<Number>) ((Map) stats.getCpuStats().get("cpu_usage")).get("percpu_usage")).size();
final long cpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue();
final long cpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue();
final long memoryTotalBytes = ((Number) stats.getMemoryStats().get("limit")).longValue();
final long memoryTotalBytesUsage = ((Number) stats.getMemoryStats().get("usage")).longValue();
final long memoryTotalBytesCache = ((Number) ((Map) stats.getMemoryStats().get("stats")).get("cache")).longValue();
- final long diskTotalBytes = (long) (nodeSpec.minDiskAvailableGb * bytesInGB);
+ final long diskTotalBytes = (long) (nodeSpec.minDiskAvailableGb * BYTES_IN_GB);
final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer
.getDiskUsageFor(containerName));
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 1c63c70453e..09e897e1fc2 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -109,12 +109,14 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(187500000000L));
nodeAgent.converge();
verify(dockerOperations, never()).removeContainer(any());
verify(orchestrator, never()).suspend(any(String.class));
verify(dockerOperations, never()).pullImageAsyncIfNeeded(any());
+ verify(storageMaintainer, never()).removeOldFilesFromNode(eq(containerName));
final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository);
// TODO: Verify this isn't run unless 1st time
@@ -131,6 +133,31 @@ public class NodeAgentImplTest {
}
@Test
+ public void verifyRemoveOldFilesIfDiskFull() throws Exception {
+ final long restartGeneration = 1;
+ final long rebootGeneration = 0;
+ final ContainerNodeSpec nodeSpec = nodeSpecBuilder
+ .wantedDockerImage(dockerImage)
+ .currentDockerImage(dockerImage)
+ .nodeState(Node.State.active)
+ .wantedVespaVersion(vespaVersion)
+ .vespaVersion(vespaVersion)
+ .wantedRestartGeneration(restartGeneration)
+ .currentRestartGeneration(restartGeneration)
+ .wantedRebootGeneration(rebootGeneration)
+ .build();
+
+ NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
+ when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(217432719360L));
+
+ nodeAgent.converge();
+
+ verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(containerName));
+ }
+
+
+ @Test
public void absentContainerCausesStart() throws Exception {
final long restartGeneration = 1;
final long rebootGeneration = 0;
@@ -148,6 +175,7 @@ public class NodeAgentImplTest {
when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo"));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
nodeAgent.converge();
@@ -187,6 +215,7 @@ public class NodeAgentImplTest {
when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec));
when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true);
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
nodeAgent.converge();
@@ -309,7 +338,6 @@ public class NodeAgentImplTest {
nodeAgent.converge();
final InOrder inOrder = inOrder(storageMaintainer, dockerOperations);
- inOrder.verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(containerName));
inOrder.verify(dockerOperations, never()).removeContainer(any());
verify(orchestrator, never()).resume(any(String.class));
@@ -420,6 +448,7 @@ public class NodeAgentImplTest {
when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec));
when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo"));
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
nodeAgent.tick();
@@ -442,6 +471,7 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec));
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L));
final InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository);
doThrow(new RuntimeException("Failed 1st time"))
@@ -523,7 +553,7 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec));
- when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(42547019776L));
+ when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(39625000000L));
when(dockerOperations.getContainerStats(eq(containerName)))
.thenReturn(Optional.of(stats1))
.thenReturn(Optional.of(stats2));
diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt
index d3875113d33..8a4d696b08e 100644
--- a/node-admin/src/test/resources/expected.container.system.metrics.txt
+++ b/node-admin/src/test/resources/expected.container.system.metrics.txt
@@ -10,11 +10,11 @@ s:
"metrics": {
"mem.limit": 4294967296,
"mem.used": 1073741824,
- "disk.used": 42547019776,
+ "disk.used": 39625000000,
"disk.util": 15.85,
"cpu.util": 5.4,
"mem.util": 25.0,
- "disk.limit": 268435456000
+ "disk.limit": 250000000000
},
"dimensions": {
"host": "host1.test.yahoo.com",