diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2017-09-07 08:48:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-09-07 08:48:16 +0200 |
commit | e42cd7c1d9b13835163ee0fbf4876f9c86fcf998 (patch) | |
tree | 3e418399fa1682d171350cb327662ef2847c10cf | |
parent | 69d1d5e192ec55601bbd54374a67d3a5489630b9 (diff) | |
parent | ef1cc7f7bb80c7ea95870f9057d650697c942d5c (diff) |
Merge pull request #3333 from vespa-engine/freva/fix-storage-maintainer
Fix storage maintainer
4 files changed, 54 insertions, 42 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index 37467d2338b..2450759a5c2 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -47,7 +47,6 @@ import static com.yahoo.vespa.defaults.Defaults.getDefaults; public class StorageMaintainer { private static final ContainerName NODE_ADMIN = new ContainerName("node-admin"); private static final ObjectMapper objectMapper = new ObjectMapper(); - private static Optional<String> kernelVersion = Optional.empty(); private final CounterWrapper numberOfNodeAdminMaintenanceFails; private final Docker docker; @@ -143,12 +142,6 @@ public class StorageMaintainer { throw new RuntimeException("Disk usage command timedout, aborting."); } String output = IOUtils.readAll(new InputStreamReader(duCommand.getInputStream())); - String error = IOUtils.readAll(new InputStreamReader(duCommand.getErrorStream())); - - if (! error.isEmpty()) { - throw new RuntimeException("Disk usage wrote to error log: " + error); - } - String[] results = output.split("\t"); if (results.length != 2) { throw new RuntimeException("Result from disk usage command not as expected: " + output); @@ -187,17 +180,19 @@ public class StorageMaintainer { maintainerExecutor.addJob("delete-files") .withArgument("basePath", path) .withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds()) - .withArgument("fileNameRegex", ".*\\.log\\..+") - .withArgument("recursive", false); - - maintainerExecutor.addJob("delete-files") - .withArgument("basePath", path) - .withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds()) - .withArgument("fileNameRegex", ".*QueryAccessLog.*") + .withArgument("fileNameRegex", ".*\\.log.+") .withArgument("recursive", false); } } + Path qrsDir = environment.pathInNodeAdminFromPathInNode( + containerName, getDefaults().underVespaHome("logs/vespa/qrs")); + maintainerExecutor.addJob("delete-files") + .withArgument("basePath", qrsDir) + .withArgument("maxAgeSeconds", Duration.ofDays(3).getSeconds()) + .withArgument("fileNameRegex", ".*QueryAccessLog.*") + .withArgument("recursive", false); + Path logArchiveDir = environment.pathInNodeAdminFromPathInNode( containerName, getDefaults().underVespaHome("logs/vespa/logarchive")); maintainerExecutor.addJob("delete-files") @@ -235,11 +230,7 @@ public class StorageMaintainer { attributes.put("region", environment.getRegion()); attributes.put("environment", environment.getEnvironment()); attributes.put("flavor", nodeSpec.nodeFlavor); - try { - attributes.put("kernel_version", getKernelVersion()); - } catch (Throwable ignored) { - attributes.put("kernel_version", "unknown"); - } + attributes.put("kernel_version", System.getProperty("os.version")); nodeSpec.currentDockerImage.ifPresent(image -> attributes.put("docker_image", image.asString())); nodeSpec.vespaVersion.ifPresent(version -> attributes.put("vespa_version", version)); @@ -326,21 +317,6 @@ public class StorageMaintainer { } - - private String getKernelVersion() throws IOException, InterruptedException { - if (! kernelVersion.isPresent()) { - Pair<Integer, String> result = new ProcessExecuter().exec(new String[]{"uname", "-r"}); - if (result.getFirst() == 0) { - kernelVersion = Optional.of(result.getSecond().trim()); - } else { - throw new RuntimeException("Failed to get kernel version\n" + result); - } - } - - return kernelVersion.orElse("unknown"); - } - - private String executeMaintainer(String mainClass, String... args) { String[] command = Stream.concat( Stream.of("sudo", "-E", getDefaults().underVespaHome("libexec/vespa/node-admin/maintenance.sh"), mainClass), diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 0d110adf5a4..29484c08472 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -50,6 +50,10 @@ import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.Containe * @author bakksjo */ public class NodeAgentImpl implements NodeAgent { + // This is used as a definition of 1 GB when comparing flavor specs in node-repo + private static final long BYTES_IN_GB = 1_000_000_000L; + + private final AtomicBoolean terminated = new AtomicBoolean(false); private boolean isFrozen = true; private boolean wantFrozen = false; @@ -451,8 +455,12 @@ public class NodeAgentImpl implements NodeAgent { break; case active: storageMaintainer.ifPresent(maintainer -> { - maintainer.removeOldFilesFromNode(containerName); maintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false); + + maintainer.getDiskUsageFor(containerName) + .map(diskUsage -> (double) diskUsage / BYTES_IN_GB / nodeSpec.minDiskAvailableGb) + .filter(diskUtil -> diskUtil >= 0.8) + .ifPresent(diskUtil -> maintainer.removeOldFilesFromNode(containerName)); }); scheduleDownLoadIfNeeded(nodeSpec); if (isDownloadingImage()) { @@ -481,7 +489,6 @@ public class NodeAgentImpl implements NodeAgent { orchestrator.resume(hostname); break; case inactive: - storageMaintainer.ifPresent(maintainer -> maintainer.removeOldFilesFromNode(containerName)); removeContainerIfNeededUpdateContainerState(nodeSpec, container); updateNodeRepoWithCurrentAttributes(nodeSpec); break; @@ -517,14 +524,13 @@ public class NodeAgentImpl implements NodeAgent { Docker.ContainerStats stats = containerStats.get(); final String APP = MetricReceiverWrapper.APPLICATION_NODE; - final long bytesInGB = 1 << 30; final int totalNumCpuCores = ((List<Number>) ((Map) stats.getCpuStats().get("cpu_usage")).get("percpu_usage")).size(); final long cpuContainerTotalTime = ((Number) ((Map) stats.getCpuStats().get("cpu_usage")).get("total_usage")).longValue(); final long cpuSystemTotalTime = ((Number) stats.getCpuStats().get("system_cpu_usage")).longValue(); final long memoryTotalBytes = ((Number) stats.getMemoryStats().get("limit")).longValue(); final long memoryTotalBytesUsage = ((Number) stats.getMemoryStats().get("usage")).longValue(); final long memoryTotalBytesCache = ((Number) ((Map) stats.getMemoryStats().get("stats")).get("cache")).longValue(); - final long diskTotalBytes = (long) (nodeSpec.minDiskAvailableGb * bytesInGB); + final long diskTotalBytes = (long) (nodeSpec.minDiskAvailableGb * BYTES_IN_GB); final Optional<Long> diskTotalBytesUsed = storageMaintainer.flatMap(maintainer -> maintainer .getDiskUsageFor(containerName)); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 1c63c70453e..09e897e1fc2 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -109,12 +109,14 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(187500000000L)); nodeAgent.converge(); verify(dockerOperations, never()).removeContainer(any()); verify(orchestrator, never()).suspend(any(String.class)); verify(dockerOperations, never()).pullImageAsyncIfNeeded(any()); + verify(storageMaintainer, never()).removeOldFilesFromNode(eq(containerName)); final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository); // TODO: Verify this isn't run unless 1st time @@ -131,6 +133,31 @@ public class NodeAgentImplTest { } @Test + public void verifyRemoveOldFilesIfDiskFull() throws Exception { + final long restartGeneration = 1; + final long rebootGeneration = 0; + final ContainerNodeSpec nodeSpec = nodeSpecBuilder + .wantedDockerImage(dockerImage) + .currentDockerImage(dockerImage) + .nodeState(Node.State.active) + .wantedVespaVersion(vespaVersion) + .vespaVersion(vespaVersion) + .wantedRestartGeneration(restartGeneration) + .currentRestartGeneration(restartGeneration) + .wantedRebootGeneration(rebootGeneration) + .build(); + + NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); + when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(217432719360L)); + + nodeAgent.converge(); + + verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(containerName)); + } + + + @Test public void absentContainerCausesStart() throws Exception { final long restartGeneration = 1; final long rebootGeneration = 0; @@ -148,6 +175,7 @@ public class NodeAgentImplTest { when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo")); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); nodeAgent.converge(); @@ -187,6 +215,7 @@ public class NodeAgentImplTest { when(nodeRepository.getContainerNodeSpec(hostName)).thenReturn(Optional.of(nodeSpec)); when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); nodeAgent.converge(); @@ -309,7 +338,6 @@ public class NodeAgentImplTest { nodeAgent.converge(); final InOrder inOrder = inOrder(storageMaintainer, dockerOperations); - inOrder.verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(containerName)); inOrder.verify(dockerOperations, never()).removeContainer(any()); verify(orchestrator, never()).resume(any(String.class)); @@ -420,6 +448,7 @@ public class NodeAgentImplTest { when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec)); when(pathResolver.getApplicationStoragePathForNodeAdmin()).thenReturn(Files.createTempDirectory("foo")); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); nodeAgent.tick(); @@ -442,6 +471,7 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec)); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(201326592000L)); final InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository); doThrow(new RuntimeException("Failed 1st time")) @@ -523,7 +553,7 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getContainerNodeSpec(eq(hostName))).thenReturn(Optional.of(nodeSpec)); - when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(42547019776L)); + when(storageMaintainer.getDiskUsageFor(eq(containerName))).thenReturn(Optional.of(39625000000L)); when(dockerOperations.getContainerStats(eq(containerName))) .thenReturn(Optional.of(stats1)) .thenReturn(Optional.of(stats2)); diff --git a/node-admin/src/test/resources/expected.container.system.metrics.txt b/node-admin/src/test/resources/expected.container.system.metrics.txt index d3875113d33..8a4d696b08e 100644 --- a/node-admin/src/test/resources/expected.container.system.metrics.txt +++ b/node-admin/src/test/resources/expected.container.system.metrics.txt @@ -10,11 +10,11 @@ s: "metrics": { "mem.limit": 4294967296, "mem.used": 1073741824, - "disk.used": 42547019776, + "disk.used": 39625000000, "disk.util": 15.85, "cpu.util": 5.4, "mem.util": 25.0, - "disk.limit": 268435456000 + "disk.limit": 250000000000 }, "dimensions": { "host": "host1.test.yahoo.com", |