diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2020-04-22 14:02:14 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@verizonmedia.com> | 2020-04-22 16:08:47 +0200 |
commit | f5b0f594949a63836a662efaba8900c2f1e904d6 (patch) | |
tree | 8c3fad6aaecd75abea523aacdca82dcbebe4143c /node-admin | |
parent | d2090d1f4b4a6c56fb528d819af7ee5b680f3ef7 (diff) |
Use DiskCleanup to clean disk when full
Diffstat (limited to 'node-admin')
4 files changed, 98 insertions, 63 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index c1897c3c416..7a7a0a8e2bc 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -3,10 +3,17 @@ package com.yahoo.vespa.hosted.node.admin.maintenance; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; +import com.yahoo.config.provision.NodeType; import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.dockerapi.Container; +import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.node.admin.component.TaskContext; import com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoredumpHandler; +import com.yahoo.vespa.hosted.node.admin.maintenance.disk.CoredumpCleanupRule; +import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanup; +import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule; +import com.yahoo.vespa.hosted.node.admin.maintenance.disk.LinearCleanupRule; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; @@ -17,18 +24,20 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.time.Clock; import java.time.Duration; +import java.time.Instant; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.TimeUnit; +import java.util.function.Function; import java.util.logging.Logger; -import java.util.regex.Pattern; -import static com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder.nameMatches; -import static com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder.olderThan; +import static com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule.Priority; import static com.yahoo.yolean.Exceptions.uncheck; /** @@ -42,34 +51,34 @@ public class StorageMaintainer { private final Terminal terminal; private final CoredumpHandler coredumpHandler; private final Path archiveContainerStoragePath; + private final DiskCleanup diskCleanup; private final Clock clock; // We cache disk usage to avoid doing expensive disk operations so often - private final Cache<Path, Long> diskUsage = CacheBuilder.newBuilder() + private final Cache<ContainerName, Long> diskUsage = CacheBuilder.newBuilder() .maximumSize(100) .expireAfterWrite(5, TimeUnit.MINUTES) .build(); public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath) { - this(terminal, coredumpHandler, archiveContainerStoragePath, Clock.systemUTC()); + this(terminal, coredumpHandler, archiveContainerStoragePath, new DiskCleanup(), Clock.systemUTC()); } - public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath, Clock clock) { + public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath, DiskCleanup diskCleanup, Clock clock) { this.terminal = terminal; this.coredumpHandler = coredumpHandler; this.archiveContainerStoragePath = archiveContainerStoragePath; + this.diskCleanup = diskCleanup; this.clock = clock; } public Optional<Long> getDiskUsageFor(NodeAgentContext context) { try { - Path path = context.pathOnHostFromPathInNode("/"); - - Long cachedDiskUsage = diskUsage.getIfPresent(path); + Long cachedDiskUsage = diskUsage.getIfPresent(context.containerName()); if (cachedDiskUsage != null) return Optional.of(cachedDiskUsage); - long diskUsageBytes = getDiskUsedInBytes(context, path); - diskUsage.put(path, diskUsageBytes); + long diskUsageBytes = getDiskUsedInBytes(context, context.pathOnHostFromPathInNode("/")); + diskUsage.put(context.containerName(), diskUsageBytes); return Optional.of(diskUsageBytes); } catch (Exception e) { context.log(logger, LogLevel.WARNING, "Failed to get disk usage", e); @@ -95,37 +104,44 @@ public class StorageMaintainer { return 1024 * Long.parseLong(results[0]); } + public boolean cleanDiskIfFull(NodeAgentContext context) { + double totalBytes = context.node().diskGb() * 1_000_000_000L; + // Delete enough bytes to get below 80% disk usage, but only if we are already using more than 90% disk + long bytesToRemove = getDiskUsageFor(context) + .map(diskUsage -> (long) (diskUsage - 0.8 * totalBytes)) + .filter(bytes -> bytes > totalBytes * 0.1) + .orElse(0L); + + if (bytesToRemove > 0 && diskCleanup.cleanup(context, createCleanupRules(context), bytesToRemove)) { + diskUsage.invalidate(context.containerName()); + return true; + } + return false; + } - /** Deletes old log files for vespa and nginx */ - public void removeOldFilesFromNode(NodeAgentContext context) { - Path[] logPaths = { - context.pathInNodeUnderVespaHome("logs/nginx"), - context.pathInNodeUnderVespaHome("logs/vespa") - }; + private List<DiskCleanupRule> createCleanupRules(NodeAgentContext context) { + Instant start = clock.instant(); + double oneMonthSeconds = Duration.ofDays(30).getSeconds(); + Function<Instant, Double> monthNormalizer = instant -> Duration.between(instant, start).getSeconds() / oneMonthSeconds; + Function<String, Path> pathOnHostUnderContainerVespaHome = path -> + context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome(path)); + List<DiskCleanupRule> rules = new ArrayList<>(); - for (Path pathToClean : logPaths) { - Path path = context.pathOnHostFromPathInNode(pathToClean); - FileFinder.files(path) - .match(olderThan(Duration.ofDays(3)).and(nameMatches(Pattern.compile(".*\\.log.+")))) - .maxDepth(1) - .deleteRecursively(context); - } + rules.add(CoredumpCleanupRule.forContainer(pathOnHostUnderContainerVespaHome.apply("var/crash"))); - FileFinder.files(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("logs/vespa/qrs"))) - .match(olderThan(Duration.ofDays(3))) - .deleteRecursively(context); + if (context.node().membership().map(m -> m.type().isContainer()).orElse(false)) + rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/qrs")).list(), + fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST)); - FileFinder.files(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("logs/vespa/logarchive"))) - .match(olderThan(Duration.ofDays(31))) - .deleteRecursively(context); + if (context.nodeType() == NodeType.tenant && context.node().membership().map(m -> m.type().isAdmin()).orElse(false)) + rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/logarchive")).list(), + fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST)); - FileFinder.directories(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("var/db/vespa/filedistribution"))) - .match(olderThan(Duration.ofDays(31))) - .deleteRecursively(context); + if (context.nodeType() == NodeType.proxy) + rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/nginx")).list(), + fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.MEDIUM)); - FileFinder.directories(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("var/db/vespa/download"))) - .match(olderThan(Duration.ofDays(31))) - .deleteRecursively(context); + return rules; } /** Checks if container has any new coredumps, reports and archives them if so */ diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 9c8b0ec2b86..1a79edca203 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -48,9 +48,6 @@ import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.Containe */ public class NodeAgentImpl implements NodeAgent { - // This is used as a definition of 1 GB when comparing flavor specs in node-repo - private static final long BYTES_IN_GB = 1_000_000_000L; - // Container is started with uncapped CPU and is kept that way until the first successful health check + this duration // Subtract 1 second to avoid warmup coming in lockstep with tick time and always end up using an extra tick when there are just a few ms left private static final Duration DEFAULT_WARM_UP_DURATION = Duration.ofSeconds(90).minus(Duration.ofSeconds(1)); @@ -445,13 +442,9 @@ public class NodeAgentImpl implements NodeAgent { stopServicesIfNeeded(context); break; case active: + storageMaintainer.cleanDiskIfFull(context); storageMaintainer.handleCoreDumpsForContainer(context, container); - storageMaintainer.getDiskUsageFor(context) - .map(diskUsage -> (double) diskUsage / BYTES_IN_GB / node.diskGb()) - .filter(diskUtil -> diskUtil >= 0.8) - .ifPresent(diskUtil -> storageMaintainer.removeOldFilesFromNode(context)); - if (downloadImageIfNeeded(node, container)) { context.log(logger, "Waiting for image to download " + context.node().wantedDockerImage().get().asString()); return; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java index 50a8da63b96..47f24ba67c3 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java @@ -1,7 +1,10 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance; +import com.yahoo.config.provision.NodeResources; import com.yahoo.test.ManualClock; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanup; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; @@ -25,6 +28,11 @@ import java.util.Set; import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; /** * @author dybis @@ -85,7 +93,7 @@ public class StorageMaintainerTest { // Archive container-1 ManualClock clock = new ManualClock(Instant.ofEpochSecond(1234567890)); - StorageMaintainer storageMaintainer = new StorageMaintainer(null, null, pathToArchiveDir, clock); + StorageMaintainer storageMaintainer = new StorageMaintainer(null, null, pathToArchiveDir, null, clock); storageMaintainer.archiveNodeStorage(context1); clock.advance(Duration.ofSeconds(3)); @@ -140,4 +148,39 @@ public class StorageMaintainerTest { return context; } } + + public static class CleanupTests { + + private final TestTerminal terminal = new TestTerminal(); + private final DiskCleanup diskCleanup = mock(DiskCleanup.class); + private final ManualClock clock = new ManualClock(); + private final StorageMaintainer storageMaintainer = new StorageMaintainer(terminal, null, null, diskCleanup, clock); + private final FileSystem fileSystem = TestFileSystem.create(); + private final NodeAgentContext context = new NodeAgentContextImpl + .Builder(NodeSpec.Builder.testSpec("h123a.domain.tld").resources(new NodeResources(1, 1, 1, 1)).build()) + .pathToContainerStorageFromFileSystem(fileSystem).build(); + + @Test + public void not_run_if_not_enough_used() throws IOException { + Files.createDirectories(context.pathOnHostFromPathInNode("/")); + mockDiskUsage(500L); + + storageMaintainer.cleanDiskIfFull(context); + verifyNoInteractions(diskCleanup); + } + + @Test + public void deletes_correct_amount() throws IOException { + Files.createDirectories(context.pathOnHostFromPathInNode("/")); + mockDiskUsage(950_000L); + + storageMaintainer.cleanDiskIfFull(context); + // Allocated size: 1 GB, usage: 950_000 kiB (972.8 MB). Wanted usage: 80% => 800 MB + verify(diskCleanup).cleanup(eq(context), any(), eq(172_800_000L)); + } + + private void mockDiskUsage(long kBytes) { + terminal.expectCommand("du -xsk /home/docker/h123a 2>&1", 0, kBytes + "\t/path"); + } + } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index b895300d7b1..dc1c0466d39 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -78,14 +78,12 @@ public class NodeAgentImplTest { NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L)); nodeAgent.doConverge(context); verify(dockerOperations, never()).removeContainer(eq(context), any()); verify(orchestrator, never()).suspend(any(String.class)); verify(dockerOperations, never()).pullImageAsyncIfNeeded(any()); - verify(storageMaintainer, never()).removeOldFilesFromNode(eq(context)); final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository); // TODO: Verify this isn't run unless 1st time @@ -104,11 +102,10 @@ public class NodeAgentImplTest { NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(217432719360L)); nodeAgent.doConverge(context); - verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(context)); + verify(storageMaintainer, times(1)).cleanDiskIfFull(eq(context)); } @Test @@ -122,7 +119,6 @@ public class NodeAgentImplTest { NodeAgentContext context = createContext(node); NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L)); nodeAgent.doConverge(context); inOrder.verify(dockerOperations, never()).startServices(eq(context)); @@ -158,7 +154,6 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); nodeAgent.doConverge(context); @@ -191,7 +186,6 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); nodeAgent.doConverge(context); @@ -214,7 +208,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); - when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); InOrder inOrder = inOrder(orchestrator, dockerOperations); @@ -262,7 +255,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true); - when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); nodeAgent.doConverge(firstContext); NodeAgentContext secondContext = createContext(specBuilder.memoryGb(20).build()); @@ -323,7 +315,6 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); doThrow(new ConvergenceException("Connection refused")).doNothing() .when(healthChecker).verifyHealth(eq(context)); @@ -495,7 +486,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, false); when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); nodeAgent.doConverge(context); @@ -517,7 +507,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true); when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node)); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); final InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository); doThrow(new RuntimeException("Failed 1st time")) @@ -553,7 +542,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(context)); try { @@ -591,7 +579,6 @@ public class NodeAgentImplTest { when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node)); when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false); - when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L)); nodeAgent.doConverge(context); @@ -629,8 +616,6 @@ public class NodeAgentImplTest { NodeAgentContext context = createContext(specBuilder.build()); NodeAgentImpl nodeAgent = makeNodeAgent(null, false, Duration.ofSeconds(30)); - when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); - InOrder inOrder = inOrder(orchestrator, dockerOperations); ConvergenceException healthCheckException = new ConvergenceException("Not yet up"); @@ -685,8 +670,6 @@ public class NodeAgentImplTest { NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true, Duration.ofSeconds(30)); mockGetContainer(dockerImage, ContainerResources.from(0, 2, 16), true); - when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L)); - InOrder inOrder = inOrder(orchestrator, dockerOperations); nodeAgent.doConverge(context); |