summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerij92@gmail.com>2020-04-22 14:02:14 +0200
committerValerij Fredriksen <valerijf@verizonmedia.com>2020-04-22 16:08:47 +0200
commitf5b0f594949a63836a662efaba8900c2f1e904d6 (patch)
tree8c3fad6aaecd75abea523aacdca82dcbebe4143c /node-admin
parentd2090d1f4b4a6c56fb528d819af7ee5b680f3ef7 (diff)
Use DiskCleanup to clean disk when full
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java88
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java9
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java45
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java19
4 files changed, 98 insertions, 63 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index c1897c3c416..7a7a0a8e2bc 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -3,10 +3,17 @@ package com.yahoo.vespa.hosted.node.admin.maintenance;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
+import com.yahoo.config.provision.NodeType;
import com.yahoo.log.LogLevel;
import com.yahoo.vespa.hosted.dockerapi.Container;
+import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.node.admin.component.TaskContext;
import com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoredumpHandler;
+import com.yahoo.vespa.hosted.node.admin.maintenance.disk.CoredumpCleanupRule;
+import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanup;
+import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule;
+import com.yahoo.vespa.hosted.node.admin.maintenance.disk.LinearCleanupRule;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath;
@@ -17,18 +24,20 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Clock;
import java.time.Duration;
+import java.time.Instant;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
+import java.util.function.Function;
import java.util.logging.Logger;
-import java.util.regex.Pattern;
-import static com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder.nameMatches;
-import static com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder.olderThan;
+import static com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule.Priority;
import static com.yahoo.yolean.Exceptions.uncheck;
/**
@@ -42,34 +51,34 @@ public class StorageMaintainer {
private final Terminal terminal;
private final CoredumpHandler coredumpHandler;
private final Path archiveContainerStoragePath;
+ private final DiskCleanup diskCleanup;
private final Clock clock;
// We cache disk usage to avoid doing expensive disk operations so often
- private final Cache<Path, Long> diskUsage = CacheBuilder.newBuilder()
+ private final Cache<ContainerName, Long> diskUsage = CacheBuilder.newBuilder()
.maximumSize(100)
.expireAfterWrite(5, TimeUnit.MINUTES)
.build();
public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath) {
- this(terminal, coredumpHandler, archiveContainerStoragePath, Clock.systemUTC());
+ this(terminal, coredumpHandler, archiveContainerStoragePath, new DiskCleanup(), Clock.systemUTC());
}
- public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath, Clock clock) {
+ public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath, DiskCleanup diskCleanup, Clock clock) {
this.terminal = terminal;
this.coredumpHandler = coredumpHandler;
this.archiveContainerStoragePath = archiveContainerStoragePath;
+ this.diskCleanup = diskCleanup;
this.clock = clock;
}
public Optional<Long> getDiskUsageFor(NodeAgentContext context) {
try {
- Path path = context.pathOnHostFromPathInNode("/");
-
- Long cachedDiskUsage = diskUsage.getIfPresent(path);
+ Long cachedDiskUsage = diskUsage.getIfPresent(context.containerName());
if (cachedDiskUsage != null) return Optional.of(cachedDiskUsage);
- long diskUsageBytes = getDiskUsedInBytes(context, path);
- diskUsage.put(path, diskUsageBytes);
+ long diskUsageBytes = getDiskUsedInBytes(context, context.pathOnHostFromPathInNode("/"));
+ diskUsage.put(context.containerName(), diskUsageBytes);
return Optional.of(diskUsageBytes);
} catch (Exception e) {
context.log(logger, LogLevel.WARNING, "Failed to get disk usage", e);
@@ -95,37 +104,44 @@ public class StorageMaintainer {
return 1024 * Long.parseLong(results[0]);
}
+ public boolean cleanDiskIfFull(NodeAgentContext context) {
+ double totalBytes = context.node().diskGb() * 1_000_000_000L;
+ // Delete enough bytes to get below 80% disk usage, but only if we are already using more than 90% disk
+ long bytesToRemove = getDiskUsageFor(context)
+ .map(diskUsage -> (long) (diskUsage - 0.8 * totalBytes))
+ .filter(bytes -> bytes > totalBytes * 0.1)
+ .orElse(0L);
+
+ if (bytesToRemove > 0 && diskCleanup.cleanup(context, createCleanupRules(context), bytesToRemove)) {
+ diskUsage.invalidate(context.containerName());
+ return true;
+ }
+ return false;
+ }
- /** Deletes old log files for vespa and nginx */
- public void removeOldFilesFromNode(NodeAgentContext context) {
- Path[] logPaths = {
- context.pathInNodeUnderVespaHome("logs/nginx"),
- context.pathInNodeUnderVespaHome("logs/vespa")
- };
+ private List<DiskCleanupRule> createCleanupRules(NodeAgentContext context) {
+ Instant start = clock.instant();
+ double oneMonthSeconds = Duration.ofDays(30).getSeconds();
+ Function<Instant, Double> monthNormalizer = instant -> Duration.between(instant, start).getSeconds() / oneMonthSeconds;
+ Function<String, Path> pathOnHostUnderContainerVespaHome = path ->
+ context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome(path));
+ List<DiskCleanupRule> rules = new ArrayList<>();
- for (Path pathToClean : logPaths) {
- Path path = context.pathOnHostFromPathInNode(pathToClean);
- FileFinder.files(path)
- .match(olderThan(Duration.ofDays(3)).and(nameMatches(Pattern.compile(".*\\.log.+"))))
- .maxDepth(1)
- .deleteRecursively(context);
- }
+ rules.add(CoredumpCleanupRule.forContainer(pathOnHostUnderContainerVespaHome.apply("var/crash")));
- FileFinder.files(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("logs/vespa/qrs")))
- .match(olderThan(Duration.ofDays(3)))
- .deleteRecursively(context);
+ if (context.node().membership().map(m -> m.type().isContainer()).orElse(false))
+ rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/qrs")).list(),
+ fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST));
- FileFinder.files(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("logs/vespa/logarchive")))
- .match(olderThan(Duration.ofDays(31)))
- .deleteRecursively(context);
+ if (context.nodeType() == NodeType.tenant && context.node().membership().map(m -> m.type().isAdmin()).orElse(false))
+ rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/logarchive")).list(),
+ fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST));
- FileFinder.directories(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("var/db/vespa/filedistribution")))
- .match(olderThan(Duration.ofDays(31)))
- .deleteRecursively(context);
+ if (context.nodeType() == NodeType.proxy)
+ rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/nginx")).list(),
+ fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.MEDIUM));
- FileFinder.directories(context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome("var/db/vespa/download")))
- .match(olderThan(Duration.ofDays(31)))
- .deleteRecursively(context);
+ return rules;
}
/** Checks if container has any new coredumps, reports and archives them if so */
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 9c8b0ec2b86..1a79edca203 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -48,9 +48,6 @@ import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.Containe
*/
public class NodeAgentImpl implements NodeAgent {
- // This is used as a definition of 1 GB when comparing flavor specs in node-repo
- private static final long BYTES_IN_GB = 1_000_000_000L;
-
// Container is started with uncapped CPU and is kept that way until the first successful health check + this duration
// Subtract 1 second to avoid warmup coming in lockstep with tick time and always end up using an extra tick when there are just a few ms left
private static final Duration DEFAULT_WARM_UP_DURATION = Duration.ofSeconds(90).minus(Duration.ofSeconds(1));
@@ -445,13 +442,9 @@ public class NodeAgentImpl implements NodeAgent {
stopServicesIfNeeded(context);
break;
case active:
+ storageMaintainer.cleanDiskIfFull(context);
storageMaintainer.handleCoreDumpsForContainer(context, container);
- storageMaintainer.getDiskUsageFor(context)
- .map(diskUsage -> (double) diskUsage / BYTES_IN_GB / node.diskGb())
- .filter(diskUtil -> diskUtil >= 0.8)
- .ifPresent(diskUtil -> storageMaintainer.removeOldFilesFromNode(context));
-
if (downloadImageIfNeeded(node, container)) {
context.log(logger, "Waiting for image to download " + context.node().wantedDockerImage().get().asString());
return;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
index 50a8da63b96..47f24ba67c3 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java
@@ -1,7 +1,10 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.maintenance;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.test.ManualClock;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
+import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanup;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder;
@@ -25,6 +28,11 @@ import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoInteractions;
/**
* @author dybis
@@ -85,7 +93,7 @@ public class StorageMaintainerTest {
// Archive container-1
ManualClock clock = new ManualClock(Instant.ofEpochSecond(1234567890));
- StorageMaintainer storageMaintainer = new StorageMaintainer(null, null, pathToArchiveDir, clock);
+ StorageMaintainer storageMaintainer = new StorageMaintainer(null, null, pathToArchiveDir, null, clock);
storageMaintainer.archiveNodeStorage(context1);
clock.advance(Duration.ofSeconds(3));
@@ -140,4 +148,39 @@ public class StorageMaintainerTest {
return context;
}
}
+
+ public static class CleanupTests {
+
+ private final TestTerminal terminal = new TestTerminal();
+ private final DiskCleanup diskCleanup = mock(DiskCleanup.class);
+ private final ManualClock clock = new ManualClock();
+ private final StorageMaintainer storageMaintainer = new StorageMaintainer(terminal, null, null, diskCleanup, clock);
+ private final FileSystem fileSystem = TestFileSystem.create();
+ private final NodeAgentContext context = new NodeAgentContextImpl
+ .Builder(NodeSpec.Builder.testSpec("h123a.domain.tld").resources(new NodeResources(1, 1, 1, 1)).build())
+ .pathToContainerStorageFromFileSystem(fileSystem).build();
+
+ @Test
+ public void not_run_if_not_enough_used() throws IOException {
+ Files.createDirectories(context.pathOnHostFromPathInNode("/"));
+ mockDiskUsage(500L);
+
+ storageMaintainer.cleanDiskIfFull(context);
+ verifyNoInteractions(diskCleanup);
+ }
+
+ @Test
+ public void deletes_correct_amount() throws IOException {
+ Files.createDirectories(context.pathOnHostFromPathInNode("/"));
+ mockDiskUsage(950_000L);
+
+ storageMaintainer.cleanDiskIfFull(context);
+ // Allocated size: 1 GB, usage: 950_000 kiB (972.8 MB). Wanted usage: 80% => 800 MB
+ verify(diskCleanup).cleanup(eq(context), any(), eq(172_800_000L));
+ }
+
+ private void mockDiskUsage(long kBytes) {
+ terminal.expectCommand("du -xsk /home/docker/h123a 2>&1", 0, kBytes + "\t/path");
+ }
+ }
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index b895300d7b1..dc1c0466d39 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -78,14 +78,12 @@ public class NodeAgentImplTest {
NodeAgentContext context = createContext(node);
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L));
nodeAgent.doConverge(context);
verify(dockerOperations, never()).removeContainer(eq(context), any());
verify(orchestrator, never()).suspend(any(String.class));
verify(dockerOperations, never()).pullImageAsyncIfNeeded(any());
- verify(storageMaintainer, never()).removeOldFilesFromNode(eq(context));
final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository);
// TODO: Verify this isn't run unless 1st time
@@ -104,11 +102,10 @@ public class NodeAgentImplTest {
NodeAgentContext context = createContext(node);
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(217432719360L));
nodeAgent.doConverge(context);
- verify(storageMaintainer, times(1)).removeOldFilesFromNode(eq(context));
+ verify(storageMaintainer, times(1)).cleanDiskIfFull(eq(context));
}
@Test
@@ -122,7 +119,6 @@ public class NodeAgentImplTest {
NodeAgentContext context = createContext(node);
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(187500000000L));
nodeAgent.doConverge(context);
inOrder.verify(dockerOperations, never()).startServices(eq(context));
@@ -158,7 +154,6 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
nodeAgent.doConverge(context);
@@ -191,7 +186,6 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true);
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
nodeAgent.doConverge(context);
@@ -214,7 +208,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true);
- when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
InOrder inOrder = inOrder(orchestrator, dockerOperations);
@@ -262,7 +255,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(dockerOperations.pullImageAsyncIfNeeded(any())).thenReturn(true);
- when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
nodeAgent.doConverge(firstContext);
NodeAgentContext secondContext = createContext(specBuilder.memoryGb(20).build());
@@ -323,7 +315,6 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
doThrow(new ConvergenceException("Connection refused")).doNothing()
.when(healthChecker).verifyHealth(eq(context));
@@ -495,7 +486,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, false);
when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node));
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
nodeAgent.doConverge(context);
@@ -517,7 +507,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true);
when(nodeRepository.getOptionalNode(eq(hostName))).thenReturn(Optional.of(node));
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
final InOrder inOrder = inOrder(orchestrator, dockerOperations, nodeRepository);
doThrow(new RuntimeException("Failed 1st time"))
@@ -553,7 +542,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = spy(makeNodeAgent(null, false));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
doThrow(new DockerException("Failed to set up network")).doNothing().when(dockerOperations).startContainer(eq(context));
try {
@@ -591,7 +579,6 @@ public class NodeAgentImplTest {
when(nodeRepository.getOptionalNode(hostName)).thenReturn(Optional.of(node));
when(dockerOperations.pullImageAsyncIfNeeded(eq(dockerImage))).thenReturn(false);
- when(storageMaintainer.getDiskUsageFor(eq(context))).thenReturn(Optional.of(201326592000L));
nodeAgent.doConverge(context);
@@ -629,8 +616,6 @@ public class NodeAgentImplTest {
NodeAgentContext context = createContext(specBuilder.build());
NodeAgentImpl nodeAgent = makeNodeAgent(null, false, Duration.ofSeconds(30));
- when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
-
InOrder inOrder = inOrder(orchestrator, dockerOperations);
ConvergenceException healthCheckException = new ConvergenceException("Not yet up");
@@ -685,8 +670,6 @@ public class NodeAgentImplTest {
NodeAgentImpl nodeAgent = makeNodeAgent(dockerImage, true, Duration.ofSeconds(30));
mockGetContainer(dockerImage, ContainerResources.from(0, 2, 16), true);
- when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.of(201326592000L));
-
InOrder inOrder = inOrder(orchestrator, dockerOperations);
nodeAgent.doConverge(context);