From bbba42fe12adf8d11caeb3e4a4e39fe3ef814d26 Mon Sep 17 00:00:00 2001 From: valerijf Date: Tue, 23 May 2017 15:26:12 +0200 Subject: Run node-maintainer once an hour --- .../node/admin/maintenance/StorageMaintainer.java | 79 +++++++++++++++---- .../admin/provider/ComponentsProviderImpl.java | 7 +- .../node/admin/integrationTests/DockerTester.java | 7 +- .../integrationTests/StorageMaintainerMock.java | 5 +- .../admin/maintenance/StorageMaintainerTest.java | 90 +++++++++++++++++++++- 5 files changed, 161 insertions(+), 27 deletions(-) (limited to 'node-admin') diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index 0b11696ef99..10066567018 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -16,7 +16,6 @@ import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; import com.yahoo.vespa.hosted.node.admin.logging.FilebeatConfigProvider; -import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl; import com.yahoo.vespa.hosted.node.admin.util.Environment; import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger; import com.yahoo.vespa.hosted.node.admin.util.SecretAgentScheduleMaker; @@ -26,6 +25,7 @@ import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; @@ -53,13 +53,15 @@ public class StorageMaintainer { private final CounterWrapper numberOfNodeAdminMaintenanceFails; private final Docker docker; private final Environment environment; + private final Clock clock; - private Map metricsCacheByContainerName = new ConcurrentHashMap<>(); + private Map maintenanceThrottlerByContainerName = new ConcurrentHashMap<>(); - public StorageMaintainer(Docker docker, MetricReceiverWrapper metricReceiver, Environment environment) { + public StorageMaintainer(Docker docker, MetricReceiverWrapper metricReceiver, Environment environment, Clock clock) { this.docker = docker; this.environment = environment; + this.clock = clock; Dimensions dimensions = new Dimensions.Builder() .add("host", HostName.getLocalhost()) @@ -122,27 +124,22 @@ public class StorageMaintainer { // Calculating disk usage is IO expensive operation and its value changes relatively slowly, we want to perform // that calculation rarely. Additionally, we spread out the calculation for different containers by adding // a random deviation. - if (! metricsCacheByContainerName.containsKey(containerName) || - metricsCacheByContainerName.get(containerName).nextUpdateAt.isBefore(Instant.now())) { - long distributedSecs = (long) (intervalSec * (0.5 + Math.random())); - MetricsCache metricsCache = new MetricsCache(Instant.now().plusSeconds(distributedSecs)); - + MaintenanceThrottler maintenanceThrottler = getMaintenanceThrottlerFor(containerName); + if (maintenanceThrottler.shouldUpdateDiskUsageNow()) { // Throttle to one disk usage calculation at a time. synchronized (monitor) { PrefixLogger logger = PrefixLogger.getNodeAgentLogger(StorageMaintainer.class, containerName); Path containerDir = environment.pathInNodeAdminFromPathInNode(containerName, "/home/"); try { long used = getDiscUsedInBytes(containerDir); - metricsCache.setDiskUsage(used); + maintenanceThrottler.setDiskUsage(used); } catch (Throwable e) { logger.error("Problems during disk usage calculations: " + e.getMessage()); } } - - metricsCacheByContainerName.put(containerName, metricsCache); } - return metricsCacheByContainerName.get(containerName).diskUsage; + return maintenanceThrottler.diskUsage; } // Public for testing @@ -175,6 +172,8 @@ public class StorageMaintainer { * Deletes old log files for vespa, nginx, logstash, etc. */ public void removeOldFilesFromNode(ContainerName containerName) { + if (! getMaintenanceThrottlerFor(containerName).shouldRemoveOldFilesNow()) return; + MaintainerExecutor maintainerExecutor = new MaintainerExecutor(); String[] pathsToClean = {"/home/y/logs/elasticsearch2", "/home/y/logs/logstash2", "/home/y/logs/daemontools_y", "/home/y/logs/nginx", "/home/y/logs/vespa"}; @@ -209,12 +208,15 @@ public class StorageMaintainer { .withArgument("recursive", false); maintainerExecutor.execute(); + getMaintenanceThrottlerFor(containerName).updateNextRemoveOldFilesTime(); } /** * Checks if container has any new coredumps, reports and archives them if so */ public void handleCoreDumpsForContainer(ContainerName containerName, ContainerNodeSpec nodeSpec, Environment environment) { + if (! getMaintenanceThrottlerFor(containerName).shouldHandleCoredumpsNow()) return; + Map attributes = new HashMap<>(); attributes.put("hostname", nodeSpec.hostname); attributes.put("parent_hostname", HostName.getLocalhost()); @@ -240,7 +242,9 @@ public class StorageMaintainer { .withArgument("doneCoredumpsPath", environment.pathInNodeAdminToDoneCoredumps()) .withArgument("coredumpsPath", environment.pathInNodeAdminFromPathInNode(containerName, "/home/y/var/crash")) .withArgument("attributes", attributes); + maintainerExecutor.execute(); + getMaintenanceThrottlerFor(containerName).updateNextHandleCoredumpsTime(); } /** @@ -249,6 +253,8 @@ public class StorageMaintainer { * * JDisc logs */ public void cleanNodeAdmin() { + if (! getMaintenanceThrottlerFor(NODE_ADMIN).shouldRemoveOldFilesNow()) return; + MaintainerExecutor maintainerExecutor = new MaintainerExecutor(); maintainerExecutor.addJob("delete-directories") .withArgument("basePath", environment.getPathResolver().getApplicationStoragePathForNodeAdmin()) @@ -260,7 +266,9 @@ public class StorageMaintainer { .withArgument("basePath", nodeAdminJDiskLogsPath) .withArgument("maxAgeSeconds", Duration.ofDays(31).getSeconds()) .withArgument("recursive", false); + maintainerExecutor.execute(); + getMaintenanceThrottlerFor(NODE_ADMIN).updateNextRemoveOldFilesTime(); } /** @@ -276,6 +284,7 @@ public class StorageMaintainer { .withArgument("to", environment.pathInNodeAdminToNodeCleanup(containerName)); maintainerExecutor.execute(); + getMaintenanceThrottlerFor(containerName).reset(); } @@ -357,16 +366,54 @@ public class StorageMaintainer { } } - private static class MetricsCache { - private final Instant nextUpdateAt; + private MaintenanceThrottler getMaintenanceThrottlerFor(ContainerName containerName) { + if (! maintenanceThrottlerByContainerName.containsKey(containerName)) { + maintenanceThrottlerByContainerName.put(containerName, new MaintenanceThrottler()); + } + + return maintenanceThrottlerByContainerName.get(containerName); + } + + private class MaintenanceThrottler { + private Instant nextDiskUsageUpdateAt; + private Instant nextRemoveOldFilesAt; + private Instant nextHandleOldCoredumpsAt; private Optional diskUsage = Optional.empty(); - MetricsCache(Instant nextUpdateAt) { - this.nextUpdateAt = nextUpdateAt; + MaintenanceThrottler() { + reset(); + } + + boolean shouldUpdateDiskUsageNow() { + return !nextDiskUsageUpdateAt.isAfter(clock.instant()); } void setDiskUsage(long diskUsage) { this.diskUsage = Optional.of(diskUsage); + long distributedSecs = (long) (intervalSec * (0.5 + Math.random())); + nextDiskUsageUpdateAt = clock.instant().plusSeconds(distributedSecs); + } + + void updateNextRemoveOldFilesTime() { + nextRemoveOldFilesAt = clock.instant().plus(Duration.ofHours(1)); + } + + boolean shouldRemoveOldFilesNow() { + return !nextRemoveOldFilesAt.isAfter(clock.instant()); + } + + void updateNextHandleCoredumpsTime() { + nextHandleOldCoredumpsAt = clock.instant().plus(Duration.ofHours(1)); + } + + boolean shouldHandleCoredumpsNow() { + return !nextHandleOldCoredumpsAt.isAfter(clock.instant()); + } + + void reset() { + nextDiskUsageUpdateAt = clock.instant().minus(Duration.ofDays(1)); + nextRemoveOldFilesAt = clock.instant().minus(Duration.ofDays(1)); + nextHandleOldCoredumpsAt = clock.instant().minus(Duration.ofDays(1)); } } } \ No newline at end of file diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java index 5892b800cde..32e18459c48 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/ComponentsProviderImpl.java @@ -58,22 +58,23 @@ public class ComponentsProviderImpl implements ComponentsProvider { throw new IllegalStateException("Environment setting for config servers missing or empty."); } + Clock clock = Clock.systemUTC(); ConfigServerHttpRequestExecutor requestExecutor = ConfigServerHttpRequestExecutor.create(configServerHosts); Orchestrator orchestrator = new OrchestratorImpl(requestExecutor); NodeRepository nodeRepository = new NodeRepositoryImpl(requestExecutor, WEB_SERVICE_PORT, baseHostName); DockerOperations dockerOperations = new DockerOperationsImpl(docker, environment); Optional storageMaintainer = isRunningLocally ? - Optional.empty() : Optional.of(new StorageMaintainer(docker, metricReceiver, environment)); + Optional.empty() : Optional.of(new StorageMaintainer(docker, metricReceiver, environment, clock)); Optional aclMaintainer = isRunningLocally ? Optional.empty() : Optional.of(new AclMaintainer(dockerOperations, nodeRepository, baseHostName)); Function nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, metricReceiver, environment, Clock.systemUTC(), aclMaintainer); + storageMaintainer, metricReceiver, environment, clock, aclMaintainer); NodeAdmin nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, storageMaintainer, NODE_AGENT_SCAN_INTERVAL_MILLIS, metricReceiver, aclMaintainer); - nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, nodeAdmin, Clock.systemUTC(), orchestrator, baseHostName); + nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, nodeAdmin, clock, orchestrator, baseHostName); nodeAdminStateUpdater.start(NODE_ADMIN_CONVERGE_STATE_INTERVAL_MILLIS); metricReceiverWrapper = metricReceiver; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index e5176247dac..e52052e4cfa 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -57,15 +57,16 @@ public class DockerTester implements AutoCloseable { orchestratorMock = new OrchestratorMock(callOrderVerifier); nodeRepositoryMock = new NodeRepoMock(callOrderVerifier); dockerMock = new DockerMock(callOrderVerifier); - StorageMaintainerMock storageMaintainer = new StorageMaintainerMock(dockerMock, environment, callOrderVerifier); + Clock clock = Clock.systemUTC(); + StorageMaintainerMock storageMaintainer = new StorageMaintainerMock(dockerMock, environment, callOrderVerifier, clock); MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); final DockerOperations dockerOperations = new DockerOperationsImpl(dockerMock, environment); Function nodeAgentFactory = (hostName) -> new NodeAgentImpl(hostName, nodeRepositoryMock, - orchestratorMock, dockerOperations, Optional.of(storageMaintainer), mr, environment, Clock.systemUTC(), Optional.empty()); + orchestratorMock, dockerOperations, Optional.of(storageMaintainer), mr, environment, clock, Optional.empty()); nodeAdmin = new NodeAdminImpl(dockerOperations, nodeAgentFactory, Optional.of(storageMaintainer), 100, mr, Optional.empty()); - updater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, Clock.systemUTC(), orchestratorMock, "basehostname"); + updater = new NodeAdminStateUpdater(nodeRepositoryMock, nodeAdmin, clock, orchestratorMock, "basehostname"); updater.start(5); } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/StorageMaintainerMock.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/StorageMaintainerMock.java index 7f862180b44..30b365c6d0e 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/StorageMaintainerMock.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/StorageMaintainerMock.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.util.Environment; +import java.time.Clock; import java.util.Optional; /** @@ -17,8 +18,8 @@ import java.util.Optional; public class StorageMaintainerMock extends StorageMaintainer { private final CallOrderVerifier callOrderVerifier; - public StorageMaintainerMock(Docker docker, Environment environment, CallOrderVerifier callOrderVerifier) { - super(docker, new MetricReceiverWrapper(MetricReceiver.nullImplementation), environment); + public StorageMaintainerMock(Docker docker, Environment environment, CallOrderVerifier callOrderVerifier, Clock clock) { + super(docker, new MetricReceiverWrapper(MetricReceiver.nullImplementation), environment, clock); this.callOrderVerifier = callOrderVerifier; } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java index 25d267eef28..a4556b5c8bc 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainerTest.java @@ -1,8 +1,15 @@ package com.yahoo.vespa.hosted.node.admin.maintenance; import com.yahoo.metrics.simple.MetricReceiver; +import com.yahoo.test.ManualClock; +import com.yahoo.vespa.hosted.dockerapi.ContainerName; +import com.yahoo.vespa.hosted.dockerapi.Docker; +import com.yahoo.vespa.hosted.dockerapi.ProcessResult; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; +import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; import com.yahoo.vespa.hosted.node.admin.util.Environment; +import com.yahoo.vespa.hosted.node.admin.util.PathResolver; +import com.yahoo.vespa.hosted.provision.Node; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -10,13 +17,26 @@ import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.time.Duration; import static org.junit.Assert.*; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyVararg; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; /** * @author dybis */ public class StorageMaintainerTest { + private final ManualClock clock = new ManualClock(); + private final Environment environment = new Environment.Builder() + .pathResolver(new PathResolver()).build(); + private final Docker docker = mock(Docker.class); + private final StorageMaintainer storageMaintainer = new StorageMaintainer(docker, + new MetricReceiverWrapper(MetricReceiver.nullImplementation), environment, clock); @Rule public TemporaryFolder folder = new TemporaryFolder(); @@ -26,14 +46,78 @@ public class StorageMaintainerTest { int writeSize = 10000; writeNBytesToFile(folder.newFile(), writeSize); - Environment environment = new Environment.Builder().build(); - StorageMaintainer storageMaintainer = new StorageMaintainer(null, - new MetricReceiverWrapper(MetricReceiver.nullImplementation), environment); long usedBytes = storageMaintainer.getDiscUsedInBytes(folder.getRoot().toPath()); if (usedBytes * 4 < writeSize || usedBytes > writeSize * 4) fail("Used bytes is " + usedBytes + ", but wrote " + writeSize + " bytes, not even close."); } + @Test + public void testMaintenanceThrottlingAfterSuccessfulMaintenance() { + String hostname = "node-123.us-north-3.test.yahoo.com"; + ContainerName containerName = ContainerName.fromHostname(hostname); + ContainerNodeSpec nodeSpec = new ContainerNodeSpec.Builder() + .hostname(hostname) + .nodeState(Node.State.ready) + .nodeType("tenants") + .nodeFlavor("docker").build(); + + when(docker.executeInContainerAsRoot(any(), anyVararg())).thenReturn(new ProcessResult(0, "", "")); + storageMaintainer.removeOldFilesFromNode(containerName); + verify(docker, times(1)).executeInContainerAsRoot(any(), anyVararg()); + // Will not actually run maintenance job until an hour passes + storageMaintainer.removeOldFilesFromNode(containerName); + verify(docker, times(1)).executeInContainerAsRoot(any(), anyVararg()); + + // Coredump handler has its own throttler + storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, environment); + verify(docker, times(2)).executeInContainerAsRoot(any(), anyVararg()); + + + clock.advance(Duration.ofMinutes(61)); + storageMaintainer.removeOldFilesFromNode(containerName); + verify(docker, times(3)).executeInContainerAsRoot(any(), anyVararg()); + + storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, environment); + verify(docker, times(4)).executeInContainerAsRoot(any(), anyVararg()); + + storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, environment); + verify(docker, times(4)).executeInContainerAsRoot(any(), anyVararg()); + + + // archiveNodeData is unthrottled and it should reset previous times + storageMaintainer.archiveNodeData(containerName); + verify(docker, times(5)).executeInContainerAsRoot(any(), anyVararg()); + storageMaintainer.archiveNodeData(containerName); + verify(docker, times(6)).executeInContainerAsRoot(any(), anyVararg()); + + storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, environment); + verify(docker, times(7)).executeInContainerAsRoot(any(), anyVararg()); + } + + @Test + public void testMaintenanceThrottlingAfterFailedMaintenance() { + String hostname = "node-123.us-north-3.test.yahoo.com"; + ContainerName containerName = ContainerName.fromHostname(hostname); + ContainerNodeSpec nodeSpec = new ContainerNodeSpec.Builder() + .hostname(hostname) + .nodeState(Node.State.ready) + .nodeType("tenants") + .nodeFlavor("docker").build(); + + when(docker.executeInContainerAsRoot(any(), anyVararg())) + .thenThrow(new RuntimeException("Something went wrong")) + .thenReturn(new ProcessResult(0, "", "")); + try { + storageMaintainer.removeOldFilesFromNode(containerName); + fail("Maintenance job should've failed!"); + } catch (RuntimeException ignored) { } + verify(docker, times(1)).executeInContainerAsRoot(any(), anyVararg()); + + // Maintenance job failed, we should be able to immediately re-run it + storageMaintainer.removeOldFilesFromNode(containerName); + verify(docker, times(2)).executeInContainerAsRoot(any(), anyVararg()); + } + private static void writeNBytesToFile(File file, int nBytes) throws IOException { Files.write(file.toPath(), new byte[nBytes]); } -- cgit v1.2.3