diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-16 11:23:18 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2021-04-16 11:23:18 +0200 |
commit | 5f35cf74205c782a9d9b8598d058937212fe63d1 (patch) | |
tree | ee5c22be5436630ee17e6ba1bba16aa9d7ec35d7 /node-admin/src | |
parent | b4cdcb29514b43faf8062d47244f16835db616c4 (diff) |
Do not start processing core/heap dumps until they've been fully written
Diffstat (limited to 'node-admin/src')
2 files changed, 27 insertions, 16 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index f1a0ecdb1a3..a912de18b94 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.node.admin.task.util.process.Terminal; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Clock; import java.time.Duration; import java.util.Comparator; import java.util.HashMap; @@ -53,8 +54,9 @@ public class CoredumpHandler { private final Path crashPatchInContainer; private final Path doneCoredumpsPath; private final String operatorGroupName; - private final Supplier<String> coredumpIdSupplier; private final Metrics metrics; + private final Clock clock; + private final Supplier<String> coredumpIdSupplier; /** * @param crashPathInContainer path inside the container where core dump are dumped @@ -64,11 +66,12 @@ public class CoredumpHandler { public CoredumpHandler(Terminal terminal, CoreCollector coreCollector, CoredumpReporter coredumpReporter, Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Metrics metrics) { this(terminal, coreCollector, coredumpReporter, crashPathInContainer, doneCoredumpsPath, - operatorGroupName, metrics, () -> UUID.randomUUID().toString()); + operatorGroupName, metrics, Clock.systemUTC(), () -> UUID.randomUUID().toString()); } CoredumpHandler(Terminal terminal, CoreCollector coreCollector, CoredumpReporter coredumpReporter, - Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Metrics metrics, Supplier<String> coredumpIdSupplier) { + Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Metrics metrics, + Clock clock, Supplier<String> coredumpIdSupplier) { this.terminal = terminal; this.coreCollector = coreCollector; this.coredumpReporter = coredumpReporter; @@ -76,6 +79,7 @@ public class CoredumpHandler { this.doneCoredumpsPath = doneCoredumpsPath; this.operatorGroupName = operatorGroupName; this.metrics = metrics; + this.clock = clock; this.coredumpIdSupplier = coredumpIdSupplier; } @@ -110,7 +114,7 @@ public class CoredumpHandler { */ Optional<Path> enqueueCoredump(Path containerCrashPathOnHost, Path containerProcessingPathOnHost) { List<Path> toProcess = FileFinder.files(containerCrashPathOnHost) - .match(nameStartsWith(".").negate()) // Skip core dump files currently being written + .match(this::isReadyForProcessing) .maxDepth(1) .stream() .sorted(Comparator.comparing(FileFinder.FileAttributes::lastModifiedTime)) @@ -250,4 +254,10 @@ public class CoredumpHandler { return dimensionsBuilder.build(); } + private boolean isReadyForProcessing(FileFinder.FileAttributes fileAttributes) { + // Wait at least a minute until we start processing a core/heap dump to ensure that + // kernel/JVM has finished writing it + return clock.instant().minusSeconds(60).isAfter(fileAttributes.lastModifiedTime()); + } + } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java index fe0a7a52a62..4f2f2f985b6 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java @@ -1,6 +1,7 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; +import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.dockerapi.metrics.DimensionMetrics; import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; @@ -19,7 +20,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.FileTime; import java.time.Duration; -import java.time.Instant; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -54,10 +54,11 @@ public class CoredumpHandlerTest { private final CoreCollector coreCollector = mock(CoreCollector.class); private final CoredumpReporter coredumpReporter = mock(CoredumpReporter.class); private final Metrics metrics = new Metrics(); + private final ManualClock clock = new ManualClock(); @SuppressWarnings("unchecked") private final Supplier<String> coredumpIdSupplier = mock(Supplier.class); private final CoredumpHandler coredumpHandler = new CoredumpHandler(terminal, coreCollector, coredumpReporter, - crashPathInContainer, doneCoredumpsPath, "users", metrics, coredumpIdSupplier); + crashPathInContainer, doneCoredumpsPath, "users", metrics, clock, coredumpIdSupplier); @Test @@ -66,14 +67,14 @@ public class CoredumpHandlerTest { final Path processingDir = fileSystem.getPath("/home/docker/container-1/some/other/processing"); Files.createDirectories(crashPathOnHost); - createFileAged(crashPathOnHost.resolve(".bash.core.431"), Duration.ZERO); + createFileAged(crashPathOnHost.resolve("bash.core.431"), Duration.ZERO); - assertFolderContents(crashPathOnHost, ".bash.core.431"); + assertFolderContents(crashPathOnHost, "bash.core.431"); Optional<Path> enqueuedPath = coredumpHandler.enqueueCoredump(crashPathOnHost, processingDir); assertEquals(Optional.empty(), enqueuedPath); // bash.core.431 finished writing... and 2 more have since been written - Files.move(crashPathOnHost.resolve(".bash.core.431"), crashPathOnHost.resolve("bash.core.431")); + clock.advance(Duration.ofMinutes(3)); createFileAged(crashPathOnHost.resolve("vespa-proton.core.119"), Duration.ofMinutes(10)); createFileAged(crashPathOnHost.resolve("vespa-slobrok.core.673"), Duration.ofMinutes(5)); @@ -100,12 +101,12 @@ public class CoredumpHandlerTest { final Path processingDir = fileSystem.getPath("/home/docker/container-1/some/other/processing"); Files.createDirectories(crashPathOnHost); - createFileAged(crashPathOnHost.resolve("java.core.69"), Duration.ofSeconds(15)); - createFileAged(crashPathOnHost.resolve("hs_err_pid69.log"), Duration.ofSeconds(20)); + createFileAged(crashPathOnHost.resolve("java.core.69"), Duration.ofSeconds(515)); + createFileAged(crashPathOnHost.resolve("hs_err_pid69.log"), Duration.ofSeconds(520)); - createFileAged(crashPathOnHost.resolve("java.core.2420"), Duration.ofSeconds(40)); - createFileAged(crashPathOnHost.resolve("hs_err_pid2420.log"), Duration.ofSeconds(49)); - createFileAged(crashPathOnHost.resolve("hs_err_pid2421.log"), Duration.ofSeconds(50)); + createFileAged(crashPathOnHost.resolve("java.core.2420"), Duration.ofSeconds(540)); + createFileAged(crashPathOnHost.resolve("hs_err_pid2420.log"), Duration.ofSeconds(549)); + createFileAged(crashPathOnHost.resolve("hs_err_pid2421.log"), Duration.ofSeconds(550)); when(coredumpIdSupplier.get()).thenReturn("id-123").thenReturn("id-321"); Optional<Path> enqueuedPath = coredumpHandler.enqueueCoredump(crashPathOnHost, processingDir); @@ -255,9 +256,9 @@ public class CoredumpHandlerTest { assertEquals(expectedContentsOfFolder, actualContentsOfFolder); } - private static Path createFileAged(Path path, Duration age) { + private Path createFileAged(Path path, Duration age) { return uncheck(() -> Files.setLastModifiedTime( Files.createFile(path), - FileTime.from(Instant.now().minus(age)))); + FileTime.from(clock.instant().minus(age)))); } } |