diff options
author | Håkon Hallingstad <hakon@yahooinc.com> | 2022-11-02 13:34:03 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@yahooinc.com> | 2022-11-02 13:34:03 +0100 |
commit | 8e599c8cf8fa47fd3d4f12fa648ff086010d11ea (patch) | |
tree | 3bfc2c0844462c766114560a87af6204c13900ad /node-admin | |
parent | 82a57bbed60b624999da93f18eb05746d0ede3f7 (diff) |
Log core dump processing
Diffstat (limited to 'node-admin')
5 files changed, 164 insertions, 20 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index b54eae0b276..3a61f8b2619 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -20,7 +20,10 @@ import com.yahoo.vespa.hosted.node.admin.container.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.maintenance.sync.ZstdCompressingInputStream; import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; +import com.yahoo.vespa.hosted.node.admin.task.util.file.FileDeleter; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; +import com.yahoo.vespa.hosted.node.admin.task.util.file.FileMover; +import com.yahoo.vespa.hosted.node.admin.task.util.file.MakeDirectory; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath; @@ -39,6 +42,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.UUID; +import java.util.function.Predicate; import java.util.function.Supplier; import java.util.logging.Logger; import java.util.regex.Pattern; @@ -126,7 +130,7 @@ public class CoredumpHandler { } // Check if we have already started to process a core dump or we can enqueue a new core one - getCoredumpToProcess(containerCrashPath, containerProcessingPath) + getCoredumpToProcess(context, containerCrashPath, containerProcessingPath) .ifPresent(path -> { if (reportCoresViaCfgFlag.with(FetchVector.Dimension.NODE_TYPE, context.nodeType().name()).value()) { processAndReportSingleCoreDump2(context, path, dockerImage); @@ -137,12 +141,12 @@ public class CoredumpHandler { } /** @return path to directory inside processing directory that contains a core dump file to process */ - Optional<ContainerPath> getCoredumpToProcess(ContainerPath containerCrashPath, ContainerPath containerProcessingPath) { + Optional<ContainerPath> getCoredumpToProcess(NodeAgentContext context, ContainerPath containerCrashPath, ContainerPath containerProcessingPath) { return FileFinder.directories(containerProcessingPath).stream() .map(FileFinder.FileAttributes::path) .findAny() .map(ContainerPath.class::cast) - .or(() -> enqueueCoredump(containerCrashPath, containerProcessingPath)); + .or(() -> enqueueCoredump(context, containerCrashPath, containerProcessingPath)); } /** @@ -154,9 +158,19 @@ public class CoredumpHandler { * * @return path to directory inside processing directory which contains the enqueued core dump file */ - Optional<ContainerPath> enqueueCoredump(ContainerPath containerCrashPath, ContainerPath containerProcessingPath) { + Optional<ContainerPath> enqueueCoredump(NodeAgentContext context, ContainerPath containerCrashPath, ContainerPath containerProcessingPath) { + Predicate<String> isCoreDump = filename -> !HS_ERR_PATTERN.matcher(filename).matches(); + List<Path> toProcess = FileFinder.files(containerCrashPath) - .match(this::isReadyForProcessing) + .match(attributes -> { + if (isReadyForProcessing(attributes)) { + return true; + } else { + if (isCoreDump.test(attributes.filename())) + context.log(logger, attributes.path() + " is still being written"); + return false; + } + }) .maxDepth(1) .stream() .sorted(Comparator.comparing(FileFinder.FileAttributes::lastModifiedTime)) @@ -164,19 +178,20 @@ public class CoredumpHandler { .toList(); int coredumpIndex = IntStream.range(0, toProcess.size()) - .filter(i -> !HS_ERR_PATTERN.matcher(toProcess.get(i).getFileName().toString()).matches()) + .filter(i -> isCoreDump.test(toProcess.get(i).getFileName().toString())) .findFirst() .orElse(-1); // Either there are no files in crash directory, or all the files are hs_err files. if (coredumpIndex == -1) return Optional.empty(); - ContainerPath enqueuedDir = (ContainerPath) uncheck(() -> Files.createDirectories(containerProcessingPath.resolve(coredumpIdSupplier.get()))); + ContainerPath enqueuedDir = containerProcessingPath.resolve(coredumpIdSupplier.get()); + new MakeDirectory(enqueuedDir).createParents().converge(context); IntStream.range(0, coredumpIndex + 1) .forEach(i -> { Path path = toProcess.get(i); String prefix = i == coredumpIndex ? COREDUMP_FILENAME_PREFIX : ""; - uncheck(() -> Files.move(path, enqueuedDir.resolve(prefix + path.getFileName()))); + new FileMover(path, enqueuedDir.resolve(prefix + path.getFileName())).converge(context); }); return Optional.of(enqueuedDir); } @@ -258,12 +273,13 @@ public class CoredumpHandler { } catch (IOException e) { throw new UncheckedIOException(e); } - uncheck(() -> Files.delete(coreFile)); + new FileDeleter(coreFile).converge(context); Path newCoredumpDirectory = doneCoredumpsPath.resolve(context.containerName().asString()); - uncheck(() -> Files.createDirectories(newCoredumpDirectory)); + new MakeDirectory(newCoredumpDirectory).createParents().converge(context); // Files.move() does not support moving non-empty directories across providers, move using host paths - uncheck(() -> Files.move(coredumpDirectory.pathOnHost(), newCoredumpDirectory.resolve(coredumpDirectory.getFileName().toString()))); + new FileMover(coredumpDirectory.pathOnHost(), newCoredumpDirectory.resolve(coredumpDirectory.getFileName().toString())) + .converge(context); } ContainerPath findCoredumpFileInProcessingDirectory(ContainerPath coredumpProccessingDirectory) { @@ -348,8 +364,8 @@ public class CoredumpHandler { String coreDumpId = coreDumpDirectory.getFileName().toString(); cores.report(context.hostname(), coreDumpId, metadata); + context.log(logger, "Core dump reported: " + coreDumpId); finishProcessing(context, coreDumpDirectory, sharedCoreKey); - context.log(logger, "Successfully reported core dump " + coreDumpId); } private CoreDumpMetadata gatherMetadata(NodeAgentContext context, ContainerPath coreDumpDirectory) { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileDeleter.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileDeleter.java index 6f3f0c06344..92dc34d5e8b 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileDeleter.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileDeleter.java @@ -26,7 +26,7 @@ public class FileDeleter { public boolean converge(TaskContext context) { boolean deleted = uncheck(() -> Files.deleteIfExists(path)); if (deleted) { - context.recordSystemModification(logger, "Deleted file or directory " + path); + context.recordSystemModification(logger, "Deleted " + path); } return deleted; diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMover.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMover.java new file mode 100644 index 00000000000..a5ba78e524e --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMover.java @@ -0,0 +1,55 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.task.util.file; + +import com.yahoo.vespa.hosted.node.admin.component.TaskContext; + +import java.nio.file.CopyOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; + +import static com.yahoo.yolean.Exceptions.uncheck; + +/** + * Utility for idempotent move of (any type of) file. + * + * @author hakonhall + */ +public class FileMover { + private static final Logger logger = Logger.getLogger(FileMover.class.getName()); + + private final Path source; + private final Path destination; + private final Set<CopyOption> moveOptions = new HashSet<>(); + + public FileMover(Path source, Path destination) { + this.source = source; + this.destination = destination; + } + + public FileMover replaceExisting() { + moveOptions.add(StandardCopyOption.REPLACE_EXISTING); + return this; + } + + public FileMover atomic() { + moveOptions.add(StandardCopyOption.ATOMIC_MOVE); + return this; + } + + /** + * Move file. + * + * @return false if the source doesn't exist while the destination do. + * @see Files#move(Path, Path, CopyOption...) Files.move() + */ + public boolean converge(TaskContext context) { + if (!Files.exists(source) && Files.exists(destination)) return false; + uncheck(() -> Files.move(source, destination, moveOptions.toArray(CopyOption[]::new))); + context.recordSystemModification(logger, "Moved " + source + " to " + destination); + return true; + } +} diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java index 1d53f0974ab..b748c067fe2 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java @@ -77,7 +77,7 @@ public class CoredumpHandlerTest { createFileAged(crashPath.resolve("bash.core.431"), Duration.ZERO); assertFolderContents(crashPath, "bash.core.431"); - Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(crashPath, processingDir); + Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(context, crashPath, processingDir); assertEquals(Optional.empty(), enqueuedPath); // bash.core.431 finished writing... and 2 more have since been written @@ -86,7 +86,7 @@ public class CoredumpHandlerTest { createFileAged(crashPath.resolve("vespa-slobrok.core.673"), Duration.ofMinutes(5)); when(coredumpIdSupplier.get()).thenReturn("id-123").thenReturn("id-321"); - enqueuedPath = coredumpHandler.enqueueCoredump(crashPath, processingDir); + enqueuedPath = coredumpHandler.enqueueCoredump(context, crashPath, processingDir); assertEquals(Optional.of(processingDir.resolve("id-123")), enqueuedPath); assertFolderContents(crashPath, "bash.core.431", "vespa-slobrok.core.673"); assertFolderContents(processingDir, "id-123"); @@ -94,7 +94,7 @@ public class CoredumpHandlerTest { verify(coredumpIdSupplier, times(1)).get(); // Enqueue another - enqueuedPath = coredumpHandler.enqueueCoredump(crashPath, processingDir); + enqueuedPath = coredumpHandler.enqueueCoredump(context, crashPath, processingDir); assertEquals(Optional.of(processingDir.resolve("id-321")), enqueuedPath); assertFolderContents(crashPath, "bash.core.431"); assertFolderContents(processingDir, "id-123", "id-321"); @@ -116,7 +116,7 @@ public class CoredumpHandlerTest { createFileAged(crashPath.resolve("hs_err_pid2421.log"), Duration.ofSeconds(550)); when(coredumpIdSupplier.get()).thenReturn("id-123").thenReturn("id-321"); - Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(crashPath, processingDir); + Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(context, crashPath, processingDir); assertEquals(Optional.of(processingDir.resolve("id-123")), enqueuedPath); assertFolderContents(crashPath, "hs_err_pid69.log", "java.core.69"); assertFolderContents(processingDir, "id-123"); @@ -128,7 +128,7 @@ public class CoredumpHandlerTest { ContainerPath processingDir = context.paths().of("/some/other/processing"); // Initially there are no core dumps - Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(containerCrashPath, processingDir); + Optional<ContainerPath> enqueuedPath = coredumpHandler.enqueueCoredump(context, containerCrashPath, processingDir); assertEquals(Optional.empty(), enqueuedPath); // 3 core dumps occur @@ -138,11 +138,11 @@ public class CoredumpHandlerTest { createFileAged(containerCrashPath.resolve("vespa-slobrok.core.673"), Duration.ofMinutes(5)); when(coredumpIdSupplier.get()).thenReturn("id-123"); - enqueuedPath = coredumpHandler.getCoredumpToProcess(containerCrashPath, processingDir); + enqueuedPath = coredumpHandler.getCoredumpToProcess(context, containerCrashPath, processingDir); assertEquals(Optional.of(processingDir.resolve("id-123")), enqueuedPath); // Running this again wont enqueue new core dumps as we are still processing the one enqueued previously - enqueuedPath = coredumpHandler.getCoredumpToProcess(containerCrashPath, processingDir); + enqueuedPath = coredumpHandler.getCoredumpToProcess(context, containerCrashPath, processingDir); assertEquals(Optional.of(processingDir.resolve("id-123")), enqueuedPath); verify(coredumpIdSupplier, times(1)).get(); } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMoverTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMoverTest.java new file mode 100644 index 00000000000..5eb02dfc7fa --- /dev/null +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/task/util/file/FileMoverTest.java @@ -0,0 +1,73 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.task.util.file; + +import com.yahoo.vespa.hosted.node.admin.component.TaskContext; +import com.yahoo.vespa.test.file.TestFileSystem; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.FileAlreadyExistsException; +import java.nio.file.FileSystem; +import java.nio.file.NoSuchFileException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; +import static org.mockito.Mockito.mock; + +/** + * @author hakonhall + */ +class FileMoverTest { + private final FileSystem fileSystem = TestFileSystem.create(); + private final TaskContext context = mock(TaskContext.class); + private final UnixPath source = new UnixPath(fileSystem.getPath("/from/source")); + private final UnixPath destination = new UnixPath(fileSystem.getPath("/to/destination")); + private final FileMover mover = new FileMover(source.toPath(), destination.toPath()); + + @Test + void movingRegularFile() { + assertConvergeThrows(() -> mover.converge(context), NoSuchFileException.class, "/from/source"); + + source.createParents().writeUtf8File("content"); + assertConvergeThrows(() -> mover.converge(context), NoSuchFileException.class, "/to/destination"); + + destination.createParents(); + assertTrue(mover.converge(context)); + assertFalse(source.exists()); + assertTrue(destination.exists()); + assertEquals("content", destination.readUtf8File()); + + assertFalse(mover.converge(context)); + + source.writeUtf8File("content 2"); + assertConvergeThrows(() -> mover.converge(context), FileAlreadyExistsException.class, "/to/destination"); + + mover.replaceExisting(); + assertTrue(mover.converge(context)); + + source.writeUtf8File("content 3"); + destination.deleteIfExists(); + destination.createDirectory(); + assertTrue(mover.converge(context)); + } + + private void assertConvergeThrows(Runnable runnable, Class<?> expectedRootExceptionClass, String expectedMessage) { + try { + runnable.run(); + fail(); + } catch (Throwable t) { + Throwable rootCause = t; + do { + Throwable cause = rootCause.getCause(); + if (cause == null) break; + rootCause = cause; + } while (true); + + assertTrue(expectedRootExceptionClass.isInstance(rootCause), "Unexpected root cause: " + rootCause); + assertEquals(expectedMessage, rootCause.getMessage()); + } + } +}
\ No newline at end of file |