diff options
author | HÃ¥kon Hallingstad <hakon@verizonmedia.com> | 2021-05-19 23:37:51 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-19 23:37:51 +0200 |
commit | 3eceb9c73b15a352c6425cdc0510280c38895dab (patch) | |
tree | 80f3d235b490372aa067d58be9ee0125ae5c8cc4 | |
parent | 9ab61df1e4788afec510696e172e9655d8a7baef (diff) | |
parent | 0d2b37680c89f4c640f4b50ca2367c1608eb8d4a (diff) |
Merge pull request #17904 from vespa-engine/freva/throw-while-core-writing
Throw while core is being written
5 files changed, 25 insertions, 10 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index 93717543a1c..10e0dd50761 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -165,9 +165,9 @@ public class StorageMaintainer { } /** Checks if container has any new coredumps, reports and archives them if so */ - public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container) { + public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container, boolean throwIfCoreBeingWritten) { if (context.isDisabled(NodeAgentTask.CoreDumps)) return; - coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container)); + coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container), throwIfCoreBeingWritten); } private Map<String, Object> getCoredumpNodeAttributes(NodeAgentContext context, Optional<Container> container) { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java index eb629ee6165..4c384b09fad 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; import com.yahoo.vespa.hosted.dockerapi.ProcessResult; import com.yahoo.vespa.hosted.node.admin.docker.ContainerOperations; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import java.nio.file.Path; @@ -45,7 +46,7 @@ public class CoreCollector { Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput()); if (! matcher.find()) { - throw new RuntimeException(String.format("Failed to extract binary path from GDB, result: %s, command: %s", + throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s", result, Arrays.toString(wrappedCommand))); } return Paths.get(matcher.group("path").split(" ")[0]); @@ -56,7 +57,7 @@ public class CoreCollector { try { ProcessResult result = docker.executeCommandInContainerAsRoot(context, command); if (result.getExitStatus() != 0) { - throw new RuntimeException("file command failed with " + result); + throw new ConvergenceException("file command failed with " + result); } Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput()); @@ -82,7 +83,7 @@ public class CoreCollector { ProcessResult result = docker.executeCommandInContainerAsRoot(context, command); if (result.getExitStatus() != 0) - throw new RuntimeException("Failed to read backtrace " + result + ", Command: " + Arrays.toString(command)); + throw new ConvergenceException("Failed to read backtrace " + result + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } @@ -92,7 +93,7 @@ public class CoreCollector { ProcessResult result = docker.executeCommandInContainerAsRoot(context, command); if (result.getExitStatus() != 0) - throw new RuntimeException("Failed to read jstack " + result + ", Command: " + Arrays.toString(command)); + throw new ConvergenceException("Failed to read jstack " + result + ", Command: " + Arrays.toString(command)); return List.of(result.getOutput().split("\n")); } @@ -118,6 +119,8 @@ public class CoreCollector { data.put("backtrace", readBacktrace(context, coredumpPath, binPath, false)); data.put("backtrace_all_threads", readBacktrace(context, coredumpPath, binPath, true)); } + } catch (ConvergenceException e) { + context.log(logger, Level.WARNING, "Failed to extract backtrace: " + e.getMessage()); } catch (RuntimeException e) { context.log(logger, Level.WARNING, "Failed to extract backtrace", e); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index a912de18b94..09c0a4ae491 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; @@ -84,12 +85,23 @@ public class CoredumpHandler { } - public void converge(NodeAgentContext context, Supplier<Map<String, Object>> nodeAttributesSupplier) { + public void converge(NodeAgentContext context, Supplier<Map<String, Object>> nodeAttributesSupplier, boolean throwIfCoreBeingWritten) { Path containerCrashPathOnHost = context.pathOnHostFromPathInNode(crashPatchInContainer); Path containerProcessingPathOnHost = containerCrashPathOnHost.resolve(PROCESSING_DIRECTORY_NAME); updateMetrics(context, containerCrashPathOnHost); + if (throwIfCoreBeingWritten) { + List<String> pendingCores = FileFinder.files(containerCrashPathOnHost) + .match(fileAttributes -> !isReadyForProcessing(fileAttributes)) + .maxDepth(1).stream() + .map(FileFinder.FileAttributes::filename) + .collect(Collectors.toUnmodifiableList()); + if (!pendingCores.isEmpty()) + throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written", + pendingCores.size() < 5 ? pendingCores : pendingCores.size())); + } + // Check if we have already started to process a core dump or we can enqueue a new core one getCoredumpToProcess(containerCrashPathOnHost, containerProcessingPathOnHost) .ifPresent(path -> processAndReportSingleCoredump(context, path, nodeAttributesSupplier)); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index c23e1899257..df3f075e8d9 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -366,7 +366,7 @@ public class NodeAgentImpl implements NodeAgent { } } - storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); + storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer), true); containerOperations.removeContainer(context, existingContainer); containerState = ABSENT; context.log(logger, "Container successfully removed, new containerState is " + containerState); @@ -469,7 +469,7 @@ public class NodeAgentImpl implements NodeAgent { case active: storageMaintainer.syncLogs(context, true); storageMaintainer.cleanDiskIfFull(context); - storageMaintainer.handleCoreDumpsForContainer(context, container); + storageMaintainer.handleCoreDumpsForContainer(context, container, false); if (downloadImageIfNeeded(context, container)) { context.log(logger, "Waiting for image to download " + context.node().wantedDockerImage().get().asString()); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9475e3720c2..34c4bc15ee9 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -449,7 +449,7 @@ public class NodeAgentImplTest { final InOrder inOrder = inOrder(storageMaintainer, containerOperations, nodeRepository); inOrder.verify(containerOperations, times(1)).stopServices(eq(context)); - inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any()); + inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any(), eq(true)); inOrder.verify(containerOperations, times(1)).removeContainer(eq(context), any()); inOrder.verify(storageMaintainer, times(1)).archiveNodeStorage(eq(context)); inOrder.verify(nodeRepository, times(1)).setNodeState(eq(hostName), eq(NodeState.ready)); |