From 0d2b37680c89f4c640f4b50ca2367c1608eb8d4a Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Wed, 19 May 2021 16:38:21 +0200 Subject: Throw if a coredump is being written while trying to remove linux container --- .../hosted/node/admin/maintenance/StorageMaintainer.java | 4 ++-- .../node/admin/maintenance/coredump/CoredumpHandler.java | 14 +++++++++++++- .../vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 4 ++-- .../hosted/node/admin/nodeagent/NodeAgentImplTest.java | 2 +- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java index 93717543a1c..10e0dd50761 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java @@ -165,9 +165,9 @@ public class StorageMaintainer { } /** Checks if container has any new coredumps, reports and archives them if so */ - public void handleCoreDumpsForContainer(NodeAgentContext context, Optional container) { + public void handleCoreDumpsForContainer(NodeAgentContext context, Optional container, boolean throwIfCoreBeingWritten) { if (context.isDisabled(NodeAgentTask.CoreDumps)) return; - coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container)); + coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container), throwIfCoreBeingWritten); } private Map getCoredumpNodeAttributes(NodeAgentContext context, Optional container) { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index a912de18b94..09c0a4ae491 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; @@ -84,12 +85,23 @@ public class CoredumpHandler { } - public void converge(NodeAgentContext context, Supplier> nodeAttributesSupplier) { + public void converge(NodeAgentContext context, Supplier> nodeAttributesSupplier, boolean throwIfCoreBeingWritten) { Path containerCrashPathOnHost = context.pathOnHostFromPathInNode(crashPatchInContainer); Path containerProcessingPathOnHost = containerCrashPathOnHost.resolve(PROCESSING_DIRECTORY_NAME); updateMetrics(context, containerCrashPathOnHost); + if (throwIfCoreBeingWritten) { + List pendingCores = FileFinder.files(containerCrashPathOnHost) + .match(fileAttributes -> !isReadyForProcessing(fileAttributes)) + .maxDepth(1).stream() + .map(FileFinder.FileAttributes::filename) + .collect(Collectors.toUnmodifiableList()); + if (!pendingCores.isEmpty()) + throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written", + pendingCores.size() < 5 ? pendingCores : pendingCores.size())); + } + // Check if we have already started to process a core dump or we can enqueue a new core one getCoredumpToProcess(containerCrashPathOnHost, containerProcessingPathOnHost) .ifPresent(path -> processAndReportSingleCoredump(context, path, nodeAttributesSupplier)); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index c23e1899257..df3f075e8d9 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -366,7 +366,7 @@ public class NodeAgentImpl implements NodeAgent { } } - storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer)); + storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer), true); containerOperations.removeContainer(context, existingContainer); containerState = ABSENT; context.log(logger, "Container successfully removed, new containerState is " + containerState); @@ -469,7 +469,7 @@ public class NodeAgentImpl implements NodeAgent { case active: storageMaintainer.syncLogs(context, true); storageMaintainer.cleanDiskIfFull(context); - storageMaintainer.handleCoreDumpsForContainer(context, container); + storageMaintainer.handleCoreDumpsForContainer(context, container, false); if (downloadImageIfNeeded(context, container)) { context.log(logger, "Waiting for image to download " + context.node().wantedDockerImage().get().asString()); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 9475e3720c2..34c4bc15ee9 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -449,7 +449,7 @@ public class NodeAgentImplTest { final InOrder inOrder = inOrder(storageMaintainer, containerOperations, nodeRepository); inOrder.verify(containerOperations, times(1)).stopServices(eq(context)); - inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any()); + inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any(), eq(true)); inOrder.verify(containerOperations, times(1)).removeContainer(eq(context), any()); inOrder.verify(storageMaintainer, times(1)).archiveNodeStorage(eq(context)); inOrder.verify(nodeRepository, times(1)).setNodeState(eq(hostName), eq(NodeState.ready)); -- cgit v1.2.3