summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥kon Hallingstad <hakon@verizonmedia.com>2021-05-19 23:37:51 +0200
committerGitHub <noreply@github.com>2021-05-19 23:37:51 +0200
commit3eceb9c73b15a352c6425cdc0510280c38895dab (patch)
tree80f3d235b490372aa067d58be9ee0125ae5c8cc4
parent9ab61df1e4788afec510696e172e9655d8a7baef (diff)
parent0d2b37680c89f4c640f4b50ca2367c1608eb8d4a (diff)
Merge pull request #17904 from vespa-engine/freva/throw-while-core-writing
Throw while core is being written
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java4
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java11
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java14
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java4
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java2
5 files changed, 25 insertions, 10 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
index 93717543a1c..10e0dd50761 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
@@ -165,9 +165,9 @@ public class StorageMaintainer {
}
/** Checks if container has any new coredumps, reports and archives them if so */
- public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container) {
+ public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container, boolean throwIfCoreBeingWritten) {
if (context.isDisabled(NodeAgentTask.CoreDumps)) return;
- coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container));
+ coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container), throwIfCoreBeingWritten);
}
private Map<String, Object> getCoredumpNodeAttributes(NodeAgentContext context, Optional<Container> container) {
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
index eb629ee6165..4c384b09fad 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.node.admin.maintenance.coredump;
import com.yahoo.vespa.hosted.dockerapi.ProcessResult;
import com.yahoo.vespa.hosted.node.admin.docker.ContainerOperations;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import java.nio.file.Path;
@@ -45,7 +46,7 @@ public class CoreCollector {
Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput());
if (! matcher.find()) {
- throw new RuntimeException(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
+ throw new ConvergenceException(String.format("Failed to extract binary path from GDB, result: %s, command: %s",
result, Arrays.toString(wrappedCommand)));
}
return Paths.get(matcher.group("path").split(" ")[0]);
@@ -56,7 +57,7 @@ public class CoreCollector {
try {
ProcessResult result = docker.executeCommandInContainerAsRoot(context, command);
if (result.getExitStatus() != 0) {
- throw new RuntimeException("file command failed with " + result);
+ throw new ConvergenceException("file command failed with " + result);
}
Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput());
@@ -82,7 +83,7 @@ public class CoreCollector {
ProcessResult result = docker.executeCommandInContainerAsRoot(context, command);
if (result.getExitStatus() != 0)
- throw new RuntimeException("Failed to read backtrace " + result + ", Command: " + Arrays.toString(command));
+ throw new ConvergenceException("Failed to read backtrace " + result + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
@@ -92,7 +93,7 @@ public class CoreCollector {
ProcessResult result = docker.executeCommandInContainerAsRoot(context, command);
if (result.getExitStatus() != 0)
- throw new RuntimeException("Failed to read jstack " + result + ", Command: " + Arrays.toString(command));
+ throw new ConvergenceException("Failed to read jstack " + result + ", Command: " + Arrays.toString(command));
return List.of(result.getOutput().split("\n"));
}
@@ -118,6 +119,8 @@ public class CoreCollector {
data.put("backtrace", readBacktrace(context, coredumpPath, binPath, false));
data.put("backtrace_all_threads", readBacktrace(context, coredumpPath, binPath, true));
}
+ } catch (ConvergenceException e) {
+ context.log(logger, Level.WARNING, "Failed to extract backtrace: " + e.getMessage());
} catch (RuntimeException e) {
context.log(logger, Level.WARNING, "Failed to extract backtrace", e);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
index a912de18b94..09c0a4ae491 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
@@ -5,6 +5,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions;
import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath;
@@ -84,12 +85,23 @@ public class CoredumpHandler {
}
- public void converge(NodeAgentContext context, Supplier<Map<String, Object>> nodeAttributesSupplier) {
+ public void converge(NodeAgentContext context, Supplier<Map<String, Object>> nodeAttributesSupplier, boolean throwIfCoreBeingWritten) {
Path containerCrashPathOnHost = context.pathOnHostFromPathInNode(crashPatchInContainer);
Path containerProcessingPathOnHost = containerCrashPathOnHost.resolve(PROCESSING_DIRECTORY_NAME);
updateMetrics(context, containerCrashPathOnHost);
+ if (throwIfCoreBeingWritten) {
+ List<String> pendingCores = FileFinder.files(containerCrashPathOnHost)
+ .match(fileAttributes -> !isReadyForProcessing(fileAttributes))
+ .maxDepth(1).stream()
+ .map(FileFinder.FileAttributes::filename)
+ .collect(Collectors.toUnmodifiableList());
+ if (!pendingCores.isEmpty())
+ throw new ConvergenceException(String.format("Cannot process %s coredumps: Still being written",
+ pendingCores.size() < 5 ? pendingCores : pendingCores.size()));
+ }
+
// Check if we have already started to process a core dump or we can enqueue a new core one
getCoredumpToProcess(containerCrashPathOnHost, containerProcessingPathOnHost)
.ifPresent(path -> processAndReportSingleCoredump(context, path, nodeAttributesSupplier));
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index c23e1899257..df3f075e8d9 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -366,7 +366,7 @@ public class NodeAgentImpl implements NodeAgent {
}
}
- storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer));
+ storageMaintainer.handleCoreDumpsForContainer(context, Optional.of(existingContainer), true);
containerOperations.removeContainer(context, existingContainer);
containerState = ABSENT;
context.log(logger, "Container successfully removed, new containerState is " + containerState);
@@ -469,7 +469,7 @@ public class NodeAgentImpl implements NodeAgent {
case active:
storageMaintainer.syncLogs(context, true);
storageMaintainer.cleanDiskIfFull(context);
- storageMaintainer.handleCoreDumpsForContainer(context, container);
+ storageMaintainer.handleCoreDumpsForContainer(context, container, false);
if (downloadImageIfNeeded(context, container)) {
context.log(logger, "Waiting for image to download " + context.node().wantedDockerImage().get().asString());
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 9475e3720c2..34c4bc15ee9 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -449,7 +449,7 @@ public class NodeAgentImplTest {
final InOrder inOrder = inOrder(storageMaintainer, containerOperations, nodeRepository);
inOrder.verify(containerOperations, times(1)).stopServices(eq(context));
- inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any());
+ inOrder.verify(storageMaintainer, times(1)).handleCoreDumpsForContainer(eq(context), any(), eq(true));
inOrder.verify(containerOperations, times(1)).removeContainer(eq(context), any());
inOrder.verify(storageMaintainer, times(1)).archiveNodeStorage(eq(context));
inOrder.verify(nodeRepository, times(1)).setNodeState(eq(hostName), eq(NodeState.ready));