From 8bebe88526e449bbca0514d8bdfc87390cb26db4 Mon Sep 17 00:00:00 2001 From: Ola Aunrønning Date: Thu, 16 Apr 2020 10:38:41 +0200 Subject: Add metrics for enqueued and processed coredumps --- .../maintenance/coredump/CoredumpHandler.java | 63 ++++++++++++++++++++-- .../maintenance/coredump/CoredumpHandlerTest.java | 28 ++++++++-- 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index 9b0a35d4b96..9924202e2ae 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -2,6 +2,11 @@ package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; +import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeMembership; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; @@ -51,6 +56,7 @@ public class CoredumpHandler { private final Path doneCoredumpsPath; private final String operatorGroupName; private final Supplier coredumpIdSupplier; + private final Metrics metrics; /** * @param crashPathInContainer path inside the container where core dump are dumped @@ -58,19 +64,20 @@ public class CoredumpHandler { * @param operatorGroupName name of the group that will be set as the owner of the processed coredump */ public CoredumpHandler(Terminal terminal, CoreCollector coreCollector, CoredumpReporter coredumpReporter, - Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName) { + Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Metrics metrics) { this(terminal, coreCollector, coredumpReporter, crashPathInContainer, doneCoredumpsPath, - operatorGroupName, () -> UUID.randomUUID().toString()); + operatorGroupName, metrics, () -> UUID.randomUUID().toString()); } CoredumpHandler(Terminal terminal, CoreCollector coreCollector, CoredumpReporter coredumpReporter, - Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Supplier coredumpIdSupplier) { + Path crashPathInContainer, Path doneCoredumpsPath, String operatorGroupName, Metrics metrics, Supplier coredumpIdSupplier) { this.terminal = terminal; this.coreCollector = coreCollector; this.coredumpReporter = coredumpReporter; this.crashPatchInContainer = crashPathInContainer; this.doneCoredumpsPath = doneCoredumpsPath; this.operatorGroupName = operatorGroupName; + this.metrics = metrics; this.coredumpIdSupplier = coredumpIdSupplier; } @@ -88,6 +95,8 @@ public class CoredumpHandler { // Check if we have already started to process a core dump or we can enqueue a new core one getCoredumpToProcess(containerCrashPathOnHost, containerProcessingPathOnHost) .ifPresent(path -> processAndReportSingleCoredump(context, path, nodeAttributesSupplier)); + + updateMetrics(context, containerCrashPathOnHost); } /** @return path to directory inside processing directory that contains a core dump file to process */ @@ -180,8 +189,9 @@ public class CoredumpHandler { new UnixPath(compressedCoreFile).setGroup(operatorGroupName).setPermissions("rw-r-----"); Files.delete(coreFile); - Path newCoredumpDirectory = doneCoredumpsPath.resolve(coredumpDirectory.getFileName()); - Files.move(coredumpDirectory, newCoredumpDirectory); + Path newCoredumpDirectory = doneCoredumpsPath.resolve(context.containerName().asString()); + uncheck(() -> Files.createDirectories(newCoredumpDirectory)); + Files.move(coredumpDirectory, newCoredumpDirectory.resolve(coredumpDirectory.getFileName())); } Path findCoredumpFileInProcessingDirectory(Path coredumpProccessingDirectory) { @@ -194,4 +204,47 @@ public class CoredumpHandler { .orElseThrow(() -> new IllegalStateException( "No coredump file found in processing directory " + coredumpProccessingDirectory)); } + + void updateMetrics(NodeAgentContext context, Path containerCrashPathOnHost) { + Dimensions dimensions = generateDimensions(context); + + // Unprocessed coredumps + int numberOfUnprocessedCoredumps = FileFinder.files(containerCrashPathOnHost) + .match(nameStartsWith(".").negate()) + .list().size(); + + metrics.declareGauge(Metrics.APPLICATION_NODE, "coredumps.enqueued", dimensions, Metrics.DimensionType.PRETAGGED).sample(numberOfUnprocessedCoredumps); + + // Processed coredumps + Path processedCoredumpsPath = doneCoredumpsPath.resolve(context.containerName().asString()); + int numberOfProcessedCoredumps = FileFinder.files(processedCoredumpsPath) + .list().size(); + + metrics.declareGauge(Metrics.APPLICATION_NODE, "coredumps.processed", dimensions, Metrics.DimensionType.PRETAGGED).sample(numberOfProcessedCoredumps); + } + + private Dimensions generateDimensions(NodeAgentContext context) { + NodeSpec node = context.node(); + ApplicationId owner = node.owner().get(); + NodeMembership membership = node.membership().get(); + Dimensions.Builder dimensionsBuilder = new Dimensions.Builder() + .add("host", node.hostname()) + .add("flavor", node.flavor()) + .add("state", node.state().toString()) + .add("zone", context.zone().getId().value()) + .add("tenantName", owner.tenant().value()) + .add("applicationName", owner.application().value()) + .add("instanceName", owner.instance().value()) + .add("app", String.join(".", owner.application().value(), owner.instance().value())) + .add("applicationId", owner.toFullString()) + .add("clustertype", membership.clusterType()) + .add("clusterid", membership.clusterId()); + node.parentHostname().ifPresent(parent -> dimensionsBuilder.add("parentHostname", parent)); + node.allowedToBeDown().ifPresent(allowed -> + dimensionsBuilder.add("orchestratorState", allowed ? "ALLOWED_TO_BE_DOWN" : "NO_REMARKS")); + node.currentVespaVersion().ifPresent(vespaVersion -> dimensionsBuilder.add("vespaVersion", vespaVersion.toFullString())); + + return dimensionsBuilder.build(); + } + } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java index 3d9e3c08276..62bf9e3f9c2 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java @@ -1,6 +1,8 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; +import com.yahoo.vespa.hosted.dockerapi.metrics.DimensionMetrics; +import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; @@ -46,16 +48,17 @@ public class CoredumpHandlerTest { private final Path donePath = fileSystem.getPath("/home/docker/dumps"); private final NodeAgentContext context = new NodeAgentContextImpl.Builder("container-123.domain.tld") .fileSystem(fileSystem).build(); - private final Path crashPathInContainer = Paths.get("/var/crash"); + private final Path crashPathInContainer = fileSystem.getPath("/var/crash"); private final Path doneCoredumpsPath = fileSystem.getPath("/home/docker/dumps"); private final TestTerminal terminal = new TestTerminal(); private final CoreCollector coreCollector = mock(CoreCollector.class); private final CoredumpReporter coredumpReporter = mock(CoredumpReporter.class); + private final Metrics metrics = new Metrics(); @SuppressWarnings("unchecked") private final Supplier coredumpIdSupplier = mock(Supplier.class); private final CoredumpHandler coredumpHandler = new CoredumpHandler(terminal, coreCollector, coredumpReporter, - crashPathInContainer, doneCoredumpsPath, "users", coredumpIdSupplier); + crashPathInContainer, doneCoredumpsPath, "users", metrics, coredumpIdSupplier); @Test @@ -206,13 +209,28 @@ public class CoredumpHandlerTest { verify(coreCollector, never()).collect(any(), any()); verify(coredumpReporter, times(1)).reportCoredump(eq("id-123"), eq("metadata")); assertFalse(Files.exists(coredumpDirectory)); - assertFolderContents(doneCoredumpsPath, "id-123"); - assertFolderContents(doneCoredumpsPath.resolve("id-123"), "metadata.json", "dump_bash.core.431.lz4"); + assertFolderContents(doneCoredumpsPath.resolve("container-123"), "id-123"); + assertFolderContents(doneCoredumpsPath.resolve("container-123").resolve("id-123"), "metadata.json", "dump_bash.core.431.lz4"); + } + + @Test + public void report_enqueued_and_processed_metrics() throws IOException { + Files.createFile(crashPathInContainer.resolve("dump-1")); + Files.createFile(crashPathInContainer.resolve("dump-2")); + Files.createFile(doneCoredumpsPath.resolve("container-123").resolve("dump-3")); + + coredumpHandler.updateMetrics(context, crashPathInContainer); + List updatedMetrics = metrics.getMetricsByType(Metrics.DimensionType.PRETAGGED); + assertEquals(1, updatedMetrics.size()); + Map values = updatedMetrics.get(0).getMetrics(); + assertEquals(2, values.get("coredumps.enqueued").intValue()); + assertEquals(1, values.get("coredumps.processed").intValue()); } @Before public void setup() throws IOException { - Files.createDirectories(donePath); + Files.createDirectories(donePath.resolve("container-123")); + Files.createDirectories(crashPathInContainer); } @After -- cgit v1.2.3