aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfreva <valerijf@yahoo-inc.com>2016-11-22 14:41:39 +0100
committerfreva <valerijf@yahoo-inc.com>2016-11-22 14:41:39 +0100
commite36b7c99d536cb9fa249e77f2e628690aa344617 (patch)
treefa162e6d3559040b8aa629f366164f0b2a707af8
parent66ff6b8848a2245ce457b70f1dada6a4869b6b40 (diff)
Created coredump handler for docker containers
-rw-r--r--node-admin/pom.xml8
-rwxr-xr-xnode-admin/scripts/maintenance.sh2
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java113
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java144
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java116
5 files changed, 369 insertions, 14 deletions
diff --git a/node-admin/pom.xml b/node-admin/pom.xml
index 1cc89a1fd09..159124f6657 100644
--- a/node-admin/pom.xml
+++ b/node-admin/pom.xml
@@ -100,6 +100,13 @@
<artifactId>airline</artifactId>
<version>0.7</version>
</dependency>
+
+ <!-- JSON parser for Maintenance JVM -->
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>2.6.2</version>
+ </dependency>
</dependencies>
<build>
@@ -126,7 +133,6 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
-
<finalName>node-admin-maintenance</finalName>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
diff --git a/node-admin/scripts/maintenance.sh b/node-admin/scripts/maintenance.sh
index f637bdca035..1a3b7fcd4a9 100755
--- a/node-admin/scripts/maintenance.sh
+++ b/node-admin/scripts/maintenance.sh
@@ -2,7 +2,7 @@
if [ -z "$CLASSPATH" ]; then
- CLASSPATH=/home/y/lib/jars/node-admin-maintenance-jar-with-dependencies.jar:/home/y/lib/jars/docker-api-jar-with-dependencies.jar:/home/y/lib/jars/vespalog.jar
+ CLASSPATH=/home/y/lib/jars/node-admin-maintenance-jar-with-dependencies.jar:/home/y/lib/jars/docker-api-jar-with-dependencies.jar:/home/y/lib/jars/vespalog.jar:/home/y/lib/jars/vespajlib.jar
fi
java \
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java
new file mode 100644
index 00000000000..9e9c0d3525d
--- /dev/null
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java
@@ -0,0 +1,113 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.node.maintenance;
+
+import com.yahoo.vespa.hosted.dockerapi.ProcessResult;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Takes in a compressed (lz4) or uncompressed core dump and collects relevant metadata.
+ *
+ * @author freva
+ */
+public class CoreCollector {
+ private static final String GDB_PATH = "/home/y/bin64/gdb";
+ private static final Pattern CORE_GENERATOR_PATH_PATTERN = Pattern.compile("^Core was generated by `(?<path>.*?)'.$");
+ private static final Pattern EXECFN_PATH_PATTERN = Pattern.compile("^.* execfn: '(?<path>.*?)'");
+ private static final Pattern FROM_PATH_PATTERN = Pattern.compile("^.* from '(?<path>.*?)'");
+ private static final Pattern TOTAL_MEMORY_PATTERN = Pattern.compile("^MemTotal:\\s*(?<totalMem>\\d+) kB$", Pattern.MULTILINE);
+
+
+ private static Path readBinPathFallback(Path coredumpPath) throws IOException, InterruptedException {
+ String command = GDB_PATH + " -n -batch -core " + coredumpPath + " | grep \'^Core was generated by\'";
+ ProcessResult result = Maintainer.exec("sh", "-c", "\"" + command + "\"");
+
+ Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput());
+ if (! matcher.find()) {
+ throw new RuntimeException("Failed to extract binary path from " + result);
+ }
+ return Paths.get(matcher.group("path").split(" ")[0]);
+ }
+
+ private static Path readBinPath(Path coredumpPath) throws IOException, InterruptedException {
+ ProcessResult result = Maintainer.exec("file", coredumpPath.toString());
+
+ Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput());
+ if (execfnMatcher.find()) {
+ return Paths.get(execfnMatcher.group("path").split(" ")[0]);
+ }
+
+ Matcher fromMatcher = FROM_PATH_PATTERN.matcher(result.getOutput());
+ if (fromMatcher.find()) {
+ return Paths.get(fromMatcher.group("path").split(" ")[0]);
+ }
+
+ return readBinPathFallback(coredumpPath);
+ }
+
+ private static List<String> readBacktrace(Path coredumpPath, Path binPath, boolean allThreads) throws IOException, InterruptedException {
+ ProcessResult result = Maintainer.exec(GDB_PATH, "-n", "-ex", (allThreads ? "thread apply all bt" : "bt"),
+ "-batch", binPath.toString(), coredumpPath.toString());
+ return Arrays.asList(result.getOutput().split("\n"));
+ }
+
+ public Map<String, Object> collect(Path coredumpPath) throws IOException, InterruptedException {
+ coredumpPath = decompressCoredump(coredumpPath);
+ Path binPath = readBinPath(coredumpPath);
+
+ Map<String, Object> data = new LinkedHashMap<>();
+ data.put("bin_path", binPath.toString());
+ data.put("backtrace", readBacktrace(coredumpPath, binPath, false));
+ data.put("backtrace_all_threads", readBacktrace(coredumpPath, binPath, true));
+
+ deleteDecompressedCoredump(coredumpPath);
+ return data;
+ }
+
+
+ private static Path decompressCoredump(Path coredumpPath) throws IOException, InterruptedException {
+ if (! coredumpPath.toString().endsWith(".lz4")) return coredumpPath;
+
+ if (! diskSpaceAvailable(coredumpPath)) {
+ throw new RuntimeException("Not decompressing " + coredumpPath + " due to not enough disk space available");
+ }
+
+ Path decompressedPath = Paths.get(coredumpPath.toString().replaceFirst("\\.lz4$", ""));
+
+ ProcessResult result = Maintainer.exec("/home/y/bin64/lz4", "-d", coredumpPath.toString(), decompressedPath.toString());
+ if (! result.isSuccess()) {
+ throw new RuntimeException("Failed to decompress file " + coredumpPath + ": " + result);
+ }
+ return decompressedPath;
+ }
+
+ /**
+ * Delete the coredump unless:
+ * - The file is compressed
+ * - There is no compressed file (i.e. it was not decompressed in the first place)
+ */
+ private static void deleteDecompressedCoredump(Path coredumpPath) throws IOException {
+ if (! coredumpPath.toString().endsWith(".lz4") && Paths.get(coredumpPath.toString() + ".lz4").toFile().exists()) {
+ Files.delete(coredumpPath);
+ }
+ }
+
+ private static boolean diskSpaceAvailable(Path path) throws IOException {
+ String memInfo = new String(Files.readAllBytes(Paths.get("/proc/meminfo")));
+
+ Matcher matcher = TOTAL_MEMORY_PATTERN.matcher(memInfo);
+ if (!matcher.find()) return false;
+ int totalMem = Integer.valueOf(matcher.group("totalMem"));
+
+ return path.toFile().getFreeSpace() > totalMem;
+ }
+}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java
new file mode 100644
index 00000000000..6b8e7043369
--- /dev/null
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java
@@ -0,0 +1,144 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.node.maintenance;
+
+import com.google.gson.Gson;
+import org.apache.http.HttpHeaders;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Duration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+/**
+ * Finds coredumps, collects metadata and reports them
+ *
+ * @author freva
+ */
+public class CoredumpHandler {
+ public static final String FEED_ENDPOINT = "http://panic.vespa.us-west-1.prod.vespa.yahooapis.com:4080/document/v1/panic/core_dump/docid";
+ public static final String PROCESSING_DIRECTORY_NAME = "processing";
+ public static final String DONE_DIRECTORY_NAME = "done";
+ public static final String METADATA_FILE_NAME = "metadata.json";
+
+ private static final Logger logger = Logger.getLogger(CoredumpHandler.class.getName());
+ private static final Gson gson = new Gson();
+ private final Path path;
+
+ private final HttpClient httpClient;
+ private final CoreCollector coreCollector;
+ private final Path processingDir;
+ private final Path doneDir;
+
+ private final Map<String, Object> nodeAttributes;
+
+ public CoredumpHandler(HttpClient httpClient, CoreCollector coreCollector, Path path, Map<String, Object> nodeAttributes) {
+ this.httpClient = httpClient;
+ this.coreCollector = coreCollector;
+ this.path = path;
+ this.processingDir = path.resolve(PROCESSING_DIRECTORY_NAME);
+ this.doneDir = path.resolve(DONE_DIRECTORY_NAME);
+ this.nodeAttributes = nodeAttributes;
+ }
+
+ public void processAll() throws IOException {
+ removeJavaCoredumps();
+ processCoredumps();
+ reportCoredumps();
+ removeOldCoredumps();
+ }
+
+ private void removeJavaCoredumps() {
+ DeleteOldAppData.deleteFiles(path.toString(), 0, "^java_pid.*\\.hprof$", false);
+ }
+
+ void processCoredumps() throws IOException {
+ processingDir.toFile().mkdirs();
+
+ Files.list(path)
+ .filter(path -> path.toFile().isFile())
+ .forEach(coredumpPath -> {
+ try {
+ coredumpPath = startProcessing(coredumpPath);
+
+ Path metadataPath = coredumpPath.getParent().resolve(METADATA_FILE_NAME);
+ Map<String, Object> metadata = collectMetada(coredumpPath);
+ writeMetadata(metadataPath, metadata);
+ } catch (Throwable e) {
+ logger.log(Level.WARNING, "Failed to process coredump " + coredumpPath, e);
+ }
+ });
+ }
+
+ void reportCoredumps() throws IOException {
+ doneDir.toFile().mkdirs();
+
+ Files.list(processingDir)
+ .filter(path -> path.toFile().isDirectory())
+ .forEach(coredumpDirectory -> {
+ try {
+ report(coredumpDirectory);
+ finishProcessing(coredumpDirectory);
+ } catch (Throwable e) {
+ logger.log(Level.WARNING, "Failed to report coredump " + coredumpDirectory, e);
+ }
+ });
+ }
+
+ private void removeOldCoredumps() {
+ DeleteOldAppData.deleteDirectories(doneDir.toString(), Duration.ofDays(10).getSeconds(), null);
+ }
+
+ private Path startProcessing(Path coredumpPath) throws IOException {
+ Path folder = processingDir.resolve(UUID.randomUUID().toString());
+ folder.toFile().mkdirs();
+ return Files.move(coredumpPath, folder.resolve(coredumpPath.getFileName()));
+ }
+
+ private Map<String, Object> collectMetada(Path coredumpPath) throws IOException, InterruptedException {
+ Map<String, Object> metadata = coreCollector.collect(coredumpPath);
+ metadata.putAll(nodeAttributes);
+
+ Map<String, Object> fields = new HashMap<>();
+ fields.put("fields", metadata);
+ return fields;
+ }
+
+ private void writeMetadata(Path metadataPath, Map<String, Object> metadata) throws IOException {
+ Files.write(metadataPath, gson.toJson(metadata).getBytes());
+ }
+
+ private void report(Path coredumpDirectory) throws IOException, InterruptedException {
+ // Use core dump UUID as document ID
+ String documentId = coredumpDirectory.getFileName().toString();
+ String metadata = new String(Files.readAllBytes(coredumpDirectory.resolve(METADATA_FILE_NAME)));
+
+ HttpPost post = new HttpPost(FEED_ENDPOINT + "/" + documentId);
+ post.setHeader(HttpHeaders.CONTENT_TYPE, "application/json");
+ post.setEntity(new StringEntity(metadata));
+
+ HttpResponse response = httpClient.execute(post);
+ if (response.getStatusLine().getStatusCode() / 100 != 2) {
+ String result = new BufferedReader(new InputStreamReader(response.getEntity().getContent()))
+ .lines().collect(Collectors.joining("\n"));
+ throw new RuntimeException("POST to " + post.getURI() + " failed with HTTP: " +
+ response.getStatusLine().getStatusCode() + " [" + result + "]");
+ }
+ logger.info("Successfully reported coredump " + documentId);
+ }
+
+ private void finishProcessing(Path coredumpDirectory) throws IOException {
+ Files.move(coredumpDirectory, doneDir.resolve(coredumpDirectory.getFileName()));
+ }
+}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java
index 02a05f9fa9d..56ad4de5b31 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java
@@ -1,16 +1,23 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.maintenance;
+import com.google.gson.Gson;
import com.yahoo.io.IOUtils;
import com.yahoo.log.LogSetup;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
+import com.yahoo.vespa.hosted.dockerapi.ProcessResult;
+import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec;
+import com.yahoo.vespa.hosted.node.admin.util.Environment;
import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger;
import io.airlift.airline.Arguments;
import io.airlift.airline.Cli;
import io.airlift.airline.Command;
import io.airlift.airline.Help;
+import io.airlift.airline.Option;
import io.airlift.airline.ParseArgumentsUnexpectedException;
import io.airlift.airline.ParseOptionMissingException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
import java.io.File;
import java.io.IOException;
@@ -24,14 +31,17 @@ import java.time.Duration;
import java.time.Instant;
import java.util.Arrays;
import java.util.Date;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.TimeZone;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
/**
- * @author valerijf
+ * @author freva
*/
public class Maintainer {
private static final Path ROOT = Paths.get("/");
@@ -40,12 +50,19 @@ public class Maintainer {
private static final Path APPLICATION_STORAGE_PATH_FOR_HOST = ROOT.resolve(RELATIVE_APPLICATION_STORAGE_PATH);
private static final String APPLICATION_STORAGE_CLEANUP_PATH_PREFIX = "cleanup_";
+ private static final HttpClient HTTP_CLIENT = HttpClientBuilder.create().build();
+ private static final CoreCollector CORE_COLLECTOR = new CoreCollector();
+ private static final Gson gson = new Gson();
+
private static DateFormat filenameFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
- public static final String JOB_DELETE_OLD_APP_DATA = "delete-old-app-data";
- public static final String JOB_ARCHIVE_APP_DATA = "archive-app-data";
- public static final String JOB_CLEAN_CORE_DUMPS = "clean-core-dumps";
- public static final String JOB_CLEAN_HOME = "clean-home";
+ private static final String JOB_DELETE_OLD_APP_DATA = "delete-old-app-data";
+ private static final String JOB_ARCHIVE_APP_DATA = "archive-app-data";
+ private static final String JOB_CLEAN_CORE_DUMPS = "clean-core-dumps";
+ private static final String JOB_CLEAN_HOME = "clean-home";
+ private static final String JOB_HANDLE_CORE_DUMPS = "handle-core-dumps";
+
+ private static Optional<String> kernelVersion = Optional.empty();
static {
filenameFormatter.setTimeZone(TimeZone.getTimeZone("UTC"));
@@ -62,7 +79,8 @@ public class Maintainer {
DeleteOldAppDataArguments.class,
CleanCoreDumpsArguments.class,
CleanHomeArguments.class,
- ArchiveApplicationData.class);
+ ArchiveApplicationData.class,
+ HandleCoreDumpsForContainer.class);
Cli<Runnable> gitParser = builder.build();
try {
@@ -89,6 +107,31 @@ public class Maintainer {
executeMaintainer(logger, JOB_ARCHIVE_APP_DATA, containerName.asString());
}
+ public static void handleCoreDumpsForContainer(PrefixLogger logger, ContainerNodeSpec nodeSpec, Environment environment) {
+ Map<String, Object> attributes = new HashMap<>();
+ attributes.put("hostname", nodeSpec.hostname);
+ attributes.put("region", environment.getRegion());
+ attributes.put("environment", environment.getEnvironment());
+ attributes.put("flavor", nodeSpec.nodeFlavor);
+ try {
+ attributes.put("kernel_version", getKernelVersion());
+ } catch (Throwable ignored) {
+ attributes.put("kernel_version", "unknown");
+ }
+
+ if (nodeSpec.wantedDockerImage.isPresent()) attributes.put("docker_image", nodeSpec.wantedDockerImage.get().asString());
+ if (nodeSpec.vespaVersion.isPresent()) attributes.put("vespa_version", nodeSpec.vespaVersion.get());
+ if (nodeSpec.owner.isPresent()) {
+ attributes.put("tenant", nodeSpec.owner.get().tenant);
+ attributes.put("application", nodeSpec.owner.get().application);
+ attributes.put("instance", nodeSpec.owner.get().instance);
+ }
+
+ executeMaintainer(logger, JOB_HANDLE_CORE_DUMPS,
+ "--container", nodeSpec.containerName.asString(),
+ "--attributes", gson.toJson(attributes));
+ }
+
private static void executeMaintainer(PrefixLogger logger, String... params) {
String[] baseArguments = {"sudo", "/home/y/libexec/vespa/node-admin/maintenance.sh"};
String[] args = concatenateArrays(baseArguments, params);
@@ -97,17 +140,24 @@ public class Maintainer {
env.put("VESPA_SERVICE_NAME", "maintainer");
try {
- Process process = processBuilder.start();
- String output = IOUtils.readAll(new InputStreamReader(process.getInputStream()));
- String errors = IOUtils.readAll(new InputStreamReader(process.getErrorStream()));
+ ProcessResult result = exec(args);
- if (! output.isEmpty()) logger.info(output);
- if (! errors.isEmpty()) logger.error(errors);
- } catch (IOException e) {
+ if (! result.getOutput().isEmpty()) logger.info(result.getOutput());
+ if (! result.getErrors().isEmpty()) logger.error(result.getErrors());
+ } catch (IOException | InterruptedException e) {
logger.warning("Failed to execute command " + Arrays.toString(args), e);
}
}
+ public static ProcessResult exec(String... args) throws IOException, InterruptedException {
+ ProcessBuilder processBuilder = new ProcessBuilder(args);
+ Process process = processBuilder.start();
+ String output = IOUtils.readAll(new InputStreamReader(process.getInputStream()));
+ String errors = IOUtils.readAll(new InputStreamReader(process.getErrorStream()));
+
+ return new ProcessResult(process.waitFor(), output, errors);
+ }
+
public static String[] concatenateArrays(String[] ar1, String... ar2) {
String[] concatenated = new String[ar1.length + ar2.length];
System.arraycopy(ar1, 0, concatenated, 0, ar1.length);
@@ -199,6 +249,35 @@ public class Maintainer {
}
}
+ @SuppressWarnings("unchecked")
+ @Command(name = JOB_HANDLE_CORE_DUMPS, description = "Finds container's coredumps, collects metadata and reports them")
+ public static class HandleCoreDumpsForContainer implements Runnable {
+ @Option(name = "--container", description = "Name of the container")
+ public String container;
+
+ @Option(name = "--attributes", description = "Comma separated key=value pairs")
+ public String attributes;
+
+ @Override
+ public void run() {
+ Logger logger = Logger.getLogger(HandleCoreDumpsForContainer.class.getName());
+
+ if (container == null) {
+ throw new IllegalArgumentException("<container> is required");
+ }
+
+ try {
+ Map<String, Object> attributesMap = (Map<String, Object>) gson.fromJson(attributes, Map.class);
+
+ Path path = new Maintainer().pathInNodeAdminFromPathInNode(new ContainerName(container), "/home/y/var/crash");
+ CoredumpHandler coredumpHandler = new CoredumpHandler(HTTP_CLIENT, CORE_COLLECTOR, path, attributesMap);
+ coredumpHandler.processAll();
+ } catch (Throwable e) {
+ logger.log(Level.WARNING, "Could not process coredumps", e);
+ }
+ }
+ }
+
/**
* Absolute path in node admin container to the node cleanup directory.
@@ -241,4 +320,17 @@ public class Maintainer {
.resolve(containerName.asString())
.resolve(ROOT.relativize(pathInNode));
}
+
+ public static String getKernelVersion() throws IOException, InterruptedException {
+ if (! kernelVersion.isPresent()) {
+ ProcessResult result = exec("uname", "-r");
+ if (result.isSuccess()) {
+ kernelVersion = Optional.of(result.getOutput().trim());
+ } else {
+ throw new RuntimeException("Failed to get kernel version\n" + result);
+ }
+ }
+
+ return kernelVersion.get();
+ }
}