diff options
author | freva <valerijf@yahoo-inc.com> | 2016-11-22 14:41:39 +0100 |
---|---|---|
committer | freva <valerijf@yahoo-inc.com> | 2016-11-22 14:41:39 +0100 |
commit | e36b7c99d536cb9fa249e77f2e628690aa344617 (patch) | |
tree | fa162e6d3559040b8aa629f366164f0b2a707af8 /node-admin | |
parent | 66ff6b8848a2245ce457b70f1dada6a4869b6b40 (diff) |
Created coredump handler for docker containers
Diffstat (limited to 'node-admin')
5 files changed, 369 insertions, 14 deletions
diff --git a/node-admin/pom.xml b/node-admin/pom.xml index 1cc89a1fd09..159124f6657 100644 --- a/node-admin/pom.xml +++ b/node-admin/pom.xml @@ -100,6 +100,13 @@ <artifactId>airline</artifactId> <version>0.7</version> </dependency> + + <!-- JSON parser for Maintenance JVM --> + <dependency> + <groupId>com.google.code.gson</groupId> + <artifactId>gson</artifactId> + <version>2.6.2</version> + </dependency> </dependencies> <build> @@ -126,7 +133,6 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <configuration> - <finalName>node-admin-maintenance</finalName> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> diff --git a/node-admin/scripts/maintenance.sh b/node-admin/scripts/maintenance.sh index f637bdca035..1a3b7fcd4a9 100755 --- a/node-admin/scripts/maintenance.sh +++ b/node-admin/scripts/maintenance.sh @@ -2,7 +2,7 @@ if [ -z "$CLASSPATH" ]; then - CLASSPATH=/home/y/lib/jars/node-admin-maintenance-jar-with-dependencies.jar:/home/y/lib/jars/docker-api-jar-with-dependencies.jar:/home/y/lib/jars/vespalog.jar + CLASSPATH=/home/y/lib/jars/node-admin-maintenance-jar-with-dependencies.jar:/home/y/lib/jars/docker-api-jar-with-dependencies.jar:/home/y/lib/jars/vespalog.jar:/home/y/lib/jars/vespajlib.jar fi java \ diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java new file mode 100644 index 00000000000..9e9c0d3525d --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoreCollector.java @@ -0,0 +1,113 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.maintenance; + +import com.yahoo.vespa.hosted.dockerapi.ProcessResult; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Takes in a compressed (lz4) or uncompressed core dump and collects relevant metadata. + * + * @author freva + */ +public class CoreCollector { + private static final String GDB_PATH = "/home/y/bin64/gdb"; + private static final Pattern CORE_GENERATOR_PATH_PATTERN = Pattern.compile("^Core was generated by `(?<path>.*?)'.$"); + private static final Pattern EXECFN_PATH_PATTERN = Pattern.compile("^.* execfn: '(?<path>.*?)'"); + private static final Pattern FROM_PATH_PATTERN = Pattern.compile("^.* from '(?<path>.*?)'"); + private static final Pattern TOTAL_MEMORY_PATTERN = Pattern.compile("^MemTotal:\\s*(?<totalMem>\\d+) kB$", Pattern.MULTILINE); + + + private static Path readBinPathFallback(Path coredumpPath) throws IOException, InterruptedException { + String command = GDB_PATH + " -n -batch -core " + coredumpPath + " | grep \'^Core was generated by\'"; + ProcessResult result = Maintainer.exec("sh", "-c", "\"" + command + "\""); + + Matcher matcher = CORE_GENERATOR_PATH_PATTERN.matcher(result.getOutput()); + if (! matcher.find()) { + throw new RuntimeException("Failed to extract binary path from " + result); + } + return Paths.get(matcher.group("path").split(" ")[0]); + } + + private static Path readBinPath(Path coredumpPath) throws IOException, InterruptedException { + ProcessResult result = Maintainer.exec("file", coredumpPath.toString()); + + Matcher execfnMatcher = EXECFN_PATH_PATTERN.matcher(result.getOutput()); + if (execfnMatcher.find()) { + return Paths.get(execfnMatcher.group("path").split(" ")[0]); + } + + Matcher fromMatcher = FROM_PATH_PATTERN.matcher(result.getOutput()); + if (fromMatcher.find()) { + return Paths.get(fromMatcher.group("path").split(" ")[0]); + } + + return readBinPathFallback(coredumpPath); + } + + private static List<String> readBacktrace(Path coredumpPath, Path binPath, boolean allThreads) throws IOException, InterruptedException { + ProcessResult result = Maintainer.exec(GDB_PATH, "-n", "-ex", (allThreads ? "thread apply all bt" : "bt"), + "-batch", binPath.toString(), coredumpPath.toString()); + return Arrays.asList(result.getOutput().split("\n")); + } + + public Map<String, Object> collect(Path coredumpPath) throws IOException, InterruptedException { + coredumpPath = decompressCoredump(coredumpPath); + Path binPath = readBinPath(coredumpPath); + + Map<String, Object> data = new LinkedHashMap<>(); + data.put("bin_path", binPath.toString()); + data.put("backtrace", readBacktrace(coredumpPath, binPath, false)); + data.put("backtrace_all_threads", readBacktrace(coredumpPath, binPath, true)); + + deleteDecompressedCoredump(coredumpPath); + return data; + } + + + private static Path decompressCoredump(Path coredumpPath) throws IOException, InterruptedException { + if (! coredumpPath.toString().endsWith(".lz4")) return coredumpPath; + + if (! diskSpaceAvailable(coredumpPath)) { + throw new RuntimeException("Not decompressing " + coredumpPath + " due to not enough disk space available"); + } + + Path decompressedPath = Paths.get(coredumpPath.toString().replaceFirst("\\.lz4$", "")); + + ProcessResult result = Maintainer.exec("/home/y/bin64/lz4", "-d", coredumpPath.toString(), decompressedPath.toString()); + if (! result.isSuccess()) { + throw new RuntimeException("Failed to decompress file " + coredumpPath + ": " + result); + } + return decompressedPath; + } + + /** + * Delete the coredump unless: + * - The file is compressed + * - There is no compressed file (i.e. it was not decompressed in the first place) + */ + private static void deleteDecompressedCoredump(Path coredumpPath) throws IOException { + if (! coredumpPath.toString().endsWith(".lz4") && Paths.get(coredumpPath.toString() + ".lz4").toFile().exists()) { + Files.delete(coredumpPath); + } + } + + private static boolean diskSpaceAvailable(Path path) throws IOException { + String memInfo = new String(Files.readAllBytes(Paths.get("/proc/meminfo"))); + + Matcher matcher = TOTAL_MEMORY_PATTERN.matcher(memInfo); + if (!matcher.find()) return false; + int totalMem = Integer.valueOf(matcher.group("totalMem")); + + return path.toFile().getFreeSpace() > totalMem; + } +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java new file mode 100644 index 00000000000..6b8e7043369 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/CoredumpHandler.java @@ -0,0 +1,144 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.maintenance; + +import com.google.gson.Gson; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.entity.StringEntity; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +/** + * Finds coredumps, collects metadata and reports them + * + * @author freva + */ +public class CoredumpHandler { + public static final String FEED_ENDPOINT = "http://panic.vespa.us-west-1.prod.vespa.yahooapis.com:4080/document/v1/panic/core_dump/docid"; + public static final String PROCESSING_DIRECTORY_NAME = "processing"; + public static final String DONE_DIRECTORY_NAME = "done"; + public static final String METADATA_FILE_NAME = "metadata.json"; + + private static final Logger logger = Logger.getLogger(CoredumpHandler.class.getName()); + private static final Gson gson = new Gson(); + private final Path path; + + private final HttpClient httpClient; + private final CoreCollector coreCollector; + private final Path processingDir; + private final Path doneDir; + + private final Map<String, Object> nodeAttributes; + + public CoredumpHandler(HttpClient httpClient, CoreCollector coreCollector, Path path, Map<String, Object> nodeAttributes) { + this.httpClient = httpClient; + this.coreCollector = coreCollector; + this.path = path; + this.processingDir = path.resolve(PROCESSING_DIRECTORY_NAME); + this.doneDir = path.resolve(DONE_DIRECTORY_NAME); + this.nodeAttributes = nodeAttributes; + } + + public void processAll() throws IOException { + removeJavaCoredumps(); + processCoredumps(); + reportCoredumps(); + removeOldCoredumps(); + } + + private void removeJavaCoredumps() { + DeleteOldAppData.deleteFiles(path.toString(), 0, "^java_pid.*\\.hprof$", false); + } + + void processCoredumps() throws IOException { + processingDir.toFile().mkdirs(); + + Files.list(path) + .filter(path -> path.toFile().isFile()) + .forEach(coredumpPath -> { + try { + coredumpPath = startProcessing(coredumpPath); + + Path metadataPath = coredumpPath.getParent().resolve(METADATA_FILE_NAME); + Map<String, Object> metadata = collectMetada(coredumpPath); + writeMetadata(metadataPath, metadata); + } catch (Throwable e) { + logger.log(Level.WARNING, "Failed to process coredump " + coredumpPath, e); + } + }); + } + + void reportCoredumps() throws IOException { + doneDir.toFile().mkdirs(); + + Files.list(processingDir) + .filter(path -> path.toFile().isDirectory()) + .forEach(coredumpDirectory -> { + try { + report(coredumpDirectory); + finishProcessing(coredumpDirectory); + } catch (Throwable e) { + logger.log(Level.WARNING, "Failed to report coredump " + coredumpDirectory, e); + } + }); + } + + private void removeOldCoredumps() { + DeleteOldAppData.deleteDirectories(doneDir.toString(), Duration.ofDays(10).getSeconds(), null); + } + + private Path startProcessing(Path coredumpPath) throws IOException { + Path folder = processingDir.resolve(UUID.randomUUID().toString()); + folder.toFile().mkdirs(); + return Files.move(coredumpPath, folder.resolve(coredumpPath.getFileName())); + } + + private Map<String, Object> collectMetada(Path coredumpPath) throws IOException, InterruptedException { + Map<String, Object> metadata = coreCollector.collect(coredumpPath); + metadata.putAll(nodeAttributes); + + Map<String, Object> fields = new HashMap<>(); + fields.put("fields", metadata); + return fields; + } + + private void writeMetadata(Path metadataPath, Map<String, Object> metadata) throws IOException { + Files.write(metadataPath, gson.toJson(metadata).getBytes()); + } + + private void report(Path coredumpDirectory) throws IOException, InterruptedException { + // Use core dump UUID as document ID + String documentId = coredumpDirectory.getFileName().toString(); + String metadata = new String(Files.readAllBytes(coredumpDirectory.resolve(METADATA_FILE_NAME))); + + HttpPost post = new HttpPost(FEED_ENDPOINT + "/" + documentId); + post.setHeader(HttpHeaders.CONTENT_TYPE, "application/json"); + post.setEntity(new StringEntity(metadata)); + + HttpResponse response = httpClient.execute(post); + if (response.getStatusLine().getStatusCode() / 100 != 2) { + String result = new BufferedReader(new InputStreamReader(response.getEntity().getContent())) + .lines().collect(Collectors.joining("\n")); + throw new RuntimeException("POST to " + post.getURI() + " failed with HTTP: " + + response.getStatusLine().getStatusCode() + " [" + result + "]"); + } + logger.info("Successfully reported coredump " + documentId); + } + + private void finishProcessing(Path coredumpDirectory) throws IOException { + Files.move(coredumpDirectory, doneDir.resolve(coredumpDirectory.getFileName())); + } +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java index 02a05f9fa9d..56ad4de5b31 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/maintenance/Maintainer.java @@ -1,16 +1,23 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.maintenance; +import com.google.gson.Gson; import com.yahoo.io.IOUtils; import com.yahoo.log.LogSetup; import com.yahoo.vespa.hosted.dockerapi.ContainerName; +import com.yahoo.vespa.hosted.dockerapi.ProcessResult; +import com.yahoo.vespa.hosted.node.admin.ContainerNodeSpec; +import com.yahoo.vespa.hosted.node.admin.util.Environment; import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger; import io.airlift.airline.Arguments; import io.airlift.airline.Cli; import io.airlift.airline.Command; import io.airlift.airline.Help; +import io.airlift.airline.Option; import io.airlift.airline.ParseArgumentsUnexpectedException; import io.airlift.airline.ParseOptionMissingException; +import org.apache.http.client.HttpClient; +import org.apache.http.impl.client.HttpClientBuilder; import java.io.File; import java.io.IOException; @@ -24,14 +31,17 @@ import java.time.Duration; import java.time.Instant; import java.util.Arrays; import java.util.Date; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.TimeZone; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; /** - * @author valerijf + * @author freva */ public class Maintainer { private static final Path ROOT = Paths.get("/"); @@ -40,12 +50,19 @@ public class Maintainer { private static final Path APPLICATION_STORAGE_PATH_FOR_HOST = ROOT.resolve(RELATIVE_APPLICATION_STORAGE_PATH); private static final String APPLICATION_STORAGE_CLEANUP_PATH_PREFIX = "cleanup_"; + private static final HttpClient HTTP_CLIENT = HttpClientBuilder.create().build(); + private static final CoreCollector CORE_COLLECTOR = new CoreCollector(); + private static final Gson gson = new Gson(); + private static DateFormat filenameFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); - public static final String JOB_DELETE_OLD_APP_DATA = "delete-old-app-data"; - public static final String JOB_ARCHIVE_APP_DATA = "archive-app-data"; - public static final String JOB_CLEAN_CORE_DUMPS = "clean-core-dumps"; - public static final String JOB_CLEAN_HOME = "clean-home"; + private static final String JOB_DELETE_OLD_APP_DATA = "delete-old-app-data"; + private static final String JOB_ARCHIVE_APP_DATA = "archive-app-data"; + private static final String JOB_CLEAN_CORE_DUMPS = "clean-core-dumps"; + private static final String JOB_CLEAN_HOME = "clean-home"; + private static final String JOB_HANDLE_CORE_DUMPS = "handle-core-dumps"; + + private static Optional<String> kernelVersion = Optional.empty(); static { filenameFormatter.setTimeZone(TimeZone.getTimeZone("UTC")); @@ -62,7 +79,8 @@ public class Maintainer { DeleteOldAppDataArguments.class, CleanCoreDumpsArguments.class, CleanHomeArguments.class, - ArchiveApplicationData.class); + ArchiveApplicationData.class, + HandleCoreDumpsForContainer.class); Cli<Runnable> gitParser = builder.build(); try { @@ -89,6 +107,31 @@ public class Maintainer { executeMaintainer(logger, JOB_ARCHIVE_APP_DATA, containerName.asString()); } + public static void handleCoreDumpsForContainer(PrefixLogger logger, ContainerNodeSpec nodeSpec, Environment environment) { + Map<String, Object> attributes = new HashMap<>(); + attributes.put("hostname", nodeSpec.hostname); + attributes.put("region", environment.getRegion()); + attributes.put("environment", environment.getEnvironment()); + attributes.put("flavor", nodeSpec.nodeFlavor); + try { + attributes.put("kernel_version", getKernelVersion()); + } catch (Throwable ignored) { + attributes.put("kernel_version", "unknown"); + } + + if (nodeSpec.wantedDockerImage.isPresent()) attributes.put("docker_image", nodeSpec.wantedDockerImage.get().asString()); + if (nodeSpec.vespaVersion.isPresent()) attributes.put("vespa_version", nodeSpec.vespaVersion.get()); + if (nodeSpec.owner.isPresent()) { + attributes.put("tenant", nodeSpec.owner.get().tenant); + attributes.put("application", nodeSpec.owner.get().application); + attributes.put("instance", nodeSpec.owner.get().instance); + } + + executeMaintainer(logger, JOB_HANDLE_CORE_DUMPS, + "--container", nodeSpec.containerName.asString(), + "--attributes", gson.toJson(attributes)); + } + private static void executeMaintainer(PrefixLogger logger, String... params) { String[] baseArguments = {"sudo", "/home/y/libexec/vespa/node-admin/maintenance.sh"}; String[] args = concatenateArrays(baseArguments, params); @@ -97,17 +140,24 @@ public class Maintainer { env.put("VESPA_SERVICE_NAME", "maintainer"); try { - Process process = processBuilder.start(); - String output = IOUtils.readAll(new InputStreamReader(process.getInputStream())); - String errors = IOUtils.readAll(new InputStreamReader(process.getErrorStream())); + ProcessResult result = exec(args); - if (! output.isEmpty()) logger.info(output); - if (! errors.isEmpty()) logger.error(errors); - } catch (IOException e) { + if (! result.getOutput().isEmpty()) logger.info(result.getOutput()); + if (! result.getErrors().isEmpty()) logger.error(result.getErrors()); + } catch (IOException | InterruptedException e) { logger.warning("Failed to execute command " + Arrays.toString(args), e); } } + public static ProcessResult exec(String... args) throws IOException, InterruptedException { + ProcessBuilder processBuilder = new ProcessBuilder(args); + Process process = processBuilder.start(); + String output = IOUtils.readAll(new InputStreamReader(process.getInputStream())); + String errors = IOUtils.readAll(new InputStreamReader(process.getErrorStream())); + + return new ProcessResult(process.waitFor(), output, errors); + } + public static String[] concatenateArrays(String[] ar1, String... ar2) { String[] concatenated = new String[ar1.length + ar2.length]; System.arraycopy(ar1, 0, concatenated, 0, ar1.length); @@ -199,6 +249,35 @@ public class Maintainer { } } + @SuppressWarnings("unchecked") + @Command(name = JOB_HANDLE_CORE_DUMPS, description = "Finds container's coredumps, collects metadata and reports them") + public static class HandleCoreDumpsForContainer implements Runnable { + @Option(name = "--container", description = "Name of the container") + public String container; + + @Option(name = "--attributes", description = "Comma separated key=value pairs") + public String attributes; + + @Override + public void run() { + Logger logger = Logger.getLogger(HandleCoreDumpsForContainer.class.getName()); + + if (container == null) { + throw new IllegalArgumentException("<container> is required"); + } + + try { + Map<String, Object> attributesMap = (Map<String, Object>) gson.fromJson(attributes, Map.class); + + Path path = new Maintainer().pathInNodeAdminFromPathInNode(new ContainerName(container), "/home/y/var/crash"); + CoredumpHandler coredumpHandler = new CoredumpHandler(HTTP_CLIENT, CORE_COLLECTOR, path, attributesMap); + coredumpHandler.processAll(); + } catch (Throwable e) { + logger.log(Level.WARNING, "Could not process coredumps", e); + } + } + } + /** * Absolute path in node admin container to the node cleanup directory. @@ -241,4 +320,17 @@ public class Maintainer { .resolve(containerName.asString()) .resolve(ROOT.relativize(pathInNode)); } + + public static String getKernelVersion() throws IOException, InterruptedException { + if (! kernelVersion.isPresent()) { + ProcessResult result = exec("uname", "-r"); + if (result.isSuccess()) { + kernelVersion = Optional.of(result.getOutput().trim()); + } else { + throw new RuntimeException("Failed to get kernel version\n" + result); + } + } + + return kernelVersion.get(); + } } |