diff options
18 files changed, 322 insertions, 297 deletions
diff --git a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java index 91904deefd5..7934215c165 100644 --- a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java +++ b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java @@ -404,13 +404,6 @@ public class Flags { "Enable mail verification", "Takes effect immediately"); - public static final UnboundBooleanFlag REPORT_CORES_VIA_CFG = defineFeatureFlag( - "report-cores-via-cfg", true, - List.of("hakonhall"), "2022-11-01", "2022-12-01", - "If true, report core dumps to the config server instead of directly to the panic app.", - "Takes effect on the next tick.", - ZONE_ID, NODE_TYPE, HOSTNAME); - public static final UnboundStringFlag CORE_ENCRYPTION_PUBLIC_KEY_ID = defineStringFlag( "core-encryption-public-key-id", "", List.of("vekterli"), "2022-11-03", "2023-02-01", diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java index d99b45dbbbc..cfb2d240bfa 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java @@ -10,9 +10,7 @@ import com.yahoo.vespa.hosted.node.admin.task.util.process.CommandResult; import java.nio.file.Path; import java.util.Arrays; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -32,9 +30,6 @@ public class CoreCollector { private static final Pattern FROM_PATH_PATTERN = Pattern.compile("^.* from '(?<path>.*?)'"); static final String GDB_PATH_RHEL8 = "/opt/rh/gcc-toolset-11/root/bin/gdb"; - static final Map<String, Object> JAVA_HEAP_DUMP_METADATA = - Map.of("bin_path", "java", "backtrace", List.of("Heap dump, no backtrace available")); - private final ContainerOperations container; public CoreCollector(ContainerOperations container) { @@ -100,36 +95,7 @@ public class CoreCollector { return List.of(result.getOutput().split("\n")); } - /** - * Collects metadata about a given core dump - * @param context context of the NodeAgent that owns the core dump - * @param coredumpPath path to core dump file inside the container - * @return map of relevant metadata about the core dump - */ - Map<String, Object> collect(NodeAgentContext context, ContainerPath coredumpPath) { - if (JAVA_HEAP_DUMP_PATTERN.matcher(coredumpPath.getFileName().toString()).find()) - return JAVA_HEAP_DUMP_METADATA; - - Map<String, Object> data = new HashMap<>(); - try { - String binPath = readBinPath(context, coredumpPath); - - data.put("bin_path", binPath); - if (Path.of(binPath).getFileName().toString().equals("java")) { - data.put("backtrace_all_threads", readJstack(context, coredumpPath, binPath)); - } else { - data.put("backtrace", readBacktrace(context, coredumpPath, binPath, false)); - data.put("backtrace_all_threads", readBacktrace(context, coredumpPath, binPath, true)); - } - } catch (ConvergenceException e) { - context.log(logger, Level.WARNING, "Failed to extract backtrace: " + e.getMessage()); - } catch (RuntimeException e) { - context.log(logger, Level.WARNING, "Failed to extract backtrace", e); - } - return data; - } - - CoreDumpMetadata collect2(NodeAgentContext context, ContainerPath coredumpPath) { + CoreDumpMetadata collect(NodeAgentContext context, ContainerPath coredumpPath) { var metadata = new CoreDumpMetadata(); if (JAVA_HEAP_DUMP_PATTERN.matcher(coredumpPath.getFileName().toString()).find()) { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java index 4a8353d92ec..94f402d5332 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java @@ -1,14 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; import com.yahoo.config.provision.DockerImage; import com.yahoo.security.KeyId; import com.yahoo.security.SecretSharedKey; import com.yahoo.security.SharedKeyGenerator; -import com.yahoo.vespa.flags.BooleanFlag; import com.yahoo.vespa.flags.FetchVector; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; @@ -26,19 +22,16 @@ import com.yahoo.vespa.hosted.node.admin.task.util.file.FileDeleter; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder; import com.yahoo.vespa.hosted.node.admin.task.util.file.FileMover; import com.yahoo.vespa.hosted.node.admin.task.util.file.MakeDirectory; -import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath; import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath; import java.io.IOException; import java.io.OutputStream; import java.io.UncheckedIOException; -import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.time.Clock; import java.util.Comparator; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -61,64 +54,49 @@ import static com.yahoo.yolean.Exceptions.uncheck; */ public class CoredumpHandler { + public static final String COREDUMP_FILENAME_PREFIX = "dump_"; + + private static final Logger logger = Logger.getLogger(CoredumpHandler.class.getName()); private static final Pattern HS_ERR_PATTERN = Pattern.compile("hs_err_pid[0-9]+\\.log"); private static final String PROCESSING_DIRECTORY_NAME = "processing"; - private static final String METADATA_FILE_NAME = "metadata.json"; private static final String METADATA2_FILE_NAME = "metadata2.json"; private static final String COMPRESSED_EXTENSION = ".zst"; private static final String ENCRYPTED_EXTENSION = ".enc"; - public static final String COREDUMP_FILENAME_PREFIX = "dump_"; - - private final Logger logger = Logger.getLogger(CoredumpHandler.class.getName()); - private final ObjectMapper objectMapper = new ObjectMapper(); private final CoreCollector coreCollector; private final Cores cores; - private final CoredumpReporter coredumpReporter; private final String crashPatchInContainer; private final Path doneCoredumpsPath; private final Metrics metrics; private final Clock clock; private final Supplier<String> coredumpIdSupplier; private final SecretSharedKeySupplier secretSharedKeySupplier; - private final BooleanFlag reportCoresViaCfgFlag; private final StringFlag coreEncryptionPublicKeyIdFlag; /** * @param crashPathInContainer path inside the container where core dump are dumped * @param doneCoredumpsPath path on host where processed core dumps are stored */ - public CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter, - String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics, - FlagSource flagSource) { - this(coreCollector, cores, coredumpReporter, crashPathInContainer, doneCoredumpsPath, - metrics, Clock.systemUTC(), () -> UUID.randomUUID().toString(), (ctx) -> Optional.empty() /*TODO*/, - flagSource); - } - - // TODO remove redundant constructor once internal callsite has been updated - public CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter, + public CoredumpHandler(CoreCollector coreCollector, Cores cores, String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics, SecretSharedKeySupplier secretSharedKeySupplier, FlagSource flagSource) { - this(coreCollector, cores, coredumpReporter, crashPathInContainer, doneCoredumpsPath, + this(coreCollector, cores, crashPathInContainer, doneCoredumpsPath, metrics, Clock.systemUTC(), () -> UUID.randomUUID().toString(), secretSharedKeySupplier, flagSource); } - CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter, + CoredumpHandler(CoreCollector coreCollector, Cores cores, String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics, Clock clock, Supplier<String> coredumpIdSupplier, SecretSharedKeySupplier secretSharedKeySupplier, FlagSource flagSource) { this.coreCollector = coreCollector; this.cores = cores; - this.coredumpReporter = coredumpReporter; this.crashPatchInContainer = crashPathInContainer; this.doneCoredumpsPath = doneCoredumpsPath; this.metrics = metrics; this.clock = clock; this.coredumpIdSupplier = coredumpIdSupplier; this.secretSharedKeySupplier = secretSharedKeySupplier; - this.reportCoresViaCfgFlag = Flags.REPORT_CORES_VIA_CFG.bindTo(flagSource); this.coreEncryptionPublicKeyIdFlag = Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.bindTo(flagSource); } @@ -143,13 +121,7 @@ public class CoredumpHandler { // Check if we have already started to process a core dump or we can enqueue a new core one getCoredumpToProcess(context, containerCrashPath, containerProcessingPath) - .ifPresent(path -> { - if (reportCoresViaCfgFlag.with(FetchVector.Dimension.NODE_TYPE, context.nodeType().name()).value()) { - processAndReportSingleCoreDump2(context, path, dockerImage); - } else { - processAndReportSingleCoredump(context, path, nodeAttributesSupplier); - } - }); + .ifPresent(path -> processAndReportSingleCoreDump(context, path, dockerImage)); } /** @return path to directory inside processing directory that contains a core dump file to process */ @@ -212,65 +184,6 @@ public class CoredumpHandler { return coreEncryptionPublicKeyIdFlag.with(FetchVector.Dimension.NODE_TYPE, context.nodeType().name()).value(); } - void processAndReportSingleCoredump(NodeAgentContext context, ContainerPath coredumpDirectory, Supplier<Map<String, Object>> nodeAttributesSupplier) { - try { - Optional<SecretSharedKey> sharedCoreKey = Optional.of(corePublicKeyFlagValue(context)) - .filter(k -> !k.isEmpty()) - .map(KeyId::ofString) - .flatMap(secretSharedKeySupplier::create); - Optional<String> decryptionToken = sharedCoreKey.map(k -> k.sealedSharedKey().toTokenString()); - String metadata = getMetadata(context, coredumpDirectory, nodeAttributesSupplier, decryptionToken); - String coredumpId = coredumpDirectory.getFileName().toString(); - coredumpReporter.reportCoredump(coredumpId, metadata); - finishProcessing(context, coredumpDirectory, sharedCoreKey); - context.log(logger, "Successfully reported coredump " + coredumpId); - } catch (Exception e) { - throw new RuntimeException("Failed to process coredump " + coredumpDirectory, e); - } - } - - /** - * @return coredump metadata from metadata.json if present, otherwise attempts to get metadata using - * {@link CoreCollector} and stores it to metadata.json - */ - String getMetadata(NodeAgentContext context, ContainerPath coredumpDirectory, Supplier<Map<String, Object>> nodeAttributesSupplier, Optional<String> decryptionToken) throws IOException { - UnixPath metadataPath = new UnixPath(coredumpDirectory.resolve(METADATA_FILE_NAME)); - if (!metadataPath.exists()) { - ContainerPath coredumpFile = findCoredumpFileInProcessingDirectory(coredumpDirectory); - Map<String, Object> metadata = new HashMap<>(coreCollector.collect(context, coredumpFile)); - metadata.putAll(nodeAttributesSupplier.get()); - metadata.put("coredump_path", doneCoredumpsPath - .resolve(context.containerName().asString()) - .resolve(coredumpDirectory.getFileName().toString()) - .resolve(coredumpFile.getFileName().toString()).toString()); - decryptionToken.ifPresent(token -> metadata.put("decryption_token", token)); - - String metadataFields = objectMapper.writeValueAsString(Map.of("fields", metadata)); - metadataPath.writeUtf8File(metadataFields); - return metadataFields; - } else { - if (decryptionToken.isPresent()) { - // Since encryption keys are single-use and generated for each core dump processing invocation, - // we must ensure we store and report the token associated with the _latest_ (i.e. current) - // attempt at processing the core dump. Patch and rewrite the file with a new token, if present. - String metadataFields = metadataWithPatchedTokenValue(metadataPath, decryptionToken.get()); - metadataPath.deleteIfExists(); - metadataPath.writeUtf8File(metadataFields); - return metadataFields; - } else { - return metadataPath.readUtf8File(); - } - } - } - - private String metadataWithPatchedTokenValue(UnixPath metadataPath, String decryptionToken) throws JsonProcessingException { - var jsonRoot = objectMapper.readTree(metadataPath.readUtf8File()); - if (jsonRoot.path("fields").isObject()) { - ((ObjectNode)jsonRoot.get("fields")).put("decryption_token", decryptionToken); - } // else: unit testing case without real metadata - return objectMapper.writeValueAsString(jsonRoot); - } - static OutputStream maybeWrapWithEncryption(OutputStream wrappedStream, Optional<SecretSharedKey> sharedCoreKey) { return sharedCoreKey .map(key -> SharedKeyGenerator.makeAesGcmEncryptionCipher(key).wrapOutputStream(wrappedStream)) @@ -372,8 +285,8 @@ public class CoredumpHandler { return clock.instant().minusSeconds(60).isAfter(fileAttributes.lastModifiedTime()); } - void processAndReportSingleCoreDump2(NodeAgentContext context, ContainerPath coreDumpDirectory, - Optional<DockerImage> dockerImage) { + void processAndReportSingleCoreDump(NodeAgentContext context, ContainerPath coreDumpDirectory, + Optional<DockerImage> dockerImage) { CoreDumpMetadata metadata = gatherMetadata(context, coreDumpDirectory); dockerImage.ifPresent(metadata::setDockerImage); dockerImage.flatMap(DockerImage::tag).ifPresent(metadata::setVespaVersion); @@ -390,20 +303,17 @@ public class CoredumpHandler { finishProcessing(context, coreDumpDirectory, sharedCoreKey); } - private CoreDumpMetadata gatherMetadata(NodeAgentContext context, ContainerPath coreDumpDirectory) { + CoreDumpMetadata gatherMetadata(NodeAgentContext context, ContainerPath coreDumpDirectory) { ContainerPath metadataPath = coreDumpDirectory.resolve(METADATA2_FILE_NAME); Optional<ReportCoreDumpRequest> request = ReportCoreDumpRequest.load(metadataPath); if (request.isPresent()) { - return request.map(requestInstance -> { - var metadata = new CoreDumpMetadata(); - requestInstance.populateMetadata(metadata, FileSystems.getDefault()); - return metadata; - }) - .get(); + var metadata = new CoreDumpMetadata(); + request.get().populateMetadata(metadata, doneCoredumpsPath.getFileSystem()); + return metadata; } ContainerPath coreDumpFile = findCoredumpFileInProcessingDirectory(coreDumpDirectory); - CoreDumpMetadata metadata = coreCollector.collect2(context, coreDumpFile); + CoreDumpMetadata metadata = coreCollector.collect(context, coreDumpFile); metadata.setCpuMicrocodeVersion(getMicrocodeVersion()) .setKernelVersion(System.getProperty("os.version")) .setCoreDumpPath(doneCoredumpsPath.resolve(context.containerName().asString()) diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java deleted file mode 100644 index 0e2bb724f50..00000000000 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; - -/** - * @author freva - */ -public interface CoredumpReporter { - - /** Report a coredump with a given ID and given metadata */ - void reportCoredump(String id, String metadata); -} diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java index ea73fd4af6f..8f251bf72c5 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; +import com.yahoo.vespa.hosted.node.admin.configserver.cores.CoreDumpMetadata; import com.yahoo.vespa.hosted.node.admin.container.ContainerOperations; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; @@ -10,10 +11,8 @@ import com.yahoo.vespa.test.file.TestFileSystem; import org.junit.jupiter.api.Test; import java.util.List; -import java.util.Map; import static com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoreCollector.GDB_PATH_RHEL8; -import static com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoreCollector.JAVA_HEAP_DUMP_METADATA; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.fail; import static org.mockito.Mockito.mock; @@ -23,7 +22,6 @@ import static org.mockito.Mockito.when; * @author freva */ public class CoreCollectorTest { - private final String JDK_PATH = "/path/to/jdk/java"; private final ContainerOperations docker = mock(ContainerOperations.class); private final CoreCollector coreCollector = new CoreCollector(docker); private final NodeAgentContext context = NodeAgentContextImpl.builder("container-123.domain.tld") @@ -144,11 +142,10 @@ public class CoreCollectorTest { "/usr/bin/program", "/tmp/core.1234"}, String.join("\n", GDB_BACKTRACE)); - Map<String, Object> expectedData = Map.of( - "bin_path", TEST_BIN_PATH, - "backtrace", GDB_BACKTRACE, - "backtrace_all_threads", GDB_BACKTRACE); - assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH)); + var expected = new CoreDumpMetadata().setBinPath(TEST_BIN_PATH) + .setBacktrace(GDB_BACKTRACE) + .setBacktraceAllThreads(GDB_BACKTRACE); + assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH)); } @Test @@ -159,8 +156,8 @@ public class CoreCollectorTest { mockExec(new String[]{GDB_PATH_RHEL8 + " -n -ex set print frame-arguments none -ex bt -batch /usr/bin/program /tmp/core.1234"}, "", "Failure"); - Map<String, Object> expectedData = Map.of("bin_path", TEST_BIN_PATH); - assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH)); + var expected = new CoreDumpMetadata().setBinPath(TEST_BIN_PATH); + assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH)); } @Test @@ -168,22 +165,25 @@ public class CoreCollectorTest { mockExec(new String[]{"file", TEST_CORE_PATH.pathInContainer()}, "dump.core.5954: ELF 64-bit LSB core file x86-64, version 1 (SYSV), too many program header sections (33172)"); + String jdkPath = "/path/to/jdk/java"; mockExec(new String[]{GDB_PATH_RHEL8, "-n", "-batch", "-core", "/tmp/core.1234"}, - "Core was generated by `" + JDK_PATH + " -Dconfig.id=default/container.11 -XX:+Pre'."); + "Core was generated by `" + jdkPath + " -Dconfig.id=default/container.11 -XX:+Pre'."); String jstack = "jstack11"; - mockExec(new String[]{"jhsdb", "jstack", "--exe", JDK_PATH, "--core", "/tmp/core.1234"}, + mockExec(new String[]{"jhsdb", "jstack", "--exe", jdkPath, "--core", "/tmp/core.1234"}, jstack); - Map<String, Object> expectedData = Map.of( - "bin_path", JDK_PATH, - "backtrace_all_threads", List.of(jstack)); - assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH)); + var expected = new CoreDumpMetadata().setBinPath(jdkPath) + .setBacktraceAllThreads(List.of(jstack)); + assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH)); } @Test void metadata_for_java_heap_dump() { - assertEquals(JAVA_HEAP_DUMP_METADATA, coreCollector.collect(context, context.paths().of("/dump_java_pid123.hprof"))); + var expected = new CoreDumpMetadata().setBinPath("java") + .setBacktrace(List.of("Heap dump, no backtrace available")); + + assertEquals(expected, coreCollector.collect(context, context.paths().of("/dump_java_pid123.hprof"))); } private void mockExec(String[] cmd, String output) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java index b096db2eaf4..1fd688558a0 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java @@ -1,12 +1,14 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.maintenance.coredump; +import com.yahoo.config.provision.DockerImage; import com.yahoo.security.KeyId; import com.yahoo.security.SealedSharedKey; import com.yahoo.security.SecretSharedKey; import com.yahoo.test.ManualClock; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; +import com.yahoo.vespa.hosted.node.admin.configserver.cores.CoreDumpMetadata; import com.yahoo.vespa.hosted.node.admin.configserver.cores.Cores; import com.yahoo.vespa.hosted.node.admin.container.metrics.DimensionMetrics; import com.yahoo.vespa.hosted.node.admin.container.metrics.Metrics; @@ -26,7 +28,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.FileTime; import java.time.Duration; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -36,9 +37,12 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import static com.yahoo.yolean.Exceptions.uncheck; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; @@ -57,7 +61,6 @@ public class CoredumpHandlerTest { private final CoreCollector coreCollector = mock(CoreCollector.class); private final Cores cores = mock(Cores.class); - private final CoredumpReporter coredumpReporter = mock(CoredumpReporter.class); private final Metrics metrics = new Metrics(); private final ManualClock clock = new ManualClock(); @SuppressWarnings("unchecked") @@ -65,7 +68,7 @@ public class CoredumpHandlerTest { private final SecretSharedKeySupplier secretSharedKeySupplier = mock(SecretSharedKeySupplier.class); private final InMemoryFlagSource flagSource = new InMemoryFlagSource(); private final CoredumpHandler coredumpHandler = - new CoredumpHandler(coreCollector, cores, coredumpReporter, containerCrashPath.pathInContainer(), + new CoredumpHandler(coreCollector, cores, containerCrashPath.pathInContainer(), doneCoredumpsPath, metrics, clock, coredumpIdSupplier, secretSharedKeySupplier, flagSource); @@ -148,31 +151,14 @@ public class CoredumpHandlerTest { verify(coredumpIdSupplier, times(1)).get(); } - private static String buildExpectedMetadataString(Optional<String> decryptionToken) { - return "{\"fields\":{" + - "\"hostname\":\"host123.yahoo.com\"," + - "\"kernel_version\":\"3.10.0-862.9.1.el7.x86_64\"," + - "\"backtrace\":[\"call 1\",\"function 2\",\"something something\"]," + - "\"vespa_version\":\"6.48.4\"," + - "\"bin_path\":\"/bin/bash\"," + - "\"coredump_path\":\"/home/docker/dumps/container-123/id-123/dump_core.456\"," + - "\"docker_image\":\"vespa/ci:6.48.4\"" + - decryptionToken.map(",\"decryption_token\":\"%s\""::formatted).orElse("") + - "}}"; - } - - void do_get_metadata_test(Optional<String> oldDecryptionToken, Optional<String> newDecryptionToken) throws IOException { - Map<String, Object> metadata = new HashMap<>(); - metadata.put("bin_path", "/bin/bash"); - metadata.put("backtrace", List.of("call 1", "function 2", "something something")); - - Map<String, Object> attributes = Map.of( - "hostname", "host123.yahoo.com", - "vespa_version", "6.48.4", - "kernel_version", "3.10.0-862.9.1.el7.x86_64", - "docker_image", "vespa/ci:6.48.4"); - - String expectedMetadataStr = buildExpectedMetadataString(oldDecryptionToken); + @Test + void gather_metadata_test() throws IOException { + var metadata = new CoreDumpMetadata().setKernelVersion("3.10.0-862.9.1.el7.x86_64") + .setBacktrace(List.of("call 1", "function 2", "something something")) + .setVespaVersion("6.48.4") + .setBinPath("/bin/bash") + .setCoreDumpPath(context.paths().of("/home/docker/dumps/container-123/id-123/dump_core.456")) + .setDockerImage(DockerImage.fromString("example.com/vespa/ci:6.48.4")); ContainerPath coredumpDirectory = context.paths().of("/var/crash/id-123"); Files.createDirectories(coredumpDirectory.pathOnHost()); @@ -180,40 +166,20 @@ public class CoredumpHandlerTest { when(coreCollector.collect(eq(context), eq(coredumpDirectory.resolve("dump_core.456")))) .thenReturn(metadata); - assertEquals(expectedMetadataStr, coredumpHandler.getMetadata(context, coredumpDirectory, () -> attributes, oldDecryptionToken)); + assertEquals(metadata, coredumpHandler.gatherMetadata(context, coredumpDirectory)); verify(coreCollector, times(1)).collect(any(), any()); - // Calling it again will read the previously generated metadata from disk and selectively - // patch in an updated decryption token value, if one is provided. - // This avoids having to re-run a potentially expensive collector step. - expectedMetadataStr = buildExpectedMetadataString(newDecryptionToken); - assertEquals(expectedMetadataStr, coredumpHandler.getMetadata(context, coredumpDirectory, () -> attributes, newDecryptionToken)); + // On second invocation the test already exist, so times(1) is not incremented + assertEquals(metadata, coredumpHandler.gatherMetadata(context, coredumpDirectory)); + doThrow(new IllegalStateException("Should not be invoked")) + .when(coreCollector).collect(any(), any()); verify(coreCollector, times(1)).collect(any(), any()); } @Test - void get_metadata_test_without_encryption() throws IOException { - do_get_metadata_test(Optional.empty(), Optional.empty()); // No token in metadata - } - - @Test - void get_metadata_test_with_encryption() throws IOException { - when(secretSharedKeySupplier.create(any())).thenReturn(Optional.of(makeFixedSecretSharedKey())); - do_get_metadata_test(Optional.of("AVeryCoolToken"), Optional.of("AnEvenCoolerToken")); - } - - @Test - void get_metadata_test_without_encryption_then_with_encryption() throws IOException { - // Edge where encryption was enabled between attempted processing runs. - // We don't bother with testing the opposite edge case (encryption -> no encryption), since - // in that case the core dump itself won't be encrypted so the token will be a no-op. - do_get_metadata_test(Optional.empty(), Optional.of("TheSwaggestToken")); - } - - @Test void cant_get_metadata_if_no_core_file() { assertThrows(IllegalStateException.class, () -> { - coredumpHandler.getMetadata(context, context.paths().of("/fake/path"), Map::of, Optional.empty()); + coredumpHandler.gatherMetadata(context, context.paths().of("/fake/path")); }); } @@ -228,31 +194,34 @@ public class CoredumpHandlerTest { }); } - void do_process_single_coredump_test(String expectedCoreFileName) throws IOException { + void do_process_single_coredump_test(String expectedCoreFileName, boolean encrypt) throws IOException { ContainerPath coredumpDirectory = context.paths().of("/path/to/coredump/proccessing/id-123"); Files.createDirectories(coredumpDirectory); - Files.write(coredumpDirectory.resolve("metadata.json"), "{\"test-metadata\":{}}".getBytes()); + Files.write(coredumpDirectory.resolve("metadata2.json"), "{\"test-metadata\":{}}".getBytes()); Files.createFile(coredumpDirectory.resolve("dump_bash.core.431")); - assertFolderContents(coredumpDirectory, "metadata.json", "dump_bash.core.431"); + assertFolderContents(coredumpDirectory, "metadata2.json", "dump_bash.core.431"); + CoreDumpMetadata expectedMetadata = new CoreDumpMetadata(); + if (encrypt) + expectedMetadata.setDecryptionToken("XWTN195KaZecNCloqRICq9YwxwWJEKGqyMYcTgHtwaZfnXtzHzd7bPrPPR60VdxxzA"); - coredumpHandler.processAndReportSingleCoredump(context, coredumpDirectory, Map::of); + coredumpHandler.processAndReportSingleCoreDump(context, coredumpDirectory, Optional.empty()); verify(coreCollector, never()).collect(any(), any()); - verify(coredumpReporter, times(1)).reportCoredump(eq("id-123"), eq("{\"test-metadata\":{}}")); + verify(cores, times(1)).report(eq(context.hostname()), eq("id-123"), eq(expectedMetadata)); assertFalse(Files.exists(coredumpDirectory)); assertFolderContents(doneCoredumpsPath.resolve("container-123"), "id-123"); - assertFolderContents(doneCoredumpsPath.resolve("container-123").resolve("id-123"), "metadata.json", expectedCoreFileName); + assertFolderContents(doneCoredumpsPath.resolve("container-123").resolve("id-123"), "metadata2.json", expectedCoreFileName); } @Test void process_single_coredump_test_without_encryption() throws IOException { - do_process_single_coredump_test("dump_bash.core.431.zst"); + do_process_single_coredump_test("dump_bash.core.431.zst", false); } @Test void process_single_coredump_test_with_encryption() throws IOException { flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), "bar-key"); when(secretSharedKeySupplier.create(KeyId.ofString("bar-key"))).thenReturn(Optional.of(makeFixedSecretSharedKey())); - do_process_single_coredump_test("dump_bash.core.431.zst.enc"); + do_process_single_coredump_test("dump_bash.core.431.zst.enc", true); } // TODO fail closed instead of open @@ -260,7 +229,7 @@ public class CoredumpHandlerTest { void encryption_disabled_when_no_public_key_set_in_feature_flag() throws IOException { flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), ""); // empty -> not set verify(secretSharedKeySupplier, never()).create(any()); - do_process_single_coredump_test("dump_bash.core.431.zst"); // No .enc suffix; not encrypted + do_process_single_coredump_test("dump_bash.core.431.zst", false); // No .enc suffix; not encrypted } // TODO fail closed instead of open @@ -268,7 +237,7 @@ public class CoredumpHandlerTest { void encryption_disabled_when_no_key_returned_for_key_id_specified_by_feature_flag() throws IOException { flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), "baz-key"); when(secretSharedKeySupplier.create(KeyId.ofString("baz-key"))).thenReturn(Optional.empty()); - do_process_single_coredump_test("dump_bash.core.431.zst"); // No .enc suffix; not encrypted + do_process_single_coredump_test("dump_bash.core.431.zst", false); // No .enc suffix; not encrypted } @Test @@ -278,7 +247,7 @@ public class CoredumpHandlerTest { Files.createFile(containerCrashPath.resolve("dump-2")); Files.createFile(containerCrashPath.resolve("hs_err_pid2.log")); Files.createDirectory(processingPath); - Files.createFile(processingPath.resolve("metadata.json")); + Files.createFile(processingPath.resolve("metadata2.json")); Files.createFile(processingPath.resolve("dump-3")); new UnixPath(doneCoredumpsPath.resolve("container-123").resolve("dump-3-folder").resolve("dump-3")) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index 3d76c8e3f94..1406aaecb71 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -167,13 +167,18 @@ public class AllocatableClusterResources { advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterSpec, exclusive); // Ask for something legal advertisedResources = applicationLimits.cap(advertisedResources); // Overrides other conditions, even if it will then fail var realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive); // What we'll really get + if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec) && advertisedResources.storageType() == NodeResources.StorageType.any) { + // Since local disk resreves some of the storage, try to constrain to remote disk + advertisedResources = advertisedResources.with(NodeResources.StorageType.remote); + realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive); + } if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec)) return Optional.empty(); if (anySatisfies(realResources, availableRealHostResources)) - return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources), - advertisedResources, - wantedResources, - clusterSpec)); + return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources), + advertisedResources, + wantedResources, + clusterSpec)); else return Optional.empty(); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index fc837ee54b4..ff72d22bb39 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -32,7 +32,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.7f, 10); var scaledResources = fixture.tester().assertResources("Scaling up since resource usage is too high", - 9, 1, 3.6, 8.3, 37.7, + 7, 1, 4.6, 11.1, 55.1, fixture.autoscale()); fixture.deploy(Capacity.from(scaledResources)); @@ -49,7 +49,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(0.1f, 10); fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly", - 8, 1, 1.0, 8.5, 38.5, + 6, 1, 1.3, 11.8, 78.6, fixture.autoscale()); } @@ -73,7 +73,7 @@ public class AutoscalingTest { fixture.loader().applyLoad(new Load(0.1, 0.1, 0.1), 3); fixture.loader().applyLoad(new Load(1.0, 1.0, 1.0), 1); fixture.tester().assertResources("Scaling up since resource usage is too high", - 8, 1, 5.3, 17.7, 89.4, + 8, 1, 5.3, 17.7, 93.6, fixture.autoscale()); } @@ -92,13 +92,29 @@ public class AutoscalingTest { fixture.currentResources().advertisedResources()); } + @Test + public void initial_deployment_with_host_sharing_flag_and_too_small_min() { + var min = new ClusterResources(1, 1, new NodeResources(0.5, 4.0, 10, 0.1)); + var max = new ClusterResources(1, 1, new NodeResources(2.0, 8.0, 50, 0.1)); + var fixture = AutoscalingTester.fixture() + .awsSetup(false, Environment.test) + .clusterType(ClusterSpec.Type.container) + .capacity(Capacity.from(min, max)) + .initialResources(Optional.empty()) + .hostSharingFlag() + .build(); + fixture.tester().assertResources("Initial resources at min, since flag turns on host sharing", + 1, 1, 0.5, 4.0, 10.0, + fixture.currentResources().advertisedResources()); + } + /** When scaling up, disregard underutilized dimensions (memory here) */ @Test public void test_only_autoscaling_up_quickly() { var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10); fixture.tester().assertResources("Scaling up (only) since resource usage is too high", - 8, 1, 7.1, 9.5, 89.4, + 7, 1, 8.2, 10.7, 99.5, fixture.autoscale()); } @@ -109,7 +125,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10); fixture.tester().assertResources("Scaling cpu and disk up and memory down", - 7, 1, 8.2, 4.0, 104.1, + 7, 1, 8.2, 4.0, 99.5, fixture.autoscale()); } @@ -119,7 +135,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10); fixture.tester().assertResources("Scaling cpu and disk up, memory follows", - 16, 1, 4, 8.0, 41.1, + 16, 1, 4, 8.0, 28.3, fixture.autoscale()); } @@ -130,7 +146,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.70, 1); fixture.loader().applyCpuLoad(0.01, 100); fixture.tester().assertResources("Scaling up since peak resource usage is too high", - 9, 1, 3.8, 8.3, 37.7, + 8, 1, 4.3, 9.5, 47.2, fixture.autoscale()); } @@ -141,7 +157,7 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.70, 1); fixture.loader().applyCpuLoad(0.01, 100); fixture.tester().assertResources("Scaling up since peak resource usage is too high", - 10, 1, 4, 8.0, 32.9, + 10, 1, 4, 8.0, 22.7, fixture.autoscale()); } @@ -180,13 +196,13 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.25f, 120); ClusterResources scaledResources = fixture.tester().assertResources("Scaling cpu up", - 4, 1, 3.3, 13.3, 60.3, + 3, 1, 5, 13.3, 66.1, fixture.autoscale()); fixture.deploy(Capacity.from(scaledResources)); fixture.deactivateRetired(Capacity.from(scaledResources)); fixture.loader().applyCpuLoad(0.1f, 120); fixture.tester().assertResources("Scaling down since cpu usage has gone down", - 3, 1, 2.5, 10.0, 45.3, + 3, 1, 2.5, 9.2, 61.1, fixture.autoscale()); } @@ -224,7 +240,7 @@ public class AutoscalingTest { @Test public void autoscaling_target_preserves_any() { - NodeResources resources = new NodeResources(1, 10, 10, 1); + NodeResources resources = new NodeResources(1, 100, 100, 1); var capacity = Capacity.from(new ClusterResources( 2, 1, resources.with(DiskSpeed.any)), new ClusterResources( 10, 1, resources.with(DiskSpeed.any))); var fixture = AutoscalingTester.fixture() @@ -272,7 +288,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(0.05f, 0.05f, 0.05f), 120); fixture.tester().assertResources("Scaling down to limit since resource usage is low", - 4, 1, 1.8, 7.4, 10.6, + 4, 1, 1.8, 7.4, 23.5, fixture.autoscale()); } @@ -359,7 +375,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(1.0, 120); fixture.tester().assertResources("Suggesting above capacity limit", - 8, 1, 6.2, 7.6, 34.3, + 8, 1, 6.2, 7.6, 37.8, fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min)); } @@ -370,7 +386,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(1.0, 120); fixture.tester().assertResources("Suggesting above capacity limit", - 13, 1, 4, 8, 19.7, + 13, 1, 4, 8, 13.6, fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min)); } @@ -405,7 +421,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(0.9, 120); fixture.tester().assertResources("Scaling up to 2 nodes, scaling memory and disk down at the same time", - 10, 5, 7.7, 40.6, 40.1, + 10, 5, 7.7, 40.6, 47.8, fixture.autoscale()); } @@ -424,7 +440,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(timePassed.negated()); fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 1.0); fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper", - 8, 1, 2.8, 36.2, 36, + 8, 1, 2.8, 36.2, 56.4, fixture.autoscale()); } @@ -444,7 +460,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(timePassed.negated()); fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0); fixture.tester().assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper", - 6, 1, 1.0, 50.7, 50.4, + 6, 1, 1.0, 50.7, 79.0, fixture.autoscale()); } @@ -461,7 +477,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(1)); fixture.loader().applyMemLoad(1.0, 1000); fixture.tester().assertResources("Increase group size to reduce memory load", - 8, 2, 4.5, 97.1, 62.7, + 8, 2, 4.5, 97.1, 74.7, fixture.autoscale()); } @@ -478,7 +494,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyLoad(new Load(0.16, 0.02, 0.5), 120); fixture.tester().assertResources("Scaling down memory", - 6, 1, 3.0, 4.2, 100.8, + 6, 1, 3.0, 4.2, 139.9, fixture.autoscale()); } @@ -490,7 +506,7 @@ public class AutoscalingTest { fixture.tester().clock().advance(Duration.ofDays(2)); fixture.loader().applyCpuLoad(0.02, 120); fixture.tester().assertResources("Scaling down since enough time has passed", - 4, 1, 1.0, 17.2, 80.4, + 3, 1, 1.0, 25.8, 147.4, fixture.autoscale()); } @@ -507,20 +523,20 @@ public class AutoscalingTest { fixture.loader().applyCpuLoad(0.25, 120); // (no read share stored) fixture.tester().assertResources("Advice to scale up since we set aside for bcp by default", - 5, 1, 3, 100, 100, + 6, 1, 3, 100, 100, fixture.autoscale()); fixture.loader().applyCpuLoad(0.25, 120); fixture.storeReadShare(0.25, 0.5); fixture.tester().assertResources("Half of global share is the same as the default assumption used above", - 5, 1, 3, 100, 100, + 6, 1, 3, 100, 100, fixture.autoscale()); fixture.tester.clock().advance(Duration.ofDays(1)); fixture.loader().applyCpuLoad(0.25, 120); fixture.storeReadShare(0.5, 0.5); fixture.tester().assertResources("Advice to scale down since we don't need room for bcp", - 4, 1, 3, 100, 100, + 5, 1, 3, 100, 100, fixture.autoscale()); } @@ -534,7 +550,7 @@ public class AutoscalingTest { fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale up since we assume we need 2x cpu for growth when no data scaling time data", - 7, 1, 1.8, 8.9, 40.4, + 6, 1, 2.1, 10.6, 66.5, fixture.autoscale()); fixture.setScalingDuration(Duration.ofMinutes(5)); @@ -543,7 +559,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale down since observed growth is slower than scaling time", - 7, 1, 1.5, 8.9, 40.4, + 5, 1, 2.2, 13.3, 83.2, fixture.autoscale()); fixture.setScalingDuration(Duration.ofMinutes(60)); @@ -554,7 +570,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.25, 200); fixture.tester().assertResources("Scale up since observed growth is faster than scaling time", - 7, 1, 1.8, 8.9, 40.4, + 6, 1, 2.1, 10.6, 66.5, fixture.autoscale()); } @@ -572,7 +588,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester.assertResources("Query and write load is equal -> scale up somewhat", - 7, 1, 2, 8.9, 40.2, + 7, 1, 2, 8.9, 55.5, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -581,7 +597,7 @@ public class AutoscalingTest { fixture.loader().addCpuMeasurements(0.4, 200); // TODO: Ackhually, we scale down here - why? fixture.tester().assertResources("Query load is 4x write load -> scale up more", - 7, 1, 1.8, 8.9, 40.4, + 6, 1, 2.1, 10.6, 66.5, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -589,7 +605,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write load is 10x query load -> scale down", - 6, 1, 1.1, 10.6, 48.5, + 5, 1, 1.4, 13.3, 83.2, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -597,7 +613,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Query only -> largest possible", - 7, 1, 3.5, 8.9, 40.2, + 7, 1, 3.5, 8.9, 55.5, fixture.autoscale()); fixture.tester().clock().advance(Duration.ofDays(2)); @@ -605,7 +621,7 @@ public class AutoscalingTest { fixture.tester.clock().advance(timeAdded.negated()); fixture.loader().addCpuMeasurements(0.4, 200); fixture.tester().assertResources("Write only -> smallest possible", - 4, 1, 1.1, 17.2, 80.4, + 4, 1, 1.1, 17.2, 110.9, fixture.autoscale()); } @@ -666,24 +682,27 @@ public class AutoscalingTest { @Test public void test_changing_exclusivity() { + var min = new ClusterResources( 2, 1, new NodeResources( 1, 4, 100, 1)); + var max = new ClusterResources(20, 1, new NodeResources(100, 1000, 1000, 1)); var fixture = AutoscalingTester.fixture() .awsProdSetup(true) .cluster(clusterSpec(true)) + .capacity(Capacity.from(min, max)) .initialResources(Optional.empty()) .build(); fixture.tester().assertResources("Initial deployment at minimum", - 2, 1, 2, 4, 10, + 2, 1, 2, 4, 100, fixture.currentResources().advertisedResources()); fixture.tester().deploy(fixture.applicationId(), clusterSpec(false), fixture.capacity()); fixture.tester().assertResources("With non-exclusive nodes, a better solution is " + "50% more nodes with half the cpu", - 3, 1, 1, 4, 10.2, + 3, 1, 1, 4, 145.6, fixture.autoscale()); fixture.tester().deploy(fixture.applicationId(), clusterSpec(true), fixture.capacity()); fixture.tester().assertResources("Reverts to the initial resources", - 2, 1, 2, 4, 10, + 2, 1, 2, 4, 100, fixture.currentResources().advertisedResources()); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java index ff04083ebde..bba06fb9080 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java @@ -189,14 +189,18 @@ public class Fixture { } public Fixture.Builder awsProdSetup(boolean allowHostSharing) { - return this.awsHostFlavors() - .awsResourceCalculator() - .zone(new Zone(Cloud.builder().dynamicProvisioning(true) - .allowHostSharing(allowHostSharing) - .build(), - SystemName.Public, - Environment.prod, - RegionName.from("aws-eu-west-1a"))); + return awsSetup(allowHostSharing, Environment.prod); + } + + public Fixture.Builder awsSetup(boolean allowHostSharing, Environment environment) { + return this.awsHostFlavors() + .awsResourceCalculator() + .zone(new Zone(Cloud.builder().dynamicProvisioning(true) + .allowHostSharing(allowHostSharing) + .build(), + SystemName.Public, + environment, + RegionName.from("aws-eu-west-1a"))); } public Fixture.Builder vespaVersion(Version version) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java index d148f6d3cc7..2ae1fe18714 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.autoscale.awsnodes; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.Nodelike; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java index 63f6d50ab2e..96fa143dc57 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java @@ -2,6 +2,8 @@ package com.yahoo.vespa.hosted.provision.autoscale.awsnodes; import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.Zone; /** * Calculations and logic on node resources common to provision-service and host-admin (at least). @@ -10,8 +12,12 @@ import com.yahoo.config.provision.NodeResources; */ public class AwsResourcesCalculator { + private final ReservedSpacePolicyImpl reservedSpacePolicy; private final double hostMemory = 0.6; - private final double hostDiskOverhead = 1; + + public AwsResourcesCalculator() { + this.reservedSpacePolicy = new ReservedSpacePolicyImpl(); + } /** The real resources of a parent host node in the node repository, given the real resources of the flavor. */ public NodeResources realResourcesOfParentHost(NodeResources realResourcesOfFlavor) { @@ -52,6 +58,7 @@ public class AwsResourcesCalculator { */ public double diskOverhead(VespaFlavor flavor, NodeResources resources, boolean real, boolean exclusive) { if ( flavor.realResources().storageType() != NodeResources.StorageType.local) return 0; + double hostDiskOverhead = reservedSpacePolicy.getPartitionSizeInBase2Gb(NodeType.host, ! exclusive); double diskShare = resources.diskGb() / ( flavor.advertisedResources().diskGb() - ( real ? hostDiskOverhead : 0) ); return hostDiskOverhead * diskShare; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java new file mode 100644 index 00000000000..000d08b59f8 --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java @@ -0,0 +1,50 @@ +package com.yahoo.vespa.hosted.provision.autoscale.awsnodes; + +import com.yahoo.config.provision.NodeType; + +/** + * Matches the internal repo implementation + * + * @author hakonhall + * @author musum + */ +public class ReservedSpacePolicyImpl { + + public long getPartitionSizeInBase2Gb(NodeType nodeType, boolean sharedHost) { + return new PartitionSizer(nodeType, sharedHost).getPartitionSize(); + } + + private static class PartitionSizer { + + private static final long imageCountForSharedHost = 6; + private static final long imageCountForNonSharedHost = 3; + + // Add a buffer to allow a small increase in image size + private static final long bufferSharedHost = 5; + private static final long bufferNonSharedHost = 3; + + private final boolean sharedHost; + + PartitionSizer(NodeType nodeType, boolean sharedHost) { + this.sharedHost = sharedHost; + } + + long getPartitionSize() { + return imageSize() * imageCount() + buffer(); + } + + private long imageSize() { + return (long)7.7; // return (long)VespaContainerImage.maxImageSize(hostedSystem, nodeType); + } + + private long buffer() { + return sharedHost ? bufferSharedHost : bufferNonSharedHost; + } + + private long imageCount() { + return sharedHost ? imageCountForSharedHost : imageCountForNonSharedHost; + } + + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java index cd5f18db516..c42b61988e9 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java @@ -34,4 +34,7 @@ public class VespaFlavor { public NodeResources advertisedResources() { return advertisedResources; } + @Override + public String toString() { return "flavor " + name; } + } diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index a21f623fbfa..50cbe8879b5 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -13,11 +13,13 @@ vespa_define_module( lz4 xxhash zstd + ${VESPA_URING_LIB} APPS src/apps/make_fixture_macros src/apps/vespa-detect-hostname src/apps/vespa-drop-file-from-cache + src/apps/vespa-probe-io-uring src/apps/vespa-resource-limits src/apps/vespa-tsan-digest src/apps/vespa-validate-hostname diff --git a/vespalib/src/apps/vespa-probe-io-uring/.gitignore b/vespalib/src/apps/vespa-probe-io-uring/.gitignore new file mode 100644 index 00000000000..9fa20bbf9d4 --- /dev/null +++ b/vespalib/src/apps/vespa-probe-io-uring/.gitignore @@ -0,0 +1 @@ +/vespa-probe-io-uring diff --git a/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt b/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt new file mode 100644 index 00000000000..24ceda46a87 --- /dev/null +++ b/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_vespa-probe-io-uring_app + SOURCES + probe_io_uring.cpp + OUTPUT_NAME vespa-probe-io-uring + INSTALL bin + DEPENDS + vespalib +) diff --git a/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp b/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp new file mode 100644 index 00000000000..ef3e6ddbef2 --- /dev/null +++ b/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp @@ -0,0 +1,95 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/config.h> +#include <stdio.h> +#include <stdlib.h> + +#ifdef VESPA_HAS_IO_URING + +#include <sys/utsname.h> +#include <liburing.h> +#include <liburing/io_uring.h> + +#include <string> +#include <vector> + +std::vector<std::string> op_names = { + "IORING_OP_NOP", + "IORING_OP_READV", + "IORING_OP_WRITEV", + "IORING_OP_FSYNC", + "IORING_OP_READ_FIXED", + "IORING_OP_WRITE_FIXED", + "IORING_OP_POLL_ADD", + "IORING_OP_POLL_REMOVE", + "IORING_OP_SYNC_FILE_RANGE", + "IORING_OP_SENDMSG", + "IORING_OP_RECVMSG", + "IORING_OP_TIMEOUT", + "IORING_OP_TIMEOUT_REMOVE", + "IORING_OP_ACCEPT", + "IORING_OP_ASYNC_CANCEL", + "IORING_OP_LINK_TIMEOUT", + "IORING_OP_CONNECT", + "IORING_OP_FALLOCATE", + "IORING_OP_OPENAT", + "IORING_OP_CLOSE", + "IORING_OP_FILES_UPDATE", + "IORING_OP_STATX", + "IORING_OP_READ", + "IORING_OP_WRITE", + "IORING_OP_FADVISE", + "IORING_OP_MADVISE", + "IORING_OP_SEND", + "IORING_OP_RECV", + "IORING_OP_OPENAT2", + "IORING_OP_EPOLL_CTL", + "IORING_OP_SPLICE", + "IORING_OP_PROVIDE_BUFFERS", + "IORING_OP_REMOVE_BUFFERS", + "IORING_OP_TEE", + "IORING_OP_SHUTDOWN", + "IORING_OP_RENAMEAT", + "IORING_OP_UNLINKAT", + "IORING_OP_MKDIRAT", + "IORING_OP_SYMLINKAT", + "IORING_OP_LINKAT", + "IORING_OP_MSG_RING", + "IORING_OP_FSETXATTR", + "IORING_OP_SETXATTR", + "IORING_OP_FGETXATTR", + "IORING_OP_GETXATTR", + "IORING_OP_SOCKET", + "IORING_OP_URING_CMD", + "IORING_OP_SEND_ZC", + "IORING_OP_SENDMSG_ZC" +}; + +int main() { + fprintf(stderr, "Vespa was compiled with io_uring\n"); + utsname host_info; + uname(&host_info); + fprintf(stderr, "kernel version: %s\n", host_info.release); + io_uring_probe *probe = io_uring_get_probe(); + if (probe == nullptr) { + fprintf(stderr, "io_uring probe failed!\n"); + return 1; + } + fprintf(stderr, "operation support: {\n"); + for (size_t i = 0; i < op_names.size(); ++i) { + fprintf(stderr, " %s: %s\n", op_names[i].c_str(), + io_uring_opcode_supported(probe, i) ? "yes" : "no"); + } + fprintf(stderr, "}\n"); + free(probe); + return 0; +} + +#else // VESPA_HAS_IO_URING + +int main() { + fprintf(stderr, "Vespa was compiled without io_uring\n"); + return 1; +} + +#endif // VESPA_HAS_IO_URING diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java index 0f090f83c0b..3e3ae08ff64 100644 --- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java +++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java @@ -29,7 +29,7 @@ public class Reconfigurer extends AbstractComponent { private static final Logger log = java.util.logging.Logger.getLogger(Reconfigurer.class.getName()); - private static final Duration TIMEOUT = Duration.ofMinutes(3); + private static final Duration TIMEOUT = Duration.ofMinutes(15); private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10)); private final VespaZooKeeperAdmin vespaZooKeeperAdmin; @@ -97,7 +97,7 @@ public class Reconfigurer extends AbstractComponent { Duration reconfigTimeout = reconfigTimeout(); Instant end = now.plus(reconfigTimeout); // Loop reconfiguring since we might need to wait until another reconfiguration is finished before we can succeed - for (int attempt = 1; now.isBefore(end); attempt++) { + for (int attempt = 1; ; attempt++) { try { Instant reconfigStarted = Instant.now(); vespaZooKeeperAdmin.reconfigure(connectionSpec, newServers); @@ -110,17 +110,19 @@ public class Reconfigurer extends AbstractComponent { return; } catch (ReconfigException e) { Duration delay = backoff.delay(attempt); - log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + - ", time left " + Duration.between(now, end) + ": " + - Exceptions.toMessageString(e)); - sleeper.sleep(delay); - } finally { now = Instant.now(); + if (now.isBefore(end)) { + log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay + + ", time left " + Duration.between(now, end) + ": " + Exceptions.toMessageString(e)); + sleeper.sleep(delay); + } + else { + log.log(Level.SEVERE, "Reconfiguration attempt " + attempt + " failed, and was failing for " + + reconfigTimeout + "; giving up now: " + Exceptions.toMessageString(e)); + shutdownAndDie(reconfigTimeout); + } } } - - // Reconfiguration failed - shutdownAndDie(reconfigTimeout); } private void shutdownAndDie(Duration reconfigTimeout) { |