summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--flags/src/main/java/com/yahoo/vespa/flags/Flags.java7
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java36
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java118
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java11
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java34
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java101
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java13
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java87
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java1
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java50
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java3
-rw-r--r--vespalib/CMakeLists.txt2
-rw-r--r--vespalib/src/apps/vespa-probe-io-uring/.gitignore1
-rw-r--r--vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt9
-rw-r--r--vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp95
-rw-r--r--zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java22
18 files changed, 322 insertions, 297 deletions
diff --git a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
index 91904deefd5..7934215c165 100644
--- a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
+++ b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java
@@ -404,13 +404,6 @@ public class Flags {
"Enable mail verification",
"Takes effect immediately");
- public static final UnboundBooleanFlag REPORT_CORES_VIA_CFG = defineFeatureFlag(
- "report-cores-via-cfg", true,
- List.of("hakonhall"), "2022-11-01", "2022-12-01",
- "If true, report core dumps to the config server instead of directly to the panic app.",
- "Takes effect on the next tick.",
- ZONE_ID, NODE_TYPE, HOSTNAME);
-
public static final UnboundStringFlag CORE_ENCRYPTION_PUBLIC_KEY_ID = defineStringFlag(
"core-encryption-public-key-id", "",
List.of("vekterli"), "2022-11-03", "2023-02-01",
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
index d99b45dbbbc..cfb2d240bfa 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollector.java
@@ -10,9 +10,7 @@ import com.yahoo.vespa.hosted.node.admin.task.util.process.CommandResult;
import java.nio.file.Path;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
@@ -32,9 +30,6 @@ public class CoreCollector {
private static final Pattern FROM_PATH_PATTERN = Pattern.compile("^.* from '(?<path>.*?)'");
static final String GDB_PATH_RHEL8 = "/opt/rh/gcc-toolset-11/root/bin/gdb";
- static final Map<String, Object> JAVA_HEAP_DUMP_METADATA =
- Map.of("bin_path", "java", "backtrace", List.of("Heap dump, no backtrace available"));
-
private final ContainerOperations container;
public CoreCollector(ContainerOperations container) {
@@ -100,36 +95,7 @@ public class CoreCollector {
return List.of(result.getOutput().split("\n"));
}
- /**
- * Collects metadata about a given core dump
- * @param context context of the NodeAgent that owns the core dump
- * @param coredumpPath path to core dump file inside the container
- * @return map of relevant metadata about the core dump
- */
- Map<String, Object> collect(NodeAgentContext context, ContainerPath coredumpPath) {
- if (JAVA_HEAP_DUMP_PATTERN.matcher(coredumpPath.getFileName().toString()).find())
- return JAVA_HEAP_DUMP_METADATA;
-
- Map<String, Object> data = new HashMap<>();
- try {
- String binPath = readBinPath(context, coredumpPath);
-
- data.put("bin_path", binPath);
- if (Path.of(binPath).getFileName().toString().equals("java")) {
- data.put("backtrace_all_threads", readJstack(context, coredumpPath, binPath));
- } else {
- data.put("backtrace", readBacktrace(context, coredumpPath, binPath, false));
- data.put("backtrace_all_threads", readBacktrace(context, coredumpPath, binPath, true));
- }
- } catch (ConvergenceException e) {
- context.log(logger, Level.WARNING, "Failed to extract backtrace: " + e.getMessage());
- } catch (RuntimeException e) {
- context.log(logger, Level.WARNING, "Failed to extract backtrace", e);
- }
- return data;
- }
-
- CoreDumpMetadata collect2(NodeAgentContext context, ContainerPath coredumpPath) {
+ CoreDumpMetadata collect(NodeAgentContext context, ContainerPath coredumpPath) {
var metadata = new CoreDumpMetadata();
if (JAVA_HEAP_DUMP_PATTERN.matcher(coredumpPath.getFileName().toString()).find()) {
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
index 4a8353d92ec..94f402d5332 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandler.java
@@ -1,14 +1,10 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.maintenance.coredump;
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.node.ObjectNode;
import com.yahoo.config.provision.DockerImage;
import com.yahoo.security.KeyId;
import com.yahoo.security.SecretSharedKey;
import com.yahoo.security.SharedKeyGenerator;
-import com.yahoo.vespa.flags.BooleanFlag;
import com.yahoo.vespa.flags.FetchVector;
import com.yahoo.vespa.flags.FlagSource;
import com.yahoo.vespa.flags.Flags;
@@ -26,19 +22,16 @@ import com.yahoo.vespa.hosted.node.admin.task.util.file.FileDeleter;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileMover;
import com.yahoo.vespa.hosted.node.admin.task.util.file.MakeDirectory;
-import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath;
import com.yahoo.vespa.hosted.node.admin.task.util.fs.ContainerPath;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UncheckedIOException;
-import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Clock;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -61,64 +54,49 @@ import static com.yahoo.yolean.Exceptions.uncheck;
*/
public class CoredumpHandler {
+ public static final String COREDUMP_FILENAME_PREFIX = "dump_";
+
+ private static final Logger logger = Logger.getLogger(CoredumpHandler.class.getName());
private static final Pattern HS_ERR_PATTERN = Pattern.compile("hs_err_pid[0-9]+\\.log");
private static final String PROCESSING_DIRECTORY_NAME = "processing";
- private static final String METADATA_FILE_NAME = "metadata.json";
private static final String METADATA2_FILE_NAME = "metadata2.json";
private static final String COMPRESSED_EXTENSION = ".zst";
private static final String ENCRYPTED_EXTENSION = ".enc";
- public static final String COREDUMP_FILENAME_PREFIX = "dump_";
-
- private final Logger logger = Logger.getLogger(CoredumpHandler.class.getName());
- private final ObjectMapper objectMapper = new ObjectMapper();
private final CoreCollector coreCollector;
private final Cores cores;
- private final CoredumpReporter coredumpReporter;
private final String crashPatchInContainer;
private final Path doneCoredumpsPath;
private final Metrics metrics;
private final Clock clock;
private final Supplier<String> coredumpIdSupplier;
private final SecretSharedKeySupplier secretSharedKeySupplier;
- private final BooleanFlag reportCoresViaCfgFlag;
private final StringFlag coreEncryptionPublicKeyIdFlag;
/**
* @param crashPathInContainer path inside the container where core dump are dumped
* @param doneCoredumpsPath path on host where processed core dumps are stored
*/
- public CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter,
- String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics,
- FlagSource flagSource) {
- this(coreCollector, cores, coredumpReporter, crashPathInContainer, doneCoredumpsPath,
- metrics, Clock.systemUTC(), () -> UUID.randomUUID().toString(), (ctx) -> Optional.empty() /*TODO*/,
- flagSource);
- }
-
- // TODO remove redundant constructor once internal callsite has been updated
- public CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter,
+ public CoredumpHandler(CoreCollector coreCollector, Cores cores,
String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics,
SecretSharedKeySupplier secretSharedKeySupplier, FlagSource flagSource) {
- this(coreCollector, cores, coredumpReporter, crashPathInContainer, doneCoredumpsPath,
+ this(coreCollector, cores, crashPathInContainer, doneCoredumpsPath,
metrics, Clock.systemUTC(), () -> UUID.randomUUID().toString(), secretSharedKeySupplier,
flagSource);
}
- CoredumpHandler(CoreCollector coreCollector, Cores cores, CoredumpReporter coredumpReporter,
+ CoredumpHandler(CoreCollector coreCollector, Cores cores,
String crashPathInContainer, Path doneCoredumpsPath, Metrics metrics,
Clock clock, Supplier<String> coredumpIdSupplier,
SecretSharedKeySupplier secretSharedKeySupplier, FlagSource flagSource) {
this.coreCollector = coreCollector;
this.cores = cores;
- this.coredumpReporter = coredumpReporter;
this.crashPatchInContainer = crashPathInContainer;
this.doneCoredumpsPath = doneCoredumpsPath;
this.metrics = metrics;
this.clock = clock;
this.coredumpIdSupplier = coredumpIdSupplier;
this.secretSharedKeySupplier = secretSharedKeySupplier;
- this.reportCoresViaCfgFlag = Flags.REPORT_CORES_VIA_CFG.bindTo(flagSource);
this.coreEncryptionPublicKeyIdFlag = Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.bindTo(flagSource);
}
@@ -143,13 +121,7 @@ public class CoredumpHandler {
// Check if we have already started to process a core dump or we can enqueue a new core one
getCoredumpToProcess(context, containerCrashPath, containerProcessingPath)
- .ifPresent(path -> {
- if (reportCoresViaCfgFlag.with(FetchVector.Dimension.NODE_TYPE, context.nodeType().name()).value()) {
- processAndReportSingleCoreDump2(context, path, dockerImage);
- } else {
- processAndReportSingleCoredump(context, path, nodeAttributesSupplier);
- }
- });
+ .ifPresent(path -> processAndReportSingleCoreDump(context, path, dockerImage));
}
/** @return path to directory inside processing directory that contains a core dump file to process */
@@ -212,65 +184,6 @@ public class CoredumpHandler {
return coreEncryptionPublicKeyIdFlag.with(FetchVector.Dimension.NODE_TYPE, context.nodeType().name()).value();
}
- void processAndReportSingleCoredump(NodeAgentContext context, ContainerPath coredumpDirectory, Supplier<Map<String, Object>> nodeAttributesSupplier) {
- try {
- Optional<SecretSharedKey> sharedCoreKey = Optional.of(corePublicKeyFlagValue(context))
- .filter(k -> !k.isEmpty())
- .map(KeyId::ofString)
- .flatMap(secretSharedKeySupplier::create);
- Optional<String> decryptionToken = sharedCoreKey.map(k -> k.sealedSharedKey().toTokenString());
- String metadata = getMetadata(context, coredumpDirectory, nodeAttributesSupplier, decryptionToken);
- String coredumpId = coredumpDirectory.getFileName().toString();
- coredumpReporter.reportCoredump(coredumpId, metadata);
- finishProcessing(context, coredumpDirectory, sharedCoreKey);
- context.log(logger, "Successfully reported coredump " + coredumpId);
- } catch (Exception e) {
- throw new RuntimeException("Failed to process coredump " + coredumpDirectory, e);
- }
- }
-
- /**
- * @return coredump metadata from metadata.json if present, otherwise attempts to get metadata using
- * {@link CoreCollector} and stores it to metadata.json
- */
- String getMetadata(NodeAgentContext context, ContainerPath coredumpDirectory, Supplier<Map<String, Object>> nodeAttributesSupplier, Optional<String> decryptionToken) throws IOException {
- UnixPath metadataPath = new UnixPath(coredumpDirectory.resolve(METADATA_FILE_NAME));
- if (!metadataPath.exists()) {
- ContainerPath coredumpFile = findCoredumpFileInProcessingDirectory(coredumpDirectory);
- Map<String, Object> metadata = new HashMap<>(coreCollector.collect(context, coredumpFile));
- metadata.putAll(nodeAttributesSupplier.get());
- metadata.put("coredump_path", doneCoredumpsPath
- .resolve(context.containerName().asString())
- .resolve(coredumpDirectory.getFileName().toString())
- .resolve(coredumpFile.getFileName().toString()).toString());
- decryptionToken.ifPresent(token -> metadata.put("decryption_token", token));
-
- String metadataFields = objectMapper.writeValueAsString(Map.of("fields", metadata));
- metadataPath.writeUtf8File(metadataFields);
- return metadataFields;
- } else {
- if (decryptionToken.isPresent()) {
- // Since encryption keys are single-use and generated for each core dump processing invocation,
- // we must ensure we store and report the token associated with the _latest_ (i.e. current)
- // attempt at processing the core dump. Patch and rewrite the file with a new token, if present.
- String metadataFields = metadataWithPatchedTokenValue(metadataPath, decryptionToken.get());
- metadataPath.deleteIfExists();
- metadataPath.writeUtf8File(metadataFields);
- return metadataFields;
- } else {
- return metadataPath.readUtf8File();
- }
- }
- }
-
- private String metadataWithPatchedTokenValue(UnixPath metadataPath, String decryptionToken) throws JsonProcessingException {
- var jsonRoot = objectMapper.readTree(metadataPath.readUtf8File());
- if (jsonRoot.path("fields").isObject()) {
- ((ObjectNode)jsonRoot.get("fields")).put("decryption_token", decryptionToken);
- } // else: unit testing case without real metadata
- return objectMapper.writeValueAsString(jsonRoot);
- }
-
static OutputStream maybeWrapWithEncryption(OutputStream wrappedStream, Optional<SecretSharedKey> sharedCoreKey) {
return sharedCoreKey
.map(key -> SharedKeyGenerator.makeAesGcmEncryptionCipher(key).wrapOutputStream(wrappedStream))
@@ -372,8 +285,8 @@ public class CoredumpHandler {
return clock.instant().minusSeconds(60).isAfter(fileAttributes.lastModifiedTime());
}
- void processAndReportSingleCoreDump2(NodeAgentContext context, ContainerPath coreDumpDirectory,
- Optional<DockerImage> dockerImage) {
+ void processAndReportSingleCoreDump(NodeAgentContext context, ContainerPath coreDumpDirectory,
+ Optional<DockerImage> dockerImage) {
CoreDumpMetadata metadata = gatherMetadata(context, coreDumpDirectory);
dockerImage.ifPresent(metadata::setDockerImage);
dockerImage.flatMap(DockerImage::tag).ifPresent(metadata::setVespaVersion);
@@ -390,20 +303,17 @@ public class CoredumpHandler {
finishProcessing(context, coreDumpDirectory, sharedCoreKey);
}
- private CoreDumpMetadata gatherMetadata(NodeAgentContext context, ContainerPath coreDumpDirectory) {
+ CoreDumpMetadata gatherMetadata(NodeAgentContext context, ContainerPath coreDumpDirectory) {
ContainerPath metadataPath = coreDumpDirectory.resolve(METADATA2_FILE_NAME);
Optional<ReportCoreDumpRequest> request = ReportCoreDumpRequest.load(metadataPath);
if (request.isPresent()) {
- return request.map(requestInstance -> {
- var metadata = new CoreDumpMetadata();
- requestInstance.populateMetadata(metadata, FileSystems.getDefault());
- return metadata;
- })
- .get();
+ var metadata = new CoreDumpMetadata();
+ request.get().populateMetadata(metadata, doneCoredumpsPath.getFileSystem());
+ return metadata;
}
ContainerPath coreDumpFile = findCoredumpFileInProcessingDirectory(coreDumpDirectory);
- CoreDumpMetadata metadata = coreCollector.collect2(context, coreDumpFile);
+ CoreDumpMetadata metadata = coreCollector.collect(context, coreDumpFile);
metadata.setCpuMicrocodeVersion(getMicrocodeVersion())
.setKernelVersion(System.getProperty("os.version"))
.setCoreDumpPath(doneCoredumpsPath.resolve(context.containerName().asString())
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java
deleted file mode 100644
index 0e2bb724f50..00000000000
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpReporter.java
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.node.admin.maintenance.coredump;
-
-/**
- * @author freva
- */
-public interface CoredumpReporter {
-
- /** Report a coredump with a given ID and given metadata */
- void reportCoredump(String id, String metadata);
-}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java
index ea73fd4af6f..8f251bf72c5 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoreCollectorTest.java
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.maintenance.coredump;
+import com.yahoo.vespa.hosted.node.admin.configserver.cores.CoreDumpMetadata;
import com.yahoo.vespa.hosted.node.admin.container.ContainerOperations;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl;
@@ -10,10 +11,8 @@ import com.yahoo.vespa.test.file.TestFileSystem;
import org.junit.jupiter.api.Test;
import java.util.List;
-import java.util.Map;
import static com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoreCollector.GDB_PATH_RHEL8;
-import static com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoreCollector.JAVA_HEAP_DUMP_METADATA;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.Mockito.mock;
@@ -23,7 +22,6 @@ import static org.mockito.Mockito.when;
* @author freva
*/
public class CoreCollectorTest {
- private final String JDK_PATH = "/path/to/jdk/java";
private final ContainerOperations docker = mock(ContainerOperations.class);
private final CoreCollector coreCollector = new CoreCollector(docker);
private final NodeAgentContext context = NodeAgentContextImpl.builder("container-123.domain.tld")
@@ -144,11 +142,10 @@ public class CoreCollectorTest {
"/usr/bin/program", "/tmp/core.1234"},
String.join("\n", GDB_BACKTRACE));
- Map<String, Object> expectedData = Map.of(
- "bin_path", TEST_BIN_PATH,
- "backtrace", GDB_BACKTRACE,
- "backtrace_all_threads", GDB_BACKTRACE);
- assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH));
+ var expected = new CoreDumpMetadata().setBinPath(TEST_BIN_PATH)
+ .setBacktrace(GDB_BACKTRACE)
+ .setBacktraceAllThreads(GDB_BACKTRACE);
+ assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH));
}
@Test
@@ -159,8 +156,8 @@ public class CoreCollectorTest {
mockExec(new String[]{GDB_PATH_RHEL8 + " -n -ex set print frame-arguments none -ex bt -batch /usr/bin/program /tmp/core.1234"},
"", "Failure");
- Map<String, Object> expectedData = Map.of("bin_path", TEST_BIN_PATH);
- assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH));
+ var expected = new CoreDumpMetadata().setBinPath(TEST_BIN_PATH);
+ assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH));
}
@Test
@@ -168,22 +165,25 @@ public class CoreCollectorTest {
mockExec(new String[]{"file", TEST_CORE_PATH.pathInContainer()},
"dump.core.5954: ELF 64-bit LSB core file x86-64, version 1 (SYSV), too many program header sections (33172)");
+ String jdkPath = "/path/to/jdk/java";
mockExec(new String[]{GDB_PATH_RHEL8, "-n", "-batch", "-core", "/tmp/core.1234"},
- "Core was generated by `" + JDK_PATH + " -Dconfig.id=default/container.11 -XX:+Pre'.");
+ "Core was generated by `" + jdkPath + " -Dconfig.id=default/container.11 -XX:+Pre'.");
String jstack = "jstack11";
- mockExec(new String[]{"jhsdb", "jstack", "--exe", JDK_PATH, "--core", "/tmp/core.1234"},
+ mockExec(new String[]{"jhsdb", "jstack", "--exe", jdkPath, "--core", "/tmp/core.1234"},
jstack);
- Map<String, Object> expectedData = Map.of(
- "bin_path", JDK_PATH,
- "backtrace_all_threads", List.of(jstack));
- assertEquals(expectedData, coreCollector.collect(context, TEST_CORE_PATH));
+ var expected = new CoreDumpMetadata().setBinPath(jdkPath)
+ .setBacktraceAllThreads(List.of(jstack));
+ assertEquals(expected, coreCollector.collect(context, TEST_CORE_PATH));
}
@Test
void metadata_for_java_heap_dump() {
- assertEquals(JAVA_HEAP_DUMP_METADATA, coreCollector.collect(context, context.paths().of("/dump_java_pid123.hprof")));
+ var expected = new CoreDumpMetadata().setBinPath("java")
+ .setBacktrace(List.of("Heap dump, no backtrace available"));
+
+ assertEquals(expected, coreCollector.collect(context, context.paths().of("/dump_java_pid123.hprof")));
}
private void mockExec(String[] cmd, String output) {
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java
index b096db2eaf4..1fd688558a0 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/maintenance/coredump/CoredumpHandlerTest.java
@@ -1,12 +1,14 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.maintenance.coredump;
+import com.yahoo.config.provision.DockerImage;
import com.yahoo.security.KeyId;
import com.yahoo.security.SealedSharedKey;
import com.yahoo.security.SecretSharedKey;
import com.yahoo.test.ManualClock;
import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.InMemoryFlagSource;
+import com.yahoo.vespa.hosted.node.admin.configserver.cores.CoreDumpMetadata;
import com.yahoo.vespa.hosted.node.admin.configserver.cores.Cores;
import com.yahoo.vespa.hosted.node.admin.container.metrics.DimensionMetrics;
import com.yahoo.vespa.hosted.node.admin.container.metrics.Metrics;
@@ -26,7 +28,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.FileTime;
import java.time.Duration;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -36,9 +37,12 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.yahoo.yolean.Exceptions.uncheck;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
@@ -57,7 +61,6 @@ public class CoredumpHandlerTest {
private final CoreCollector coreCollector = mock(CoreCollector.class);
private final Cores cores = mock(Cores.class);
- private final CoredumpReporter coredumpReporter = mock(CoredumpReporter.class);
private final Metrics metrics = new Metrics();
private final ManualClock clock = new ManualClock();
@SuppressWarnings("unchecked")
@@ -65,7 +68,7 @@ public class CoredumpHandlerTest {
private final SecretSharedKeySupplier secretSharedKeySupplier = mock(SecretSharedKeySupplier.class);
private final InMemoryFlagSource flagSource = new InMemoryFlagSource();
private final CoredumpHandler coredumpHandler =
- new CoredumpHandler(coreCollector, cores, coredumpReporter, containerCrashPath.pathInContainer(),
+ new CoredumpHandler(coreCollector, cores, containerCrashPath.pathInContainer(),
doneCoredumpsPath, metrics, clock, coredumpIdSupplier, secretSharedKeySupplier,
flagSource);
@@ -148,31 +151,14 @@ public class CoredumpHandlerTest {
verify(coredumpIdSupplier, times(1)).get();
}
- private static String buildExpectedMetadataString(Optional<String> decryptionToken) {
- return "{\"fields\":{" +
- "\"hostname\":\"host123.yahoo.com\"," +
- "\"kernel_version\":\"3.10.0-862.9.1.el7.x86_64\"," +
- "\"backtrace\":[\"call 1\",\"function 2\",\"something something\"]," +
- "\"vespa_version\":\"6.48.4\"," +
- "\"bin_path\":\"/bin/bash\"," +
- "\"coredump_path\":\"/home/docker/dumps/container-123/id-123/dump_core.456\"," +
- "\"docker_image\":\"vespa/ci:6.48.4\"" +
- decryptionToken.map(",\"decryption_token\":\"%s\""::formatted).orElse("") +
- "}}";
- }
-
- void do_get_metadata_test(Optional<String> oldDecryptionToken, Optional<String> newDecryptionToken) throws IOException {
- Map<String, Object> metadata = new HashMap<>();
- metadata.put("bin_path", "/bin/bash");
- metadata.put("backtrace", List.of("call 1", "function 2", "something something"));
-
- Map<String, Object> attributes = Map.of(
- "hostname", "host123.yahoo.com",
- "vespa_version", "6.48.4",
- "kernel_version", "3.10.0-862.9.1.el7.x86_64",
- "docker_image", "vespa/ci:6.48.4");
-
- String expectedMetadataStr = buildExpectedMetadataString(oldDecryptionToken);
+ @Test
+ void gather_metadata_test() throws IOException {
+ var metadata = new CoreDumpMetadata().setKernelVersion("3.10.0-862.9.1.el7.x86_64")
+ .setBacktrace(List.of("call 1", "function 2", "something something"))
+ .setVespaVersion("6.48.4")
+ .setBinPath("/bin/bash")
+ .setCoreDumpPath(context.paths().of("/home/docker/dumps/container-123/id-123/dump_core.456"))
+ .setDockerImage(DockerImage.fromString("example.com/vespa/ci:6.48.4"));
ContainerPath coredumpDirectory = context.paths().of("/var/crash/id-123");
Files.createDirectories(coredumpDirectory.pathOnHost());
@@ -180,40 +166,20 @@ public class CoredumpHandlerTest {
when(coreCollector.collect(eq(context), eq(coredumpDirectory.resolve("dump_core.456"))))
.thenReturn(metadata);
- assertEquals(expectedMetadataStr, coredumpHandler.getMetadata(context, coredumpDirectory, () -> attributes, oldDecryptionToken));
+ assertEquals(metadata, coredumpHandler.gatherMetadata(context, coredumpDirectory));
verify(coreCollector, times(1)).collect(any(), any());
- // Calling it again will read the previously generated metadata from disk and selectively
- // patch in an updated decryption token value, if one is provided.
- // This avoids having to re-run a potentially expensive collector step.
- expectedMetadataStr = buildExpectedMetadataString(newDecryptionToken);
- assertEquals(expectedMetadataStr, coredumpHandler.getMetadata(context, coredumpDirectory, () -> attributes, newDecryptionToken));
+ // On second invocation the test already exist, so times(1) is not incremented
+ assertEquals(metadata, coredumpHandler.gatherMetadata(context, coredumpDirectory));
+ doThrow(new IllegalStateException("Should not be invoked"))
+ .when(coreCollector).collect(any(), any());
verify(coreCollector, times(1)).collect(any(), any());
}
@Test
- void get_metadata_test_without_encryption() throws IOException {
- do_get_metadata_test(Optional.empty(), Optional.empty()); // No token in metadata
- }
-
- @Test
- void get_metadata_test_with_encryption() throws IOException {
- when(secretSharedKeySupplier.create(any())).thenReturn(Optional.of(makeFixedSecretSharedKey()));
- do_get_metadata_test(Optional.of("AVeryCoolToken"), Optional.of("AnEvenCoolerToken"));
- }
-
- @Test
- void get_metadata_test_without_encryption_then_with_encryption() throws IOException {
- // Edge where encryption was enabled between attempted processing runs.
- // We don't bother with testing the opposite edge case (encryption -> no encryption), since
- // in that case the core dump itself won't be encrypted so the token will be a no-op.
- do_get_metadata_test(Optional.empty(), Optional.of("TheSwaggestToken"));
- }
-
- @Test
void cant_get_metadata_if_no_core_file() {
assertThrows(IllegalStateException.class, () -> {
- coredumpHandler.getMetadata(context, context.paths().of("/fake/path"), Map::of, Optional.empty());
+ coredumpHandler.gatherMetadata(context, context.paths().of("/fake/path"));
});
}
@@ -228,31 +194,34 @@ public class CoredumpHandlerTest {
});
}
- void do_process_single_coredump_test(String expectedCoreFileName) throws IOException {
+ void do_process_single_coredump_test(String expectedCoreFileName, boolean encrypt) throws IOException {
ContainerPath coredumpDirectory = context.paths().of("/path/to/coredump/proccessing/id-123");
Files.createDirectories(coredumpDirectory);
- Files.write(coredumpDirectory.resolve("metadata.json"), "{\"test-metadata\":{}}".getBytes());
+ Files.write(coredumpDirectory.resolve("metadata2.json"), "{\"test-metadata\":{}}".getBytes());
Files.createFile(coredumpDirectory.resolve("dump_bash.core.431"));
- assertFolderContents(coredumpDirectory, "metadata.json", "dump_bash.core.431");
+ assertFolderContents(coredumpDirectory, "metadata2.json", "dump_bash.core.431");
+ CoreDumpMetadata expectedMetadata = new CoreDumpMetadata();
+ if (encrypt)
+ expectedMetadata.setDecryptionToken("XWTN195KaZecNCloqRICq9YwxwWJEKGqyMYcTgHtwaZfnXtzHzd7bPrPPR60VdxxzA");
- coredumpHandler.processAndReportSingleCoredump(context, coredumpDirectory, Map::of);
+ coredumpHandler.processAndReportSingleCoreDump(context, coredumpDirectory, Optional.empty());
verify(coreCollector, never()).collect(any(), any());
- verify(coredumpReporter, times(1)).reportCoredump(eq("id-123"), eq("{\"test-metadata\":{}}"));
+ verify(cores, times(1)).report(eq(context.hostname()), eq("id-123"), eq(expectedMetadata));
assertFalse(Files.exists(coredumpDirectory));
assertFolderContents(doneCoredumpsPath.resolve("container-123"), "id-123");
- assertFolderContents(doneCoredumpsPath.resolve("container-123").resolve("id-123"), "metadata.json", expectedCoreFileName);
+ assertFolderContents(doneCoredumpsPath.resolve("container-123").resolve("id-123"), "metadata2.json", expectedCoreFileName);
}
@Test
void process_single_coredump_test_without_encryption() throws IOException {
- do_process_single_coredump_test("dump_bash.core.431.zst");
+ do_process_single_coredump_test("dump_bash.core.431.zst", false);
}
@Test
void process_single_coredump_test_with_encryption() throws IOException {
flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), "bar-key");
when(secretSharedKeySupplier.create(KeyId.ofString("bar-key"))).thenReturn(Optional.of(makeFixedSecretSharedKey()));
- do_process_single_coredump_test("dump_bash.core.431.zst.enc");
+ do_process_single_coredump_test("dump_bash.core.431.zst.enc", true);
}
// TODO fail closed instead of open
@@ -260,7 +229,7 @@ public class CoredumpHandlerTest {
void encryption_disabled_when_no_public_key_set_in_feature_flag() throws IOException {
flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), ""); // empty -> not set
verify(secretSharedKeySupplier, never()).create(any());
- do_process_single_coredump_test("dump_bash.core.431.zst"); // No .enc suffix; not encrypted
+ do_process_single_coredump_test("dump_bash.core.431.zst", false); // No .enc suffix; not encrypted
}
// TODO fail closed instead of open
@@ -268,7 +237,7 @@ public class CoredumpHandlerTest {
void encryption_disabled_when_no_key_returned_for_key_id_specified_by_feature_flag() throws IOException {
flagSource.withStringFlag(Flags.CORE_ENCRYPTION_PUBLIC_KEY_ID.id(), "baz-key");
when(secretSharedKeySupplier.create(KeyId.ofString("baz-key"))).thenReturn(Optional.empty());
- do_process_single_coredump_test("dump_bash.core.431.zst"); // No .enc suffix; not encrypted
+ do_process_single_coredump_test("dump_bash.core.431.zst", false); // No .enc suffix; not encrypted
}
@Test
@@ -278,7 +247,7 @@ public class CoredumpHandlerTest {
Files.createFile(containerCrashPath.resolve("dump-2"));
Files.createFile(containerCrashPath.resolve("hs_err_pid2.log"));
Files.createDirectory(processingPath);
- Files.createFile(processingPath.resolve("metadata.json"));
+ Files.createFile(processingPath.resolve("metadata2.json"));
Files.createFile(processingPath.resolve("dump-3"));
new UnixPath(doneCoredumpsPath.resolve("container-123").resolve("dump-3-folder").resolve("dump-3"))
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
index 3d76c8e3f94..1406aaecb71 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
@@ -167,13 +167,18 @@ public class AllocatableClusterResources {
advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterSpec, exclusive); // Ask for something legal
advertisedResources = applicationLimits.cap(advertisedResources); // Overrides other conditions, even if it will then fail
var realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive); // What we'll really get
+ if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec) && advertisedResources.storageType() == NodeResources.StorageType.any) {
+ // Since local disk resreves some of the storage, try to constrain to remote disk
+ advertisedResources = advertisedResources.with(NodeResources.StorageType.remote);
+ realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive);
+ }
if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec))
return Optional.empty();
if (anySatisfies(realResources, availableRealHostResources))
- return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources),
- advertisedResources,
- wantedResources,
- clusterSpec));
+ return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources),
+ advertisedResources,
+ wantedResources,
+ clusterSpec));
else
return Optional.empty();
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index fc837ee54b4..ff72d22bb39 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -32,7 +32,7 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.7f, 10);
var scaledResources = fixture.tester().assertResources("Scaling up since resource usage is too high",
- 9, 1, 3.6, 8.3, 37.7,
+ 7, 1, 4.6, 11.1, 55.1,
fixture.autoscale());
fixture.deploy(Capacity.from(scaledResources));
@@ -49,7 +49,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(0.1f, 10);
fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly",
- 8, 1, 1.0, 8.5, 38.5,
+ 6, 1, 1.3, 11.8, 78.6,
fixture.autoscale());
}
@@ -73,7 +73,7 @@ public class AutoscalingTest {
fixture.loader().applyLoad(new Load(0.1, 0.1, 0.1), 3);
fixture.loader().applyLoad(new Load(1.0, 1.0, 1.0), 1);
fixture.tester().assertResources("Scaling up since resource usage is too high",
- 8, 1, 5.3, 17.7, 89.4,
+ 8, 1, 5.3, 17.7, 93.6,
fixture.autoscale());
}
@@ -92,13 +92,29 @@ public class AutoscalingTest {
fixture.currentResources().advertisedResources());
}
+ @Test
+ public void initial_deployment_with_host_sharing_flag_and_too_small_min() {
+ var min = new ClusterResources(1, 1, new NodeResources(0.5, 4.0, 10, 0.1));
+ var max = new ClusterResources(1, 1, new NodeResources(2.0, 8.0, 50, 0.1));
+ var fixture = AutoscalingTester.fixture()
+ .awsSetup(false, Environment.test)
+ .clusterType(ClusterSpec.Type.container)
+ .capacity(Capacity.from(min, max))
+ .initialResources(Optional.empty())
+ .hostSharingFlag()
+ .build();
+ fixture.tester().assertResources("Initial resources at min, since flag turns on host sharing",
+ 1, 1, 0.5, 4.0, 10.0,
+ fixture.currentResources().advertisedResources());
+ }
+
/** When scaling up, disregard underutilized dimensions (memory here) */
@Test
public void test_only_autoscaling_up_quickly() {
var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10);
fixture.tester().assertResources("Scaling up (only) since resource usage is too high",
- 8, 1, 7.1, 9.5, 89.4,
+ 7, 1, 8.2, 10.7, 99.5,
fixture.autoscale());
}
@@ -109,7 +125,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10);
fixture.tester().assertResources("Scaling cpu and disk up and memory down",
- 7, 1, 8.2, 4.0, 104.1,
+ 7, 1, 8.2, 4.0, 99.5,
fixture.autoscale());
}
@@ -119,7 +135,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10);
fixture.tester().assertResources("Scaling cpu and disk up, memory follows",
- 16, 1, 4, 8.0, 41.1,
+ 16, 1, 4, 8.0, 28.3,
fixture.autoscale());
}
@@ -130,7 +146,7 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.70, 1);
fixture.loader().applyCpuLoad(0.01, 100);
fixture.tester().assertResources("Scaling up since peak resource usage is too high",
- 9, 1, 3.8, 8.3, 37.7,
+ 8, 1, 4.3, 9.5, 47.2,
fixture.autoscale());
}
@@ -141,7 +157,7 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.70, 1);
fixture.loader().applyCpuLoad(0.01, 100);
fixture.tester().assertResources("Scaling up since peak resource usage is too high",
- 10, 1, 4, 8.0, 32.9,
+ 10, 1, 4, 8.0, 22.7,
fixture.autoscale());
}
@@ -180,13 +196,13 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.25f, 120);
ClusterResources scaledResources = fixture.tester().assertResources("Scaling cpu up",
- 4, 1, 3.3, 13.3, 60.3,
+ 3, 1, 5, 13.3, 66.1,
fixture.autoscale());
fixture.deploy(Capacity.from(scaledResources));
fixture.deactivateRetired(Capacity.from(scaledResources));
fixture.loader().applyCpuLoad(0.1f, 120);
fixture.tester().assertResources("Scaling down since cpu usage has gone down",
- 3, 1, 2.5, 10.0, 45.3,
+ 3, 1, 2.5, 9.2, 61.1,
fixture.autoscale());
}
@@ -224,7 +240,7 @@ public class AutoscalingTest {
@Test
public void autoscaling_target_preserves_any() {
- NodeResources resources = new NodeResources(1, 10, 10, 1);
+ NodeResources resources = new NodeResources(1, 100, 100, 1);
var capacity = Capacity.from(new ClusterResources( 2, 1, resources.with(DiskSpeed.any)),
new ClusterResources( 10, 1, resources.with(DiskSpeed.any)));
var fixture = AutoscalingTester.fixture()
@@ -272,7 +288,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(0.05f, 0.05f, 0.05f), 120);
fixture.tester().assertResources("Scaling down to limit since resource usage is low",
- 4, 1, 1.8, 7.4, 10.6,
+ 4, 1, 1.8, 7.4, 23.5,
fixture.autoscale());
}
@@ -359,7 +375,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(1.0, 120);
fixture.tester().assertResources("Suggesting above capacity limit",
- 8, 1, 6.2, 7.6, 34.3,
+ 8, 1, 6.2, 7.6, 37.8,
fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
}
@@ -370,7 +386,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(1.0, 120);
fixture.tester().assertResources("Suggesting above capacity limit",
- 13, 1, 4, 8, 19.7,
+ 13, 1, 4, 8, 13.6,
fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
}
@@ -405,7 +421,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(0.9, 120);
fixture.tester().assertResources("Scaling up to 2 nodes, scaling memory and disk down at the same time",
- 10, 5, 7.7, 40.6, 40.1,
+ 10, 5, 7.7, 40.6, 47.8,
fixture.autoscale());
}
@@ -424,7 +440,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(timePassed.negated());
fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 1.0);
fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper",
- 8, 1, 2.8, 36.2, 36,
+ 8, 1, 2.8, 36.2, 56.4,
fixture.autoscale());
}
@@ -444,7 +460,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(timePassed.negated());
fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 20.0 : 10.0, t -> 100.0);
fixture.tester().assertResources("Scaling down since resource usage is too high, changing to 1 group is cheaper",
- 6, 1, 1.0, 50.7, 50.4,
+ 6, 1, 1.0, 50.7, 79.0,
fixture.autoscale());
}
@@ -461,7 +477,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(1));
fixture.loader().applyMemLoad(1.0, 1000);
fixture.tester().assertResources("Increase group size to reduce memory load",
- 8, 2, 4.5, 97.1, 62.7,
+ 8, 2, 4.5, 97.1, 74.7,
fixture.autoscale());
}
@@ -478,7 +494,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(0.16, 0.02, 0.5), 120);
fixture.tester().assertResources("Scaling down memory",
- 6, 1, 3.0, 4.2, 100.8,
+ 6, 1, 3.0, 4.2, 139.9,
fixture.autoscale());
}
@@ -490,7 +506,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(0.02, 120);
fixture.tester().assertResources("Scaling down since enough time has passed",
- 4, 1, 1.0, 17.2, 80.4,
+ 3, 1, 1.0, 25.8, 147.4,
fixture.autoscale());
}
@@ -507,20 +523,20 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.25, 120);
// (no read share stored)
fixture.tester().assertResources("Advice to scale up since we set aside for bcp by default",
- 5, 1, 3, 100, 100,
+ 6, 1, 3, 100, 100,
fixture.autoscale());
fixture.loader().applyCpuLoad(0.25, 120);
fixture.storeReadShare(0.25, 0.5);
fixture.tester().assertResources("Half of global share is the same as the default assumption used above",
- 5, 1, 3, 100, 100,
+ 6, 1, 3, 100, 100,
fixture.autoscale());
fixture.tester.clock().advance(Duration.ofDays(1));
fixture.loader().applyCpuLoad(0.25, 120);
fixture.storeReadShare(0.5, 0.5);
fixture.tester().assertResources("Advice to scale down since we don't need room for bcp",
- 4, 1, 3, 100, 100,
+ 5, 1, 3, 100, 100,
fixture.autoscale());
}
@@ -534,7 +550,7 @@ public class AutoscalingTest {
fixture.loader().addCpuMeasurements(0.25, 200);
fixture.tester().assertResources("Scale up since we assume we need 2x cpu for growth when no data scaling time data",
- 7, 1, 1.8, 8.9, 40.4,
+ 6, 1, 2.1, 10.6, 66.5,
fixture.autoscale());
fixture.setScalingDuration(Duration.ofMinutes(5));
@@ -543,7 +559,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.25, 200);
fixture.tester().assertResources("Scale down since observed growth is slower than scaling time",
- 7, 1, 1.5, 8.9, 40.4,
+ 5, 1, 2.2, 13.3, 83.2,
fixture.autoscale());
fixture.setScalingDuration(Duration.ofMinutes(60));
@@ -554,7 +570,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.25, 200);
fixture.tester().assertResources("Scale up since observed growth is faster than scaling time",
- 7, 1, 1.8, 8.9, 40.4,
+ 6, 1, 2.1, 10.6, 66.5,
fixture.autoscale());
}
@@ -572,7 +588,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester.assertResources("Query and write load is equal -> scale up somewhat",
- 7, 1, 2, 8.9, 40.2,
+ 7, 1, 2, 8.9, 55.5,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -581,7 +597,7 @@ public class AutoscalingTest {
fixture.loader().addCpuMeasurements(0.4, 200);
// TODO: Ackhually, we scale down here - why?
fixture.tester().assertResources("Query load is 4x write load -> scale up more",
- 7, 1, 1.8, 8.9, 40.4,
+ 6, 1, 2.1, 10.6, 66.5,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -589,7 +605,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester().assertResources("Write load is 10x query load -> scale down",
- 6, 1, 1.1, 10.6, 48.5,
+ 5, 1, 1.4, 13.3, 83.2,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -597,7 +613,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester().assertResources("Query only -> largest possible",
- 7, 1, 3.5, 8.9, 40.2,
+ 7, 1, 3.5, 8.9, 55.5,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -605,7 +621,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester().assertResources("Write only -> smallest possible",
- 4, 1, 1.1, 17.2, 80.4,
+ 4, 1, 1.1, 17.2, 110.9,
fixture.autoscale());
}
@@ -666,24 +682,27 @@ public class AutoscalingTest {
@Test
public void test_changing_exclusivity() {
+ var min = new ClusterResources( 2, 1, new NodeResources( 1, 4, 100, 1));
+ var max = new ClusterResources(20, 1, new NodeResources(100, 1000, 1000, 1));
var fixture = AutoscalingTester.fixture()
.awsProdSetup(true)
.cluster(clusterSpec(true))
+ .capacity(Capacity.from(min, max))
.initialResources(Optional.empty())
.build();
fixture.tester().assertResources("Initial deployment at minimum",
- 2, 1, 2, 4, 10,
+ 2, 1, 2, 4, 100,
fixture.currentResources().advertisedResources());
fixture.tester().deploy(fixture.applicationId(), clusterSpec(false), fixture.capacity());
fixture.tester().assertResources("With non-exclusive nodes, a better solution is " +
"50% more nodes with half the cpu",
- 3, 1, 1, 4, 10.2,
+ 3, 1, 1, 4, 145.6,
fixture.autoscale());
fixture.tester().deploy(fixture.applicationId(), clusterSpec(true), fixture.capacity());
fixture.tester().assertResources("Reverts to the initial resources",
- 2, 1, 2, 4, 10,
+ 2, 1, 2, 4, 100,
fixture.currentResources().advertisedResources());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
index ff04083ebde..bba06fb9080 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
@@ -189,14 +189,18 @@ public class Fixture {
}
public Fixture.Builder awsProdSetup(boolean allowHostSharing) {
- return this.awsHostFlavors()
- .awsResourceCalculator()
- .zone(new Zone(Cloud.builder().dynamicProvisioning(true)
- .allowHostSharing(allowHostSharing)
- .build(),
- SystemName.Public,
- Environment.prod,
- RegionName.from("aws-eu-west-1a")));
+ return awsSetup(allowHostSharing, Environment.prod);
+ }
+
+ public Fixture.Builder awsSetup(boolean allowHostSharing, Environment environment) {
+ return this.awsHostFlavors()
+ .awsResourceCalculator()
+ .zone(new Zone(Cloud.builder().dynamicProvisioning(true)
+ .allowHostSharing(allowHostSharing)
+ .build(),
+ SystemName.Public,
+ environment,
+ RegionName.from("aws-eu-west-1a")));
}
public Fixture.Builder vespaVersion(Version version) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java
index d148f6d3cc7..2ae1fe18714 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsHostResourcesCalculatorImpl.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.autoscale.awsnodes;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.config.provision.Zone;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.Nodelike;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java
index 63f6d50ab2e..96fa143dc57 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/AwsResourcesCalculator.java
@@ -2,6 +2,8 @@
package com.yahoo.vespa.hosted.provision.autoscale.awsnodes;
import com.yahoo.config.provision.NodeResources;
+import com.yahoo.config.provision.NodeType;
+import com.yahoo.config.provision.Zone;
/**
* Calculations and logic on node resources common to provision-service and host-admin (at least).
@@ -10,8 +12,12 @@ import com.yahoo.config.provision.NodeResources;
*/
public class AwsResourcesCalculator {
+ private final ReservedSpacePolicyImpl reservedSpacePolicy;
private final double hostMemory = 0.6;
- private final double hostDiskOverhead = 1;
+
+ public AwsResourcesCalculator() {
+ this.reservedSpacePolicy = new ReservedSpacePolicyImpl();
+ }
/** The real resources of a parent host node in the node repository, given the real resources of the flavor. */
public NodeResources realResourcesOfParentHost(NodeResources realResourcesOfFlavor) {
@@ -52,6 +58,7 @@ public class AwsResourcesCalculator {
*/
public double diskOverhead(VespaFlavor flavor, NodeResources resources, boolean real, boolean exclusive) {
if ( flavor.realResources().storageType() != NodeResources.StorageType.local) return 0;
+ double hostDiskOverhead = reservedSpacePolicy.getPartitionSizeInBase2Gb(NodeType.host, ! exclusive);
double diskShare = resources.diskGb() /
( flavor.advertisedResources().diskGb() - ( real ? hostDiskOverhead : 0) );
return hostDiskOverhead * diskShare;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java
new file mode 100644
index 00000000000..000d08b59f8
--- /dev/null
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/ReservedSpacePolicyImpl.java
@@ -0,0 +1,50 @@
+package com.yahoo.vespa.hosted.provision.autoscale.awsnodes;
+
+import com.yahoo.config.provision.NodeType;
+
+/**
+ * Matches the internal repo implementation
+ *
+ * @author hakonhall
+ * @author musum
+ */
+public class ReservedSpacePolicyImpl {
+
+ public long getPartitionSizeInBase2Gb(NodeType nodeType, boolean sharedHost) {
+ return new PartitionSizer(nodeType, sharedHost).getPartitionSize();
+ }
+
+ private static class PartitionSizer {
+
+ private static final long imageCountForSharedHost = 6;
+ private static final long imageCountForNonSharedHost = 3;
+
+ // Add a buffer to allow a small increase in image size
+ private static final long bufferSharedHost = 5;
+ private static final long bufferNonSharedHost = 3;
+
+ private final boolean sharedHost;
+
+ PartitionSizer(NodeType nodeType, boolean sharedHost) {
+ this.sharedHost = sharedHost;
+ }
+
+ long getPartitionSize() {
+ return imageSize() * imageCount() + buffer();
+ }
+
+ private long imageSize() {
+ return (long)7.7; // return (long)VespaContainerImage.maxImageSize(hostedSystem, nodeType);
+ }
+
+ private long buffer() {
+ return sharedHost ? bufferSharedHost : bufferNonSharedHost;
+ }
+
+ private long imageCount() {
+ return sharedHost ? imageCountForSharedHost : imageCountForNonSharedHost;
+ }
+
+ }
+
+}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java
index cd5f18db516..c42b61988e9 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/awsnodes/VespaFlavor.java
@@ -34,4 +34,7 @@ public class VespaFlavor {
public NodeResources advertisedResources() { return advertisedResources; }
+ @Override
+ public String toString() { return "flavor " + name; }
+
}
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index a21f623fbfa..50cbe8879b5 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -13,11 +13,13 @@ vespa_define_module(
lz4
xxhash
zstd
+ ${VESPA_URING_LIB}
APPS
src/apps/make_fixture_macros
src/apps/vespa-detect-hostname
src/apps/vespa-drop-file-from-cache
+ src/apps/vespa-probe-io-uring
src/apps/vespa-resource-limits
src/apps/vespa-tsan-digest
src/apps/vespa-validate-hostname
diff --git a/vespalib/src/apps/vespa-probe-io-uring/.gitignore b/vespalib/src/apps/vespa-probe-io-uring/.gitignore
new file mode 100644
index 00000000000..9fa20bbf9d4
--- /dev/null
+++ b/vespalib/src/apps/vespa-probe-io-uring/.gitignore
@@ -0,0 +1 @@
+/vespa-probe-io-uring
diff --git a/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt b/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt
new file mode 100644
index 00000000000..24ceda46a87
--- /dev/null
+++ b/vespalib/src/apps/vespa-probe-io-uring/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_vespa-probe-io-uring_app
+ SOURCES
+ probe_io_uring.cpp
+ OUTPUT_NAME vespa-probe-io-uring
+ INSTALL bin
+ DEPENDS
+ vespalib
+)
diff --git a/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp b/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp
new file mode 100644
index 00000000000..ef3e6ddbef2
--- /dev/null
+++ b/vespalib/src/apps/vespa-probe-io-uring/probe_io_uring.cpp
@@ -0,0 +1,95 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef VESPA_HAS_IO_URING
+
+#include <sys/utsname.h>
+#include <liburing.h>
+#include <liburing/io_uring.h>
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> op_names = {
+ "IORING_OP_NOP",
+ "IORING_OP_READV",
+ "IORING_OP_WRITEV",
+ "IORING_OP_FSYNC",
+ "IORING_OP_READ_FIXED",
+ "IORING_OP_WRITE_FIXED",
+ "IORING_OP_POLL_ADD",
+ "IORING_OP_POLL_REMOVE",
+ "IORING_OP_SYNC_FILE_RANGE",
+ "IORING_OP_SENDMSG",
+ "IORING_OP_RECVMSG",
+ "IORING_OP_TIMEOUT",
+ "IORING_OP_TIMEOUT_REMOVE",
+ "IORING_OP_ACCEPT",
+ "IORING_OP_ASYNC_CANCEL",
+ "IORING_OP_LINK_TIMEOUT",
+ "IORING_OP_CONNECT",
+ "IORING_OP_FALLOCATE",
+ "IORING_OP_OPENAT",
+ "IORING_OP_CLOSE",
+ "IORING_OP_FILES_UPDATE",
+ "IORING_OP_STATX",
+ "IORING_OP_READ",
+ "IORING_OP_WRITE",
+ "IORING_OP_FADVISE",
+ "IORING_OP_MADVISE",
+ "IORING_OP_SEND",
+ "IORING_OP_RECV",
+ "IORING_OP_OPENAT2",
+ "IORING_OP_EPOLL_CTL",
+ "IORING_OP_SPLICE",
+ "IORING_OP_PROVIDE_BUFFERS",
+ "IORING_OP_REMOVE_BUFFERS",
+ "IORING_OP_TEE",
+ "IORING_OP_SHUTDOWN",
+ "IORING_OP_RENAMEAT",
+ "IORING_OP_UNLINKAT",
+ "IORING_OP_MKDIRAT",
+ "IORING_OP_SYMLINKAT",
+ "IORING_OP_LINKAT",
+ "IORING_OP_MSG_RING",
+ "IORING_OP_FSETXATTR",
+ "IORING_OP_SETXATTR",
+ "IORING_OP_FGETXATTR",
+ "IORING_OP_GETXATTR",
+ "IORING_OP_SOCKET",
+ "IORING_OP_URING_CMD",
+ "IORING_OP_SEND_ZC",
+ "IORING_OP_SENDMSG_ZC"
+};
+
+int main() {
+ fprintf(stderr, "Vespa was compiled with io_uring\n");
+ utsname host_info;
+ uname(&host_info);
+ fprintf(stderr, "kernel version: %s\n", host_info.release);
+ io_uring_probe *probe = io_uring_get_probe();
+ if (probe == nullptr) {
+ fprintf(stderr, "io_uring probe failed!\n");
+ return 1;
+ }
+ fprintf(stderr, "operation support: {\n");
+ for (size_t i = 0; i < op_names.size(); ++i) {
+ fprintf(stderr, " %s: %s\n", op_names[i].c_str(),
+ io_uring_opcode_supported(probe, i) ? "yes" : "no");
+ }
+ fprintf(stderr, "}\n");
+ free(probe);
+ return 0;
+}
+
+#else // VESPA_HAS_IO_URING
+
+int main() {
+ fprintf(stderr, "Vespa was compiled without io_uring\n");
+ return 1;
+}
+
+#endif // VESPA_HAS_IO_URING
diff --git a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
index 0f090f83c0b..3e3ae08ff64 100644
--- a/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
+++ b/zookeeper-server/zookeeper-server-common/src/main/java/com/yahoo/vespa/zookeeper/Reconfigurer.java
@@ -29,7 +29,7 @@ public class Reconfigurer extends AbstractComponent {
private static final Logger log = java.util.logging.Logger.getLogger(Reconfigurer.class.getName());
- private static final Duration TIMEOUT = Duration.ofMinutes(3);
+ private static final Duration TIMEOUT = Duration.ofMinutes(15);
private final ExponentialBackoff backoff = new ExponentialBackoff(Duration.ofSeconds(1), Duration.ofSeconds(10));
private final VespaZooKeeperAdmin vespaZooKeeperAdmin;
@@ -97,7 +97,7 @@ public class Reconfigurer extends AbstractComponent {
Duration reconfigTimeout = reconfigTimeout();
Instant end = now.plus(reconfigTimeout);
// Loop reconfiguring since we might need to wait until another reconfiguration is finished before we can succeed
- for (int attempt = 1; now.isBefore(end); attempt++) {
+ for (int attempt = 1; ; attempt++) {
try {
Instant reconfigStarted = Instant.now();
vespaZooKeeperAdmin.reconfigure(connectionSpec, newServers);
@@ -110,17 +110,19 @@ public class Reconfigurer extends AbstractComponent {
return;
} catch (ReconfigException e) {
Duration delay = backoff.delay(attempt);
- log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay +
- ", time left " + Duration.between(now, end) + ": " +
- Exceptions.toMessageString(e));
- sleeper.sleep(delay);
- } finally {
now = Instant.now();
+ if (now.isBefore(end)) {
+ log.log(Level.WARNING, "Reconfiguration attempt " + attempt + " failed. Retrying in " + delay +
+ ", time left " + Duration.between(now, end) + ": " + Exceptions.toMessageString(e));
+ sleeper.sleep(delay);
+ }
+ else {
+ log.log(Level.SEVERE, "Reconfiguration attempt " + attempt + " failed, and was failing for " +
+ reconfigTimeout + "; giving up now: " + Exceptions.toMessageString(e));
+ shutdownAndDie(reconfigTimeout);
+ }
}
}
-
- // Reconfiguration failed
- shutdownAndDie(reconfigTimeout);
}
private void shutdownAndDie(Duration reconfigTimeout) {