summaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java33
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java29
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java124
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java19
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java53
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java61
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java15
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java41
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java33
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json2
20 files changed, 323 insertions, 159 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 90133f7499e..98fcd3f56fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -3,9 +3,8 @@ package com.yahoo.vespa.hosted.provision.applications;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
-import com.yahoo.config.provision.NodeResources;
-import com.yahoo.vespa.hosted.provision.lb.LoadBalancer;
+import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@@ -19,11 +18,15 @@ import java.util.Optional;
*/
public class Cluster {
+ public static final int maxScalingEvents = 15;
+
private final ClusterSpec.Id id;
private final boolean exclusive;
private final ClusterResources min, max;
private final Optional<ClusterResources> suggested;
private final Optional<ClusterResources> target;
+
+ /** The maxScalingEvents last scaling events of this, sorted by increasing time (newest last) */
private final List<ScalingEvent> scalingEvents;
private final String autoscalingStatus;
@@ -45,8 +48,10 @@ public class Cluster {
this.target = Optional.empty();
else
this.target = targetResources;
- this.scalingEvents = scalingEvents;
+ this.scalingEvents = List.copyOf(scalingEvents);
this.autoscalingStatus = autoscalingStatus;
+ if (autoscalingStatus.isEmpty() && targetResources.isPresent())
+ throw new RuntimeException("Autoscaling status set empty for " + id + " even though target is " + targetResources);
}
public ClusterSpec.Id id() { return id; }
@@ -97,8 +102,10 @@ public class Cluster {
}
public Cluster with(ScalingEvent scalingEvent) {
- // NOTE: We're just storing the latest scaling event so far
- return new Cluster(id, exclusive, min, max, suggested, target, List.of(scalingEvent), autoscalingStatus);
+ List<ScalingEvent> scalingEvents = new ArrayList<>(this.scalingEvents);
+ scalingEvents.add(scalingEvent);
+ prune(scalingEvents);
+ return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus);
}
public Cluster withAutoscalingStatus(String autoscalingStatus) {
@@ -120,4 +127,9 @@ public class Cluster {
return "cluster '" + id + "'";
}
+ private void prune(List<ScalingEvent> scalingEvents) {
+ while (scalingEvents.size() > maxScalingEvents)
+ scalingEvents.remove(0);
+ }
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
index b1213b2da41..9eb4b796970 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
@@ -26,20 +26,19 @@ public class AllocatableClusterResources {
private final NodeResources realResources;
private final NodeResources advertisedResources;
- private final ClusterSpec.Type clusterType;
+ private final ClusterSpec clusterSpec;
private final double fulfilment;
/** Fake allocatable resources from requested capacity */
public AllocatableClusterResources(ClusterResources requested,
- ClusterSpec.Type clusterType,
- boolean exclusive,
+ ClusterSpec clusterSpec,
NodeRepository nodeRepository) {
this.nodes = requested.nodes();
this.groups = requested.groups();
- this.realResources = nodeRepository.resourcesCalculator().requestToReal(requested.nodeResources(), exclusive);
+ this.realResources = nodeRepository.resourcesCalculator().requestToReal(requested.nodeResources(), clusterSpec.isExclusive());
this.advertisedResources = requested.nodeResources();
- this.clusterType = clusterType;
+ this.clusterSpec = clusterSpec;
this.fulfilment = 1;
}
@@ -48,19 +47,19 @@ public class AllocatableClusterResources {
this.groups = (int)nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count();
this.realResources = averageRealResourcesOf(nodes, nodeRepository, exclusive); // Average since we average metrics over nodes
this.advertisedResources = nodes.get(0).resources();
- this.clusterType = nodes.get(0).allocation().get().membership().cluster().type();
+ this.clusterSpec = nodes.get(0).allocation().get().membership().cluster();
this.fulfilment = 1;
}
public AllocatableClusterResources(ClusterResources realResources,
NodeResources advertisedResources,
NodeResources idealResources,
- ClusterSpec.Type clusterType) {
+ ClusterSpec clusterSpec) {
this.nodes = realResources.nodes();
this.groups = realResources.groups();
this.realResources = realResources.nodeResources();
this.advertisedResources = advertisedResources;
- this.clusterType = clusterType;
+ this.clusterSpec = clusterSpec;
this.fulfilment = fulfilment(realResources.nodeResources(), idealResources);
}
@@ -88,7 +87,7 @@ public class AllocatableClusterResources {
return (int)Math.ceil((double)nodes / groups);
}
- public ClusterSpec.Type clusterType() { return clusterType; }
+ public ClusterSpec clusterSpec() { return clusterSpec; }
public double cost() { return nodes * advertisedResources.cost(); }
@@ -133,23 +132,23 @@ public class AllocatableClusterResources {
}
public static Optional<AllocatableClusterResources> from(ClusterResources wantedResources,
- boolean exclusive,
- ClusterSpec.Type clusterType,
+ ClusterSpec clusterSpec,
Limits applicationLimits,
NodeRepository nodeRepository) {
var systemLimits = new NodeResourceLimits(nodeRepository);
- if ( !exclusive && !nodeRepository.zone().getCloud().dynamicProvisioning()) {
+ boolean exclusive = clusterSpec.isExclusive();
+ if ( !clusterSpec.isExclusive() && !nodeRepository.zone().getCloud().dynamicProvisioning()) {
// We decide resources: Add overhead to what we'll request (advertised) to make sure real becomes (at least) cappedNodeResources
NodeResources advertisedResources = nodeRepository.resourcesCalculator().realToRequest(wantedResources.nodeResources(), exclusive);
- advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterType, exclusive); // Attempt to ask for something legal
+ advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterSpec.type(), exclusive); // Attempt to ask for something legal
advertisedResources = applicationLimits.cap(advertisedResources); // Overrides other conditions, even if it will then fail
NodeResources realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive); // ... thus, what we really get may change
- if ( ! systemLimits.isWithinRealLimits(realResources, clusterType)) return Optional.empty();
+ if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec.type())) return Optional.empty();
if (matchesAny(nodeRepository.flavors().getFlavors(), advertisedResources))
return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources),
advertisedResources,
wantedResources.nodeResources(),
- clusterType));
+ clusterSpec));
else
return Optional.empty();
}
@@ -172,11 +171,11 @@ public class AllocatableClusterResources {
}
if ( ! between(applicationLimits.min().nodeResources(), applicationLimits.max().nodeResources(), advertisedResources)) continue;
- if ( ! systemLimits.isWithinRealLimits(realResources, clusterType)) continue;
+ if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec.type())) continue;
var candidate = new AllocatableClusterResources(wantedResources.with(realResources),
advertisedResources,
wantedResources.nodeResources(),
- clusterType);
+ clusterSpec);
if (best.isEmpty() || candidate.preferableTo(best.get()))
best = Optional.of(candidate);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
index e57011b0e4a..fb97e803a35 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
@@ -58,7 +58,7 @@ public class AllocationOptimizer {
groups,
nodeResourcesWith(nodesAdjustedForRedundancy, groupsAdjustedForRedundancy, limits, current, target));
- var allocatableResources = AllocatableClusterResources.from(next, exclusive, current.clusterType(), limits, nodeRepository);
+ var allocatableResources = AllocatableClusterResources.from(next, current.clusterSpec(), limits, nodeRepository);
if (allocatableResources.isEmpty()) continue;
if (bestAllocation.isEmpty() || allocatableResources.get().preferableTo(bestAllocation.get()))
bestAllocation = allocatableResources;
@@ -79,7 +79,7 @@ public class AllocationOptimizer {
int groupSize = nodes / groups;
- if (current.clusterType().isContent()) { // load scales with node share of content
+ if (current.clusterSpec().isStateful()) { // load scales with node share of content
// The fixed cost portion of cpu does not scale with changes to the node count
// TODO: Only for the portion of cpu consumed by queries
double cpuPerGroup = fixedCpuCostFraction * target.nodeCpu() +
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 023eb5860ee..66c6d68931c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -59,7 +59,7 @@ public class Autoscaler {
}
private Advice autoscale(Cluster cluster, List<Node> clusterNodes, Limits limits, boolean exclusive) {
- ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type();
+ ClusterSpec clusterSpec = clusterNodes.get(0).allocation().get().membership().cluster();
if ( ! stable(clusterNodes, nodeRepository))
return Advice.none("Cluster change in progress");
@@ -69,13 +69,13 @@ public class Autoscaler {
ClusterTimeseries clusterTimeseries = new ClusterTimeseries(cluster, clusterNodes, metricsDb, nodeRepository);
int measurementsPerNode = clusterTimeseries.measurementsPerNode();
- if (measurementsPerNode < minimumMeasurementsPerNode(clusterType))
+ if (measurementsPerNode < minimumMeasurementsPerNode(clusterSpec))
return Advice.none("Collecting more data before making new scaling decisions" +
": Has " + measurementsPerNode + " data points per node" +
- "(all: " + clusterTimeseries.measurementCount +
+ " (all: " + clusterTimeseries.measurementCount +
", without stale: " + clusterTimeseries.measurementCountWithoutStale +
", without out of service: " + clusterTimeseries.measurementCountWithoutStaleOutOfService +
- ", without unstable: " + clusterTimeseries.measurementCountWithoutStaleOutOfServiceUnstable);
+ ", without unstable: " + clusterTimeseries.measurementCountWithoutStaleOutOfServiceUnstable + ")");
int nodesMeasured = clusterTimeseries.nodesMeasured();
if (nodesMeasured != clusterNodes.size())
@@ -124,14 +124,14 @@ public class Autoscaler {
}
private boolean recentlyScaled(Cluster cluster, List<Node> clusterNodes) {
- Duration downscalingDelay = downscalingDelay(clusterNodes.get(0).allocation().get().membership().cluster().type());
+ Duration downscalingDelay = downscalingDelay(clusterNodes.get(0).allocation().get().membership().cluster());
return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
.isAfter(nodeRepository.clock().instant().minus(downscalingDelay));
}
/** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
- static Duration scalingWindow(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return Duration.ofHours(12);
+ static Duration scalingWindow(ClusterSpec cluster) {
+ if (cluster.isStateful()) return Duration.ofHours(12);
return Duration.ofMinutes(30);
}
@@ -140,8 +140,8 @@ public class Autoscaler {
}
/** Measurements are currently taken once a minute. See also scalingWindow */
- static int minimumMeasurementsPerNode(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return 60;
+ static int minimumMeasurementsPerNode(ClusterSpec cluster) {
+ if (cluster.isStateful()) return 60;
return 7;
}
@@ -149,8 +149,8 @@ public class Autoscaler {
* We should wait a while before scaling down after a scaling event as a peak in usage
* indicates more peaks may arrive in the near future.
*/
- static Duration downscalingDelay(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return Duration.ofHours(12);
+ static Duration downscalingDelay(ClusterSpec cluster) {
+ if (cluster.isStateful()) return Duration.ofHours(12);
return Duration.ofHours(1);
}
@@ -200,6 +200,13 @@ public class Autoscaler {
private static Advice scaleTo(ClusterResources target) {
return new Advice(Optional.of(target), true, "Scaling due to load changes");
}
+
+ @Override
+ public String toString() {
+ return "autoscaling advice: " +
+ (present ? (target.isPresent() ? "Scale to " + target.get() : "Don't scale") : " None");
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index 1983162f121..769174a188e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -7,7 +7,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import java.time.Instant;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -33,8 +32,8 @@ public class ClusterTimeseries {
public ClusterTimeseries(Cluster cluster, List<Node> clusterNodes, MetricsDb db, NodeRepository nodeRepository) {
this.clusterNodes = clusterNodes;
- ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type();
- var timeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterType)),
+ ClusterSpec clusterSpec = clusterNodes.get(0).allocation().get().membership().cluster();
+ var timeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterSpec)),
clusterNodes.stream().map(Node::hostname).collect(Collectors.toSet()));
Map<String, Instant> startTimePerNode = metricStartTimes(cluster, clusterNodes, timeseries, nodeRepository);
@@ -61,8 +60,8 @@ public class ClusterTimeseries {
List<NodeTimeseries> nodeTimeseries,
NodeRepository nodeRepository) {
Map<String, Instant> startTimePerHost = new HashMap<>();
- if ( ! cluster.scalingEvents().isEmpty()) {
- var deployment = cluster.scalingEvents().get(cluster.scalingEvents().size() - 1);
+ if (cluster.lastScalingEvent().isPresent()) {
+ var deployment = cluster.lastScalingEvent().get();
for (Node node : clusterNodes) {
startTimePerHost.put(node.hostname(), nodeRepository.clock().instant()); // Discard all unless we can prove otherwise
var nodeGenerationMeasurements =
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
index b3632a4e82a..8e90294b4a5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
@@ -4,10 +4,12 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.google.inject.Inject;
import com.yahoo.collections.ListMap;
import com.yahoo.collections.Pair;
+import com.yahoo.component.AbstractComponent;
import com.yahoo.io.IOUtils;
import com.yahoo.vespa.defaults.Defaults;
import io.questdb.cairo.CairoConfiguration;
import io.questdb.cairo.CairoEngine;
+import io.questdb.cairo.CairoException;
import io.questdb.cairo.DefaultCairoConfiguration;
import io.questdb.cairo.TableWriter;
import io.questdb.cairo.sql.Record;
@@ -40,14 +42,14 @@ import java.util.stream.Collectors;
*
* @author bratseth
*/
-public class QuestMetricsDb implements MetricsDb {
+public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
private static final Logger log = Logger.getLogger(QuestMetricsDb.class.getName());
- private static final String tableName = "metrics";
+ private static final String table = "metrics";
private final Clock clock;
private final String dataDir;
- private final CairoEngine engine;
+ private CairoEngine engine;
private long highestTimestampAdded = 0;
@@ -63,8 +65,11 @@ public class QuestMetricsDb implements MetricsDb {
&& ! new File(Defaults.getDefaults().vespaHome()).exists())
dataDir = "data"; // We're injected, but not on a node with Vespa installed
this.dataDir = dataDir;
+ initializeDb();
+ }
- IOUtils.createDirectory(dataDir + "/" + tableName);
+ private void initializeDb() {
+ IOUtils.createDirectory(dataDir + "/" + table);
// silence Questdb's custom logging system
IOUtils.writeFile(new File(dataDir, "quest-log.conf"), new byte[0]);
@@ -73,30 +78,43 @@ public class QuestMetricsDb implements MetricsDb {
CairoConfiguration configuration = new DefaultCairoConfiguration(dataDir);
engine = new CairoEngine(configuration);
- ensureExists(tableName);
+ ensureExists(table);
}
@Override
public void add(Collection<Pair<String, MetricSnapshot>> snapshots) {
- try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), tableName)) {
- for (var snapshot : snapshots) {
- long atMillis = adjustIfRecent(snapshot.getSecond().at().toEpochMilli(), highestTimestampAdded);
- if (atMillis < highestTimestampAdded) continue; // Ignore old data
- highestTimestampAdded = atMillis;
- TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds
- row.putStr(0, snapshot.getFirst());
- row.putFloat(2, (float)snapshot.getSecond().cpu());
- row.putFloat(3, (float)snapshot.getSecond().memory());
- row.putFloat(4, (float)snapshot.getSecond().disk());
- row.putLong(5, snapshot.getSecond().generation());
- row.putBool(6, snapshot.getSecond().inService());
- row.putBool(7, snapshot.getSecond().stable());
- row.append();
+ try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) {
+ add(snapshots, writer);
+ }
+ catch (CairoException e) {
+ if (e.getMessage().contains("Cannot read offset")) {
+ // This error seems non-recoverable
+ repair(e);
+ try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) {
+ add(snapshots, writer);
+ }
}
- writer.commit();
}
}
+ private void add(Collection<Pair<String, MetricSnapshot>> snapshots, TableWriter writer) {
+ for (var snapshot : snapshots) {
+ long atMillis = adjustIfRecent(snapshot.getSecond().at().toEpochMilli(), highestTimestampAdded);
+ if (atMillis < highestTimestampAdded) continue; // Ignore old data
+ highestTimestampAdded = atMillis;
+ TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds
+ row.putStr(0, snapshot.getFirst());
+ row.putFloat(2, (float)snapshot.getSecond().cpu());
+ row.putFloat(3, (float)snapshot.getSecond().memory());
+ row.putFloat(4, (float)snapshot.getSecond().disk());
+ row.putLong(5, snapshot.getSecond().generation());
+ row.putBool(6, snapshot.getSecond().inService());
+ row.putBool(7, snapshot.getSecond().stable());
+ row.append();
+ }
+ writer.commit();
+ }
+
@Override
public List<NodeTimeseries> getNodeTimeseries(Instant startTime, Set<String> hostnames) {
try (SqlCompiler compiler = new SqlCompiler(engine)) {
@@ -118,7 +136,7 @@ public class QuestMetricsDb implements MetricsDb {
SqlExecutionContext context = newContext();
int partitions = 0;
try (SqlCompiler compiler = new SqlCompiler(engine)) {
- File tableRoot = new File(dataDir, tableName);
+ File tableRoot = new File(dataDir, table);
List<String> removeList = new ArrayList<>();
for (String dirEntry : tableRoot.list()) {
File partitionDir = new File(tableRoot, dirEntry);
@@ -133,7 +151,7 @@ public class QuestMetricsDb implements MetricsDb {
}
// Remove unless all partitions are old: Removing all partitions "will be supported in the future"
if ( removeList.size() < partitions && ! removeList.isEmpty())
- compiler.compile("alter table " + tableName + " drop partition " +
+ compiler.compile("alter table " + table + " drop partition " +
removeList.stream().map(dir -> "'" + dir + "'").collect(Collectors.joining(",")),
context);
}
@@ -143,28 +161,78 @@ public class QuestMetricsDb implements MetricsDb {
}
@Override
+ public void deconstruct() { close(); }
+
+ @Override
public void close() {
if (engine != null)
engine.close();
}
- private void ensureExists(String tableName) {
+ /**
+ * Repairs this db on corruption.
+ *
+ * @param e the exception indicating corruption
+ */
+ private void repair(Exception e) {
+ log.log(Level.WARNING, "QuestDb seems corrupted, wiping data and starting over", e);
+ IOUtils.recursiveDeleteDir(new File(dataDir));
+ initializeDb();
+ }
+
+ private void ensureExists(String table) {
SqlExecutionContext context = newContext();
- if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), tableName)) return;
+ if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { // table exists
+ ensureUpdated(table, context);
+ } else {
+ create(table, context);
+ }
+ }
+ private void ensureUpdated(String table, SqlExecutionContext context) {
try (SqlCompiler compiler = new SqlCompiler(engine)) {
- compiler.compile("create table " + tableName +
+ if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) {
+ ensureColumnExists("inService", "boolean", table, compiler, context); // TODO: Remove after December 2020
+ ensureColumnExists("stable", "boolean", table, compiler, context); // TODO: Remove after December 2020
+ }
+ } catch (SqlException e) {
+ repair(e);
+ }
+ }
+
+ private void create(String table, SqlExecutionContext context) {
+ try (SqlCompiler compiler = new SqlCompiler(engine)) {
+ compiler.compile("create table " + table +
" (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," +
" application_generation long, inService boolean, stable boolean)" +
" timestamp(at)" +
"PARTITION BY DAY;",
context);
- // We should do this if we get a version where selecting on stringhs work embedded, see below
+ // We should do this if we get a version where selecting on strings work embedded, see below
// compiler.compile("alter table " + tableName + " alter column hostname add index", context);
}
catch (SqlException e) {
- throw new IllegalStateException("Could not create Quest db table '" + tableName + "'", e);
+ throw new IllegalStateException("Could not create Quest db table '" + table + "'", e);
+ }
+ }
+
+ private void ensureColumnExists(String column, String columnType,
+ String table, SqlCompiler compiler, SqlExecutionContext context) throws SqlException {
+ if (columnNamesOf(table, compiler, context).contains(column)) return;
+ compiler.compile("alter table " + table + " add column " + column + " " + columnType, context);
+ }
+
+ private List<String> columnNamesOf(String tableName, SqlCompiler compiler, SqlExecutionContext context) throws SqlException {
+ List<String> columns = new ArrayList<>();
+ try (RecordCursorFactory factory = compiler.compile("show columns from " + tableName, context).getRecordCursorFactory()) {
+ try (RecordCursor cursor = factory.getCursor(context)) {
+ Record record = cursor.getRecord();
+ while (cursor.hasNext()) {
+ columns.add(record.getStr(0).toString());
+ }
+ }
}
+ return columns;
}
private long adjustIfRecent(long timestamp, long highestTimestampAdded) {
@@ -185,7 +253,7 @@ public class QuestMetricsDb implements MetricsDb {
DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME.withZone(ZoneId.of("UTC"));
String from = formatter.format(startTime).substring(0, 19) + ".000000Z";
String to = formatter.format(clock.instant()).substring(0, 19) + ".000000Z";
- String sql = "select * from " + tableName + " where at in('" + from + "', '" + to + "');";
+ String sql = "select * from " + table + " where at in('" + from + "', '" + to + "');";
// WHERE clauses does not work:
// String sql = "select * from " + tableName + " where hostname in('host1', 'host2', 'host3');";
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index 809c54146d0..1197a01b9c7 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -14,6 +14,7 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
+import com.yahoo.vespa.orchestrator.status.ApplicationLock;
import java.time.Duration;
import java.util.List;
@@ -66,19 +67,17 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
List<Node> clusterNodes,
MaintenanceDeployment deployment) {
Application application = nodeRepository().applications().get(applicationId).orElse(new Application(applicationId));
- Optional<Cluster> cluster = application.cluster(clusterId);
- if (cluster.isEmpty()) return;
+ if (application.cluster(clusterId).isEmpty()) return;
+ Cluster cluster = application.cluster(clusterId).get();
- var advice = autoscaler.autoscale(cluster.get(), clusterNodes);
-
- application = application.with(cluster.get().withAutoscalingStatus(advice.reason()));
+ var advice = autoscaler.autoscale(cluster, clusterNodes);
+ cluster = cluster.withAutoscalingStatus(advice.reason());
if (advice.isEmpty()) {
- applications().put(application, deployment.applicationLock().get());
- }
- else if ( ! cluster.get().targetResources().equals(advice.target())) {
- applications().put(application.with(cluster.get().withTarget(advice.target())), deployment.applicationLock().get());
+ applications().put(application.with(cluster), deployment.applicationLock().get());
+ } else if (!cluster.targetResources().equals(advice.target())) {
+ applications().put(application.with(cluster.withTarget(advice.target())), deployment.applicationLock().get());
if (advice.target().isPresent()) {
- logAutoscaling(advice.target().get(), applicationId, cluster.get(), clusterNodes);
+ logAutoscaling(advice.target().get(), applicationId, cluster, clusterNodes);
deployment.activate();
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index d2dcaaeae5b..4a5c28fe0c8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.collections.Pair;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ClusterSpec;
@@ -26,10 +27,12 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Duration;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
@@ -41,6 +44,7 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.any;
*/
public class MetricsReporter extends NodeRepositoryMaintainer {
+ private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>();
private final Metric metric;
private final Orchestrator orchestrator;
private final ServiceMonitor serviceMonitor;
@@ -261,28 +265,39 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
.forEach((lockPath, lockMetrics) -> {
Metric.Context context = getContext(Map.of("lockPath", lockPath));
- metric.set("lockAttempt.acquire", lockMetrics.getAndResetAcquireCount(), context);
- metric.set("lockAttempt.acquireFailed", lockMetrics.getAndResetAcquireFailedCount(), context);
- metric.set("lockAttempt.acquireTimedOut", lockMetrics.getAndResetAcquireTimedOutCount(), context);
- metric.set("lockAttempt.locked", lockMetrics.getAndResetAcquireSucceededCount(), context);
- metric.set("lockAttempt.release", lockMetrics.getAndResetReleaseCount(), context);
- metric.set("lockAttempt.releaseFailed", lockMetrics.getAndResetReleaseFailedCount(), context);
- metric.set("lockAttempt.reentry", lockMetrics.getAndResetReentryCount(), context);
- metric.set("lockAttempt.deadlock", lockMetrics.getAndResetDeadlockCount(), context);
- metric.set("lockAttempt.nakedRelease", lockMetrics.getAndResetNakedReleaseCount(), context);
- metric.set("lockAttempt.acquireWithoutRelease", lockMetrics.getAndResetAcquireWithoutReleaseCount(), context);
- metric.set("lockAttempt.foreignRelease", lockMetrics.getAndResetForeignReleaseCount(), context);
-
- setLockLatencyMetrics("acquire", lockMetrics.getAndResetAcquireLatencyMetrics(), context);
- setLockLatencyMetrics("locked", lockMetrics.getAndResetLockedLatencyMetrics(), context);
+ LatencyMetrics acquireLatencyMetrics = lockMetrics.getAndResetAcquireLatencyMetrics();
+ setNonZero("lockAttempt.acquireMaxActiveLatency", acquireLatencyMetrics.maxActiveLatencySeconds(), context);
+ setNonZero("lockAttempt.acquireHz", acquireLatencyMetrics.startHz(), context);
+ setNonZero("lockAttempt.acquireLoad", acquireLatencyMetrics.load(), context);
+
+ LatencyMetrics lockedLatencyMetrics = lockMetrics.getAndResetLockedLatencyMetrics();
+ setNonZero("lockAttempt.lockedLatency", lockedLatencyMetrics.maxLatencySeconds(), context);
+ setNonZero("lockAttempt.lockedLoad", lockedLatencyMetrics.load(), context);
+
+ setNonZero("lockAttempt.acquireTimedOut", lockMetrics.getAndResetAcquireTimedOutCount(), context);
+ setNonZero("lockAttempt.deadlock", lockMetrics.getAndResetDeadlockCount(), context);
+
+ // bucket for various rare errors - to reduce #metrics
+ setNonZero("lockAttempt.errors",
+ lockMetrics.getAndResetAcquireFailedCount() +
+ lockMetrics.getAndResetReleaseFailedCount() +
+ lockMetrics.getAndResetNakedReleaseCount() +
+ lockMetrics.getAndResetAcquireWithoutReleaseCount() +
+ lockMetrics.getAndResetForeignReleaseCount(),
+ context);
});
}
- private void setLockLatencyMetrics(String name, LatencyMetrics latencyMetrics, Metric.Context context) {
- metric.set("lockAttempt." + name + "Latency", latencyMetrics.latencySeconds(), context);
- metric.set("lockAttempt." + name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds(), context);
- metric.set("lockAttempt." + name + "Hz", latencyMetrics.startHz(), context);
- metric.set("lockAttempt." + name + "Load", latencyMetrics.load(), context);
+ private void setNonZero(String key, Number value, Metric.Context context) {
+ var metricKey = new Pair<>(context, key);
+ if (Double.compare(value.doubleValue(), 0.0) != 0) {
+ metric.set(key, value, context);
+ nonZeroMetrics.add(metricKey);
+ } else if (nonZeroMetrics.remove(metricKey)) {
+ // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying
+ // a non-zero 'last' from earlier periods.
+ metric.set(key, value, context);
+ }
}
private void updateDockerMetrics(NodeList nodes) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 6d571fada9e..2999655e5fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
- nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ // Allow a grace period after node re-activation
+ if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
+ nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
}
else if (hostSuspended(node, activeNodes)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index 42e26814d41..696853b2992 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -70,7 +70,7 @@ public class CuratorDatabaseClient {
private static final Path containerImagesPath = root.append("dockerImages");
private static final Path firmwareCheckPath = root.append("firmwareCheck");
- private static final Duration defaultLockTimeout = Duration.ofMinutes(2);
+ private static final Duration defaultLockTimeout = Duration.ofMinutes(6);
private final NodeSerializer nodeSerializer;
private final CuratorDatabase db;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
index 68e11c4c995..bc164dc37e0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
@@ -203,7 +203,7 @@ class NodeAllocation {
* Such nodes will be marked retired during finalization of the list of accepted nodes.
* The conditions for this are:
*
- * This is a content or combined node. These must always be retired before being removed to allow the cluster to
+ * This is a stateful node. These must always be retired before being removed to allow the cluster to
* migrate away data.
*
* This is a container node and it is not desired due to having the wrong flavor. In this case this
@@ -218,7 +218,7 @@ class NodeAllocation {
if (candidate.allocation().get().membership().retired()) return true; // don't second-guess if already retired
if (! requestedNodes.considerRetiring()) return false;
- return cluster.type().isContent() ||
+ return cluster.isStateful() ||
(cluster.type() == ClusterSpec.Type.container && !hasCompatibleFlavor(candidate));
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
index ede6f4ef250..a6d68243160 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
@@ -29,7 +29,6 @@ import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
import com.yahoo.vespa.hosted.provision.autoscale.AllocationOptimizer;
import com.yahoo.vespa.hosted.provision.autoscale.Limits;
import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget;
-import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.filter.ApplicationFilter;
import com.yahoo.vespa.hosted.provision.node.filter.NodeHostFilter;
@@ -168,7 +167,7 @@ public class NodeRepositoryProvisioner implements Provisioner {
boolean firstDeployment = nodes.isEmpty();
AllocatableClusterResources currentResources =
firstDeployment // start at min, preserve current resources otherwise
- ? new AllocatableClusterResources(requested.minResources(), clusterSpec.type(), clusterSpec.isExclusive(), nodeRepository)
+ ? new AllocatableClusterResources(requested.minResources(), clusterSpec, nodeRepository)
: new AllocatableClusterResources(nodes, nodeRepository, clusterSpec.isExclusive());
return within(Limits.of(requested), clusterSpec.isExclusive(), currentResources, firstDeployment);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
index 6f1334421ef..0f9babb53aa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
@@ -51,26 +51,6 @@ public class LocksResponse extends HttpResponse {
root.setString("hostname", hostname);
root.setString("time", Instant.now().toString());
- Cursor lockPathsCursor = root.setArray("lock-paths");
- lockMetricsByPath.forEach((lockPath, lockMetrics) -> {
- Cursor lockPathCursor = lockPathsCursor.addObject();
- lockPathCursor.setString("path", lockPath);
- lockPathCursor.setLong("acquireCount", lockMetrics.getCumulativeAcquireCount());
- lockPathCursor.setLong("acquireFailedCount", lockMetrics.getCumulativeAcquireFailedCount());
- lockPathCursor.setLong("acquireTimedOutCount", lockMetrics.getCumulativeAcquireTimedOutCount());
- lockPathCursor.setLong("lockedCount", lockMetrics.getCumulativeAcquireSucceededCount());
- lockPathCursor.setLong("releaseCount", lockMetrics.getCumulativeReleaseCount());
- lockPathCursor.setLong("releaseFailedCount", lockMetrics.getCumulativeReleaseFailedCount());
- lockPathCursor.setLong("reentryCount", lockMetrics.getCumulativeReentryCount());
- lockPathCursor.setLong("deadlock", lockMetrics.getCumulativeDeadlockCount());
- lockPathCursor.setLong("nakedRelease", lockMetrics.getCumulativeNakedReleaseCount());
- lockPathCursor.setLong("acquireWithoutRelease", lockMetrics.getCumulativeAcquireWithoutReleaseCount());
- lockPathCursor.setLong("foreignRelease", lockMetrics.getCumulativeForeignReleaseCount());
-
- setLatency(lockPathCursor, "acquire", lockMetrics.getAcquireLatencyMetrics());
- setLatency(lockPathCursor, "locked", lockMetrics.getLockedLatencyMetrics());
- });
-
Cursor threadsCursor = root.setArray("threads");
for (var threadLockStats : threadLockStatsList) {
Optional<LockAttempt> ongoingLockAttempt = threadLockStats.getTopMostOngoingLockAttempt();
@@ -102,18 +82,45 @@ public class LocksResponse extends HttpResponse {
Cursor recordingsCursor = root.setArray("recordings");
historicRecordings.forEach(recording -> setRecording(recordingsCursor.addObject(), recording));
}
+
+ Cursor lockPathsCursor = root.setArray("lock-paths");
+ lockMetricsByPath.forEach((lockPath, lockMetrics) -> {
+ Cursor lockPathCursor = lockPathsCursor.addObject();
+ lockPathCursor.setString("path", lockPath);
+ setNonZeroLong(lockPathCursor, "acquireCount", lockMetrics.getCumulativeAcquireCount());
+ setNonZeroLong(lockPathCursor, "acquireFailedCount", lockMetrics.getCumulativeAcquireFailedCount());
+ setNonZeroLong(lockPathCursor, "acquireTimedOutCount", lockMetrics.getCumulativeAcquireTimedOutCount());
+ setNonZeroLong(lockPathCursor, "lockedCount", lockMetrics.getCumulativeAcquireSucceededCount());
+ setNonZeroLong(lockPathCursor, "releaseCount", lockMetrics.getCumulativeReleaseCount());
+ setNonZeroLong(lockPathCursor, "releaseFailedCount", lockMetrics.getCumulativeReleaseFailedCount());
+ setNonZeroLong(lockPathCursor, "reentryCount", lockMetrics.getCumulativeReentryCount());
+ setNonZeroLong(lockPathCursor, "deadlock", lockMetrics.getCumulativeDeadlockCount());
+ setNonZeroLong(lockPathCursor, "nakedRelease", lockMetrics.getCumulativeNakedReleaseCount());
+ setNonZeroLong(lockPathCursor, "acquireWithoutRelease", lockMetrics.getCumulativeAcquireWithoutReleaseCount());
+ setNonZeroLong(lockPathCursor, "foreignRelease", lockMetrics.getCumulativeForeignReleaseCount());
+
+ setLatency(lockPathCursor, "acquire", lockMetrics.getAcquireLatencyMetrics());
+ setLatency(lockPathCursor, "locked", lockMetrics.getLockedLatencyMetrics());
+ });
+ }
+
+ private static void setNonZeroLong(Cursor cursor, String fieldName, long value) {
+ if (value != 0) {
+ cursor.setLong(fieldName, value);
+ }
}
private static void setLatency(Cursor cursor, String name, LatencyMetrics latencyMetrics) {
- cursor.setDouble(name + "Latency", latencyMetrics.latencySeconds());
- cursor.setDouble(name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds());
- cursor.setDouble(name + "Hz", latencyMetrics.endHz());
- cursor.setDouble(name + "Load", latencyMetrics.load());
+ setNonZeroDouble(cursor, name + "Latency", latencyMetrics.latencySeconds());
+ setNonZeroDouble(cursor, name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds());
+ setNonZeroDouble(cursor, name + "Hz", latencyMetrics.endHz());
+ setNonZeroDouble(cursor, name + "Load", latencyMetrics.load());
}
- private static double roundDouble(double value, int decimalPlaces) {
- double factor = Math.pow(10, decimalPlaces);
- return Math.round(value * factor) / factor;
+ private static void setNonZeroDouble(Cursor cursor, String fieldName, double value) {
+ if (Double.compare(value, 0.0) != 0) {
+ cursor.setDouble(fieldName, value);
+ }
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index a821bde5b26..75f49834f97 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -74,6 +74,9 @@ public class AutoscalingTest {
tester.assertResources("Scaling down to minimum since usage has gone down significantly",
14, 1, 1.0, 30.8, 30.8,
tester.autoscale(application1, cluster1.id(), min, max).target());
+
+ var events = tester.nodeRepository().applications().get(application1).get().cluster(cluster1.id()).get().scalingEvents();
+ events.forEach(e -> System.out.println(e));
}
/** We prefer fewer nodes for container clusters as (we assume) they all use the same disk and memory */
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
index b2c9da4d22c..ba60a6a2207 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
@@ -117,7 +117,7 @@ public class QuestMetricsDbTest {
/** To manually test that we can read existing data */
@Ignore
@Test
- public void testReadingExistingData() {
+ public void testReadingAndAppendingToExistingData() {
String dataDir = "data/QuestMetricsDbExistingData";
if ( ! new File(dataDir).exists()) {
System.out.println("No existing data to check");
@@ -125,14 +125,21 @@ public class QuestMetricsDbTest {
}
IOUtils.createDirectory(dataDir + "/metrics");
ManualClock clock = new ManualClock("2020-10-01T00:00:00");
- clock.advance(Duration.ofSeconds(10)); // Adjust to end time of data written
+ clock.advance(Duration.ofSeconds(9)); // Adjust to last data written
QuestMetricsDb db = new QuestMetricsDb(dataDir, clock);
- List<NodeTimeseries> timeseries = db.getNodeTimeseries(clock.instant().minus(Duration.ofSeconds(10)), Set.of("host1"));
+ List<NodeTimeseries> timeseries = db.getNodeTimeseries(clock.instant().minus(Duration.ofSeconds(9)), Set.of("host1"));
assertFalse("Could read existing data", timeseries.isEmpty());
assertEquals(10, timeseries.get(0).size());
- System.out.println("Existing data:");
+ System.out.println("Existing data read:");
+ for (var snapshot : timeseries.get(0).asList())
+ System.out.println(" " + snapshot);
+
+ clock.advance(Duration.ofSeconds(1));
+ db.add(timeseries(2, Duration.ofSeconds(1), clock, "host1"));
+ System.out.println("New data written and read:");
+ timeseries = db.getNodeTimeseries(clock.instant().minus(Duration.ofSeconds(2)), Set.of("host1"));
for (var snapshot : timeseries.get(0).asList())
System.out.println(" " + snapshot);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
index 4b14174488e..b51f653ecc0 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
import org.junit.Test;
@@ -14,7 +15,6 @@ import java.time.Duration;
import java.time.Instant;
import java.util.List;
-
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -89,11 +89,11 @@ public class AutoscalingMaintainerTest {
assertTrue(tester.deployer().lastDeployTime(app1).isPresent());
assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
List<ScalingEvent> events = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().scalingEvents();
- assertEquals(1, events.size());
- assertEquals(2, events.get(0).from().nodes());
- assertEquals(4, events.get(0).to().nodes());
- assertEquals(1, events.get(0).generation());
- assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(0).at().toEpochMilli());
+ assertEquals(2, events.size());
+ assertEquals(2, events.get(1).from().nodes());
+ assertEquals(4, events.get(1).to().nodes());
+ assertEquals(1, events.get(1).generation());
+ assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(1).at().toEpochMilli());
// Measure overload still, since change is not applied, but metrics are discarded
tester.clock().advance(Duration.ofSeconds(1));
@@ -116,7 +116,7 @@ public class AutoscalingMaintainerTest {
tester.maintainer().maintain();
assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
events = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().scalingEvents();
- assertEquals(2, events.get(0).generation());
+ assertEquals(2, events.get(2).generation());
}
@Test
@@ -128,4 +128,31 @@ public class AutoscalingMaintainerTest {
AutoscalingMaintainer.toString(new ClusterResources(4, 2, new NodeResources(1, 2, 4, 1))));
}
+ @Test
+ public void testScalingEventRecording() {
+ ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1");
+ ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec();
+ NodeResources lowResources = new NodeResources(4, 4, 10, 0.1);
+ NodeResources highResources = new NodeResources(8, 8, 20, 0.1);
+ Capacity app1Capacity = Capacity.from(new ClusterResources(2, 1, lowResources),
+ new ClusterResources(4, 2, highResources));
+ var tester = new AutoscalingMaintainerTester(new MockDeployer.ApplicationContext(app1, cluster1, app1Capacity));
+
+ // deploy
+ tester.deploy(app1, cluster1, app1Capacity);
+
+ for (int i = 0; i < 20; i++) {
+ tester.clock().advance(Duration.ofDays(1));
+
+ if (i % 2 == 0) // high load
+ tester.addMeasurements(0.9f, 0.9f, 0.9f, i, 200, app1);
+ else // low load
+ tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 200, app1);
+ tester.maintainer().maintain();
+ }
+
+ var events = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().scalingEvents();
+ assertEquals(Cluster.maxScalingEvents, events.size());
+ }
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index a25858c034f..3e4887b6998 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -147,25 +147,14 @@ public class MetricsReporterTest {
// Verify sum of values across dimensions, and remove these metrics to avoid checking against
// metric.values below, which is not sensitive to dimensions.
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquire", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireFailed", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireTimedOut", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.locked", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.release", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.releaseFailed", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.reentry", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.deadlock", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.nakedRelease", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireWithoutRelease", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.foreignRelease", 0);
- metric.remove("lockAttempt.acquireLatency");
metric.remove("lockAttempt.acquireMaxActiveLatency");
metric.remove("lockAttempt.acquireHz");
metric.remove("lockAttempt.acquireLoad");
metric.remove("lockAttempt.lockedLatency");
- metric.remove("lockAttempt.lockedMaxActiveLatency");
- metric.remove("lockAttempt.lockedHz");
metric.remove("lockAttempt.lockedLoad");
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireTimedOut", 0);
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.deadlock", 0);
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.errors", 0);
assertEquals(expectedMetrics, new TreeMap<>(metric.values));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index d403affc292..d4dbc6f55a5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
@@ -233,7 +234,7 @@ public class NodeFailerTest {
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
-
+
String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
@@ -309,6 +310,36 @@ public class NodeFailerTest {
}
@Test
+ public void re_activate_grace_period_test() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+ String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
+
+ tester.serviceMonitor.setHostDown(downNode);
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(75));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+
+ // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again
+ tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName());
+ tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(45));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+ }
+
+ @Test
public void node_failing_can_allocate_spare() {
var resources = new NodeResources(1, 20, 15, 1);
Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
index 86427fe30ae..3945c518a77 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
@@ -485,7 +485,7 @@ public class NodesV2ApiTest {
"{\"message\":\"Moved host2.yahoo.com to parked\"}");
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/state/ready/host2.yahoo.com",
new byte[0], Request.Method.PUT),
- 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"Cannot make parked host host2.yahoo.com allocated to tenant2.application2.instance2 as 'content/id2/0/0' available for new allocation as it is not in state [dirty]\"}");
+ 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"Cannot make parked host host2.yahoo.com allocated to tenant2.application2.instance2 as 'content/id2/0/0/stateful' available for new allocation as it is not in state [dirty]\"}");
// (... while dirty then ready works (the ready move will be initiated by node maintenance))
assertResponse(new Request("http://localhost:8080/nodes/v2/state/dirty/host2.yahoo.com",
new byte[0], Request.Method.PUT),
@@ -502,7 +502,7 @@ public class NodesV2ApiTest {
// Attempt to DELETE allocated node
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com",
new byte[0], Request.Method.DELETE),
- 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"active child node host4.yahoo.com allocated to tenant3.application3.instance3 as 'content/id3/0/0' is currently allocated and cannot be removed\"}");
+ 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"active child node host4.yahoo.com allocated to tenant3.application3.instance3 as 'content/id3/0/0/stateful' is currently allocated and cannot be removed\"}");
// PUT current restart generation with string instead of long
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com",
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
index 7104957ba98..c3857e9c8ee 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
@@ -4,7 +4,7 @@
"failedTenantParent": "dockerhost1.yahoo.com",
"failedTenant": "host4.yahoo.com",
"failedTenantResources": "[vcpu: 1.0, memory: 4.0 Gb, disk 100.0 Gb, bandwidth: 1.0 Gbps, storage type: local]",
- "failedTenantAllocation": "allocated to tenant3.application3.instance3 as 'content/id3/0/0'",
+ "failedTenantAllocation": "allocated to tenant3.application3.instance3 as 'content/id3/0/0/stateful'",
"hostCandidateRejectionReasons": {
"singularReasonFailures": {
"insufficientVcpu": 0,