aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHarald Musum <musum@verizonmedia.com>2021-01-01 19:13:11 +0100
committerHarald Musum <musum@verizonmedia.com>2021-01-01 19:13:11 +0100
commit9ab7a397bacbae98501137a349786d7461a8aa29 (patch)
tree2c009df1401e94beb3acaa4e79e1dc22e7abeade /node-repository
parent3838f731e99e2a24871b3a90eb40d18579a56e40 (diff)
parent65df61f0c3d23ae95dbe8cb4296cc4c2d3e7fb6e (diff)
Merge branch 'master' into revert-14062-revert-14057-hmusum/upgrade-to-curator-4
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java33
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java36
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/ScalingEvent.java32
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java33
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java117
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java68
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java36
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java16
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java128
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java71
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java237
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java53
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java18
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java110
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java7
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/JobControlFlags.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java29
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/FlavorConfigBuilder.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java47
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java18
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java61
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java52
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java16
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java59
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java116
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java311
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java17
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java33
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java3
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java12
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java12
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisionerTest.java26
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json4
61 files changed, 1427 insertions, 573 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 87a0684909c..5c635551692 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -183,7 +183,28 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
.findFirst());
}
+ /**
+ * Returns the cluster spec of the nodes in this, without any group designation
+ *
+ * @throws IllegalStateException if there are no nodes in thus list or they do not all belong
+ * to the same cluster
+ */
+ public ClusterSpec clusterSpec() {
+ ensureSingleCluster();
+ if (isEmpty()) throw new IllegalStateException("No nodes");
+ return first().get().allocation().get().membership().cluster().with(Optional.empty());
+ }
+
+ /**
+ * Returns the resources of the nodes of this.
+ *
+ * NOTE: If the nodes do not all have the same values of node resources, a random pick among those node resources
+ * will be returned.
+ *
+ * @throws IllegalStateException if the nodes in this do not all belong to the same cluster
+ */
public ClusterResources toResources() {
+ ensureSingleCluster();
if (isEmpty()) return new ClusterResources(0, 0, NodeResources.unspecified());
return new ClusterResources(size(),
(int)stream().map(node -> node.allocation().get().membership().cluster().group().get())
@@ -192,6 +213,18 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
first().get().resources());
}
+ private void ensureSingleCluster() {
+ if (isEmpty()) return;
+
+ if (stream().anyMatch(node -> node.allocation().isEmpty()))
+ throw new IllegalStateException("Some nodes are not allocated to a cluster");
+
+ ClusterSpec firstNodeSpec = first().get().allocation().get().membership().cluster().with(Optional.empty());
+ if (stream().map(node -> node.allocation().get().membership().cluster().with(Optional.empty()))
+ .anyMatch(clusterSpec -> ! clusterSpec.equals(firstNodeSpec)))
+ throw new IllegalStateException("Nodes belong to multiple clusters");
+ }
+
/** Returns the nodes of this as a stream */
public Stream<Node> stream() { return asList().stream(); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index 86795767710..b0e7c0bd61b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -257,6 +257,10 @@ public class NodeRepository extends AbstractComponent {
return NodeList.copyOf(getNodes(inState));
}
+ public NodeList list(ApplicationId application, State ... inState) {
+ return NodeList.copyOf(getNodes(application, inState));
+ }
+
/** Returns a filterable list of all nodes of an application */
public NodeList list(ApplicationId application) {
return NodeList.copyOf(getNodes(application));
@@ -884,11 +888,15 @@ public class NodeRepository extends AbstractComponent {
}
public boolean canAllocateTenantNodeTo(Node host) {
+ return canAllocateTenantNodeTo(host, zone.getCloud().dynamicProvisioning());
+ }
+
+ public static boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) {
if ( ! host.type().canRun(NodeType.tenant)) return false;
if (host.status().wantToRetire()) return false;
if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false;
- if (zone.getCloud().dynamicProvisioning())
+ if (dynamicProvisioning)
return EnumSet.of(State.active, State.ready, State.provisioned).contains(host.state());
else
return host.state() == State.active;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 90133f7499e..92bc62229ed 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -3,9 +3,9 @@ package com.yahoo.vespa.hosted.provision.applications;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
-import com.yahoo.config.provision.NodeResources;
-import com.yahoo.vespa.hosted.provision.lb.LoadBalancer;
+import java.time.Instant;
+import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@@ -19,11 +19,15 @@ import java.util.Optional;
*/
public class Cluster {
+ public static final int maxScalingEvents = 15;
+
private final ClusterSpec.Id id;
private final boolean exclusive;
private final ClusterResources min, max;
private final Optional<ClusterResources> suggested;
private final Optional<ClusterResources> target;
+
+ /** The maxScalingEvents last scaling events of this, sorted by increasing time (newest last) */
private final List<ScalingEvent> scalingEvents;
private final String autoscalingStatus;
@@ -45,7 +49,7 @@ public class Cluster {
this.target = Optional.empty();
else
this.target = targetResources;
- this.scalingEvents = scalingEvents;
+ this.scalingEvents = List.copyOf(scalingEvents);
this.autoscalingStatus = autoscalingStatus;
}
@@ -96,9 +100,18 @@ public class Cluster {
return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus);
}
+ /** Add or update (based on "at" time) a scaling event */
public Cluster with(ScalingEvent scalingEvent) {
- // NOTE: We're just storing the latest scaling event so far
- return new Cluster(id, exclusive, min, max, suggested, target, List.of(scalingEvent), autoscalingStatus);
+ List<ScalingEvent> scalingEvents = new ArrayList<>(this.scalingEvents);
+
+ int existingIndex = eventIndexAt(scalingEvent.at());
+ if (existingIndex >= 0)
+ scalingEvents.set(existingIndex, scalingEvent);
+ else
+ scalingEvents.add(scalingEvent);
+
+ prune(scalingEvents);
+ return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus);
}
public Cluster withAutoscalingStatus(String autoscalingStatus) {
@@ -120,4 +133,17 @@ public class Cluster {
return "cluster '" + id + "'";
}
+ private void prune(List<ScalingEvent> scalingEvents) {
+ while (scalingEvents.size() > maxScalingEvents)
+ scalingEvents.remove(0);
+ }
+
+ private int eventIndexAt(Instant at) {
+ for (int i = 0; i < scalingEvents.size(); i++) {
+ if (scalingEvents.get(i).at().equals(at))
+ return i;
+ }
+ return -1;
+ }
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/ScalingEvent.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/ScalingEvent.java
index 68e65d10d69..745fdcd736d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/ScalingEvent.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/ScalingEvent.java
@@ -3,8 +3,10 @@ package com.yahoo.vespa.hosted.provision.applications;
import com.yahoo.config.provision.ClusterResources;
+import java.time.Duration;
import java.time.Instant;
import java.util.Objects;
+import java.util.Optional;
/**
* A recording of a change in resources for an application cluster
@@ -16,12 +18,19 @@ public class ScalingEvent {
private final ClusterResources from, to;
private final long generation;
private final Instant at;
+ private final Optional<Instant> completion;
- public ScalingEvent(ClusterResources from, ClusterResources to, long generation, Instant at) {
+ /** Do not use */
+ public ScalingEvent(ClusterResources from,
+ ClusterResources to,
+ long generation,
+ Instant at,
+ Optional<Instant> completion) {
this.from = from;
this.to = to;
this.generation = generation;
this.at = at;
+ this.completion = completion;
}
/** Returns the resources we changed from */
@@ -36,6 +45,19 @@ public class ScalingEvent {
/** Returns the time of this deployment */
public Instant at() { return at; }
+ /** Returns the instant this completed, or empty if it is not yet complete as far as we know */
+ public Optional<Instant> completion() { return completion; }
+
+ /** Returns the time this event took to completion, or empty if it's not yet complete */
+ public Optional<Duration> duration() {
+ if (completion.isEmpty()) return Optional.empty();
+ return Optional.of(Duration.between(at, completion.get()));
+ }
+
+ public ScalingEvent withCompletion(Instant completion) {
+ return new ScalingEvent(from, to, generation, at, Optional.of(completion));
+ }
+
@Override
public int hashCode() { return Objects.hash(from, to, generation, at); }
@@ -48,12 +70,18 @@ public class ScalingEvent {
if ( ! other.at.equals(this.at)) return false;
if ( ! other.from.equals(this.from)) return false;
if ( ! other.to.equals(this.to)) return false;
+ if ( ! other.completion.equals(this.completion)) return false;
return true;
}
@Override
public String toString() {
- return "scaling event from " + from + " to " + to + ", generation " + generation + " at " + at;
+ return "scaling event from " + from + " to " + to + ", generation " + generation + " at " + at +
+ (completion.isPresent() ? " completed " + completion.get() : "");
+ }
+
+ public static ScalingEvent create(ClusterResources from, ClusterResources to, long generation, Instant at) {
+ return new ScalingEvent(from, to, generation, at, Optional.empty());
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
index b1213b2da41..9eb4b796970 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
@@ -26,20 +26,19 @@ public class AllocatableClusterResources {
private final NodeResources realResources;
private final NodeResources advertisedResources;
- private final ClusterSpec.Type clusterType;
+ private final ClusterSpec clusterSpec;
private final double fulfilment;
/** Fake allocatable resources from requested capacity */
public AllocatableClusterResources(ClusterResources requested,
- ClusterSpec.Type clusterType,
- boolean exclusive,
+ ClusterSpec clusterSpec,
NodeRepository nodeRepository) {
this.nodes = requested.nodes();
this.groups = requested.groups();
- this.realResources = nodeRepository.resourcesCalculator().requestToReal(requested.nodeResources(), exclusive);
+ this.realResources = nodeRepository.resourcesCalculator().requestToReal(requested.nodeResources(), clusterSpec.isExclusive());
this.advertisedResources = requested.nodeResources();
- this.clusterType = clusterType;
+ this.clusterSpec = clusterSpec;
this.fulfilment = 1;
}
@@ -48,19 +47,19 @@ public class AllocatableClusterResources {
this.groups = (int)nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count();
this.realResources = averageRealResourcesOf(nodes, nodeRepository, exclusive); // Average since we average metrics over nodes
this.advertisedResources = nodes.get(0).resources();
- this.clusterType = nodes.get(0).allocation().get().membership().cluster().type();
+ this.clusterSpec = nodes.get(0).allocation().get().membership().cluster();
this.fulfilment = 1;
}
public AllocatableClusterResources(ClusterResources realResources,
NodeResources advertisedResources,
NodeResources idealResources,
- ClusterSpec.Type clusterType) {
+ ClusterSpec clusterSpec) {
this.nodes = realResources.nodes();
this.groups = realResources.groups();
this.realResources = realResources.nodeResources();
this.advertisedResources = advertisedResources;
- this.clusterType = clusterType;
+ this.clusterSpec = clusterSpec;
this.fulfilment = fulfilment(realResources.nodeResources(), idealResources);
}
@@ -88,7 +87,7 @@ public class AllocatableClusterResources {
return (int)Math.ceil((double)nodes / groups);
}
- public ClusterSpec.Type clusterType() { return clusterType; }
+ public ClusterSpec clusterSpec() { return clusterSpec; }
public double cost() { return nodes * advertisedResources.cost(); }
@@ -133,23 +132,23 @@ public class AllocatableClusterResources {
}
public static Optional<AllocatableClusterResources> from(ClusterResources wantedResources,
- boolean exclusive,
- ClusterSpec.Type clusterType,
+ ClusterSpec clusterSpec,
Limits applicationLimits,
NodeRepository nodeRepository) {
var systemLimits = new NodeResourceLimits(nodeRepository);
- if ( !exclusive && !nodeRepository.zone().getCloud().dynamicProvisioning()) {
+ boolean exclusive = clusterSpec.isExclusive();
+ if ( !clusterSpec.isExclusive() && !nodeRepository.zone().getCloud().dynamicProvisioning()) {
// We decide resources: Add overhead to what we'll request (advertised) to make sure real becomes (at least) cappedNodeResources
NodeResources advertisedResources = nodeRepository.resourcesCalculator().realToRequest(wantedResources.nodeResources(), exclusive);
- advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterType, exclusive); // Attempt to ask for something legal
+ advertisedResources = systemLimits.enlargeToLegal(advertisedResources, clusterSpec.type(), exclusive); // Attempt to ask for something legal
advertisedResources = applicationLimits.cap(advertisedResources); // Overrides other conditions, even if it will then fail
NodeResources realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive); // ... thus, what we really get may change
- if ( ! systemLimits.isWithinRealLimits(realResources, clusterType)) return Optional.empty();
+ if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec.type())) return Optional.empty();
if (matchesAny(nodeRepository.flavors().getFlavors(), advertisedResources))
return Optional.of(new AllocatableClusterResources(wantedResources.with(realResources),
advertisedResources,
wantedResources.nodeResources(),
- clusterType));
+ clusterSpec));
else
return Optional.empty();
}
@@ -172,11 +171,11 @@ public class AllocatableClusterResources {
}
if ( ! between(applicationLimits.min().nodeResources(), applicationLimits.max().nodeResources(), advertisedResources)) continue;
- if ( ! systemLimits.isWithinRealLimits(realResources, clusterType)) continue;
+ if ( ! systemLimits.isWithinRealLimits(realResources, clusterSpec.type())) continue;
var candidate = new AllocatableClusterResources(wantedResources.with(realResources),
advertisedResources,
wantedResources.nodeResources(),
- clusterType);
+ clusterSpec);
if (best.isEmpty() || candidate.preferableTo(best.get()))
best = Optional.of(candidate);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
index e57011b0e4a..fb97e803a35 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
@@ -58,7 +58,7 @@ public class AllocationOptimizer {
groups,
nodeResourcesWith(nodesAdjustedForRedundancy, groupsAdjustedForRedundancy, limits, current, target));
- var allocatableResources = AllocatableClusterResources.from(next, exclusive, current.clusterType(), limits, nodeRepository);
+ var allocatableResources = AllocatableClusterResources.from(next, current.clusterSpec(), limits, nodeRepository);
if (allocatableResources.isEmpty()) continue;
if (bestAllocation.isEmpty() || allocatableResources.get().preferableTo(bestAllocation.get()))
bestAllocation = allocatableResources;
@@ -79,7 +79,7 @@ public class AllocationOptimizer {
int groupSize = nodes / groups;
- if (current.clusterType().isContent()) { // load scales with node share of content
+ if (current.clusterSpec().isStateful()) { // load scales with node share of content
// The fixed cost portion of cpu does not scale with changes to the node count
// TODO: Only for the portion of cpu consumed by queries
double cpuPerGroup = fixedCpuCostFraction * target.nodeCpu() +
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index d2c943794fe..79383c056f9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -5,18 +5,18 @@ import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import java.time.Duration;
import java.time.Instant;
-import java.util.List;
import java.util.Objects;
import java.util.Optional;
/**
- * The autoscaler makes decisions about the flavor and node count that should be allocated to a cluster
- * based on observed behavior.
+ * The autoscaler gives advice about what resources should be allocated to a cluster based on observed behavior.
*
* @author bratseth
*/
@@ -44,7 +44,7 @@ public class Autoscaler {
* @param clusterNodes the list of all the active nodes in a cluster
* @return scaling advice for this cluster
*/
- public Advice suggest(Cluster cluster, List<Node> clusterNodes) {
+ public Advice suggest(Cluster cluster, NodeList clusterNodes) {
return autoscale(cluster, clusterNodes, Limits.empty(), cluster.exclusive());
}
@@ -54,30 +54,34 @@ public class Autoscaler {
* @param clusterNodes the list of all the active nodes in a cluster
* @return scaling advice for this cluster
*/
- public Advice autoscale(Cluster cluster, List<Node> clusterNodes) {
- if (cluster.minResources().equals(cluster.maxResources())) return Advice.none("Autoscaling is disabled"); // Shortcut
+ public Advice autoscale(Cluster cluster, NodeList clusterNodes) {
+ if (cluster.minResources().equals(cluster.maxResources())) return Advice.none("Autoscaling is not enabled");
return autoscale(cluster, clusterNodes, Limits.of(cluster), cluster.exclusive());
}
- private Advice autoscale(Cluster cluster, List<Node> clusterNodes, Limits limits, boolean exclusive) {
- ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type();
- if (unstable(clusterNodes, nodeRepository))
+ private Advice autoscale(Cluster cluster, NodeList clusterNodes, Limits limits, boolean exclusive) {
+ if ( ! stable(clusterNodes, nodeRepository))
return Advice.none("Cluster change in progress");
- AllocatableClusterResources currentAllocation =
- new AllocatableClusterResources(clusterNodes, nodeRepository, cluster.exclusive());
+ Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster);
+ if (scaledIn(scalingWindow, cluster))
+ return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling");
- ClusterTimeseries clusterTimeseries = new ClusterTimeseries(cluster, clusterNodes, metricsDb, nodeRepository);
+ ClusterTimeseries clusterTimeseries =
+ new ClusterTimeseries(nodeRepository.clock().instant().minus(scalingWindow), cluster, clusterNodes, metricsDb);
+ AllocatableClusterResources currentAllocation =
+ new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
int measurementsPerNode = clusterTimeseries.measurementsPerNode();
- if (measurementsPerNode < minimumMeasurementsPerNode(clusterType))
- return Advice.none("Collecting more data before making new scaling decisions" +
- ": Has " + measurementsPerNode + " data points per node");
+ if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
+ return Advice.none("Collecting more data before making new scaling decisions: " +
+ "Have " + measurementsPerNode + " measurements per node but require " +
+ minimumMeasurementsPerNode(scalingWindow));
int nodesMeasured = clusterTimeseries.nodesMeasured();
if (nodesMeasured != clusterNodes.size())
- return Advice.none("Collecting more data before making new scaling decisions" +
- ": Has measurements from " + nodesMeasured + " but need from " + clusterNodes.size());
+ return Advice.none("Collecting more data before making new scaling decisions: " +
+ "Have measurements from " + nodesMeasured + " but require from " + clusterNodes.size());
double cpuLoad = clusterTimeseries.averageLoad(Resource.cpu);
double memoryLoad = clusterTimeseries.averageLoad(Resource.memory);
@@ -91,9 +95,10 @@ public class Autoscaler {
return Advice.dontScale("No allocation changes are possible within configured limits");
if (similar(bestAllocation.get(), currentAllocation))
- return Advice.dontScale("Cluster is ideally scaled (within configured limits)");
- if (isDownscaling(bestAllocation.get(), currentAllocation) && recentlyScaled(cluster, clusterNodes))
- return Advice.dontScale("Waiting a while before scaling down");
+ return Advice.dontScale("Cluster is ideally scaled within configured limits");
+
+ if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster))
+ return Advice.dontScale("Waiting " + scalingWindow.multipliedBy(3) + " since last rescaling before reducing resources");
return Advice.scaleTo(bestAllocation.get().toAdvertisedClusterResources());
}
@@ -120,49 +125,64 @@ public class Autoscaler {
return ! targetTotal.justNumbers().satisfies(currentTotal.justNumbers());
}
- private boolean recentlyScaled(Cluster cluster, List<Node> clusterNodes) {
- Duration downscalingDelay = downscalingDelay(clusterNodes.get(0).allocation().get().membership().cluster().type());
+ private boolean scaledIn(Duration delay, Cluster cluster) {
return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
- .isAfter(nodeRepository.clock().instant().minus(downscalingDelay));
+ .isAfter(nodeRepository.clock().instant().minus(delay));
}
/** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
- static Duration scalingWindow(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return Duration.ofHours(12);
- return Duration.ofMinutes(30);
- }
+ private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) {
+ int completedEventCount = 0;
+ Duration totalDuration = Duration.ZERO;
+ for (ScalingEvent event : cluster.scalingEvents()) {
+ if (event.duration().isEmpty()) continue;
+ completedEventCount++;
+ totalDuration = totalDuration.plus(event.duration().get());
+ }
- static Duration maxScalingWindow() {
- return Duration.ofHours(12);
+ if (completedEventCount == 0) { // Use defaults
+ if (clusterSpec.isStateful()) return Duration.ofHours(12);
+ return Duration.ofMinutes(10);
+ }
+ else {
+ Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
+
+ // TODO: Remove when we have reliable completion for content clusters
+ if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
+ return Duration.ofHours(12);
+
+ if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
+ return predictedDuration;
+ }
}
- /** Measurements are currently taken once a minute. See also scalingWindow */
- static int minimumMeasurementsPerNode(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return 60;
- return 7;
+ static Duration maxScalingWindow() {
+ return Duration.ofHours(48);
}
- /**
- * We should wait a while before scaling down after a scaling event as a peak in usage
- * indicates more peaks may arrive in the near future.
- */
- static Duration downscalingDelay(ClusterSpec.Type clusterType) {
- if (clusterType.isContent()) return Duration.ofHours(12);
- return Duration.ofHours(1);
+ /** Returns the minimum measurements per node (average) we require to give autoscaling advice.*/
+ private int minimumMeasurementsPerNode(Duration scalingWindow) {
+ // Measurements are ideally taken every minute, but no guarantees
+ // (network, nodes may be down, collecting is single threaded and may take longer than 1 minute to complete).
+ // Since the metric window is 5 minutes, we won't really improve from measuring more often.
+ long minimumMeasurements = scalingWindow.toMinutes() / 5;
+ minimumMeasurements = Math.round(0.8 * minimumMeasurements); // Allow 20% metrics collection blackout
+ if (minimumMeasurements < 1) minimumMeasurements = 1;
+ return (int)minimumMeasurements;
}
- public static boolean unstable(List<Node> nodes, NodeRepository nodeRepository) {
+ public static boolean stable(NodeList nodes, NodeRepository nodeRepository) {
// The cluster is processing recent changes
if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
node.allocation().get().membership().retired() ||
node.allocation().get().isRemovable()))
- return true;
+ return false;
// A deployment is ongoing
- if (nodeRepository.getNodes(nodes.get(0).allocation().get().owner(), Node.State.reserved).size() > 0)
- return true;
+ if (nodeRepository.getNodes(nodes.first().get().allocation().get().owner(), Node.State.reserved).size() > 0)
+ return false;
- return false;
+ return true;
}
public static class Advice {
@@ -197,6 +217,13 @@ public class Autoscaler {
private static Advice scaleTo(ClusterResources target) {
return new Advice(Optional.of(target), true, "Scaling due to load changes");
}
+
+ @Override
+ public String toString() {
+ return "autoscaling advice: " +
+ (present ? (target.isPresent() ? "Scale to " + target.get() : "Don't scale") : " None");
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index e325e797ca5..e1a3ceca033 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -1,15 +1,13 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;
-import com.yahoo.config.provision.ClusterSpec;
-import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import java.time.Instant;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+import java.util.function.Predicate;
import java.util.stream.Collectors;
/**
@@ -19,64 +17,38 @@ import java.util.stream.Collectors;
*/
public class ClusterTimeseries {
- private final List<Node> clusterNodes;
+ private final NodeList clusterNodes;
- /** The measurements for all hosts in this snapshot */
- private final List<NodeTimeseries> nodeTimeseries;
+ /** The measurements for all nodes in this snapshot */
+ private final List<NodeTimeseries> allTimeseries;
- public ClusterTimeseries(Cluster cluster, List<Node> clusterNodes, MetricsDb db, NodeRepository nodeRepository) {
+ public ClusterTimeseries(Instant startTime, Cluster cluster, NodeList clusterNodes, MetricsDb db) {
this.clusterNodes = clusterNodes;
- ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type();
- var allTimeseries = db.getNodeTimeseries(nodeRepository.clock().instant().minus(Autoscaler.scalingWindow(clusterType)),
- clusterNodes.stream().map(Node::hostname).collect(Collectors.toSet()));
- Map<String, Instant> startTimePerNode = metricStartTimes(cluster, clusterNodes, allTimeseries, nodeRepository);
- nodeTimeseries = filterStale(allTimeseries, startTimePerNode);
- }
+ var timeseries = db.getNodeTimeseries(startTime, clusterNodes);
- /**
- * Returns the instant of the oldest metric to consider for each node, or an empty map if metrics from the
- * entire (max) window should be considered.
- */
- private Map<String, Instant> metricStartTimes(Cluster cluster,
- List<Node> clusterNodes,
- List<NodeTimeseries> nodeTimeseries,
- NodeRepository nodeRepository) {
- Map<String, Instant> startTimePerHost = new HashMap<>();
- if ( ! cluster.scalingEvents().isEmpty()) {
- var deployment = cluster.scalingEvents().get(cluster.scalingEvents().size() - 1);
- for (Node node : clusterNodes) {
- startTimePerHost.put(node.hostname(), nodeRepository.clock().instant()); // Discard all unless we can prove otherwise
- var nodeGenerationMeasurements =
- nodeTimeseries.stream().filter(m -> m.hostname().equals(node.hostname())).findAny();
- if (nodeGenerationMeasurements.isPresent()) {
- var firstMeasurementOfCorrectGeneration =
- nodeGenerationMeasurements.get().asList().stream()
- .filter(m -> m.generation() >= deployment.generation())
- .findFirst();
- if (firstMeasurementOfCorrectGeneration.isPresent()) {
- startTimePerHost.put(node.hostname(), firstMeasurementOfCorrectGeneration.get().at());
- }
- }
- }
- }
- return startTimePerHost;
+ if (cluster.lastScalingEvent().isPresent())
+ timeseries = filter(timeseries, snapshot -> snapshot.generation() < 0 || // Content nodes do not yet send generation
+ snapshot.generation() >= cluster.lastScalingEvent().get().generation());
+ timeseries = filter(timeseries, snapshot -> snapshot.inService() && snapshot.stable());
+
+ this.allTimeseries = timeseries;
}
/** Returns the average number of measurements per node */
public int measurementsPerNode() {
- int measurementCount = nodeTimeseries.stream().mapToInt(m -> m.size()).sum();
+ int measurementCount = allTimeseries.stream().mapToInt(m -> m.size()).sum();
return measurementCount / clusterNodes.size();
}
/** Returns the number of nodes measured in this */
public int nodesMeasured() {
- return nodeTimeseries.size();
+ return allTimeseries.size();
}
/** Returns the average load of this resource in this */
public double averageLoad(Resource resource) {
- int measurementCount = nodeTimeseries.stream().mapToInt(m -> m.size()).sum();
- double measurementSum = nodeTimeseries.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum();
+ int measurementCount = allTimeseries.stream().mapToInt(m -> m.size()).sum();
+ double measurementSum = allTimeseries.stream().flatMap(m -> m.asList().stream()).mapToDouble(m -> value(resource, m)).sum();
return measurementSum / measurementCount;
}
@@ -89,10 +61,8 @@ public class ClusterTimeseries {
}
}
- private List<NodeTimeseries> filterStale(List<NodeTimeseries> timeseries,
- Map<String, Instant> startTimePerHost) {
- if (startTimePerHost.isEmpty()) return timeseries; // Map is either empty or complete
- return timeseries.stream().map(m -> m.justAfter(startTimePerHost.get(m.hostname()))).collect(Collectors.toList());
+ private List<NodeTimeseries> filter(List<NodeTimeseries> timeseries, Predicate<MetricSnapshot> filter) {
+ return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.filter(filter)).collect(Collectors.toList());
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java
index 6f6cd862c33..d5072475cd9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricSnapshot.java
@@ -8,9 +8,8 @@ import java.time.Instant;
*
* @author bratseth
*/
-public class MetricSnapshot {
+public class MetricSnapshot implements Comparable<MetricSnapshot> {
- // TODO: Order by timestamp
private final Instant at;
private final double cpu;
@@ -18,28 +17,43 @@ public class MetricSnapshot {
private final double disk;
private final long generation;
private final boolean inService;
+ private final boolean stable;
- public MetricSnapshot(Instant at, double cpu, double memory, double disk, long generation, boolean inService) {
+ public MetricSnapshot(Instant at, double cpu, double memory, double disk, long generation,
+ boolean inService, boolean stable) {
this.at = at;
this.cpu = cpu;
this.memory = memory;
this.disk = disk;
this.generation = generation;
this.inService = inService;
+ this.stable = stable;
}
public Instant at() { return at; }
public double cpu() { return cpu; }
public double memory() { return memory; }
public double disk() { return disk; }
+
+ /** The configuration generation at the time of this measurement, or -1 if not known */
public long generation() { return generation; }
+
public boolean inService() { return inService; }
+ public boolean stable() { return stable; }
+
+ @Override
+ public int compareTo(MetricSnapshot other) {
+ return at.compareTo(other.at);
+ }
@Override
public String toString() { return "metrics at " + at + ":" +
" cpu: " + cpu +
" memory: " + memory +
" disk: " + disk +
- " generation: " + generation; }
+ " generation: " + generation +
+ " inService: " + inService +
+ " stable: " + stable;
+ }
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java
index ea4ce4b44de..68acdcc88f7 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsDb.java
@@ -2,6 +2,8 @@
package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.collections.Pair;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import java.time.Clock;
@@ -9,6 +11,7 @@ import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Set;
+import java.util.stream.Collectors;
/**
* An in-memory time-series database of node metrics.
@@ -26,6 +29,10 @@ public interface MetricsDb {
*/
List<NodeTimeseries> getNodeTimeseries(Instant startTime, Set<String> hostnames);
+ default List<NodeTimeseries> getNodeTimeseries(Instant startTime, NodeList nodes) {
+ return getNodeTimeseries(startTime, nodes.stream().map(Node::hostname).collect(Collectors.toSet()));
+ }
+
/** Must be called intermittently (as long as add is called) to gc old data */
void gc();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index 4471d267416..7c5b839edf3 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -2,17 +2,24 @@
package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.collections.Pair;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.slime.ArrayTraverser;
import com.yahoo.slime.Inspector;
import com.yahoo.slime.ObjectTraverser;
import com.yahoo.slime.Slime;
import com.yahoo.slime.SlimeUtils;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.applications.Application;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
+import java.util.Optional;
/**
* Consumes a response from the metrics/v2 API and populates the fields of this with the resulting values
@@ -23,33 +30,41 @@ public class MetricsResponse {
private final Collection<Pair<String, MetricSnapshot>> nodeMetrics = new ArrayList<>();
- public MetricsResponse(String response) {
- this(SlimeUtils.jsonToSlime(response));
+ public MetricsResponse(String response, NodeList applicationNodes, NodeRepository nodeRepository) {
+ this(SlimeUtils.jsonToSlime(response), applicationNodes, nodeRepository);
}
public Collection<Pair<String, MetricSnapshot>> metrics() { return nodeMetrics; }
- private MetricsResponse(Slime response) {
+ private MetricsResponse(Slime response, NodeList applicationNodes, NodeRepository nodeRepository) {
Inspector root = response.get();
Inspector nodes = root.field("nodes");
- nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node));
+ nodes.traverse((ArrayTraverser)(__, node) -> consumeNode(node, applicationNodes, nodeRepository));
}
- private void consumeNode(Inspector node) {
+ private void consumeNode(Inspector node, NodeList applicationNodes, NodeRepository nodeRepository) {
String hostname = node.field("hostname").asString();
- consumeNodeMetrics(hostname, node.field("node"));
+ consumeNodeMetrics(hostname, node.field("node"), applicationNodes, nodeRepository);
// consumeServiceMetrics(hostname, node.field("services"));
}
- private void consumeNodeMetrics(String hostname, Inspector node) {
- long timestampSecond = node.field("timestamp").asLong();
- Map<String, Double> values = consumeMetrics(node.field("metrics"));
+ private void consumeNodeMetrics(String hostname, Inspector nodeData, NodeList applicationNodes, NodeRepository nodeRepository) {
+ Optional<Node> node = applicationNodes.stream().filter(n -> n.hostname().equals(hostname)).findAny();
+ if (node.isEmpty()) return; // Node is not part of this cluster any more
+ long timestampSecond = nodeData.field("timestamp").asLong();
+ Map<String, Double> values = consumeMetrics(nodeData.field("metrics"));
nodeMetrics.add(new Pair<>(hostname, new MetricSnapshot(Instant.ofEpochMilli(timestampSecond * 1000),
Metric.cpu.from(values),
Metric.memory.from(values),
Metric.disk.from(values),
(long)Metric.generation.from(values),
- Metric.inService.from(values) > 0)));
+ Metric.inService.from(values) > 0,
+ clusterIsStable(node.get(), applicationNodes, nodeRepository))));
+ }
+
+ private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) {
+ ClusterSpec cluster = node.allocation().get().membership().cluster();
+ return Autoscaler.stable(applicationNodes.cluster(cluster.id()), nodeRepository);
}
private void consumeServiceMetrics(String hostname, Inspector node) {
@@ -86,6 +101,7 @@ public class MetricsResponse {
generation { // application config generation active on the node
public String metricResponseName() { return "application_generation"; }
double convertValue(double metricValue) { return (float)metricValue; } // Really a long
+ double defaultValue() { return -1.0; }
},
inService {
public String metricResponseName() { return "in_service"; }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
index 33bdc746678..4afc876056a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java
@@ -55,9 +55,6 @@ public class MetricsV2MetricsFetcher extends AbstractComponent implements Metric
public Collection<Pair<String, MetricSnapshot>> fetchMetrics(ApplicationId application) {
NodeList applicationNodes = nodeRepository.list(application).state(Node.State.active);
- // Do not try to draw conclusions from utilization while unstable
- if (Autoscaler.unstable(applicationNodes.asList(), nodeRepository)) return Collections.emptyList();
-
Optional<Node> metricsV2Container = applicationNodes.container()
.matching(node -> expectedUp(node))
.stream()
@@ -65,8 +62,7 @@ public class MetricsV2MetricsFetcher extends AbstractComponent implements Metric
if (metricsV2Container.isEmpty()) return Collections.emptyList();
// Consumer 'autoscaling' defined in com.yahoo.vespa.model.admin.monitoring.MetricConsumer
String url = "http://" + metricsV2Container.get().hostname() + ":" + 4080 + apiPath + "?consumer=autoscaling";
- String response = httpClient.get(url);
- return new MetricsResponse(response).metrics();
+ return new MetricsResponse(httpClient.get(url), applicationNodes, nodeRepository).metrics();
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
index 6cba3928b8f..24876609f58 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
@@ -1,16 +1,15 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;
-import com.yahoo.config.provision.ClusterSpec;
-
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.function.Predicate;
import java.util.stream.Collectors;
/**
- * A list of metric snapshots from a host
+ * A list of metric snapshots from a node, sorted by increasing time (newest last).
*
* @author bratseth
*/
@@ -19,10 +18,11 @@ public class NodeTimeseries {
private final String hostname;
private final List<MetricSnapshot> snapshots;
- // Note: This transfers ownership of the snapshot list to this
NodeTimeseries(String hostname, List<MetricSnapshot> snapshots) {
this.hostname = hostname;
- this.snapshots = snapshots;
+ List<MetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots);
+ Collections.sort(sortedSnapshots);
+ this.snapshots = Collections.unmodifiableList(sortedSnapshots);
}
public boolean isEmpty() { return snapshots.isEmpty(); }
@@ -31,7 +31,7 @@ public class NodeTimeseries {
public MetricSnapshot get(int index) { return snapshots.get(index); }
- public List<MetricSnapshot> asList() { return Collections.unmodifiableList(snapshots); }
+ public List<MetricSnapshot> asList() { return snapshots; }
public String hostname() { return hostname; }
@@ -41,6 +41,10 @@ public class NodeTimeseries {
return new NodeTimeseries(hostname(), list);
}
+ public NodeTimeseries filter(Predicate<MetricSnapshot> filter) {
+ return new NodeTimeseries(hostname, snapshots.stream().filter(filter).collect(Collectors.toList()));
+ }
+
public NodeTimeseries justAfter(Instant oldestTime) {
return new NodeTimeseries(hostname,
snapshots.stream()
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
index bf013810b2f..8e90294b4a5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
@@ -4,10 +4,12 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.google.inject.Inject;
import com.yahoo.collections.ListMap;
import com.yahoo.collections.Pair;
+import com.yahoo.component.AbstractComponent;
import com.yahoo.io.IOUtils;
import com.yahoo.vespa.defaults.Defaults;
import io.questdb.cairo.CairoConfiguration;
import io.questdb.cairo.CairoEngine;
+import io.questdb.cairo.CairoException;
import io.questdb.cairo.DefaultCairoConfiguration;
import io.questdb.cairo.TableWriter;
import io.questdb.cairo.sql.Record;
@@ -40,14 +42,14 @@ import java.util.stream.Collectors;
*
* @author bratseth
*/
-public class QuestMetricsDb implements MetricsDb {
+public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
private static final Logger log = Logger.getLogger(QuestMetricsDb.class.getName());
- private static final String tableName = "metrics";
+ private static final String table = "metrics";
private final Clock clock;
private final String dataDir;
- private final CairoEngine engine;
+ private CairoEngine engine;
private long highestTimestampAdded = 0;
@@ -63,8 +65,11 @@ public class QuestMetricsDb implements MetricsDb {
&& ! new File(Defaults.getDefaults().vespaHome()).exists())
dataDir = "data"; // We're injected, but not on a node with Vespa installed
this.dataDir = dataDir;
+ initializeDb();
+ }
- IOUtils.createDirectory(dataDir + "/" + tableName);
+ private void initializeDb() {
+ IOUtils.createDirectory(dataDir + "/" + table);
// silence Questdb's custom logging system
IOUtils.writeFile(new File(dataDir, "quest-log.conf"), new byte[0]);
@@ -73,28 +78,43 @@ public class QuestMetricsDb implements MetricsDb {
CairoConfiguration configuration = new DefaultCairoConfiguration(dataDir);
engine = new CairoEngine(configuration);
- ensureExists(tableName);
+ ensureExists(table);
}
@Override
public void add(Collection<Pair<String, MetricSnapshot>> snapshots) {
- try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), tableName)) {
- for (var snapshot : snapshots) {
- long atMillis = adjustIfRecent(snapshot.getSecond().at().toEpochMilli(), highestTimestampAdded);
- if (atMillis < highestTimestampAdded) continue; // Ignore old data
- highestTimestampAdded = atMillis;
- TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds
- row.putStr(0, snapshot.getFirst());
- row.putFloat(2, (float)snapshot.getSecond().cpu());
- row.putFloat(3, (float)snapshot.getSecond().memory());
- row.putFloat(4, (float)snapshot.getSecond().disk());
- row.putLong(5, snapshot.getSecond().generation());
- row.append();
+ try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) {
+ add(snapshots, writer);
+ }
+ catch (CairoException e) {
+ if (e.getMessage().contains("Cannot read offset")) {
+ // This error seems non-recoverable
+ repair(e);
+ try (TableWriter writer = engine.getWriter(newContext().getCairoSecurityContext(), table)) {
+ add(snapshots, writer);
+ }
}
- writer.commit();
}
}
+ private void add(Collection<Pair<String, MetricSnapshot>> snapshots, TableWriter writer) {
+ for (var snapshot : snapshots) {
+ long atMillis = adjustIfRecent(snapshot.getSecond().at().toEpochMilli(), highestTimestampAdded);
+ if (atMillis < highestTimestampAdded) continue; // Ignore old data
+ highestTimestampAdded = atMillis;
+ TableWriter.Row row = writer.newRow(atMillis * 1000); // in microseconds
+ row.putStr(0, snapshot.getFirst());
+ row.putFloat(2, (float)snapshot.getSecond().cpu());
+ row.putFloat(3, (float)snapshot.getSecond().memory());
+ row.putFloat(4, (float)snapshot.getSecond().disk());
+ row.putLong(5, snapshot.getSecond().generation());
+ row.putBool(6, snapshot.getSecond().inService());
+ row.putBool(7, snapshot.getSecond().stable());
+ row.append();
+ }
+ writer.commit();
+ }
+
@Override
public List<NodeTimeseries> getNodeTimeseries(Instant startTime, Set<String> hostnames) {
try (SqlCompiler compiler = new SqlCompiler(engine)) {
@@ -116,7 +136,7 @@ public class QuestMetricsDb implements MetricsDb {
SqlExecutionContext context = newContext();
int partitions = 0;
try (SqlCompiler compiler = new SqlCompiler(engine)) {
- File tableRoot = new File(dataDir, tableName);
+ File tableRoot = new File(dataDir, table);
List<String> removeList = new ArrayList<>();
for (String dirEntry : tableRoot.list()) {
File partitionDir = new File(tableRoot, dirEntry);
@@ -131,7 +151,7 @@ public class QuestMetricsDb implements MetricsDb {
}
// Remove unless all partitions are old: Removing all partitions "will be supported in the future"
if ( removeList.size() < partitions && ! removeList.isEmpty())
- compiler.compile("alter table " + tableName + " drop partition " +
+ compiler.compile("alter table " + table + " drop partition " +
removeList.stream().map(dir -> "'" + dir + "'").collect(Collectors.joining(",")),
context);
}
@@ -141,27 +161,78 @@ public class QuestMetricsDb implements MetricsDb {
}
@Override
+ public void deconstruct() { close(); }
+
+ @Override
public void close() {
if (engine != null)
engine.close();
}
- private void ensureExists(String tableName) {
+ /**
+ * Repairs this db on corruption.
+ *
+ * @param e the exception indicating corruption
+ */
+ private void repair(Exception e) {
+ log.log(Level.WARNING, "QuestDb seems corrupted, wiping data and starting over", e);
+ IOUtils.recursiveDeleteDir(new File(dataDir));
+ initializeDb();
+ }
+
+ private void ensureExists(String table) {
SqlExecutionContext context = newContext();
- if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), tableName)) return;
+ if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) { // table exists
+ ensureUpdated(table, context);
+ } else {
+ create(table, context);
+ }
+ }
+ private void ensureUpdated(String table, SqlExecutionContext context) {
try (SqlCompiler compiler = new SqlCompiler(engine)) {
- compiler.compile("create table " + tableName +
- " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float, application_generation long)" +
+ if (0 == engine.getStatus(context.getCairoSecurityContext(), new Path(), table)) {
+ ensureColumnExists("inService", "boolean", table, compiler, context); // TODO: Remove after December 2020
+ ensureColumnExists("stable", "boolean", table, compiler, context); // TODO: Remove after December 2020
+ }
+ } catch (SqlException e) {
+ repair(e);
+ }
+ }
+
+ private void create(String table, SqlExecutionContext context) {
+ try (SqlCompiler compiler = new SqlCompiler(engine)) {
+ compiler.compile("create table " + table +
+ " (hostname string, at timestamp, cpu_util float, mem_total_util float, disk_util float," +
+ " application_generation long, inService boolean, stable boolean)" +
" timestamp(at)" +
"PARTITION BY DAY;",
context);
- // We should do this if we get a version where selecting on stringhs work embedded, see below
+ // We should do this if we get a version where selecting on strings work embedded, see below
// compiler.compile("alter table " + tableName + " alter column hostname add index", context);
}
catch (SqlException e) {
- throw new IllegalStateException("Could not create Quest db table '" + tableName + "'", e);
+ throw new IllegalStateException("Could not create Quest db table '" + table + "'", e);
+ }
+ }
+
+ private void ensureColumnExists(String column, String columnType,
+ String table, SqlCompiler compiler, SqlExecutionContext context) throws SqlException {
+ if (columnNamesOf(table, compiler, context).contains(column)) return;
+ compiler.compile("alter table " + table + " add column " + column + " " + columnType, context);
+ }
+
+ private List<String> columnNamesOf(String tableName, SqlCompiler compiler, SqlExecutionContext context) throws SqlException {
+ List<String> columns = new ArrayList<>();
+ try (RecordCursorFactory factory = compiler.compile("show columns from " + tableName, context).getRecordCursorFactory()) {
+ try (RecordCursor cursor = factory.getCursor(context)) {
+ Record record = cursor.getRecord();
+ while (cursor.hasNext()) {
+ columns.add(record.getStr(0).toString());
+ }
+ }
}
+ return columns;
}
private long adjustIfRecent(long timestamp, long highestTimestampAdded) {
@@ -182,7 +253,7 @@ public class QuestMetricsDb implements MetricsDb {
DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME.withZone(ZoneId.of("UTC"));
String from = formatter.format(startTime).substring(0, 19) + ".000000Z";
String to = formatter.format(clock.instant()).substring(0, 19) + ".000000Z";
- String sql = "select * from " + tableName + " where at in('" + from + "', '" + to + "');";
+ String sql = "select * from " + table + " where at in('" + from + "', '" + to + "');";
// WHERE clauses does not work:
// String sql = "select * from " + tableName + " where hostname in('host1', 'host2', 'host3');";
@@ -200,7 +271,8 @@ public class QuestMetricsDb implements MetricsDb {
record.getFloat(3),
record.getFloat(4),
record.getLong(5),
- true));
+ record.getBool(6),
+ record.getBool(7)));
}
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
index 5df45bbc1b1..9ac1ca2b4c1 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
@@ -95,9 +95,14 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer {
}
@Override
- public void close() {
- super.close();
+ public void shutdown() {
+ super.shutdown();
this.deploymentExecutor.shutdownNow();
+ }
+
+ @Override
+ public void awaitShutdown() {
+ super.awaitShutdown();
try {
// Give deployments in progress some time to complete
this.deploymentExecutor.awaitTermination(1, TimeUnit.MINUTES);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index 809c54146d0..c4744f6cb6a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -7,15 +7,21 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Deployer;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Applications;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler;
+import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
+import com.yahoo.vespa.hosted.provision.autoscale.NodeTimeseries;
+import com.yahoo.vespa.hosted.provision.node.History;
+import com.yahoo.vespa.orchestrator.status.ApplicationLock;
import java.time.Duration;
+import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -29,6 +35,7 @@ import java.util.stream.Collectors;
public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
private final Autoscaler autoscaler;
+ private final MetricsDb metricsDb;
private final Deployer deployer;
private final Metric metric;
@@ -39,8 +46,9 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
Duration interval) {
super(nodeRepository, interval, metric);
this.autoscaler = new Autoscaler(metricsDb, nodeRepository);
- this.metric = metric;
+ this.metricsDb = metricsDb;
this.deployer = deployer;
+ this.metric = metric;
}
@Override
@@ -57,44 +65,69 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
private void autoscale(ApplicationId application, List<Node> applicationNodes) {
try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
if ( ! deployment.isValid()) return;
- nodesByCluster(applicationNodes).forEach((clusterId, clusterNodes) -> autoscale(application, clusterId, clusterNodes, deployment));
+ nodesByCluster(applicationNodes).forEach((clusterId, clusterNodes) -> autoscale(application, clusterId, NodeList.copyOf(clusterNodes), deployment));
}
}
private void autoscale(ApplicationId applicationId,
ClusterSpec.Id clusterId,
- List<Node> clusterNodes,
+ NodeList clusterNodes,
MaintenanceDeployment deployment) {
Application application = nodeRepository().applications().get(applicationId).orElse(new Application(applicationId));
- Optional<Cluster> cluster = application.cluster(clusterId);
- if (cluster.isEmpty()) return;
-
- var advice = autoscaler.autoscale(cluster.get(), clusterNodes);
-
- application = application.with(cluster.get().withAutoscalingStatus(advice.reason()));
- if (advice.isEmpty()) {
- applications().put(application, deployment.applicationLock().get());
- }
- else if ( ! cluster.get().targetResources().equals(advice.target())) {
- applications().put(application.with(cluster.get().withTarget(advice.target())), deployment.applicationLock().get());
+ if (application.cluster(clusterId).isEmpty()) return;
+ Cluster cluster = application.cluster(clusterId).get();
+ cluster = updateCompletion(cluster, clusterNodes);
+ var advice = autoscaler.autoscale(cluster, clusterNodes);
+ cluster = cluster.withAutoscalingStatus(advice.reason());
+
+ if (advice.isPresent() && !cluster.targetResources().equals(advice.target())) { // autoscale
+ cluster = cluster.withTarget(advice.target());
+ applications().put(application.with(cluster), deployment.applicationLock().get());
if (advice.target().isPresent()) {
- logAutoscaling(advice.target().get(), applicationId, cluster.get(), clusterNodes);
+ logAutoscaling(advice.target().get(), applicationId, cluster, clusterNodes);
deployment.activate();
}
}
+ else { // store cluster update
+ applications().put(application.with(cluster), deployment.applicationLock().get());
+ }
}
private Applications applications() {
return nodeRepository().applications();
}
+ /** Check if the last scaling event for this cluster has completed and if so record it in the returned instance */
+ private Cluster updateCompletion(Cluster cluster, NodeList clusterNodes) {
+ if (cluster.lastScalingEvent().isEmpty()) return cluster;
+ var event = cluster.lastScalingEvent().get();
+ if (event.completion().isPresent()) return cluster;
+
+ // Scaling event is complete if:
+ // - 1. no nodes which was retired by this are still present (which also implies data distribution is complete)
+ if (clusterNodes.retired().stream()
+ .anyMatch(node -> node.history().hasEventAt(History.Event.Type.retired, event.at())))
+ return cluster;
+ // - 2. all nodes have switched to the right config generation
+ for (NodeTimeseries nodeTimeseries : metricsDb.getNodeTimeseries(event.at(), clusterNodes)) {
+ Optional<MetricSnapshot> firstOnNewGeneration =
+ nodeTimeseries.asList().stream()
+ .filter(snapshot -> snapshot.generation() >= event.generation()).findFirst();
+ if (firstOnNewGeneration.isEmpty()) return cluster; // Not completed
+ }
+
+
+ // Set the completion time to the instant we notice completion.
+ Instant completionTime = nodeRepository().clock().instant();
+ return cluster.with(event.withCompletion(completionTime));
+ }
+
private void logAutoscaling(ClusterResources target,
ApplicationId application,
Cluster cluster,
- List<Node> clusterNodes) {
- ClusterResources current = new AllocatableClusterResources(clusterNodes, nodeRepository(), cluster.exclusive()).toAdvertisedClusterResources();
- ClusterSpec.Type clusterType = clusterNodes.get(0).allocation().get().membership().cluster().type();
- log.info("Autoscaling " + application + " " + clusterType + " " + cluster.id() + ":" +
+ NodeList clusterNodes) {
+ ClusterResources current = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository(), cluster.exclusive()).toAdvertisedClusterResources();
+ log.info("Autoscaling " + application + " " + clusterNodes.clusterSpec() + ":" +
"\nfrom " + toString(current) + "\nto " + toString(target));
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
index 0195466b689..e271d5dcd11 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
@@ -2,36 +2,48 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.component.Version;
+import com.yahoo.component.Vtag;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ClusterMembership;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.OutOfCapacityException;
import com.yahoo.jdisc.Metric;
+import com.yahoo.lang.MutableInteger;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.flags.FlagSource;
-import com.yahoo.vespa.flags.Flags;
+import com.yahoo.vespa.flags.JacksonFlag;
import com.yahoo.vespa.flags.ListFlag;
-import com.yahoo.vespa.flags.custom.HostCapacity;
+import com.yahoo.vespa.flags.PermanentFlags;
+import com.yahoo.vespa.flags.custom.ClusterCapacity;
+import com.yahoo.vespa.flags.custom.SharedHost;
+import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.node.IP;
import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing;
+import com.yahoo.vespa.hosted.provision.provisioning.NodeCandidate;
+import com.yahoo.vespa.hosted.provision.provisioning.NodePrioritizer;
import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceComparator;
+import com.yahoo.vespa.hosted.provision.provisioning.NodeSpec;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost;
import com.yahoo.yolean.Exceptions;
import javax.naming.NameNotFoundException;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.logging.Level;
@@ -48,7 +60,8 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(DynamicProvisioningMaintainer.class.getName());
private final HostProvisioner hostProvisioner;
- private final ListFlag<HostCapacity> targetCapacityFlag;
+ private final ListFlag<ClusterCapacity> preprovisionCapacityFlag;
+ private final JacksonFlag<SharedHost> sharedHostFlag;
DynamicProvisioningMaintainer(NodeRepository nodeRepository,
Duration interval,
@@ -57,7 +70,8 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
Metric metric) {
super(nodeRepository, interval, metric);
this.hostProvisioner = hostProvisioner;
- this.targetCapacityFlag = Flags.TARGET_CAPACITY.bindTo(flagSource);
+ this.preprovisionCapacityFlag = PermanentFlags.PREPROVISION_CAPACITY.bindTo(flagSource);
+ this.sharedHostFlag = PermanentFlags.SHARED_HOST.bindTo(flagSource);
}
@Override
@@ -104,8 +118,17 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
/** Converge zone to wanted capacity */
private void convergeToCapacity(NodeList nodes) {
- List<NodeResources> capacity = targetCapacity();
- List<Node> excessHosts = provision(capacity, nodes);
+ List<Node> excessHosts;
+ try {
+ excessHosts = provision(nodes);
+ } catch (OutOfCapacityException | IllegalStateException e) {
+ log.log(Level.WARNING, "Failed to provision preprovision capacity and/or find excess hosts: " + e.getMessage());
+ return; // avoid removing excess hosts
+ } catch (RuntimeException e) {
+ log.log(Level.WARNING, "Failed to provision preprovision capacity and/or find excess hosts", e);
+ return; // avoid removing excess hosts
+ }
+
excessHosts.forEach(host -> {
try {
hostProvisioner.deprovision(host);
@@ -119,72 +142,182 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
/**
* Provision hosts to ensure there is room to allocate spare nodes.
*
- * @param advertisedSpareCapacity the advertised resources of the spare nodes
- * @param nodes list of all nodes
+ * @param nodeList list of all nodes
* @return excess hosts that can safely be deprovisioned: An excess host 1. contains no nodes allocated
* to an application, and assuming the spare nodes have been allocated, and 2. is not parked
* without wantToDeprovision (which means an operator is looking at the node).
*/
- private List<Node> provision(List<NodeResources> advertisedSpareCapacity, NodeList nodes) {
- Map<String, Node> hostsByHostname = new HashMap<>(nodes.hosts().asList().stream()
+ private List<Node> provision(NodeList nodeList) {
+ final List<Node> nodes = new ArrayList<>(provisionUntilNoDeficit(nodeList));
+
+
+ Map<String, Node> sharedHosts = new HashMap<>(findSharedHosts(nodeList));
+
+ int minCount = sharedHostFlag.value().getMinCount();
+ int deficit = minCount - sharedHosts.size();
+ if (deficit > 0) {
+ provisionHosts(deficit, NodeResources.unspecified())
+ .forEach(host -> {
+ sharedHosts.put(host.hostname(), host);
+ nodes.add(host);
+ });
+ }
+
+ return candidatesForRemoval(nodes).stream()
+ .sorted(Comparator.comparing(node -> node.history().events().stream()
+ .map(History.Event::at).min(Comparator.naturalOrder()).orElseGet(() -> Instant.MIN)))
+ .filter(node -> {
+ if (!sharedHosts.containsKey(node.hostname()) || sharedHosts.size() > minCount) {
+ sharedHosts.remove(node.hostname());
+ return true;
+ } else {
+ return false;
+ }
+ })
+ .collect(Collectors.toList());
+ }
+
+ private List<Node> candidatesForRemoval(List<Node> nodes) {
+ Map<String, Node> hostsByHostname = new HashMap<>(nodes.stream()
+ .filter(node -> node.type() == NodeType.host)
.filter(host -> host.state() != Node.State.parked || host.status().wantToDeprovision())
.collect(Collectors.toMap(Node::hostname, Function.identity())));
- nodes.asList().stream()
+ nodes.stream()
.filter(node -> node.allocation().isPresent())
.flatMap(node -> node.parentHostname().stream())
.distinct()
.forEach(hostsByHostname::remove);
- List<Node> excessHosts = new ArrayList<>(hostsByHostname.values());
-
- var capacity = new ArrayList<>(advertisedSpareCapacity);
- for (Iterator<NodeResources> it = capacity.iterator(); it.hasNext() && !excessHosts.isEmpty(); ) {
- NodeResources resources = it.next();
- excessHosts.stream()
- .filter(nodeRepository()::canAllocateTenantNodeTo)
- .filter(host -> nodeRepository().resourcesCalculator()
- .advertisedResourcesOf(host.flavor())
- .satisfies(resources))
- .min(Comparator.comparingInt(n -> n.flavor().cost()))
- .ifPresent(host -> {
- excessHosts.remove(host);
- it.remove();
- });
+ return List.copyOf(hostsByHostname.values());
+ }
+
+ private Map<String, Node> findSharedHosts(NodeList nodeList) {
+ return nodeList.stream()
+ .filter(node -> NodeRepository.canAllocateTenantNodeTo(node, true))
+ .filter(node -> node.reservedTo().isEmpty())
+ .filter(node -> node.exclusiveTo().isEmpty())
+ .collect(Collectors.toMap(Node::hostname, Function.identity()));
+ }
+
+ /**
+ * @return the nodes in {@code nodeList} plus all hosts provisioned, plus all preprovision capacity
+ * nodes that were allocated.
+ * @throws OutOfCapacityException if there were problems provisioning hosts, and in case message
+ * should be sufficient (avoid no stack trace)
+ * @throws IllegalStateException if there was an algorithmic problem, and in case message
+ * should be sufficient (avoid no stack trace).
+ */
+ private List<Node> provisionUntilNoDeficit(NodeList nodeList) {
+ List<ClusterCapacity> preprovisionCapacity = preprovisionCapacityFlag.value();
+
+ // Worst-case each ClusterCapacity in preprovisionCapacity will require an allocation.
+ int maxProvisions = preprovisionCapacity.size();
+
+ var nodesPlusProvisioned = new ArrayList<>(nodeList.asList());
+ for (int numProvisions = 0;; ++numProvisions) {
+ var nodesPlusProvisionedPlusAllocated = new ArrayList<>(nodesPlusProvisioned);
+ Optional<ClusterCapacity> deficit = allocatePreprovisionCapacity(preprovisionCapacity, nodesPlusProvisionedPlusAllocated);
+ if (deficit.isEmpty()) {
+ return nodesPlusProvisionedPlusAllocated;
+ }
+
+ if (numProvisions >= maxProvisions) {
+ throw new IllegalStateException("Have provisioned " + numProvisions + " times but there's still deficit: aborting");
+ }
+
+ nodesPlusProvisioned.addAll(provisionHosts(deficit.get().count(), toNodeResources(deficit.get())));
}
+ }
- // Pre-provisioning is best effort, do one host at a time
- capacity.forEach(resources -> {
- try {
- Version osVersion = nodeRepository().osVersions().targetFor(NodeType.host).orElse(Version.emptyVersion);
- List<Node> hosts = hostProvisioner.provisionHosts(nodeRepository().database().getProvisionIndexes(1),
- resources, ApplicationId.defaultId(), osVersion,
- HostSharing.shared)
- .stream()
- .map(ProvisionedHost::generateHost)
- .collect(Collectors.toList());
- nodeRepository().addNodes(hosts, Agent.DynamicProvisioningMaintainer);
- } catch (OutOfCapacityException | IllegalArgumentException | IllegalStateException e) {
- log.log(Level.WARNING, "Failed to pre-provision " + resources + ": " + e.getMessage());
- } catch (RuntimeException e) {
- log.log(Level.WARNING, "Failed to pre-provision " + resources + ", will retry in " + interval(), e);
+ private List<Node> provisionHosts(int count, NodeResources nodeResources) {
+ try {
+ Version osVersion = nodeRepository().osVersions().targetFor(NodeType.host).orElse(Version.emptyVersion);
+ List<Integer> provisionIndexes = nodeRepository().database().getProvisionIndexes(count);
+ List<Node> hosts = hostProvisioner.provisionHosts(provisionIndexes, nodeResources,
+ ApplicationId.defaultId(), osVersion, HostSharing.shared)
+ .stream()
+ .map(ProvisionedHost::generateHost)
+ .collect(Collectors.toList());
+ nodeRepository().addNodes(hosts, Agent.DynamicProvisioningMaintainer);
+ return hosts;
+ } catch (OutOfCapacityException | IllegalArgumentException | IllegalStateException e) {
+ throw new OutOfCapacityException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage());
+ } catch (RuntimeException e) {
+ throw new RuntimeException("Failed to provision " + count + " " + nodeResources + ", will retry in " + interval(), e);
+ }
+ }
+
+ /**
+ * Try to allocate the preprovision cluster capacity.
+ *
+ * @param mutableNodes represents all nodes in the node repo. As preprovision capacity is virtually allocated
+ * they are added to {@code mutableNodes}
+ * @return the part of a cluster capacity it was unable to allocate, if any
+ */
+ private Optional<ClusterCapacity> allocatePreprovisionCapacity(List<ClusterCapacity> preprovisionCapacity,
+ ArrayList<Node> mutableNodes) {
+ for (int clusterIndex = 0; clusterIndex < preprovisionCapacity.size(); ++clusterIndex) {
+ ClusterCapacity clusterCapacity = preprovisionCapacity.get(clusterIndex);
+ LockedNodeList nodeList = new LockedNodeList(mutableNodes, () -> {});
+ List<Node> candidates = findCandidates(clusterCapacity, clusterIndex, nodeList);
+ int deficit = Math.max(0, clusterCapacity.count() - candidates.size());
+ if (deficit > 0) {
+ return Optional.of(clusterCapacity.withCount(deficit));
}
- });
- return excessHosts;
+ // Simulate allocating the cluster
+ mutableNodes.addAll(candidates);
+ }
+
+ return Optional.empty();
}
+ private List<Node> findCandidates(ClusterCapacity clusterCapacity, int clusterIndex, LockedNodeList nodeList) {
+ NodeResources nodeResources = toNodeResources(clusterCapacity);
+
+ // We'll allocate each ClusterCapacity as a unique cluster in a dummy application
+ ApplicationId applicationId = ApplicationId.defaultId();
+ ClusterSpec.Id clusterId = ClusterSpec.Id.from(String.valueOf(clusterIndex));
+ ClusterSpec clusterSpec = ClusterSpec.request(ClusterSpec.Type.content, clusterId)
+ // build() requires a version, even though it is not (should not be) used
+ .vespaVersion(Vtag.currentVersion)
+ .build();
+ NodeSpec nodeSpec = NodeSpec.from(clusterCapacity.count(), nodeResources, false, true);
+ int wantedGroups = 1;
+
+ NodePrioritizer prioritizer = new NodePrioritizer(nodeList, applicationId, clusterSpec, nodeSpec, wantedGroups,
+ true, nodeRepository().nameResolver(), nodeRepository().resourcesCalculator(),
+ nodeRepository().spareCount());
+ List<NodeCandidate> nodeCandidates = prioritizer.collect(List.of());
+ MutableInteger index = new MutableInteger(0);
+ return nodeCandidates
+ .stream()
+ .limit(clusterCapacity.count())
+ .map(candidate -> candidate.toNode()
+ .allocate(applicationId,
+ ClusterMembership.from(clusterSpec, index.next()),
+ nodeResources,
+ nodeRepository().clock().instant()))
+ .collect(Collectors.toList());
+
+ }
+
+ private static NodeResources toNodeResources(ClusterCapacity clusterCapacity) {
+ return new NodeResources(clusterCapacity.vcpu(), clusterCapacity.memoryGb(), clusterCapacity.diskGb(),
+ clusterCapacity.bandwidthGbps());
+ }
/** Reads node resources declared by target capacity flag */
private List<NodeResources> targetCapacity() {
- return targetCapacityFlag.value().stream()
- .flatMap(cap -> {
- NodeResources resources = new NodeResources(cap.getVcpu(), cap.getMemoryGb(),
- cap.getDiskGb(), 1);
- return IntStream.range(0, cap.getCount()).mapToObj(i -> resources);
- })
- .sorted(NodeResourceComparator.memoryDiskCpuOrder().reversed())
- .collect(Collectors.toList());
+ return preprovisionCapacityFlag.value().stream()
+ .flatMap(cap -> {
+ NodeResources resources = new NodeResources(cap.vcpu(), cap.memoryGb(),
+ cap.diskGb(), cap.bandwidthGbps());
+ return IntStream.range(0, cap.count()).mapToObj(i -> resources);
+ })
+ .sorted(NodeResourceComparator.memoryDiskCpuOrder().reversed())
+ .collect(Collectors.toList());
}
/** Verify DNS configuration of given nodes */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
index ee55e22e89c..62d042d5e8b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java
@@ -177,7 +177,7 @@ class MaintenanceDeployment implements Closeable {
if ( deployment.activate().isEmpty()) return false;
log.info(agent + " redeployed " + application + " to " +
- ( verifyTarget ? this : "move " + (node.hostname() + " from " + fromHost)));
+ ( verifyTarget ? this : "move " + (node + " from " + fromHost.hostname())));
return true;
}
finally {
@@ -225,7 +225,7 @@ class MaintenanceDeployment implements Closeable {
@Override
public String toString() {
return "move " +
- ( isEmpty() ? "none" : (node.hostname() + " from " + fromHost + " to " + toHost));
+ ( isEmpty() ? "none" : (node + " from " + fromHost.hostname() + " to " + toHost.hostname()));
}
public static Move empty() { return new Move(null, null, null); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index d2dcaaeae5b..4a5c28fe0c8 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.collections.Pair;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ClusterSpec;
@@ -26,10 +27,12 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Duration;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
@@ -41,6 +44,7 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.any;
*/
public class MetricsReporter extends NodeRepositoryMaintainer {
+ private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>();
private final Metric metric;
private final Orchestrator orchestrator;
private final ServiceMonitor serviceMonitor;
@@ -261,28 +265,39 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
.forEach((lockPath, lockMetrics) -> {
Metric.Context context = getContext(Map.of("lockPath", lockPath));
- metric.set("lockAttempt.acquire", lockMetrics.getAndResetAcquireCount(), context);
- metric.set("lockAttempt.acquireFailed", lockMetrics.getAndResetAcquireFailedCount(), context);
- metric.set("lockAttempt.acquireTimedOut", lockMetrics.getAndResetAcquireTimedOutCount(), context);
- metric.set("lockAttempt.locked", lockMetrics.getAndResetAcquireSucceededCount(), context);
- metric.set("lockAttempt.release", lockMetrics.getAndResetReleaseCount(), context);
- metric.set("lockAttempt.releaseFailed", lockMetrics.getAndResetReleaseFailedCount(), context);
- metric.set("lockAttempt.reentry", lockMetrics.getAndResetReentryCount(), context);
- metric.set("lockAttempt.deadlock", lockMetrics.getAndResetDeadlockCount(), context);
- metric.set("lockAttempt.nakedRelease", lockMetrics.getAndResetNakedReleaseCount(), context);
- metric.set("lockAttempt.acquireWithoutRelease", lockMetrics.getAndResetAcquireWithoutReleaseCount(), context);
- metric.set("lockAttempt.foreignRelease", lockMetrics.getAndResetForeignReleaseCount(), context);
-
- setLockLatencyMetrics("acquire", lockMetrics.getAndResetAcquireLatencyMetrics(), context);
- setLockLatencyMetrics("locked", lockMetrics.getAndResetLockedLatencyMetrics(), context);
+ LatencyMetrics acquireLatencyMetrics = lockMetrics.getAndResetAcquireLatencyMetrics();
+ setNonZero("lockAttempt.acquireMaxActiveLatency", acquireLatencyMetrics.maxActiveLatencySeconds(), context);
+ setNonZero("lockAttempt.acquireHz", acquireLatencyMetrics.startHz(), context);
+ setNonZero("lockAttempt.acquireLoad", acquireLatencyMetrics.load(), context);
+
+ LatencyMetrics lockedLatencyMetrics = lockMetrics.getAndResetLockedLatencyMetrics();
+ setNonZero("lockAttempt.lockedLatency", lockedLatencyMetrics.maxLatencySeconds(), context);
+ setNonZero("lockAttempt.lockedLoad", lockedLatencyMetrics.load(), context);
+
+ setNonZero("lockAttempt.acquireTimedOut", lockMetrics.getAndResetAcquireTimedOutCount(), context);
+ setNonZero("lockAttempt.deadlock", lockMetrics.getAndResetDeadlockCount(), context);
+
+ // bucket for various rare errors - to reduce #metrics
+ setNonZero("lockAttempt.errors",
+ lockMetrics.getAndResetAcquireFailedCount() +
+ lockMetrics.getAndResetReleaseFailedCount() +
+ lockMetrics.getAndResetNakedReleaseCount() +
+ lockMetrics.getAndResetAcquireWithoutReleaseCount() +
+ lockMetrics.getAndResetForeignReleaseCount(),
+ context);
});
}
- private void setLockLatencyMetrics(String name, LatencyMetrics latencyMetrics, Metric.Context context) {
- metric.set("lockAttempt." + name + "Latency", latencyMetrics.latencySeconds(), context);
- metric.set("lockAttempt." + name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds(), context);
- metric.set("lockAttempt." + name + "Hz", latencyMetrics.startHz(), context);
- metric.set("lockAttempt." + name + "Load", latencyMetrics.load(), context);
+ private void setNonZero(String key, Number value, Metric.Context context) {
+ var metricKey = new Pair<>(context, key);
+ if (Double.compare(value.doubleValue(), 0.0) != 0) {
+ metric.set(key, value, context);
+ nonZeroMetrics.add(metricKey);
+ } else if (nonZeroMetrics.remove(metricKey)) {
+ // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying
+ // a non-zero 'last' from earlier periods.
+ metric.set(key, value, context);
+ }
}
private void updateDockerMetrics(NodeList nodes) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 6d571fada9e..2999655e5fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -153,7 +153,9 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Map<Node, String> nodesByFailureReason = new HashMap<>();
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
- nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ // Allow a grace period after node re-activation
+ if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
+ nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
}
else if (hostSuspended(node, activeNodes)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
index ff7bc9393bd..017e1264f1c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java
@@ -1,23 +1,18 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
-import com.yahoo.collections.Pair;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsFetcher;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
-import java.util.Collection;
import java.util.logging.Level;
-import java.util.stream.Collectors;
/**
- * Maintainer which keeps the node metric db up to date by periodically fetching metrics from all
- * active nodes.
+ * Maintainer which keeps the node metric db up to date by periodically fetching metrics from all active nodes.
*
* @author bratseth
*/
@@ -43,12 +38,12 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer {
int warnings = 0;
for (ApplicationId application : activeNodesByApplication().keySet()) {
try {
- metricsDb.add(filter(metricsFetcher.fetchMetrics(application)));
+ metricsDb.add(metricsFetcher.fetchMetrics(application));
}
catch (Exception e) {
// TODO: Don't warn if this only happens occasionally
if (warnings++ < maxWarningsPerInvocation)
- log.log(Level.WARNING, "Could not update metrics for " + application + ": " + Exceptions.toMessageString(e));
+ log.log(Level.WARNING, "Could not update metrics for " + application + ": " + Exceptions.toMessageString(e), e);
}
}
metricsDb.gc();
@@ -59,11 +54,4 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer {
return warnings == 0;
}
- /** Filter out uninformative snapshots before storing */
- private Collection<Pair<String, MetricSnapshot>> filter(Collection<Pair<String, MetricSnapshot>> snapshots) {
- return snapshots.stream()
- .filter(snapshot -> snapshot.getSecond().inService())
- .collect(Collectors.toList());
- }
-
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
index 214872348a3..96d10415e63 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java
@@ -3,8 +3,8 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.flags.FlagSource;
-import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.IntFlag;
+import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.History;
@@ -32,7 +32,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer {
NodeRebooter(NodeRepository nodeRepository, FlagSource flagSource, Metric metric) {
super(nodeRepository, Duration.ofMinutes(25), metric);
- this.rebootIntervalInDays = Flags.REBOOT_INTERVAL_IN_DAYS.bindTo(flagSource);
+ this.rebootIntervalInDays = PermanentFlags.REBOOT_INTERVAL_IN_DAYS.bindTo(flagSource);
this.random = new Random(nodeRepository.clock().millis()); // seed with clock for test determinism
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 9f2c6f05b00..ac33cc2441c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.google.inject.Inject;
import com.yahoo.component.AbstractComponent;
+import com.yahoo.concurrent.maintenance.Maintainer;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.HostLivenessTracker;
@@ -18,7 +19,8 @@ import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Duration;
-import java.util.Optional;
+import java.util.List;
+import java.util.concurrent.CopyOnWriteArrayList;
/**
* A component which sets up all the node repo maintenance jobs.
@@ -27,28 +29,7 @@ import java.util.Optional;
*/
public class NodeRepositoryMaintenance extends AbstractComponent {
- private final NodeFailer nodeFailer;
- private final NodeHealthTracker nodeHealthTracker;
- private final PeriodicApplicationMaintainer periodicApplicationMaintainer;
- private final OperatorChangeApplicationMaintainer operatorChangeApplicationMaintainer;
- private final ReservationExpirer reservationExpirer;
- private final InactiveExpirer inactiveExpirer;
- private final RetiredExpirer retiredExpirer;
- private final FailedExpirer failedExpirer;
- private final DirtyExpirer dirtyExpirer;
- private final ProvisionedExpirer provisionedExpirer;
- private final NodeRebooter nodeRebooter;
- private final MetricsReporter metricsReporter;
- private final InfrastructureProvisioner infrastructureProvisioner;
- private final Optional<LoadBalancerExpirer> loadBalancerExpirer;
- private final Optional<DynamicProvisioningMaintainer> dynamicProvisioningMaintainer;
- private final SpareCapacityMaintainer spareCapacityMaintainer;
- private final OsUpgradeActivator osUpgradeActivator;
- private final Rebalancer rebalancer;
- private final NodeMetricsDbMaintainer nodeMetricsDbMaintainer;
- private final AutoscalingMaintainer autoscalingMaintainer;
- private final ScalingSuggestionsMaintainer scalingSuggestionsMaintainer;
- private final SwitchRebalancer switchRebalancer;
+ private final List<Maintainer> maintainers = new CopyOnWriteArrayList<>();
@SuppressWarnings("unused")
@Inject
@@ -59,60 +40,45 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
MetricsFetcher metricsFetcher, MetricsDb metricsDb) {
DefaultTimes defaults = new DefaultTimes(zone, deployer);
- nodeFailer = new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, orchestrator, defaults.throttlePolicy, metric);
- nodeHealthTracker = new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric);
- periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, metric, nodeRepository,
- defaults.redeployMaintainerInterval, defaults.periodicRedeployInterval, flagSource);
- operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.operatorChangeRedeployInterval);
- reservationExpirer = new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric);
- retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry);
- inactiveExpirer = new InactiveExpirer(nodeRepository, defaults.inactiveExpiry, metric);
- failedExpirer = new FailedExpirer(nodeRepository, zone, defaults.failedExpirerInterval, metric);
- dirtyExpirer = new DirtyExpirer(nodeRepository, defaults.dirtyExpiry, metric);
- provisionedExpirer = new ProvisionedExpirer(nodeRepository, defaults.provisionedExpiry, metric);
- nodeRebooter = new NodeRebooter(nodeRepository, flagSource, metric);
- metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval);
- infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, defaults.infrastructureProvisionInterval, metric);
- loadBalancerExpirer = provisionServiceProvider.getLoadBalancerService(nodeRepository).map(lbService ->
- new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService, metric));
- dynamicProvisioningMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner ->
- new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric));
- spareCapacityMaintainer = new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval);
- osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval, metric);
- rebalancer = new Rebalancer(deployer, nodeRepository, metric, defaults.rebalancerInterval);
- nodeMetricsDbMaintainer = new NodeMetricsDbMaintainer(nodeRepository, metricsFetcher, metricsDb, defaults.nodeMetricsCollectionInterval, metric);
- autoscalingMaintainer = new AutoscalingMaintainer(nodeRepository, metricsDb, deployer, metric, defaults.autoscalingInterval);
- scalingSuggestionsMaintainer = new ScalingSuggestionsMaintainer(nodeRepository, metricsDb, defaults.scalingSuggestionsInterval, metric);
- switchRebalancer = new SwitchRebalancer(nodeRepository, defaults.switchRebalancerInterval, metric, deployer);
-
+ PeriodicApplicationMaintainer periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, metric, nodeRepository, defaults.redeployMaintainerInterval,
+ defaults.periodicRedeployInterval, flagSource);
+ InfrastructureProvisioner infrastructureProvisioner = new InfrastructureProvisioner(nodeRepository, infraDeployer, defaults.infrastructureProvisionInterval, metric);
+ maintainers.add(periodicApplicationMaintainer);
+ maintainers.add(infrastructureProvisioner);
+
+ maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, orchestrator, defaults.throttlePolicy, metric));
+ maintainers.add(new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric));
+ maintainers.add(new OperatorChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.operatorChangeRedeployInterval));
+ maintainers.add(new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric));
+ maintainers.add(new RetiredExpirer(nodeRepository, orchestrator, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry));
+ maintainers.add(new InactiveExpirer(nodeRepository, defaults.inactiveExpiry, metric));
+ maintainers.add(new FailedExpirer(nodeRepository, zone, defaults.failedExpirerInterval, metric));
+ maintainers.add(new DirtyExpirer(nodeRepository, defaults.dirtyExpiry, metric));
+ maintainers.add(new ProvisionedExpirer(nodeRepository, defaults.provisionedExpiry, metric));
+ maintainers.add(new NodeRebooter(nodeRepository, flagSource, metric));
+ maintainers.add(new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval));
+ maintainers.add(new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval));
+ maintainers.add(new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval, metric));
+ maintainers.add(new Rebalancer(deployer, nodeRepository, metric, defaults.rebalancerInterval));
+ maintainers.add(new NodeMetricsDbMaintainer(nodeRepository, metricsFetcher, metricsDb, defaults.nodeMetricsCollectionInterval, metric));
+ maintainers.add(new AutoscalingMaintainer(nodeRepository, metricsDb, deployer, metric, defaults.autoscalingInterval));
+ maintainers.add(new ScalingSuggestionsMaintainer(nodeRepository, metricsDb, defaults.scalingSuggestionsInterval, metric));
+ maintainers.add(new SwitchRebalancer(nodeRepository, defaults.switchRebalancerInterval, metric, deployer));
+
+ provisionServiceProvider.getLoadBalancerService(nodeRepository)
+ .map(lbService -> new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService, metric))
+ .ifPresent(maintainers::add);
+ provisionServiceProvider.getHostProvisioner()
+ .map(hostProvisioner -> new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric))
+ .ifPresent(maintainers::add);
// The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now
infrastructureProvisioner.maintainButThrowOnException();
}
@Override
public void deconstruct() {
- nodeFailer.close();
- nodeHealthTracker.close();
- periodicApplicationMaintainer.close();
- operatorChangeApplicationMaintainer.close();
- reservationExpirer.close();
- inactiveExpirer.close();
- retiredExpirer.close();
- failedExpirer.close();
- dirtyExpirer.close();
- nodeRebooter.close();
- spareCapacityMaintainer.close();
- provisionedExpirer.close();
- metricsReporter.close();
- infrastructureProvisioner.close();
- loadBalancerExpirer.ifPresent(NodeRepositoryMaintainer::close);
- dynamicProvisioningMaintainer.ifPresent(NodeRepositoryMaintainer::close);
- osUpgradeActivator.close();
- rebalancer.close();
- nodeMetricsDbMaintainer.close();
- autoscalingMaintainer.close();
- scalingSuggestionsMaintainer.close();
- switchRebalancer.close();
+ maintainers.forEach(Maintainer::shutdown);
+ maintainers.forEach(Maintainer::awaitShutdown);
}
private static class DefaultTimes {
@@ -164,7 +130,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
operatorChangeRedeployInterval = Duration.ofMinutes(3);
// Vespa upgrade frequency is higher in CD so (de)activate OS upgrades more frequently as well
osUpgradeActivatorInterval = zone.system().isCd() ? Duration.ofSeconds(30) : Duration.ofMinutes(5);
- periodicRedeployInterval = Duration.ofMinutes(30);
+ periodicRedeployInterval = Duration.ofMinutes(60);
provisionedExpiry = Duration.ofHours(4);
rebalancerInterval = Duration.ofMinutes(120);
redeployMaintainerInterval = Duration.ofMinutes(1);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
index 289d5a6742a..d20b06becaf 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
@@ -7,7 +7,7 @@ import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.flags.BooleanFlag;
import com.yahoo.vespa.flags.FetchVector;
import com.yahoo.vespa.flags.FlagSource;
-import com.yahoo.vespa.flags.Flags;
+import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
@@ -70,7 +70,7 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
}
private boolean shouldMaintain(ApplicationId id) {
- BooleanFlag skipMaintenanceDeployment = Flags.SKIP_MAINTENANCE_DEPLOYMENT.bindTo(flagSource)
+ BooleanFlag skipMaintenanceDeployment = PermanentFlags.SKIP_MAINTENANCE_DEPLOYMENT.bindTo(flagSource)
.with(FetchVector.Dimension.APPLICATION_ID, id.serializedForm());
return ! skipMaintenanceDeployment.value();
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
index 3546c8d8afb..7cb0270636f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Applications;
@@ -51,7 +52,7 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer {
private int suggest(ApplicationId application, List<Node> applicationNodes) {
int successes = 0;
for (var cluster : nodesByCluster(applicationNodes).entrySet())
- successes += suggest(application, cluster.getKey(), cluster.getValue()) ? 1 : 0;
+ successes += suggest(application, cluster.getKey(), NodeList.copyOf(cluster.getValue())) ? 1 : 0;
return successes;
}
@@ -61,7 +62,7 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer {
private boolean suggest(ApplicationId applicationId,
ClusterSpec.Id clusterId,
- List<Node> clusterNodes) {
+ NodeList clusterNodes) {
Application application = applications().get(applicationId).orElse(new Application(applicationId));
Optional<Cluster> cluster = application.cluster(clusterId);
if (cluster.isEmpty()) return true;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
index b490cdf4c24..8c9e54a2ae4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java
@@ -33,14 +33,13 @@ public class SwitchRebalancer extends NodeMover<Move> {
@Override
protected boolean maintain() {
- if ( ! nodeRepository().isWorking()) return false;
+ if (!nodeRepository().isWorking()) return false;
+ if (!nodeRepository().zone().environment().isProduction()) return true;
+ NodeList allNodes = nodeRepository().list(); // Lockless as strong consistency is not needed
+ if (!zoneIsStable(allNodes)) return true;
- boolean success = true;
- // Using node list without holding lock as strong consistency is not needed here
- NodeList allNodes = nodeRepository().list();
- if (!zoneIsStable(allNodes)) return success;
findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository());
- return success;
+ return true;
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index e92415d6538..3c2541bac27 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -39,6 +39,13 @@ public class History {
/** Returns this event if it is present in this history */
public Optional<Event> event(Event.Type type) { return Optional.ofNullable(events.get(type)); }
+ /** Returns true if a given event is registered in this history at the given time */
+ public boolean hasEventAt(Event.Type type, Instant time) {
+ return event(type)
+ .map(event -> event.at().equals(time))
+ .orElse(false);
+ }
+
/** Returns true if a given event is registered in this history after the given time */
public boolean hasEventAfter(Event.Type type, Instant time) {
return event(type)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
index 3979b898145..4b9b14656ca 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
@@ -52,6 +52,7 @@ public class ApplicationSerializer {
private static final String toKey = "to";
private static final String generationKey = "generation";
private static final String atKey = "at";
+ private static final String completionKey = "completion";
public static byte[] toJson(Application application) {
Slime slime = new Slime();
@@ -140,13 +141,19 @@ public class ApplicationSerializer {
toSlime(event.to(), object.setObject(toKey));
object.setLong(generationKey, event.generation());
object.setLong(atKey, event.at().toEpochMilli());
+ event.completion().ifPresent(completion -> object.setLong(completionKey, completion.toEpochMilli()));
}
private static ScalingEvent scalingEventFromSlime(Inspector inspector) {
return new ScalingEvent(clusterResourcesFromSlime(inspector.field(fromKey)),
clusterResourcesFromSlime(inspector.field(toKey)),
inspector.field(generationKey).asLong(),
- Instant.ofEpochMilli(inspector.field(atKey).asLong()));
+ Instant.ofEpochMilli(inspector.field(atKey).asLong()),
+ optionalInstant(inspector.field(completionKey)));
+ }
+
+ private static Optional<Instant> optionalInstant(Inspector inspector) {
+ return inspector.valid() ? Optional.of(Instant.ofEpochMilli(inspector.asLong())) : Optional.empty();
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index 42e26814d41..696853b2992 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -70,7 +70,7 @@ public class CuratorDatabaseClient {
private static final Path containerImagesPath = root.append("dockerImages");
private static final Path firmwareCheckPath = root.append("firmwareCheck");
- private static final Duration defaultLockTimeout = Duration.ofMinutes(2);
+ private static final Duration defaultLockTimeout = Duration.ofMinutes(6);
private final NodeSerializer nodeSerializer;
private final CuratorDatabase db;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/JobControlFlags.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/JobControlFlags.java
index 2a2a45186f9..79f4403e534 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/JobControlFlags.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/JobControlFlags.java
@@ -3,8 +3,8 @@ package com.yahoo.vespa.hosted.provision.persistence;
import com.yahoo.concurrent.maintenance.JobControlState;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.flags.FlagSource;
-import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.ListFlag;
+import com.yahoo.vespa.flags.PermanentFlags;
import java.util.Set;
@@ -20,7 +20,7 @@ public class JobControlFlags implements JobControlState {
public JobControlFlags(CuratorDatabaseClient curator, FlagSource flagSource) {
this.curator = curator;
- this.inactiveJobsFlag = Flags.INACTIVE_MAINTENANCE_JOBS.bindTo(flagSource);
+ this.inactiveJobsFlag = PermanentFlags.INACTIVE_MAINTENANCE_JOBS.bindTo(flagSource);
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
index 3cae4a5a5ea..2b03a5cae8c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
@@ -16,6 +16,7 @@ import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -60,6 +61,7 @@ class Activator {
* while holding the node repository lock on this application
*/
private void activateNodes(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) {
+ Instant activationTime = nodeRepository.clock().instant(); // Use one timestamp for all activation changes
ApplicationId application = transaction.application();
Set<String> hostnames = hosts.stream().map(HostSpec::hostname).collect(Collectors.toSet());
NodeList allNodes = nodeRepository.list();
@@ -69,7 +71,7 @@ class Activator {
List<Node> reservedToActivate = updatePortsFrom(hosts, retainHostsInList(hostnames, reserved));
List<Node> oldActive = applicationNodes.state(Node.State.active).asList(); // All nodes active now
List<Node> continuedActive = retainHostsInList(hostnames, oldActive);
- List<Node> newActive = updateFrom(hosts, continuedActive); // All nodes that will be active when this is committed
+ List<Node> newActive = updateFrom(hosts, continuedActive, activationTime); // All nodes that will be active when this is committed
newActive.addAll(reservedToActivate);
if ( ! containsAll(hostnames, newActive))
throw new IllegalArgumentException("Activation of " + application + " failed. " +
@@ -84,16 +86,16 @@ class Activator {
List<Node> activeToRemove = removeHostsFromList(hostnames, oldActive);
activeToRemove = activeToRemove.stream().map(Node::unretire).collect(Collectors.toList()); // only active nodes can be retired. TODO: Move this line to deactivate
- nodeRepository.deactivate(activeToRemove, transaction);
+ nodeRepository.deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line
nodeRepository.activate(newActive, transaction.nested()); // activate also continued active to update node state
- rememberResourceChange(transaction, generation,
+ rememberResourceChange(transaction, generation, activationTime,
NodeList.copyOf(oldActive).not().retired(),
NodeList.copyOf(newActive).not().retired());
unreserveParentsOf(reservedToActivate);
}
- private void rememberResourceChange(ApplicationTransaction transaction, long generation,
+ private void rememberResourceChange(ApplicationTransaction transaction, long generation, Instant at,
NodeList oldNodes, NodeList newNodes) {
Optional<Application> application = nodeRepository.applications().get(transaction.application());
if (application.isEmpty()) return; // infrastructure app, hopefully :-|
@@ -102,15 +104,18 @@ class Activator {
.collect(Collectors.groupingBy(node -> node.allocation().get().membership().cluster().id()));
Application modified = application.get();
for (var clusterEntry : currentNodesByCluster.entrySet()) {
+ var cluster = modified.cluster(clusterEntry.getKey()).get();
var previousResources = oldNodes.cluster(clusterEntry.getKey()).toResources();
var currentResources = NodeList.copyOf(clusterEntry.getValue()).toResources();
- if ( ! previousResources.equals(currentResources)) {
- modified = modified.with(application.get().cluster(clusterEntry.getKey()).get()
- .with(new ScalingEvent(previousResources,
- currentResources,
- generation,
- nodeRepository.clock().instant())));
+ if ( ! previousResources.justNumbers().equals(currentResources.justNumbers())) {
+ cluster = cluster.with(ScalingEvent.create(previousResources, currentResources, generation, at));
}
+ if (cluster.targetResources().isPresent()
+ && cluster.targetResources().get().justNumbers().equals(currentResources.justNumbers())) {
+ cluster = cluster.withAutoscalingStatus("Cluster is ideally scaled within configured limits");
+ }
+ if (cluster != modified.cluster(clusterEntry.getKey()).get())
+ modified = modified.with(cluster);
}
if (modified != application.get())
@@ -202,11 +207,11 @@ class Activator {
}
/** Returns the input nodes with the changes resulting from applying the settings in hosts to the given list of nodes. */
- private List<Node> updateFrom(Collection<HostSpec> hosts, List<Node> nodes) {
+ private List<Node> updateFrom(Collection<HostSpec> hosts, List<Node> nodes, Instant at) {
List<Node> updated = new ArrayList<>();
for (Node node : nodes) {
HostSpec hostSpec = getHost(node.hostname(), hosts);
- node = hostSpec.membership().get().retired() ? node.retire(nodeRepository.clock().instant()) : node.unretire();
+ node = hostSpec.membership().get().retired() ? node.retire(at) : node.unretire();
if (! hostSpec.advertisedResources().equals(node.resources())) // A resized node
node = node.with(new Flavor(hostSpec.advertisedResources()));
Allocation allocation = node.allocation().get()
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/FlavorConfigBuilder.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/FlavorConfigBuilder.java
index e04c1aa208d..54530297baa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/FlavorConfigBuilder.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/FlavorConfigBuilder.java
@@ -55,9 +55,9 @@ public class FlavorConfigBuilder {
else if (flavorName.equals("host2"))
flavorConfigBuilder.addFlavor(flavorName, 16, 24, 100, 1, Flavor.Type.BARE_METAL);
else if (flavorName.equals("host3"))
- flavorConfigBuilder.addFlavor(flavorName, 24, 64, 100, 1, Flavor.Type.BARE_METAL);
+ flavorConfigBuilder.addFlavor(flavorName, 24, 64, 100, 10, Flavor.Type.BARE_METAL);
else if (flavorName.equals("host4"))
- flavorConfigBuilder.addFlavor(flavorName, 48, 128, 1000, 1, Flavor.Type.BARE_METAL);
+ flavorConfigBuilder.addFlavor(flavorName, 48, 128, 1000, 10, Flavor.Type.BARE_METAL);
else if (flavorName.equals("devhost"))
flavorConfigBuilder.addFlavor(flavorName, 4., 80., 100, 10, Flavor.Type.BARE_METAL);
else
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
index 6462fb6f19d..f02659aab5f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
@@ -122,7 +122,9 @@ public class GroupPreparer {
NodeAllocation allocation = new NodeAllocation(allNodes, application, cluster, requestedNodes,
highestIndex, nodeRepository);
NodePrioritizer prioritizer = new NodePrioritizer(
- allNodes, application, cluster, requestedNodes, wantedGroups, nodeRepository);
+ allNodes, application, cluster, requestedNodes, wantedGroups,
+ nodeRepository.zone().getCloud().dynamicProvisioning(), nodeRepository.nameResolver(),
+ nodeRepository.resourcesCalculator(), nodeRepository.spareCount());
allocation.offer(prioritizer.collect(surplusActiveNodes));
return allocation;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
index 206aa077027..0b5a04ca42c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
@@ -2,10 +2,12 @@
package com.yahoo.vespa.hosted.provision.provisioning;
import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ApplicationName;
import com.yahoo.config.provision.ApplicationTransaction;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.config.provision.TenantName;
import com.yahoo.config.provision.exception.LoadBalancerServiceException;
import com.yahoo.transaction.NestedTransaction;
import com.yahoo.vespa.flags.BooleanFlag;
@@ -82,7 +84,8 @@ public class LoadBalancerProvisioner {
try (var lock = db.lock(application)) {
ClusterSpec.Id clusterId = effectiveId(cluster);
List<Node> nodes = nodesOf(clusterId, application);
- provision(application, clusterId, nodes, false);
+ LoadBalancerId loadBalancerId = requireNonClashing(new LoadBalancerId(application, clusterId));
+ provision(loadBalancerId, nodes, false);
}
}
@@ -149,9 +152,30 @@ public class LoadBalancerProvisioner {
return canForwardTo;
}
+ /** Find all load balancer IDs owned by given tenant and application */
+ private List<LoadBalancerId> findLoadBalancers(TenantName tenant, ApplicationName application) {
+ return db.readLoadBalancerIds().stream()
+ .filter(id -> id.application().tenant().equals(tenant) &&
+ id.application().application().equals(application))
+ .collect(Collectors.toUnmodifiableList());
+ }
+
+ /** Require that load balancer IDs do not clash. This prevents name clashing when compacting endpoint DNS names */
+ private LoadBalancerId requireNonClashing(LoadBalancerId loadBalancerId) {
+ List<LoadBalancerId> loadBalancerIds = findLoadBalancers(loadBalancerId.application().tenant(),
+ loadBalancerId.application().application());
+ List<String> nonCompactableIds = withoutCompactableIds(loadBalancerId);
+ for (var id : loadBalancerIds) {
+ if (id.equals(loadBalancerId)) continue;
+ if (nonCompactableIds.equals(withoutCompactableIds(id))) {
+ throw new IllegalArgumentException(loadBalancerId + " clashes with " + id);
+ }
+ }
+ return loadBalancerId;
+ }
+
/** Idempotently provision a load balancer for given application and cluster */
- private void provision(ApplicationId application, ClusterSpec.Id clusterId, List<Node> nodes, boolean activate) {
- var id = new LoadBalancerId(application, clusterId);
+ private void provision(LoadBalancerId id, List<Node> nodes, boolean activate) {
var now = nodeRepository.clock().instant();
var loadBalancer = db.readLoadBalancer(id);
if (loadBalancer.isEmpty() && activate) return; // Nothing to activate as this load balancer was never prepared
@@ -171,6 +195,10 @@ public class LoadBalancerProvisioner {
db.writeLoadBalancer(newLoadBalancer);
}
+ private void provision(ApplicationId application, ClusterSpec.Id clusterId, List<Node> nodes, boolean activate) {
+ provision(new LoadBalancerId(application, clusterId), nodes, activate);
+ }
+
private LoadBalancerInstance provisionInstance(LoadBalancerId id, List<Node> nodes, boolean force) {
var reals = new LinkedHashSet<Real>();
for (var node : nodes) {
@@ -206,6 +234,18 @@ public class LoadBalancerProvisioner {
return nodes.stream().collect(Collectors.groupingBy(node -> effectiveId(node.allocation().get().membership().cluster())));
}
+ /** Returns a list of the non-compactable IDs of given load balancer */
+ private static List<String> withoutCompactableIds(LoadBalancerId id) {
+ List<String> ids = new ArrayList<>(2);
+ if (!"default".equals(id.cluster().value())) {
+ ids.add(id.cluster().value());
+ }
+ if (!id.application().instance().isDefault()) {
+ ids.add(id.application().instance().value());
+ }
+ return ids;
+ }
+
/** Find IP addresses reachable by the load balancer service */
private Set<String> reachableIpAddresses(Node node) {
Set<String> reachable = new LinkedHashSet<>(node.ipConfig().primary());
@@ -225,4 +265,5 @@ public class LoadBalancerProvisioner {
return cluster.combinedId().orElse(cluster.id());
}
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
index 68e11c4c995..bc164dc37e0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
@@ -203,7 +203,7 @@ class NodeAllocation {
* Such nodes will be marked retired during finalization of the list of accepted nodes.
* The conditions for this are:
*
- * This is a content or combined node. These must always be retired before being removed to allow the cluster to
+ * This is a stateful node. These must always be retired before being removed to allow the cluster to
* migrate away data.
*
* This is a container node and it is not desired due to having the wrong flavor. In this case this
@@ -218,7 +218,7 @@ class NodeAllocation {
if (candidate.allocation().get().membership().retired()) return true; // don't second-guess if already retired
if (! requestedNodes.considerRetiring()) return false;
- return cluster.type().isContent() ||
+ return cluster.isStateful() ||
(cluster.type() == ClusterSpec.Type.container && !hasCompatibleFlavor(candidate));
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
index 14937e6afeb..460b7a821e6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
@@ -8,10 +8,10 @@ import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.Node;
-import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.Nodelike;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.IP;
+import com.yahoo.vespa.hosted.provision.persistence.NameResolver;
import com.yahoo.yolean.Exceptions;
import java.time.Instant;
@@ -25,7 +25,7 @@ import java.util.logging.Logger;
*
* @author smorgrav
*/
-abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
+public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
private static final Logger log = Logger.getLogger(NodeCandidate.class.getName());
@@ -224,8 +224,8 @@ abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
Node parent,
boolean violatesSpares,
LockedNodeList allNodes,
- NodeRepository nodeRepository) {
- return new VirtualNodeCandidate(resources, freeParentCapacity, parent, violatesSpares, true, allNodes, nodeRepository);
+ NameResolver nameResolver) {
+ return new VirtualNodeCandidate(resources, freeParentCapacity, parent, violatesSpares, true, allNodes, nameResolver);
}
public static NodeCandidate createNewExclusiveChild(Node node, Node parent) {
@@ -316,7 +316,7 @@ abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
/** Needed to construct the node */
private final LockedNodeList allNodes;
- private final NodeRepository nodeRepository;
+ private final NameResolver nameResolver;
private VirtualNodeCandidate(NodeResources resources,
NodeResources freeParentCapacity,
@@ -324,11 +324,11 @@ abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
boolean violatesSpares,
boolean exclusiveSwitch,
LockedNodeList allNodes,
- NodeRepository nodeRepository) {
+ NameResolver nameResolver) {
super(freeParentCapacity, Optional.of(parent), violatesSpares, exclusiveSwitch, false, true, false);
this.resources = resources;
this.allNodes = allNodes;
- this.nodeRepository = nodeRepository;
+ this.nameResolver = nameResolver;
}
@Override
@@ -361,7 +361,7 @@ abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
public NodeCandidate withNode() {
Optional<IP.Allocation> allocation;
try {
- allocation = parent.get().ipConfig().pool().findAllocation(allNodes, nodeRepository.nameResolver());
+ allocation = parent.get().ipConfig().pool().findAllocation(allNodes, nameResolver);
if (allocation.isEmpty()) return new InvalidNodeCandidate(resources, freeParentCapacity, parent.get(),
"No addresses available on parent host");
} catch (Exception e) {
@@ -382,7 +382,7 @@ abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidate> {
@Override
public NodeCandidate withExclusiveSwitch(boolean exclusiveSwitch) {
- return new VirtualNodeCandidate(resources, freeParentCapacity, parent.get(), violatesSpares, exclusiveSwitch, allNodes, nodeRepository);
+ return new VirtualNodeCandidate(resources, freeParentCapacity, parent.get(), violatesSpares, exclusiveSwitch, allNodes, nameResolver);
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index abfd5e021c4..b88556fbfec 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -8,6 +8,7 @@ import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.persistence.NameResolver;
import java.util.ArrayList;
import java.util.Collections;
@@ -34,7 +35,8 @@ public class NodePrioritizer {
private final NodeSpec requestedNodes;
private final ApplicationId application;
private final ClusterSpec clusterSpec;
- private final NodeRepository nodeRepository;
+ private final NameResolver nameResolver;
+ private final boolean dynamicProvisioning;
/** Whether node specification allows new nodes to be allocated. */
private final boolean canAllocateNew;
private final boolean canAllocateToSpareHosts;
@@ -42,19 +44,19 @@ public class NodePrioritizer {
private final int currentClusterSize;
private final Set<Node> spareHosts;
- NodePrioritizer(LockedNodeList allNodes, ApplicationId application, ClusterSpec clusterSpec, NodeSpec nodeSpec,
- int wantedGroups, NodeRepository nodeRepository) {
- boolean dynamicProvisioning = nodeRepository.zone().getCloud().dynamicProvisioning();
-
+ public NodePrioritizer(LockedNodeList allNodes, ApplicationId application, ClusterSpec clusterSpec, NodeSpec nodeSpec,
+ int wantedGroups, boolean dynamicProvisioning, NameResolver nameResolver,
+ HostResourcesCalculator hostResourcesCalculator, int spareCount) {
this.allNodes = allNodes;
- this.capacity = new HostCapacity(allNodes, nodeRepository.resourcesCalculator());
+ this.capacity = new HostCapacity(allNodes, hostResourcesCalculator);
this.requestedNodes = nodeSpec;
this.clusterSpec = clusterSpec;
this.application = application;
+ this.dynamicProvisioning = dynamicProvisioning;
this.spareHosts = dynamicProvisioning ?
capacity.findSpareHostsInDynamicallyProvisionedZones(allNodes.asList()) :
- capacity.findSpareHosts(allNodes.asList(), nodeRepository.spareCount());
- this.nodeRepository = nodeRepository;
+ capacity.findSpareHosts(allNodes.asList(), spareCount);
+ this.nameResolver = nameResolver;
NodeList nodesInCluster = allNodes.owner(application).type(clusterSpec.type()).cluster(clusterSpec.id());
NodeList nonRetiredNodesInCluster = nodesInCluster.not().retired();
@@ -81,7 +83,7 @@ public class NodePrioritizer {
}
/** Collects all node candidates for this application and returns them in the most-to-least preferred order */
- List<NodeCandidate> collect(List<Node> surplusActiveNodes) {
+ public List<NodeCandidate> collect(List<Node> surplusActiveNodes) {
addApplicationNodes();
addSurplusNodes(surplusActiveNodes);
addReadyNodes();
@@ -131,7 +133,7 @@ public class NodePrioritizer {
if ( !canAllocateNew) return;
for (Node host : allNodes) {
- if ( ! nodeRepository.canAllocateTenantNodeTo(host)) continue;
+ if ( ! NodeRepository.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue;
if (host.reservedTo().isPresent() && !host.reservedTo().get().equals(application.tenant())) continue;
if (host.reservedTo().isPresent() && application.instance().isTester()) continue;
if (host.exclusiveTo().isPresent()) continue; // Never allocate new nodes to exclusive hosts
@@ -143,7 +145,7 @@ public class NodePrioritizer {
host,
spareHosts.contains(host),
allNodes,
- nodeRepository));
+ nameResolver));
}
}
@@ -209,7 +211,7 @@ public class NodePrioritizer {
if (node.type() != NodeType.tenant || node.parentHostname().isEmpty()) return true;
Optional<Node> parent = allNodes.parentOf(node);
if (parent.isEmpty()) return false;
- return nodeRepository.canAllocateTenantNodeTo(parent.get());
+ return NodeRepository.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
index ede6f4ef250..a6d68243160 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
@@ -29,7 +29,6 @@ import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
import com.yahoo.vespa.hosted.provision.autoscale.AllocationOptimizer;
import com.yahoo.vespa.hosted.provision.autoscale.Limits;
import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget;
-import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.filter.ApplicationFilter;
import com.yahoo.vespa.hosted.provision.node.filter.NodeHostFilter;
@@ -168,7 +167,7 @@ public class NodeRepositoryProvisioner implements Provisioner {
boolean firstDeployment = nodes.isEmpty();
AllocatableClusterResources currentResources =
firstDeployment // start at min, preserve current resources otherwise
- ? new AllocatableClusterResources(requested.minResources(), clusterSpec.type(), clusterSpec.isExclusive(), nodeRepository)
+ ? new AllocatableClusterResources(requested.minResources(), clusterSpec, nodeRepository)
: new AllocatableClusterResources(nodes, nodeRepository, clusterSpec.isExclusive());
return within(Limits.of(requested), clusterSpec.isExclusive(), currentResources, firstDeployment);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
index 6f1334421ef..0f9babb53aa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
@@ -51,26 +51,6 @@ public class LocksResponse extends HttpResponse {
root.setString("hostname", hostname);
root.setString("time", Instant.now().toString());
- Cursor lockPathsCursor = root.setArray("lock-paths");
- lockMetricsByPath.forEach((lockPath, lockMetrics) -> {
- Cursor lockPathCursor = lockPathsCursor.addObject();
- lockPathCursor.setString("path", lockPath);
- lockPathCursor.setLong("acquireCount", lockMetrics.getCumulativeAcquireCount());
- lockPathCursor.setLong("acquireFailedCount", lockMetrics.getCumulativeAcquireFailedCount());
- lockPathCursor.setLong("acquireTimedOutCount", lockMetrics.getCumulativeAcquireTimedOutCount());
- lockPathCursor.setLong("lockedCount", lockMetrics.getCumulativeAcquireSucceededCount());
- lockPathCursor.setLong("releaseCount", lockMetrics.getCumulativeReleaseCount());
- lockPathCursor.setLong("releaseFailedCount", lockMetrics.getCumulativeReleaseFailedCount());
- lockPathCursor.setLong("reentryCount", lockMetrics.getCumulativeReentryCount());
- lockPathCursor.setLong("deadlock", lockMetrics.getCumulativeDeadlockCount());
- lockPathCursor.setLong("nakedRelease", lockMetrics.getCumulativeNakedReleaseCount());
- lockPathCursor.setLong("acquireWithoutRelease", lockMetrics.getCumulativeAcquireWithoutReleaseCount());
- lockPathCursor.setLong("foreignRelease", lockMetrics.getCumulativeForeignReleaseCount());
-
- setLatency(lockPathCursor, "acquire", lockMetrics.getAcquireLatencyMetrics());
- setLatency(lockPathCursor, "locked", lockMetrics.getLockedLatencyMetrics());
- });
-
Cursor threadsCursor = root.setArray("threads");
for (var threadLockStats : threadLockStatsList) {
Optional<LockAttempt> ongoingLockAttempt = threadLockStats.getTopMostOngoingLockAttempt();
@@ -102,18 +82,45 @@ public class LocksResponse extends HttpResponse {
Cursor recordingsCursor = root.setArray("recordings");
historicRecordings.forEach(recording -> setRecording(recordingsCursor.addObject(), recording));
}
+
+ Cursor lockPathsCursor = root.setArray("lock-paths");
+ lockMetricsByPath.forEach((lockPath, lockMetrics) -> {
+ Cursor lockPathCursor = lockPathsCursor.addObject();
+ lockPathCursor.setString("path", lockPath);
+ setNonZeroLong(lockPathCursor, "acquireCount", lockMetrics.getCumulativeAcquireCount());
+ setNonZeroLong(lockPathCursor, "acquireFailedCount", lockMetrics.getCumulativeAcquireFailedCount());
+ setNonZeroLong(lockPathCursor, "acquireTimedOutCount", lockMetrics.getCumulativeAcquireTimedOutCount());
+ setNonZeroLong(lockPathCursor, "lockedCount", lockMetrics.getCumulativeAcquireSucceededCount());
+ setNonZeroLong(lockPathCursor, "releaseCount", lockMetrics.getCumulativeReleaseCount());
+ setNonZeroLong(lockPathCursor, "releaseFailedCount", lockMetrics.getCumulativeReleaseFailedCount());
+ setNonZeroLong(lockPathCursor, "reentryCount", lockMetrics.getCumulativeReentryCount());
+ setNonZeroLong(lockPathCursor, "deadlock", lockMetrics.getCumulativeDeadlockCount());
+ setNonZeroLong(lockPathCursor, "nakedRelease", lockMetrics.getCumulativeNakedReleaseCount());
+ setNonZeroLong(lockPathCursor, "acquireWithoutRelease", lockMetrics.getCumulativeAcquireWithoutReleaseCount());
+ setNonZeroLong(lockPathCursor, "foreignRelease", lockMetrics.getCumulativeForeignReleaseCount());
+
+ setLatency(lockPathCursor, "acquire", lockMetrics.getAcquireLatencyMetrics());
+ setLatency(lockPathCursor, "locked", lockMetrics.getLockedLatencyMetrics());
+ });
+ }
+
+ private static void setNonZeroLong(Cursor cursor, String fieldName, long value) {
+ if (value != 0) {
+ cursor.setLong(fieldName, value);
+ }
}
private static void setLatency(Cursor cursor, String name, LatencyMetrics latencyMetrics) {
- cursor.setDouble(name + "Latency", latencyMetrics.latencySeconds());
- cursor.setDouble(name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds());
- cursor.setDouble(name + "Hz", latencyMetrics.endHz());
- cursor.setDouble(name + "Load", latencyMetrics.load());
+ setNonZeroDouble(cursor, name + "Latency", latencyMetrics.latencySeconds());
+ setNonZeroDouble(cursor, name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds());
+ setNonZeroDouble(cursor, name + "Hz", latencyMetrics.endHz());
+ setNonZeroDouble(cursor, name + "Load", latencyMetrics.load());
}
- private static double roundDouble(double value, int decimalPlaces) {
- double factor = Math.pow(10, decimalPlaces);
- return Math.round(value * factor) / factor;
+ private static void setNonZeroDouble(Cursor cursor, String fieldName, double value) {
+ if (Double.compare(value, 0.0) != 0) {
+ cursor.setDouble(fieldName, value);
+ }
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java
index 9332eb79f20..0d423333ce1 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingIntegrationTest.java
@@ -59,7 +59,7 @@ public class AutoscalingIntegrationTest {
tester.nodeRepository().applications().put(application, lock);
}
var scaledResources = autoscaler.suggest(application.clusters().get(cluster1.id()),
- tester.nodeRepository().getNodes(application1));
+ tester.nodeRepository().list(application1));
assertTrue(scaledResources.isPresent());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index 5393aa7cfb8..a217d97ac27 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -52,7 +52,7 @@ public class AutoscalingTest {
assertTrue("Too few measurements -> No change", tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
tester.clock().advance(Duration.ofDays(1));
- tester.addCpuMeasurements(0.25f, 1f, 60, application1);
+ tester.addCpuMeasurements(0.25f, 1f, 120, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high",
15, 1, 1.3, 28.6, 28.6,
tester.autoscale(application1, cluster1.id(), min, max).target());
@@ -62,7 +62,7 @@ public class AutoscalingTest {
tester.deactivateRetired(application1, cluster1, scaledResources);
- tester.clock().advance(Duration.ofDays(1));
+ tester.clock().advance(Duration.ofDays(2));
tester.addCpuMeasurements(0.8f, 1f, 3, application1);
assertTrue("Load change is large, but insufficient measurements for new config -> No change",
tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
@@ -74,6 +74,8 @@ public class AutoscalingTest {
tester.assertResources("Scaling down to minimum since usage has gone down significantly",
14, 1, 1.0, 30.8, 30.8,
tester.autoscale(application1, cluster1.id(), min, max).target());
+
+ var events = tester.nodeRepository().applications().get(application1).get().cluster(cluster1.id()).get().scalingEvents();
}
/** We prefer fewer nodes for container clusters as (we assume) they all use the same disk and memory */
@@ -117,7 +119,7 @@ public class AutoscalingTest {
tester.nodeRepository().getNodes(application1).stream()
.allMatch(n -> n.allocation().get().requestedResources().diskSpeed() == NodeResources.DiskSpeed.slow);
- tester.clock().advance(Duration.ofDays(1));
+ tester.clock().advance(Duration.ofDays(2));
tester.addCpuMeasurements(0.25f, 1f, 120, application1);
// Changing min and max from slow to any
ClusterResources min = new ClusterResources( 2, 1,
@@ -225,6 +227,40 @@ public class AutoscalingTest {
}
@Test
+ public void not_using_out_of_service_measurements() {
+ NodeResources resources = new NodeResources(3, 100, 100, 1);
+ ClusterResources min = new ClusterResources(2, 1, new NodeResources(1, 1, 1, 1));
+ ClusterResources max = new ClusterResources(5, 1, new NodeResources(100, 1000, 1000, 1));
+ AutoscalingTester tester = new AutoscalingTester(resources.withVcpu(resources.vcpu() * 2));
+
+ ApplicationId application1 = tester.applicationId("application1");
+ ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1");
+
+ // deploy
+ tester.deploy(application1, cluster1, 2, 1, resources);
+ tester.addMeasurements(0.5f, 0.6f, 0.7f, 1, false, true, 120, application1);
+ assertTrue("Not scaling up since nodes were measured while cluster was unstable",
+ tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
+ }
+
+ @Test
+ public void not_using_unstable_measurements() {
+ NodeResources resources = new NodeResources(3, 100, 100, 1);
+ ClusterResources min = new ClusterResources(2, 1, new NodeResources(1, 1, 1, 1));
+ ClusterResources max = new ClusterResources(5, 1, new NodeResources(100, 1000, 1000, 1));
+ AutoscalingTester tester = new AutoscalingTester(resources.withVcpu(resources.vcpu() * 2));
+
+ ApplicationId application1 = tester.applicationId("application1");
+ ClusterSpec cluster1 = tester.clusterSpec(ClusterSpec.Type.container, "cluster1");
+
+ // deploy
+ tester.deploy(application1, cluster1, 2, 1, resources);
+ tester.addMeasurements(0.5f, 0.6f, 0.7f, 1, true, false, 120, application1);
+ assertTrue("Not scaling up since nodes were measured while cluster was unstable",
+ tester.autoscale(application1, cluster1.id(), min, max).isEmpty());
+ }
+
+ @Test
public void test_autoscaling_group_size_1() {
NodeResources resources = new NodeResources(3, 100, 100, 1);
ClusterResources min = new ClusterResources( 2, 2, new NodeResources(1, 1, 1, 1));
@@ -272,6 +308,7 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 6, 2, new NodeResources(10, 100, 100, 1));
+ tester.clock().advance(Duration.ofDays(1));
tester.addMemMeasurements(1.0f, 1f, 1000, application1);
tester.assertResources("Increase group size to reduce memory load",
8, 2, 12.9, 89.3, 62.5,
@@ -290,7 +327,7 @@ public class AutoscalingTest {
// deploy
tester.deploy(application1, cluster1, 6, 1, hostResources.withVcpu(hostResources.vcpu() / 2));
- tester.clock().advance(Duration.ofDays(1));
+ tester.clock().advance(Duration.ofDays(2));
tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
tester.assertResources("Scaling down",
6, 1, 2.8, 4.0, 95.0,
@@ -313,8 +350,8 @@ public class AutoscalingTest {
tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
assertTrue(tester.autoscale(application1, cluster1.id(), min, max).target().isEmpty());
- // Trying the same a day later causes autoscaling
- tester.clock().advance(Duration.ofDays(1));
+ // Trying the same later causes autoscaling
+ tester.clock().advance(Duration.ofDays(2));
tester.addMemMeasurements(0.02f, 0.95f, 120, application1);
tester.assertResources("Scaling down",
6, 1, 2.8, 4.0, 95.0,
@@ -376,7 +413,7 @@ public class AutoscalingTest {
// deploy (Why 103 Gb memory? See AutoscalingTester.MockHostResourcesCalculator
tester.deploy(application1, cluster1, 5, 1, new NodeResources(3, 103, 100, 1));
- tester.clock().advance(Duration.ofDays(1));
+ tester.clock().advance(Duration.ofDays(2));
tester.addMemMeasurements(0.9f, 0.6f, 120, application1);
ClusterResources scaledResources = tester.assertResources("Scaling up since resource usage is too high.",
8, 1, 3, 83, 34.3,
@@ -385,6 +422,7 @@ public class AutoscalingTest {
tester.deploy(application1, cluster1, scaledResources);
tester.deactivateRetired(application1, cluster1, scaledResources);
+ tester.clock().advance(Duration.ofDays(2));
tester.addMemMeasurements(0.3f, 0.6f, 1000, application1);
tester.assertResources("Scaling down since resource usage has gone down",
5, 1, 3, 83, 36,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 3faa4c244ee..43302c4fe23 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -138,6 +138,7 @@ class AutoscalingTester {
memory,
disk,
0,
+ true,
true))));
}
}
@@ -168,12 +169,18 @@ class AutoscalingTester {
memory,
disk,
0,
+ true,
true))));
}
}
}
- public void addMeasurements(float cpu, float memory, float disk, int generation, int count, ApplicationId applicationId) {
+ public void addMeasurements(float cpu, float memory, float disk, int generation, int count, ApplicationId applicationId) {
+ addMeasurements(cpu, memory, disk, generation, true, true, count, applicationId);
+ }
+
+ public void addMeasurements(float cpu, float memory, float disk, int generation, boolean inService, boolean stable,
+ int count, ApplicationId applicationId) {
List<Node> nodes = nodeRepository().getNodes(applicationId, Node.State.active);
for (int i = 0; i < count; i++) {
clock().advance(Duration.ofMinutes(1));
@@ -183,7 +190,8 @@ class AutoscalingTester {
memory,
disk,
generation,
- true))));
+ inService,
+ stable))));
}
}
}
@@ -196,7 +204,7 @@ class AutoscalingTester {
nodeRepository().applications().put(application, lock);
}
return autoscaler.autoscale(application.clusters().get(clusterId),
- nodeRepository().getNodes(applicationId, Node.State.active));
+ nodeRepository().list(applicationId, Node.State.active));
}
public Autoscaler.Advice suggest(ApplicationId applicationId, ClusterSpec.Id clusterId,
@@ -207,7 +215,7 @@ class AutoscalingTester {
nodeRepository().applications().put(application, lock);
}
return autoscaler.suggest(application.clusters().get(clusterId),
- nodeRepository().getNodes(applicationId, Node.State.active));
+ nodeRepository().list(applicationId, Node.State.active));
}
public ClusterResources assertResources(String message,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
index dd991f15087..0a62e5b6a68 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java
@@ -17,6 +17,7 @@ import java.util.ArrayList;
import java.util.List;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class MetricsV2MetricsFetcherTest {
@@ -70,14 +71,17 @@ public class MetricsV2MetricsFetcherTest {
assertEquals(0.15, values.get(0).getSecond().memory(), delta);
assertEquals(0.20, values.get(0).getSecond().disk(), delta);
assertEquals(3, values.get(0).getSecond().generation(), delta);
+ assertTrue(values.get(0).getSecond().stable());
}
{
+ httpClient.cannedResponse = cannedResponseForApplication2;
try (Mutex lock = tester.nodeRepository().lock(application1)) {
- tester.nodeRepository().write(tester.nodeRepository().getNodes(application1, Node.State.active)
+ tester.nodeRepository().write(tester.nodeRepository().getNodes(application2, Node.State.active)
.get(0).retire(tester.clock().instant()), lock);
}
- assertTrue("No metrics fetching while unstable", fetcher.fetchMetrics(application1).isEmpty());
+ List<Pair<String, MetricSnapshot>> values = new ArrayList<>(fetcher.fetchMetrics(application2));
+ assertFalse(values.get(0).getSecond().stable());
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java
index dba19d675dd..48c90125ac8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/NodeMetricsDbTest.java
@@ -40,7 +40,7 @@ public class NodeMetricsDbTest {
MetricsDb db = MetricsDb.createTestInstance(tester.nodeRepository());
Collection<Pair<String, MetricSnapshot>> values = new ArrayList<>();
for (int i = 0; i < 40; i++) {
- values.add(new Pair<>(node0, new MetricSnapshot(clock.instant(), 0.9f, 0.6f, 0.6f, 0, true)));
+ values.add(new Pair<>(node0, new MetricSnapshot(clock.instant(), 0.9f, 0.6f, 0.6f, 0, true, false)));
clock.advance(Duration.ofMinutes(120));
}
db.add(values);
@@ -50,7 +50,7 @@ public class NodeMetricsDbTest {
assertEquals(35, measurementCount(db.getNodeTimeseries(clock.instant().minus(Duration.ofHours(72)), Set.of(node0))));
db.gc();
- assertEquals( 5, measurementCount(db.getNodeTimeseries(clock.instant().minus(Duration.ofHours(72)), Set.of(node0))));
+ assertEquals(23, measurementCount(db.getNodeTimeseries(clock.instant().minus(Duration.ofHours(72)), Set.of(node0))));
}
private int measurementCount(List<NodeTimeseries> measurements) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
index b97d5136485..1f61faed735 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDbTest.java
@@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.collections.Pair;
import com.yahoo.io.IOUtils;
import com.yahoo.test.ManualClock;
+import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
@@ -16,6 +17,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
/**
* Tests the Quest metrics db.
@@ -34,6 +36,7 @@ public class QuestMetricsDbTest {
ManualClock clock = new ManualClock("2020-10-01T00:00:00");
QuestMetricsDb db = new QuestMetricsDb(dataDir, clock);
Instant startTime = clock.instant();
+
clock.advance(Duration.ofSeconds(1));
db.add(timeseries(1000, Duration.ofSeconds(1), clock, "host1", "host2", "host3"));
@@ -106,9 +109,56 @@ public class QuestMetricsDbTest {
assertEquals(24 * 10, db.getNodeTimeseries(startTime, Set.of("host1")).get(0).size());
db.gc();
- assertEquals(24 * 1 + dayOffset, db.getNodeTimeseries(startTime, Set.of("host1")).get(0).size());
+ assertEquals(48 * 1 + dayOffset, db.getNodeTimeseries(startTime, Set.of("host1")).get(0).size());
db.gc(); // no-op
- assertEquals(24 * 1 + dayOffset, db.getNodeTimeseries(startTime, Set.of("host1")).get(0).size());
+ assertEquals(48 * 1 + dayOffset, db.getNodeTimeseries(startTime, Set.of("host1")).get(0).size());
+ }
+
+ /** To manually test that we can read existing data */
+ @Ignore
+ @Test
+ public void testReadingAndAppendingToExistingData() {
+ String dataDir = "data/QuestMetricsDbExistingData";
+ if ( ! new File(dataDir).exists()) {
+ System.out.println("No existing data to check");
+ return;
+ }
+ IOUtils.createDirectory(dataDir + "/metrics");
+ ManualClock clock = new ManualClock("2020-10-01T00:00:00");
+ clock.advance(Duration.ofSeconds(9)); // Adjust to last data written
+ QuestMetricsDb db = new QuestMetricsDb(dataDir, clock);
+
+ List<NodeTimeseries> timeseries = db.getNodeTimeseries(clock.instant().minus(Duration.ofSeconds(9)), Set.of("host1"));
+ assertFalse("Could read existing data", timeseries.isEmpty());
+ assertEquals(10, timeseries.get(0).size());
+
+ System.out.println("Existing data read:");
+ for (var snapshot : timeseries.get(0).asList())
+ System.out.println(" " + snapshot);
+
+ clock.advance(Duration.ofSeconds(1));
+ db.add(timeseries(2, Duration.ofSeconds(1), clock, "host1"));
+ System.out.println("New data written and read:");
+ timeseries = db.getNodeTimeseries(clock.instant().minus(Duration.ofSeconds(2)), Set.of("host1"));
+ for (var snapshot : timeseries.get(0).asList())
+ System.out.println(" " + snapshot);
+ }
+
+ /** To update data for the manual test above */
+ @Ignore
+ @Test
+ public void updateExistingData() {
+ String dataDir = "data/QuestMetricsDbExistingData";
+ IOUtils.recursiveDeleteDir(new File(dataDir));
+ IOUtils.createDirectory(dataDir + "/metrics");
+ ManualClock clock = new ManualClock("2020-10-01T00:00:00");
+ QuestMetricsDb db = new QuestMetricsDb(dataDir, clock);
+ Instant startTime = clock.instant();
+ db.add(timeseries(10, Duration.ofSeconds(1), clock, "host1"));
+
+ int added = db.getNodeTimeseries(startTime, Set.of("host1")).get(0).asList().size();
+ System.out.println("Added " + added + " rows of data");
+ db.close();
}
private Collection<Pair<String, MetricSnapshot>> timeseries(int countPerHost, Duration sampleRate, ManualClock clock,
@@ -121,6 +171,7 @@ public class QuestMetricsDbTest {
i * 0.2,
i * 0.4,
i % 100,
+ true,
true)));
clock.advance(sampleRate);
}
@@ -136,8 +187,10 @@ public class QuestMetricsDbTest {
i * 0.2,
i * 0.4,
i % 100,
- true)));
+ true,
+ false)));
}
return timeseries;
}
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
index 4b14174488e..0c1a59c883d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
@@ -6,6 +6,8 @@ import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
+import com.yahoo.test.ManualClock;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
import org.junit.Test;
@@ -13,9 +15,10 @@ import org.junit.Test;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
-
+import java.util.Optional;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
/**
@@ -27,7 +30,7 @@ import static org.junit.Assert.assertTrue;
public class AutoscalingMaintainerTest {
@Test
- public void testAutoscalingMaintainer() {
+ public void test_autoscaling_maintainer() {
ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1");
ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec();
@@ -53,6 +56,7 @@ public class AutoscalingMaintainerTest {
new ClusterResources(10, 1, new NodeResources(6.5, 9, 20, 0.1)),
false, true));
+ tester.clock().advance(Duration.ofMinutes(10));
tester.maintainer().maintain(); // noop
assertTrue(tester.deployer().lastDeployTime(app1).isEmpty());
assertTrue(tester.deployer().lastDeployTime(app2).isEmpty());
@@ -60,6 +64,7 @@ public class AutoscalingMaintainerTest {
tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app1);
tester.addMeasurements(0.9f, 0.9f, 0.9f, 0, 500, app2);
+ tester.clock().advance(Duration.ofMinutes(10));
tester.maintainer().maintain();
assertTrue(tester.deployer().lastDeployTime(app1).isEmpty()); // since autoscaling is off
assertTrue(tester.deployer().lastDeployTime(app2).isPresent());
@@ -84,16 +89,18 @@ public class AutoscalingMaintainerTest {
// Causes autoscaling
tester.clock().advance(Duration.ofSeconds(1));
+ tester.clock().advance(Duration.ofMinutes(10));
Instant firstMaintenanceTime = tester.clock().instant();
tester.maintainer().maintain();
assertTrue(tester.deployer().lastDeployTime(app1).isPresent());
assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
List<ScalingEvent> events = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().scalingEvents();
- assertEquals(1, events.size());
- assertEquals(2, events.get(0).from().nodes());
- assertEquals(4, events.get(0).to().nodes());
- assertEquals(1, events.get(0).generation());
- assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(0).at().toEpochMilli());
+ assertEquals(2, events.size());
+ assertEquals(Optional.of(firstMaintenanceTime), events.get(0).completion());
+ assertEquals(2, events.get(1).from().nodes());
+ assertEquals(4, events.get(1).to().nodes());
+ assertEquals(1, events.get(1).generation());
+ assertEquals(firstMaintenanceTime.toEpochMilli(), events.get(1).at().toEpochMilli());
// Measure overload still, since change is not applied, but metrics are discarded
tester.clock().advance(Duration.ofSeconds(1));
@@ -110,13 +117,18 @@ public class AutoscalingMaintainerTest {
assertEquals(firstMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
// Add measurement of the expected generation, leading to rescaling
- tester.clock().advance(Duration.ofHours(2));
+ // - record scaling completion
+ tester.clock().advance(Duration.ofMinutes(5));
+ tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 1, app1);
+ tester.maintainer().maintain();
+ // - measure underload
+ tester.clock().advance(Duration.ofHours(1));
tester.addMeasurements(0.1f, 0.1f, 0.1f, 1, 500, app1);
Instant lastMaintenanceTime = tester.clock().instant();
tester.maintainer().maintain();
assertEquals(lastMaintenanceTime.toEpochMilli(), tester.deployer().lastDeployTime(app1).get().toEpochMilli());
events = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().scalingEvents();
- assertEquals(2, events.get(0).generation());
+ assertEquals(2, events.get(2).generation());
}
@Test
@@ -128,4 +140,90 @@ public class AutoscalingMaintainerTest {
AutoscalingMaintainer.toString(new ClusterResources(4, 2, new NodeResources(1, 2, 4, 1))));
}
+ @Test
+ public void test_scaling_event_recording() {
+ ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1");
+ ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec();
+ NodeResources lowResources = new NodeResources(4, 4, 10, 0.1);
+ NodeResources highResources = new NodeResources(8, 8, 20, 0.1);
+ Capacity app1Capacity = Capacity.from(new ClusterResources(2, 1, lowResources),
+ new ClusterResources(4, 2, highResources));
+ var tester = new AutoscalingMaintainerTester(new MockDeployer.ApplicationContext(app1, cluster1, app1Capacity));
+
+ // deploy
+ tester.deploy(app1, cluster1, app1Capacity);
+
+ for (int i = 0; i < 20; i++) {
+ // Record completion to keep scaling window at minimum
+ tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 1, app1);
+ tester.maintainer().maintain();
+
+ tester.clock().advance(Duration.ofDays(1));
+
+ if (i % 2 == 0) // high load
+ tester.addMeasurements(0.9f, 0.9f, 0.9f, i, 200, app1);
+ else // low load
+ tester.addMeasurements(0.1f, 0.1f, 0.1f, i, 200, app1);
+ tester.maintainer().maintain();
+ }
+
+ assertEquals(Cluster.maxScalingEvents, tester.cluster(app1, cluster1).scalingEvents().size());
+ }
+
+ @Test
+ public void test_autoscaling_window() {
+ ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1");
+ ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec();
+ NodeResources lowResources = new NodeResources(4, 4, 10, 0.1);
+ NodeResources highResources = new NodeResources(8, 8, 20, 0.1);
+ Capacity app1Capacity = Capacity.from(new ClusterResources(2, 1, lowResources),
+ new ClusterResources(4, 2, highResources));
+ var tester = new AutoscalingMaintainerTester(new MockDeployer.ApplicationContext(app1, cluster1, app1Capacity));
+ ManualClock clock = tester.clock();
+
+ // deploy
+ tester.deploy(app1, cluster1, app1Capacity);
+
+ autoscale(false, Duration.ofMinutes( 1), Duration.ofMinutes( 5), clock, app1, cluster1, tester);
+ autoscale( true, Duration.ofMinutes(19), Duration.ofMinutes(10), clock, app1, cluster1, tester);
+ autoscale( true, Duration.ofMinutes(40), Duration.ofMinutes(20), clock, app1, cluster1, tester);
+ }
+
+ private void autoscale(boolean down, Duration completionTime, Duration expectedWindow,
+ ManualClock clock, ApplicationId application, ClusterSpec cluster,
+ AutoscalingMaintainerTester tester) {
+ long generation = tester.cluster(application, cluster).lastScalingEvent().get().generation();
+ tester.maintainer().maintain();
+ assertFalse("Not measured to be on the last generation yet",
+ tester.cluster(application, cluster).lastScalingEvent().get().completion().isPresent());
+
+ clock.advance(completionTime);
+ float load = down ? 0.1f : 1.0f;
+ tester.addMeasurements(load, load, load, generation, 200, application);
+ tester.maintainer().maintain();
+ assertEvent("Measured completion of the last scaling event, but no new autoscaling yet",
+ generation, Optional.of(clock.instant()),
+ tester.cluster(application, cluster).lastScalingEvent().get());
+ if (down)
+ clock.advance(expectedWindow.minus(completionTime).plus(expectedWindow.multipliedBy(2)));
+ else
+ clock.advance(expectedWindow.minus(completionTime));
+
+ tester.addMeasurements(load, load, load, generation, 200, application);
+ tester.maintainer().maintain();
+ assertEquals("We passed window duration so a new autoscaling is started: " +
+ tester.cluster(application, cluster).autoscalingStatus(),
+ generation + 1,
+ tester.cluster(application, cluster).lastScalingEvent().get().generation());
+ }
+
+ private void assertEvent(String explanation,
+ long expectedGeneration, Optional<Instant> expectedCompletion, ScalingEvent event) {
+ assertEquals(explanation + ". Generation", expectedGeneration, event.generation());
+ assertEquals(explanation + ". Generation " + expectedGeneration + ". Completion",
+ expectedCompletion, event.completion());
+ }
+
+
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
index fadcd40ad0a..f53ae1baad2 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTester.java
@@ -14,6 +14,8 @@ import com.yahoo.config.provisioning.FlavorsConfig;
import com.yahoo.test.ManualClock;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.autoscale.MetricSnapshot;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
@@ -68,7 +70,7 @@ public class AutoscalingMaintainerTester {
return provisioningTester.deploy(application, cluster, capacity);
}
- public void addMeasurements(float cpu, float mem, float disk, int generation, int count, ApplicationId applicationId) {
+ public void addMeasurements(float cpu, float mem, float disk, long generation, int count, ApplicationId applicationId) {
List<Node> nodes = nodeRepository().getNodes(applicationId, Node.State.active);
for (int i = 0; i < count; i++) {
for (Node node : nodes)
@@ -77,10 +79,15 @@ public class AutoscalingMaintainerTester {
mem,
disk,
generation,
+ true,
true))));
}
}
+ public Cluster cluster(ApplicationId application, ClusterSpec cluster) {
+ return nodeRepository().applications().get(application).get().cluster(cluster.id()).get();
+ }
+
private FlavorsConfig flavorsConfig() {
FlavorConfigBuilder b = new FlavorConfigBuilder();
b.addFlavor("flt", 30, 30, 40, 3, Flavor.Type.BARE_METAL);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java
index 86507508e68..5dfd1193581 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java
@@ -226,7 +226,7 @@ public class CapacityCheckerTester {
return m;
}
public String toString() {
- return String.format("%s/%s/%s/%d%s", clustertype, clusterid, group, index, retired ? "/retired" : "");
+ return String.format("%s/%s/%s/%d%s%s", clustertype, clusterid, group, index, retired ? "/retired" : "", clustertype.isContent() ? "/stateful" : "");
}
}
static class OwnerModel {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
index 2833c4e11ba..c67f6657c7f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java
@@ -15,9 +15,10 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
-import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.InMemoryFlagSource;
-import com.yahoo.vespa.flags.custom.HostCapacity;
+import com.yahoo.vespa.flags.PermanentFlags;
+import com.yahoo.vespa.flags.custom.ClusterCapacity;
+import com.yahoo.vespa.flags.custom.SharedHost;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Address;
@@ -31,7 +32,6 @@ import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
-import org.junit.Ignore;
import org.junit.Test;
import java.time.Duration;
@@ -43,6 +43,7 @@ import java.util.Optional;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
+import java.util.stream.IntStream;
import java.util.stream.Stream;
import static com.yahoo.vespa.hosted.provision.maintenance.DynamicProvisioningMaintainerTest.MockHostProvisioner.Behaviour;
@@ -104,7 +105,7 @@ public class DynamicProvisioningMaintainerTest {
@Test
public void does_not_deprovision_when_preprovisioning_enabled() {
var tester = new DynamicProvisioningTester().addInitialNodes();
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(), List.of(new HostCapacity(1, 3, 2, 1)), HostCapacity.class);
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(), List.of(new ClusterCapacity(1, 1, 3, 2, 1.0)), ClusterCapacity.class);
Optional<Node> failedHost = tester.nodeRepository.getNode("host2");
assertTrue(failedHost.isPresent());
@@ -116,23 +117,162 @@ public class DynamicProvisioningMaintainerTest {
@Test
public void provision_deficit_and_deprovision_excess() {
var tester = new DynamicProvisioningTester().addInitialNodes();
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(),
- List.of(new HostCapacity(24, 64, 100, 2),
- new HostCapacity(16, 24, 100, 1)),
- HostCapacity.class);
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, 48, 128, 1000, 10.0),
+ new ClusterCapacity(1, 16, 24, 100, 1.0)),
+ ClusterCapacity.class);
+
+ assertEquals(0, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(11, tester.nodeRepository.getNodes().size());
assertTrue(tester.nodeRepository.getNode("host2").isPresent());
- assertEquals(0 ,tester.hostProvisioner.provisionedHosts.size());
+ assertTrue(tester.nodeRepository.getNode("host2-1").isPresent());
+ assertTrue(tester.nodeRepository.getNode("host3").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname100").isEmpty());
+ assertTrue(tester.nodeRepository.getNode("hostname101").isEmpty());
- // failed host2 is removed
- Optional<Node> failedHost = tester.nodeRepository.getNode("host2");
- assertTrue(failedHost.isPresent());
tester.maintainer.maintain();
- assertTrue("Failed host is deprovisioned", tester.nodeRepository.getNode(failedHost.get().hostname()).isEmpty());
- assertTrue("Host with matching resources is kept", tester.nodeRepository.getNode("host3").isPresent());
- // Two more hosts are provisioned with expected resources
- NodeResources resources = new NodeResources(24, 64, 100, 1);
- assertEquals(2, tester.provisionedHostsMatching(resources));
+ assertEquals(2, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(2, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10)));
+ List<Node> nodesAfter = tester.nodeRepository.getNodes();
+ assertEquals(11, nodesAfter.size()); // 2 removed, 2 added
+ assertTrue("Failed host 'host2' is deprovisioned", tester.nodeRepository.getNode("host2").isEmpty());
+ assertTrue("Node on deprovisioned host removed", tester.nodeRepository.getNode("host2-1").isEmpty());
+ assertTrue("Host satisfying 16-24-100-1 is kept", tester.nodeRepository.getNode("host3").isPresent());
+ assertTrue("New 48-128-1000-10 host added", tester.nodeRepository.getNode("hostname100").isPresent());
+ assertTrue("New 48-128-1000-10 host added", tester.nodeRepository.getNode("hostname101").isPresent());
+ }
+
+ @Test
+ public void preprovision_with_shared_host() {
+ var tester = new DynamicProvisioningTester().addInitialNodes();
+ // Makes provisioned hosts 48-128-1000-10
+ tester.hostProvisioner.provisionSharedHost("host4");
+
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, 1, 30, 20, 3.0)),
+ ClusterCapacity.class);
+
+ assertEquals(0, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(11, tester.nodeRepository.getNodes().size());
+ assertTrue(tester.nodeRepository.getNode("host2").isPresent());
+ assertTrue(tester.nodeRepository.getNode("host2-1").isPresent());
+ assertTrue(tester.nodeRepository.getNode("host3").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname100").isEmpty());
+
+ // The first cluster will be allocated to host3 and a new host hostname100.
+ // hostname100 will be a large shared host specified above.
+ tester.maintainer.maintain();
+ verifyFirstMaintain(tester);
+
+ // Second maintain should be a no-op, otherwise we did wrong in the first maintain.
+ tester.maintainer.maintain();
+ verifyFirstMaintain(tester);
+
+ // Add a second cluster equal to the first. It should fit on existing host3 and hostname100.
+
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, 1, 30, 20, 3.0),
+ new ClusterCapacity(2, 1, 30, 20, 3.0)),
+ ClusterCapacity.class);
+
+ tester.maintainer.maintain();
+ verifyFirstMaintain(tester);
+
+ // Change second cluster such that it doesn't fit on host3, but does on hostname100,
+ // and with a size of 2 it should allocate a new shared host.
+ // The node allocation code prefers to allocate to the shared hosts instead of host3 (at least
+ // in this test, due to skew), so host3 will be deprovisioned when hostname101 is provisioned.
+ // host3 is a 24-64-100-10 while hostname100 is 48-128-1000-10.
+
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, 1, 30, 20, 3.0),
+ new ClusterCapacity(2, 24, 64, 100, 1.0)),
+ ClusterCapacity.class);
+
+ tester.maintainer.maintain();
+
+ assertEquals(2, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(2, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10)));
+ assertEquals(10, tester.nodeRepository.getNodes().size()); // 3 removed, 2 added
+ assertTrue("preprovision capacity is prefered on shared hosts", tester.nodeRepository.getNode("host3").isEmpty());
+ assertTrue(tester.nodeRepository.getNode("hostname100").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname101").isPresent());
+
+ // If the preprovision capacity is reduced, we should see shared hosts deprovisioned.
+
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(1, 1, 30, 20, 3.0)),
+ ClusterCapacity.class);
+
+ tester.maintainer.maintain();
+
+ assertEquals("one provisioned host has been deprovisioned, so there are 2 -> 1 provisioned hosts",
+ 1, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(1, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10)));
+ assertEquals(9, tester.nodeRepository.getNodes().size()); // 4 removed, 2 added
+ if (tester.nodeRepository.getNode("hostname100").isPresent()) {
+ assertTrue("hostname101 is superfluous and should have been deprovisioned",
+ tester.nodeRepository.getNode("hostname101").isEmpty());
+ } else {
+ assertTrue("hostname101 is required for preprovision capacity",
+ tester.nodeRepository.getNode("hostname101").isPresent());
+ }
+
+ }
+
+ private void verifyFirstMaintain(DynamicProvisioningTester tester) {
+ assertEquals(1, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(1, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10)));
+ assertEquals(10, tester.nodeRepository.getNodes().size()); // 2 removed, 1 added
+ assertTrue("Failed host 'host2' is deprovisioned", tester.nodeRepository.getNode("host2").isEmpty());
+ assertTrue("Node on deprovisioned host removed", tester.nodeRepository.getNode("host2-1").isEmpty());
+ assertTrue("One 1-30-20-3 node fits on host3", tester.nodeRepository.getNode("host3").isPresent());
+ assertTrue("New 48-128-1000-10 host added", tester.nodeRepository.getNode("hostname100").isPresent());
+ }
+
+ @Test
+ public void verify_min_count_of_shared_hosts() {
+ // What's going on here? We are trying to verify the impact of varying the minimum number of
+ // shared hosts (SharedHost.minCount()).
+ //
+ // addInitialNodes() adds 4 tenant hosts:
+ // host1 shared !removable # not removable because it got child nodes w/allocation
+ // host2 !shared removable # not counted as a shared host because it is failed
+ // host3 shared removable
+ // host4 shared !removable # not removable because it got child nodes w/allocation
+ //
+ // Hosts 1, 3, and 4 count as "shared hosts" with respect to the minCount lower boundary.
+ // Hosts 3 and 4 are removable, that is they will be deprovisioned as excess hosts unless
+ // prevented by minCount.
+
+ // minCount=0: All (2) removable hosts are deprovisioned
+ assertWithMinCount(0, 0, 2);
+ // minCount=1: The same thing happens, because there are 2 shared hosts left
+ assertWithMinCount(1, 0, 2);
+ assertWithMinCount(2, 0, 2);
+ // minCount=3: since we require 3 shared hosts, host3 is not deprovisioned.
+ assertWithMinCount(3, 0, 1);
+ // 4 shared hosts require we provision 1 shared host
+ assertWithMinCount(4, 1, 1);
+ // 5 shared hosts require we provision 2 shared hosts
+ assertWithMinCount(5, 2, 1);
+ assertWithMinCount(6, 3, 1);
+ }
+
+ private void assertWithMinCount(int minCount, int provisionCount, int deprovisionCount) {
+ var tester = new DynamicProvisioningTester().addInitialNodes();
+ tester.hostProvisioner.provisionSharedHost("host4");
+
+ tester.flagSource.withJacksonFlag(PermanentFlags.SHARED_HOST.id(), new SharedHost(null, minCount), SharedHost.class);
+ tester.maintainer.maintain();
+ assertEquals(provisionCount, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(deprovisionCount, tester.hostProvisioner.deprovisionedHosts);
+
+ // Verify next maintain is a no-op
+ tester.maintainer.maintain();
+ assertEquals(provisionCount, tester.hostProvisioner.provisionedHosts.size());
+ assertEquals(deprovisionCount, tester.hostProvisioner.deprovisionedHosts);
}
@Test
@@ -145,58 +285,95 @@ public class DynamicProvisioningMaintainerTest {
assertTrue(tester.nodeRepository.getNode(host2.hostname()).isPresent());
}
- @Ignore // TODO (hakon): Enable as test of min-capacity specified in flag
@Test
- public void provision_exact_capacity() {
- var tester = new DynamicProvisioningTester(Cloud.builder().dynamicProvisioning(true).build());
- NodeResources resources1 = new NodeResources(24, 64, 100, 1);
- NodeResources resources2 = new NodeResources(16, 24, 100, 1);
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(), List.of(new HostCapacity(resources1.vcpu(), resources1.memoryGb(), resources1.diskGb(), 1),
- new HostCapacity(resources2.vcpu(), resources2.memoryGb(), resources2.diskGb(), 2)),
- HostCapacity.class);
+ public void test_minimum_capacity() {
+ var tester = new DynamicProvisioningTester();
+ NodeResources resources1 = new NodeResources(24, 64, 100, 10);
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, resources1.vcpu(), resources1.memoryGb(), resources1.diskGb(), resources1.bandwidthGbps())),
+ ClusterCapacity.class);
tester.maintainer.maintain();
// Hosts are provisioned
- assertEquals(1, tester.provisionedHostsMatching(resources1));
- assertEquals(2, tester.provisionedHostsMatching(resources2));
+ assertEquals(2, tester.provisionedHostsMatching(resources1));
+ assertEquals(0, tester.hostProvisioner.deprovisionedHosts);
// Next maintenance run does nothing
tester.assertNodesUnchanged();
- // Target capacity is changed
- NodeResources resources3 = new NodeResources(48, 128, 1000, 1);
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(), List.of(new HostCapacity(resources1.vcpu(), resources1.memoryGb(), resources1.diskGb(), 1),
- new HostCapacity(resources3.vcpu(), resources3.memoryGb(), resources3.diskGb(), 1)),
- HostCapacity.class);
+ // Pretend shared-host flag has been set to host4's flavor
+ var sharedHostNodeResources = new NodeResources(48, 128, 1000, 10, NodeResources.DiskSpeed.fast, NodeResources.StorageType.remote);
+ tester.hostProvisioner.provisionSharedHost("host4");
- // Excess hosts are deprovisioned
- tester.maintainer.maintain();
- assertEquals(1, tester.provisionedHostsMatching(resources1));
- assertEquals(0, tester.provisionedHostsMatching(resources2));
- assertEquals(1, tester.provisionedHostsMatching(resources3));
- assertEquals(2, tester.nodeRepository.getNodes(Node.State.deprovisioned).size());
+ // Next maintenance run does nothing
+ tester.assertNodesUnchanged();
+
+ // Must be able to allocate 2 nodes with "no resource requirement"
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(2, 0, 0, 0, 0.0)),
+ ClusterCapacity.class);
+
+ // Next maintenance run does nothing
+ tester.assertNodesUnchanged();
// Activate hosts
- tester.maintainer.maintain(); // Resume provisioning of new hosts
List<Node> provisioned = tester.nodeRepository.list().state(Node.State.provisioned).asList();
tester.nodeRepository.setReady(provisioned, Agent.system, this.getClass().getSimpleName());
tester.provisioningTester.activateTenantHosts();
// Allocating nodes to a host does not result in provisioning of additional capacity
ApplicationId application = ProvisioningTester.applicationId();
+ NodeResources applicationNodeResources = new NodeResources(4, 8, 50, 0.1);
tester.provisioningTester.deploy(application,
- Capacity.from(new ClusterResources(2, 1, new NodeResources(4, 8, 50, 0.1))));
+ Capacity.from(new ClusterResources(2, 1, applicationNodeResources)));
assertEquals(2, tester.nodeRepository.list().owner(application).size());
tester.assertNodesUnchanged();
// Clearing flag does nothing
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(), List.of(), HostCapacity.class);
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(), List.of(), ClusterCapacity.class);
+ tester.assertNodesUnchanged();
+
+ // Increasing the capacity provisions additional hosts
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(3, 0, 0, 0, 0.0)),
+ ClusterCapacity.class);
+ assertEquals(0, tester.provisionedHostsMatching(sharedHostNodeResources));
+ assertTrue(tester.nodeRepository.getNode("hostname102").isEmpty());
+ tester.maintainer.maintain();
+ assertEquals(1, tester.provisionedHostsMatching(sharedHostNodeResources));
+ assertTrue(tester.nodeRepository.getNode("hostname102").isPresent());
+
+ // Next maintenance run does nothing
tester.assertNodesUnchanged();
- // Capacity reduction does not remove host with children
- tester.flagSource.withListFlag(Flags.TARGET_CAPACITY.id(), List.of(new HostCapacity(resources1.vcpu(), resources1.memoryGb(), resources1.diskGb(), 1)),
- HostCapacity.class);
+ // Requiring >0 capacity does nothing as long as it fits on the 3 hosts
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(3,
+ resources1.vcpu() - applicationNodeResources.vcpu(),
+ resources1.memoryGb() - applicationNodeResources.memoryGb(),
+ resources1.diskGb() - applicationNodeResources.diskGb(),
+ resources1.bandwidthGbps() - applicationNodeResources.bandwidthGbps())),
+ ClusterCapacity.class);
tester.assertNodesUnchanged();
+
+ // But requiring a bit more in the cluster => provisioning of 2 shared hosts.
+ tester.flagSource.withListFlag(PermanentFlags.PREPROVISION_CAPACITY.id(),
+ List.of(new ClusterCapacity(3,
+ resources1.vcpu() - applicationNodeResources.vcpu() + 1,
+ resources1.memoryGb() - applicationNodeResources.memoryGb() + 1,
+ resources1.diskGb() - applicationNodeResources.diskGb() + 1,
+ resources1.bandwidthGbps())),
+ ClusterCapacity.class);
+
+ assertEquals(1, tester.provisionedHostsMatching(sharedHostNodeResources));
+ assertTrue(tester.nodeRepository.getNode("hostname102").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname103").isEmpty());
+ assertTrue(tester.nodeRepository.getNode("hostname104").isEmpty());
+ tester.maintainer.maintain();
+ assertEquals(3, tester.provisionedHostsMatching(sharedHostNodeResources));
+ assertTrue(tester.nodeRepository.getNode("hostname102").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname103").isPresent());
+ assertTrue(tester.nodeRepository.getNode("hostname104").isPresent());
}
@Test
@@ -225,9 +402,7 @@ public class DynamicProvisioningMaintainerTest {
private static final ApplicationId proxyApp = ApplicationId.from("vespa", "proxy", "default");
private static final NodeFlavors flavors = FlavorConfigBuilder.createDummies("default", "docker", "host2", "host3", "host4");
- private final InMemoryFlagSource flagSource = new InMemoryFlagSource().withListFlag(Flags.TARGET_CAPACITY.id(),
- List.of(),
- HostCapacity.class);
+ private final InMemoryFlagSource flagSource = new InMemoryFlagSource();
private final NodeRepository nodeRepository;
private final MockHostProvisioner hostProvisioner;
@@ -260,9 +435,10 @@ public class DynamicProvisioningMaintainerTest {
List.of(createNode("host1", Optional.empty(), NodeType.host, Node.State.active, Optional.of(tenantHostApp)),
createNode("host1-1", Optional.of("host1"), NodeType.tenant, Node.State.reserved, Optional.of(tenantApp)),
createNode("host1-2", Optional.of("host1"), NodeType.tenant, Node.State.failed, Optional.empty()),
- createNode("host2", Optional.empty(), NodeType.host, Node.State.failed, Optional.of(tenantApp)),
+ createNode("host2", Optional.empty(), NodeType.host, Node.State.failed, Optional.of(tenantHostApp)),
createNode("host2-1", Optional.of("host2"), NodeType.tenant, Node.State.failed, Optional.empty()),
- createNode("host3", Optional.empty(), NodeType.host, Node.State.provisioned, Optional.empty()),
+ createNode("host3", Optional.empty(), NodeType.host, Node.State.provisioned, Optional.empty(),
+ "host3-1", "host3-2", "host3-3", "host3-4", "host3-5"),
createNode("host4", Optional.empty(), NodeType.host, Node.State.provisioned, Optional.empty()),
createNode("host4-1", Optional.of("host4"), NodeType.tenant, Node.State.reserved, Optional.of(tenantApp)),
createNode("proxyhost1", Optional.empty(), NodeType.proxyhost, Node.State.provisioned, Optional.empty()),
@@ -281,8 +457,9 @@ public class DynamicProvisioningMaintainerTest {
return nodeRepository.database().addNodesInState(List.of(node), node.state(), Agent.system).get(0);
}
- private Node createNode(String hostname, Optional<String> parentHostname, NodeType nodeType, Node.State state, Optional<ApplicationId> application) {
- Flavor flavor = nodeRepository.flavors().getFlavor(parentHostname.isPresent() ? "docker" : "host2").orElseThrow();
+ private Node createNode(String hostname, Optional<String> parentHostname, NodeType nodeType,
+ Node.State state, Optional<ApplicationId> application, String... additionalHostnames) {
+ Flavor flavor = nodeRepository.flavors().getFlavor(parentHostname.isPresent() ? "docker" : "host3").orElseThrow();
Optional<Allocation> allocation = application
.map(app -> new Allocation(
app,
@@ -290,8 +467,9 @@ public class DynamicProvisioningMaintainerTest {
flavor.resources(),
Generation.initial(),
false));
+ List<Address> addresses = Stream.of(additionalHostnames).map(Address::new).collect(Collectors.toList());
Node.Builder builder = Node.create("fake-id-" + hostname, hostname, flavor, state, nodeType)
- .ipConfigWithEmptyPool(state == Node.State.active ? Set.of("::1") : Set.of());
+ .ipConfig(new IP.Config(state == Node.State.active ? Set.of("::1") : Set.of(), Set.of(), addresses));
parentHostname.ifPresent(builder::parentHostname);
allocation.ifPresent(builder::allocation);
return builder.build();
@@ -299,7 +477,7 @@ public class DynamicProvisioningMaintainerTest {
private long provisionedHostsMatching(NodeResources resources) {
return hostProvisioner.provisionedHosts.stream()
- .filter(host -> host.nodeResources().equals(resources))
+ .filter(host -> host.generateHost().resources().compatibleWith(resources))
.count();
}
@@ -319,27 +497,35 @@ public class DynamicProvisioningMaintainerTest {
private int deprovisionedHosts = 0;
private EnumSet<Behaviour> behaviours = EnumSet.noneOf(Behaviour.class);
+ private Optional<Flavor> provisionHostFlavor = Optional.empty();
public MockHostProvisioner(NodeFlavors flavors, MockNameResolver nameResolver) {
this.flavors = flavors;
this.nameResolver = nameResolver;
}
+ public MockHostProvisioner provisionSharedHost(String flavorName) {
+ provisionHostFlavor = Optional.of(flavors.getFlavorOrThrow(flavorName));
+ return this;
+ }
+
@Override
public List<ProvisionedHost> provisionHosts(List<Integer> provisionIndexes, NodeResources resources,
ApplicationId applicationId, Version osVersion, HostSharing sharing) {
- Flavor hostFlavor = flavors.getFlavors().stream()
- .filter(f -> !f.isDocker())
- .filter(f -> f.resources().compatibleWith(resources))
- .findFirst()
- .orElseThrow(() -> new IllegalArgumentException("No host flavor found satisfying " + resources));
+ Flavor hostFlavor = provisionHostFlavor
+ .orElseGet(() -> flavors.getFlavors().stream()
+ .filter(f -> !f.isDocker())
+ .filter(f -> f.resources().compatibleWith(resources))
+ .findFirst()
+ .orElseThrow(() -> new IllegalArgumentException("No host flavor found satisfying " + resources)));
+
List<ProvisionedHost> hosts = new ArrayList<>();
for (int index : provisionIndexes) {
hosts.add(new ProvisionedHost("host" + index,
"hostname" + index,
hostFlavor,
Optional.empty(),
- List.of(new Address("nodename" + index)),
+ createAddressesForHost(hostFlavor, index),
resources,
osVersion));
}
@@ -347,6 +533,13 @@ public class DynamicProvisioningMaintainerTest {
return hosts;
}
+ private List<Address> createAddressesForHost(Flavor flavor, int hostIndex) {
+ long numAddresses = Math.max(1, Math.round(flavor.resources().bandwidthGbps()));
+ return IntStream.range(0, (int) numAddresses)
+ .mapToObj(i -> new Address("nodename" + hostIndex + "_" + i))
+ .collect(Collectors.toList());
+ }
+
@Override
public List<Node> provision(Node host, Set<Node> children) throws FatalProvisioningException {
if (behaviours.contains(Behaviour.failProvisioning)) throw new FatalProvisioningException("Failed to provision node(s)");
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index a25858c034f..3e4887b6998 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -147,25 +147,14 @@ public class MetricsReporterTest {
// Verify sum of values across dimensions, and remove these metrics to avoid checking against
// metric.values below, which is not sensitive to dimensions.
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquire", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireFailed", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireTimedOut", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.locked", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.release", 3);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.releaseFailed", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.reentry", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.deadlock", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.nakedRelease", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireWithoutRelease", 0);
- verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.foreignRelease", 0);
- metric.remove("lockAttempt.acquireLatency");
metric.remove("lockAttempt.acquireMaxActiveLatency");
metric.remove("lockAttempt.acquireHz");
metric.remove("lockAttempt.acquireLoad");
metric.remove("lockAttempt.lockedLatency");
- metric.remove("lockAttempt.lockedMaxActiveLatency");
- metric.remove("lockAttempt.lockedHz");
metric.remove("lockAttempt.lockedLoad");
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.acquireTimedOut", 0);
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.deadlock", 0);
+ verifyAndRemoveIntegerMetricSum(metric, "lockAttempt.errors", 0);
assertEquals(expectedMetrics, new TreeMap<>(metric.values));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index d403affc292..d4dbc6f55a5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -10,6 +10,7 @@ import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Report;
import com.yahoo.vespa.hosted.provision.node.Reports;
import org.junit.Test;
@@ -233,7 +234,7 @@ public class NodeFailerTest {
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
-
+
String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
@@ -309,6 +310,36 @@ public class NodeFailerTest {
}
@Test
+ public void re_activate_grace_period_test() {
+ NodeFailTester tester = NodeFailTester.withTwoApplications();
+ String downNode = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
+
+ tester.serviceMonitor.setHostDown(downNode);
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(75));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+
+ // Re-activate the node. It is still down, but should not be failed out until the grace period has passed again
+ tester.nodeRepository.reactivate(downNode, Agent.system, getClass().getSimpleName());
+ tester.clock.advance(Duration.ofMinutes(30));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+
+ tester.clock.advance(Duration.ofMinutes(45));
+ tester.allNodesMakeAConfigRequestExcept();
+ tester.runMaintainers();
+ assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(Node.State.failed, tester.nodeRepository.getNode(downNode).get().state());
+ }
+
+ @Test
public void node_failing_can_allocate_spare() {
var resources = new NodeResources(1, 20, 15, 1);
Capacity capacity = Capacity.from(new ClusterResources(3, 1, resources), false, true);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
index 722911569de..4f0b0d55742 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java
@@ -19,6 +19,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
/**
@@ -51,9 +52,8 @@ public class NodeMetricsDbMaintainerTest {
List<MetricSnapshot> allSnapshots = timeseriesList.stream()
.flatMap(timeseries -> timeseries.asList().stream())
.collect(Collectors.toList());
- assertEquals("Snapshot from the node not in service is filtered out",
- 1, allSnapshots.size());
- assertEquals(0.14, allSnapshots.get(0).cpu(), 0.000001);
+ assertTrue(allSnapshots.stream().anyMatch(snapshot -> snapshot.inService()));
+ assertTrue(allSnapshots.stream().anyMatch(snapshot -> ! snapshot.inService()));
}
private static class MockHttpClient implements MetricsV2MetricsFetcher.HttpClient {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
index 36d088a59df..2abe5ed7ebf 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
@@ -5,8 +5,8 @@ import com.yahoo.component.Version;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.curator.mock.MockCurator;
-import com.yahoo.vespa.flags.Flags;
import com.yahoo.vespa.flags.InMemoryFlagSource;
+import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
@@ -142,7 +142,7 @@ public class NodeRebooterTest {
}
private static ProvisioningTester createTester(Duration rebootInterval, InMemoryFlagSource flagSource) {
- flagSource = flagSource.withIntFlag(Flags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays());
+ flagSource = flagSource.withIntFlag(PermanentFlags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays());
ProvisioningTester tester = new ProvisioningTester.Builder().flagSource(flagSource).build();
tester.clock().setInstant(Instant.ofEpochMilli(1605522619000L)); // Use a fixed random seed
((MockCurator) tester.getCurator()).setZooKeeperEnsembleConnectionSpec("zk1.host:1,zk2.host:2,zk3.host:3");
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
index 55a183c8ec1..cd188bc017f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java
@@ -51,7 +51,7 @@ public class PeriodicApplicationMaintainerTest {
@After
public void after() {
- this.fixture.maintainer.close();
+ this.fixture.maintainer.awaitShutdown();
}
@Test(timeout = 60_000)
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 15966a4c44b..d1fc13c9796 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -56,6 +56,7 @@ public class ScalingSuggestionsMaintainerTest {
new ClusterResources(10, 1, new NodeResources(6.5, 5, 15, 0.1)),
false, true));
+ tester.clock().advance(Duration.ofHours(13));
addMeasurements(0.90f, 0.90f, 0.90f, 0, 500, app1, tester.nodeRepository(), metricsDb);
addMeasurements(0.99f, 0.99f, 0.99f, 0, 500, app2, tester.nodeRepository(), metricsDb);
@@ -81,6 +82,7 @@ public class ScalingSuggestionsMaintainerTest {
memory,
disk,
generation,
+ true,
true))));
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
index e63f31cf304..06473e60712 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
@@ -45,7 +45,8 @@ public class ApplicationSerializerTest {
List.of(new ScalingEvent(new ClusterResources(10, 5, minResources),
new ClusterResources(12, 6, minResources),
7L,
- Instant.ofEpochMilli(12345L))),
+ Instant.ofEpochMilli(12345L),
+ Optional.of(Instant.ofEpochMilli(67890L)))),
"Autoscaling status"));
Application original = new Application(ApplicationId.from("myTenant", "myApplication", "myInstance"),
clusters);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java
index 8b9d60aeaf4..e5333ea4186 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java
@@ -79,7 +79,7 @@ public class NodeSerializerTest {
node = node.allocate(ApplicationId.from(TenantName.from("myTenant"),
ApplicationName.from("myApplication"),
InstanceName.from("myInstance")),
- ClusterMembership.from("content/myId/0/0", Vtag.currentVersion, Optional.empty()),
+ ClusterMembership.from("content/myId/0/0/stateful", Vtag.currentVersion, Optional.empty()),
requestedResources,
clock.instant());
assertEquals(1, node.history().events().size());
@@ -134,7 +134,7 @@ public class NodeSerializerTest {
" \"applicationId\" : \"myApplication\",\n" +
" \"tenantId\" : \"myTenant\",\n" +
" \"instanceId\" : \"myInstance\",\n" +
- " \"serviceId\" : \"content/myId/0/0\",\n" +
+ " \"serviceId\" : \"content/myId/0/0/stateful\",\n" +
" \"restartGeneration\" : 3,\n" +
" \"currentRestartGeneration\" : 4,\n" +
" \"removable\" : true,\n" +
@@ -167,7 +167,7 @@ public class NodeSerializerTest {
node = node.allocate(ApplicationId.from(TenantName.from("myTenant"),
ApplicationName.from("myApplication"),
InstanceName.from("myInstance")),
- ClusterMembership.from("content/myId/0/0", Vtag.currentVersion, Optional.empty()),
+ ClusterMembership.from("content/myId/0/0/stateful", Vtag.currentVersion, Optional.empty()),
node.flavor().resources(),
clock.instant());
assertEquals(1, node.history().events().size());
@@ -217,7 +217,7 @@ public class NodeSerializerTest {
node = node.allocate(ApplicationId.from(TenantName.from("myTenant"),
ApplicationName.from("myApplication"),
InstanceName.from("myInstance")),
- ClusterMembership.from("content/myId/0/0", Vtag.currentVersion, Optional.empty()),
+ ClusterMembership.from("content/myId/0/0/stateful", Vtag.currentVersion, Optional.empty()),
node.flavor().resources(),
clock.instant());
@@ -315,7 +315,7 @@ public class NodeSerializerTest {
" \"hostname\" : \"myHostname\",\n" +
" \"ipAddresses\" : [\"127.0.0.1\"],\n" +
" \"instance\": {\n" +
- " \"serviceId\": \"content/myId/0/0\",\n" +
+ " \"serviceId\": \"content/myId/0/0/stateful\",\n" +
" \"wantedVespaVersion\": \"6.42.2\"\n" +
" }\n" +
"}";
@@ -408,7 +408,7 @@ public class NodeSerializerTest {
node = node.allocate(ApplicationId.from(TenantName.from("myTenant"),
ApplicationName.from("myApplication"),
InstanceName.from("myInstance")),
- ClusterMembership.from("content/myId/0/0", Vtag.currentVersion, Optional.empty()),
+ ClusterMembership.from("content/myId/0/0/stateful", Vtag.currentVersion, Optional.empty()),
node.flavor().resources(),
clock.instant());
assertTrue(node.allocation().isPresent());
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java
index e046bc0a512..f368f4d139c 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DockerProvisioningTest.java
@@ -75,7 +75,6 @@ public class DockerProvisioningTest {
public void refuses_to_activate_on_non_active_host() {
ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build();
- ApplicationId zoneApplication = ProvisioningTester.applicationId();
List<Node> parents = tester.makeReadyNodes(10, new NodeResources(2, 4, 20, 2), NodeType.host, 1);
for (Node parent : parents)
tester.makeReadyVirtualDockerNodes(1, dockerResources, parent.hostname());
@@ -90,11 +89,8 @@ public class DockerProvisioningTest {
fail("Expected the allocation to fail due to parent hosts not being active yet");
} catch (OutOfCapacityException expected) { }
- // Activate the zone-app, thereby allocating the parents
- List<HostSpec> hosts = tester.prepare(zoneApplication,
- ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("zone-app")).vespaVersion(wantedVespaVersion).build(),
- Capacity.fromRequiredNodeType(NodeType.host));
- tester.activate(zoneApplication, hosts);
+ // Activate the hosts, thereby allocating the parents
+ tester.activateTenantHosts();
// Try allocating tenants again
List<HostSpec> nodes = tester.prepare(application1,
@@ -412,10 +408,6 @@ public class DockerProvisioningTest {
return nodes.asList().stream().map(Node::parentHostname).map(Optional::get).collect(Collectors.toSet());
}
- private void prepareAndActivate(ApplicationId application, int nodeCount, boolean exclusive, ProvisioningTester tester) {
- prepareAndActivate(application, nodeCount, exclusive, dockerResources, tester);
- }
-
private void prepareAndActivate(ApplicationId application, int nodeCount, boolean exclusive, NodeResources resources, ProvisioningTester tester) {
Set<HostSpec> hosts = new HashSet<>(tester.prepare(application,
ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("myContainer")).vespaVersion("6.39").exclusive(exclusive).build(),
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisionerTest.java
index c51ef7250e2..7a636a030ec 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisionerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisionerTest.java
@@ -33,6 +33,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
/**
* @author mpolden
@@ -236,6 +237,31 @@ public class LoadBalancerProvisionerTest {
assertEquals(cluster, lbs.get().get(0).id().cluster());
}
+ @Test
+ public void reject_load_balancers_with_clashing_names() {
+ ApplicationId instance1 = ApplicationId.from("t1", "a1", "default");
+ ApplicationId instance2 = ApplicationId.from("t1", "a1", "dev");
+ ApplicationId instance3 = ApplicationId.from("t1", "a1", "qrs");
+ ClusterSpec.Id devCluster = ClusterSpec.Id.from("dev");
+ ClusterSpec.Id defaultCluster = ClusterSpec.Id.from("default");
+
+ // instance1 is deployed
+ tester.activate(instance1, prepare(instance1, clusterRequest(ClusterSpec.Type.container, devCluster)));
+
+ // instance2 clashes because cluster name matches instance1
+ try {
+ prepare(instance2, clusterRequest(ClusterSpec.Type.container, defaultCluster));
+ fail("Expected exception");
+ } catch (IllegalArgumentException ignored) {
+ }
+
+ // instance2 changes cluster name and does not clash
+ tester.activate(instance2, prepare(instance2, clusterRequest(ClusterSpec.Type.container, ClusterSpec.Id.from("qrs"))));
+
+ // instance3 clashes because instance name matches instance2 cluster
+ tester.activate(instance3, prepare(instance3, clusterRequest(ClusterSpec.Type.container, defaultCluster)));
+ }
+
private void dirtyNodesOf(ApplicationId application) {
tester.nodeRepository().setDirty(tester.nodeRepository().getNodes(application), Agent.system, this.getClass().getSimpleName());
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
index 86427fe30ae..3945c518a77 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
@@ -485,7 +485,7 @@ public class NodesV2ApiTest {
"{\"message\":\"Moved host2.yahoo.com to parked\"}");
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/state/ready/host2.yahoo.com",
new byte[0], Request.Method.PUT),
- 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"Cannot make parked host host2.yahoo.com allocated to tenant2.application2.instance2 as 'content/id2/0/0' available for new allocation as it is not in state [dirty]\"}");
+ 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"Cannot make parked host host2.yahoo.com allocated to tenant2.application2.instance2 as 'content/id2/0/0/stateful' available for new allocation as it is not in state [dirty]\"}");
// (... while dirty then ready works (the ready move will be initiated by node maintenance))
assertResponse(new Request("http://localhost:8080/nodes/v2/state/dirty/host2.yahoo.com",
new byte[0], Request.Method.PUT),
@@ -502,7 +502,7 @@ public class NodesV2ApiTest {
// Attempt to DELETE allocated node
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com",
new byte[0], Request.Method.DELETE),
- 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"active child node host4.yahoo.com allocated to tenant3.application3.instance3 as 'content/id3/0/0' is currently allocated and cannot be removed\"}");
+ 400, "{\"error-code\":\"BAD_REQUEST\",\"message\":\"active child node host4.yahoo.com allocated to tenant3.application3.instance3 as 'content/id3/0/0/stateful' is currently allocated and cannot be removed\"}");
// PUT current restart generation with string instead of long
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com",
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
index f6b2f96023a..c3857e9c8ee 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/capacity-zone.json
@@ -4,7 +4,7 @@
"failedTenantParent": "dockerhost1.yahoo.com",
"failedTenant": "host4.yahoo.com",
"failedTenantResources": "[vcpu: 1.0, memory: 4.0 Gb, disk 100.0 Gb, bandwidth: 1.0 Gbps, storage type: local]",
- "failedTenantAllocation": "allocated to tenant3.application3.instance3 as 'content/id3/0/0'",
+ "failedTenantAllocation": "allocated to tenant3.application3.instance3 as 'content/id3/0/0/stateful'",
"hostCandidateRejectionReasons": {
"singularReasonFailures": {
"insufficientVcpu": 0,
@@ -43,4 +43,4 @@
}
]
}
-} \ No newline at end of file
+}