summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-12-03 15:43:56 +0100
committerJon Bratseth <bratseth@gmail.com>2022-12-03 15:43:56 +0100
commit2301101b70949b96ab7e4a4d2fe5279925b59c8c (patch)
tree5ed87518637f26b5a94b95b385b9330cb511fd55 /node-repository
parentafcf1bb71cb7b87a03149d197f724cfc7603ef92 (diff)
Discard metrics right after restart
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java37
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java15
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java55
4 files changed, 80 insertions, 36 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
index 2eb57dcdd87..657b6f0fdb9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
@@ -5,7 +5,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import java.time.Duration;
-import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.OptionalDouble;
@@ -36,7 +35,7 @@ public class ClusterNodesTimeseries {
var timeseries = db.getNodeTimeseries(period.plus(warmupDuration.multipliedBy(4)), clusterNodes);
if (cluster.lastScalingEvent().isPresent()) {
long currentGeneration = cluster.lastScalingEvent().get().generation();
- timeseries = keepCurrentGenerationAfterWarmup(timeseries, currentGeneration);
+ timeseries = keepGenerationAfterWarmup(timeseries, currentGeneration);
}
timeseries = keep(timeseries, snapshot -> snapshot.inService() && snapshot.stable());
timeseries = keep(timeseries, snapshot -> ! snapshot.at().isBefore(db.clock().instant().minus(period)));
@@ -114,10 +113,10 @@ public class ClusterNodesTimeseries {
return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.keep(filter)).collect(Collectors.toList());
}
- private static List<NodeTimeseries> keepCurrentGenerationAfterWarmup(List<NodeTimeseries> timeseries,
- long currentGeneration) {
+ private List<NodeTimeseries> keepGenerationAfterWarmup(List<NodeTimeseries> timeseries, long currentGeneration) {
return timeseries.stream()
- .map(nodeTimeseries -> nodeTimeseries.keepCurrentGenerationAfterWarmup(currentGeneration))
+ .map(nodeTimeseries -> nodeTimeseries.keepGenerationAfterWarmup(currentGeneration,
+ clusterNodes.node(nodeTimeseries.hostname())))
.collect(Collectors.toList());
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
index 500dbf0f66f..cd75a383b87 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java
@@ -1,6 +1,9 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.node.History;
+
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
@@ -79,18 +82,34 @@ public class NodeTimeseries {
.collect(Collectors.toList()));
}
- public NodeTimeseries keepCurrentGenerationAfterWarmup(long currentGeneration) {
- Optional<Instant> generationChange = generationChange(currentGeneration);
- return keep(snapshot -> isOnCurrentGenerationAfterWarmup(snapshot, currentGeneration, generationChange));
+ public NodeTimeseries keepGenerationAfterWarmup(long generation, Optional<Node> node) {
+ Optional<Instant> generationChange = generationChange(generation);
+ return keep(snapshot -> isOnGenerationAfterWarmup(snapshot, node, generation, generationChange));
+ }
+ private boolean isOnGenerationAfterWarmup(NodeMetricSnapshot snapshot,
+ Optional<Node> node,
+ long generation,
+ Optional<Instant> generationChange) {
+ if ( ! node.isPresent()) return false; // Node has been removed
+ if ( ! onAtLeastGeneration(generation, snapshot)) return false;
+ if (recentlyChangedGeneration(snapshot, generationChange)) return false;
+ if (recentlyCameUp(snapshot, node.get())) return false;
+ return true;
}
- private boolean isOnCurrentGenerationAfterWarmup(NodeMetricSnapshot snapshot,
- long currentGeneration,
- Optional<Instant> generationChange) {
+ private boolean onAtLeastGeneration(long generation, NodeMetricSnapshot snapshot) {
if (snapshot.generation() < 0) return true; // Content nodes do not yet send generation
- if (snapshot.generation() < currentGeneration) return false;
- if (generationChange.isEmpty()) return true;
- return ! snapshot.at().isBefore(generationChange.get().plus(warmupDuration));
+ return snapshot.generation() >= generation;
+ }
+
+ private boolean recentlyChangedGeneration(NodeMetricSnapshot snapshot, Optional<Instant> generationChange) {
+ if (generationChange.isEmpty()) return false;
+ return snapshot.at().isBefore(generationChange.get().plus(warmupDuration));
+ }
+
+ private boolean recentlyCameUp(NodeMetricSnapshot snapshot, Node node) {
+ Optional<History.Event> up = node.history().event(History.Event.Type.up);
+ return up.isPresent() && snapshot.at().isBefore(up.get().at().plus(warmupDuration));
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 51b59ab77eb..1b4b9f9c54b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -170,16 +170,11 @@ public class NodeFailer extends NodeRepositoryMaintainer {
* But we refuse to fail out config(host)/controller(host)
*/
private boolean failAllowedFor(NodeType nodeType) {
- switch (nodeType) {
- case tenant:
- case host:
- return true;
- case proxy:
- case proxyhost:
- return nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty();
- default:
- return false;
- }
+ return switch (nodeType) {
+ case tenant, host -> true;
+ case proxy, proxyhost -> nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty();
+ default -> false;
+ };
}
/**
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
index 876ff058bf7..2d05754d96e 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java
@@ -12,8 +12,12 @@ import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
import com.yahoo.test.ManualClock;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
+import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
+import com.yahoo.vespa.hosted.provision.node.Agent;
+import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
import org.junit.Test;
@@ -198,7 +202,7 @@ public class AutoscalingMaintainerTest {
}
@Test
- public void test_autoscaling_ignores_high_cpu_right_after_generation_change() {
+ public void test_autoscaling_ignores_measurements_during_warmup() {
ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1");
ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec();
NodeResources resources = new NodeResources(4, 4, 10, 1);
@@ -207,26 +211,46 @@ public class AutoscalingMaintainerTest {
var capacity = Capacity.from(min, max);
var tester = new AutoscalingMaintainerTester(new MockDeployer.ApplicationContext(app1, cluster1, capacity));
+ // Add a scaling event
tester.deploy(app1, cluster1, capacity);
- // fast completion
- tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1);
- tester.clock().advance(Duration.ofSeconds(150));
- tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1);
- tester.clock().advance(Duration.ofSeconds(150));
- tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1);
+ tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1);
tester.maintainer().maintain();
assertEquals("Scale up: " + tester.cluster(app1, cluster1).autoscalingStatus(),
1,
tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
- // fast completion, with initially overloaded cpu
- tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 1, app1);
- tester.clock().advance(Duration.ofSeconds(150));
- tester.addMeasurements(0.2f, 0.3f, 0.3f, 1, 1, app1);
+ // measurements with outdated generation are ignored -> no autoscaling
+ var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1);
tester.maintainer().maintain();
- assertEquals("No autoscaling since we ignore the (first) data point in the warup period",
+ assertEquals("Measurements with outdated generation are ignored -> no autoscaling",
1,
tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
+ tester.clock().advance(duration.negated());
+
+ duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+ tester.maintainer().maintain();
+ assertEquals("Measurements right after generation change are ignored -> no autoscaling",
+ 1,
+ tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
+ tester.clock().advance(duration.negated());
+
+ // Add a restart event
+ tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1)));
+ tester.nodeRepository().nodes().list().owner(app1).asList().forEach(node -> recordRestart(node, tester.nodeRepository()));
+
+ duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+ tester.maintainer().maintain();
+ assertEquals("Measurements right after restart are ignored -> no autoscaling",
+ 1,
+ tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
+ tester.clock().advance(duration.negated());
+
+ tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1)));
+ tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1);
+ tester.maintainer().maintain();
+ assertEquals("We have valid measurements -> scale up",
+ 2,
+ tester.cluster(app1, cluster1).lastScalingEvent().get().generation());
}
@Test
@@ -300,6 +324,13 @@ public class AutoscalingMaintainerTest {
tester.cluster(application, cluster).lastScalingEvent().get().generation());
}
+ private void recordRestart(Node node, NodeRepository nodeRepository) {
+ var upEvent = new History.Event(History.Event.Type.up, Agent.system, nodeRepository.clock().instant());
+ try (var locked = nodeRepository.nodes().lockAndGetRequired(node)) {
+ nodeRepository.nodes().write(locked.node().with(locked.node().history().with(upEvent)), locked);
+ }
+ }
+
private void assertEvent(String explanation,
long expectedGeneration, Optional<Instant> expectedCompletion, ScalingEvent event) {
assertEquals(explanation + ". Generation", expectedGeneration, event.generation());