From 2301101b70949b96ab7e4a4d2fe5279925b59c8c Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sat, 3 Dec 2022 15:43:56 +0100 Subject: Discard metrics right after restart --- .../autoscale/ClusterNodesTimeseries.java | 9 ++-- .../hosted/provision/autoscale/NodeTimeseries.java | 37 +++++++++++---- .../hosted/provision/maintenance/NodeFailer.java | 15 ++---- .../maintenance/AutoscalingMaintainerTest.java | 55 +++++++++++++++++----- 4 files changed, 80 insertions(+), 36 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 2eb57dcdd87..657b6f0fdb9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -5,7 +5,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.applications.Cluster; import java.time.Duration; -import java.time.Instant; import java.util.List; import java.util.Optional; import java.util.OptionalDouble; @@ -36,7 +35,7 @@ public class ClusterNodesTimeseries { var timeseries = db.getNodeTimeseries(period.plus(warmupDuration.multipliedBy(4)), clusterNodes); if (cluster.lastScalingEvent().isPresent()) { long currentGeneration = cluster.lastScalingEvent().get().generation(); - timeseries = keepCurrentGenerationAfterWarmup(timeseries, currentGeneration); + timeseries = keepGenerationAfterWarmup(timeseries, currentGeneration); } timeseries = keep(timeseries, snapshot -> snapshot.inService() && snapshot.stable()); timeseries = keep(timeseries, snapshot -> ! snapshot.at().isBefore(db.clock().instant().minus(period))); @@ -114,10 +113,10 @@ public class ClusterNodesTimeseries { return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.keep(filter)).collect(Collectors.toList()); } - private static List keepCurrentGenerationAfterWarmup(List timeseries, - long currentGeneration) { + private List keepGenerationAfterWarmup(List timeseries, long currentGeneration) { return timeseries.stream() - .map(nodeTimeseries -> nodeTimeseries.keepCurrentGenerationAfterWarmup(currentGeneration)) + .map(nodeTimeseries -> nodeTimeseries.keepGenerationAfterWarmup(currentGeneration, + clusterNodes.node(nodeTimeseries.hostname()))) .collect(Collectors.toList()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java index 500dbf0f66f..cd75a383b87 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java @@ -1,6 +1,9 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.autoscale; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.node.History; + import java.time.Instant; import java.util.ArrayList; import java.util.Collections; @@ -79,18 +82,34 @@ public class NodeTimeseries { .collect(Collectors.toList())); } - public NodeTimeseries keepCurrentGenerationAfterWarmup(long currentGeneration) { - Optional generationChange = generationChange(currentGeneration); - return keep(snapshot -> isOnCurrentGenerationAfterWarmup(snapshot, currentGeneration, generationChange)); + public NodeTimeseries keepGenerationAfterWarmup(long generation, Optional node) { + Optional generationChange = generationChange(generation); + return keep(snapshot -> isOnGenerationAfterWarmup(snapshot, node, generation, generationChange)); + } + private boolean isOnGenerationAfterWarmup(NodeMetricSnapshot snapshot, + Optional node, + long generation, + Optional generationChange) { + if ( ! node.isPresent()) return false; // Node has been removed + if ( ! onAtLeastGeneration(generation, snapshot)) return false; + if (recentlyChangedGeneration(snapshot, generationChange)) return false; + if (recentlyCameUp(snapshot, node.get())) return false; + return true; } - private boolean isOnCurrentGenerationAfterWarmup(NodeMetricSnapshot snapshot, - long currentGeneration, - Optional generationChange) { + private boolean onAtLeastGeneration(long generation, NodeMetricSnapshot snapshot) { if (snapshot.generation() < 0) return true; // Content nodes do not yet send generation - if (snapshot.generation() < currentGeneration) return false; - if (generationChange.isEmpty()) return true; - return ! snapshot.at().isBefore(generationChange.get().plus(warmupDuration)); + return snapshot.generation() >= generation; + } + + private boolean recentlyChangedGeneration(NodeMetricSnapshot snapshot, Optional generationChange) { + if (generationChange.isEmpty()) return false; + return snapshot.at().isBefore(generationChange.get().plus(warmupDuration)); + } + + private boolean recentlyCameUp(NodeMetricSnapshot snapshot, Node node) { + Optional up = node.history().event(History.Event.Type.up); + return up.isPresent() && snapshot.at().isBefore(up.get().at().plus(warmupDuration)); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 51b59ab77eb..1b4b9f9c54b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -170,16 +170,11 @@ public class NodeFailer extends NodeRepositoryMaintainer { * But we refuse to fail out config(host)/controller(host) */ private boolean failAllowedFor(NodeType nodeType) { - switch (nodeType) { - case tenant: - case host: - return true; - case proxy: - case proxyhost: - return nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty(); - default: - return false; - } + return switch (nodeType) { + case tenant, host -> true; + case proxy, proxyhost -> nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty(); + default -> false; + }; } /** diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java index 876ff058bf7..2d05754d96e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainerTest.java @@ -12,8 +12,12 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; import org.junit.Test; @@ -198,7 +202,7 @@ public class AutoscalingMaintainerTest { } @Test - public void test_autoscaling_ignores_high_cpu_right_after_generation_change() { + public void test_autoscaling_ignores_measurements_during_warmup() { ApplicationId app1 = AutoscalingMaintainerTester.makeApplicationId("app1"); ClusterSpec cluster1 = AutoscalingMaintainerTester.containerClusterSpec(); NodeResources resources = new NodeResources(4, 4, 10, 1); @@ -207,26 +211,46 @@ public class AutoscalingMaintainerTest { var capacity = Capacity.from(min, max); var tester = new AutoscalingMaintainerTester(new MockDeployer.ApplicationContext(app1, cluster1, capacity)); + // Add a scaling event tester.deploy(app1, cluster1, capacity); - // fast completion - tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1); - tester.clock().advance(Duration.ofSeconds(150)); - tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1); - tester.clock().advance(Duration.ofSeconds(150)); - tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 1, app1); + tester.addMeasurements(1.0f, 0.3f, 0.3f, 0, 4, app1); tester.maintainer().maintain(); assertEquals("Scale up: " + tester.cluster(app1, cluster1).autoscalingStatus(), 1, tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); - // fast completion, with initially overloaded cpu - tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 1, app1); - tester.clock().advance(Duration.ofSeconds(150)); - tester.addMeasurements(0.2f, 0.3f, 0.3f, 1, 1, app1); + // measurements with outdated generation are ignored -> no autoscaling + var duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 0, 2, app1); tester.maintainer().maintain(); - assertEquals("No autoscaling since we ignore the (first) data point in the warup period", + assertEquals("Measurements with outdated generation are ignored -> no autoscaling", 1, tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); + tester.clock().advance(duration.negated()); + + duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + tester.maintainer().maintain(); + assertEquals("Measurements right after generation change are ignored -> no autoscaling", + 1, + tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); + tester.clock().advance(duration.negated()); + + // Add a restart event + tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1))); + tester.nodeRepository().nodes().list().owner(app1).asList().forEach(node -> recordRestart(node, tester.nodeRepository())); + + duration = tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + tester.maintainer().maintain(); + assertEquals("Measurements right after restart are ignored -> no autoscaling", + 1, + tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); + tester.clock().advance(duration.negated()); + + tester.clock().advance(ClusterModel.warmupDuration.plus(Duration.ofMinutes(1))); + tester.addMeasurements(3.0f, 0.3f, 0.3f, 1, 2, app1); + tester.maintainer().maintain(); + assertEquals("We have valid measurements -> scale up", + 2, + tester.cluster(app1, cluster1).lastScalingEvent().get().generation()); } @Test @@ -300,6 +324,13 @@ public class AutoscalingMaintainerTest { tester.cluster(application, cluster).lastScalingEvent().get().generation()); } + private void recordRestart(Node node, NodeRepository nodeRepository) { + var upEvent = new History.Event(History.Event.Type.up, Agent.system, nodeRepository.clock().instant()); + try (var locked = nodeRepository.nodes().lockAndGetRequired(node)) { + nodeRepository.nodes().write(locked.node().with(locked.node().history().with(upEvent)), locked); + } + } + private void assertEvent(String explanation, long expectedGeneration, Optional expectedCompletion, ScalingEvent event) { assertEquals(explanation + ". Generation", expectedGeneration, event.generation()); -- cgit v1.2.3