From f3953b0b098dfca3d3c101897699ec1d283fdd29 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Wed, 7 Jun 2023 15:34:46 +0200 Subject: Track deprovisioned hosts in dynamically provisioned zones --- .../maintenance/DeprovisionedExpirer.java | 31 ++++++++++++ .../hosted/provision/maintenance/Expirer.java | 5 +- .../maintenance/NodeRepositoryMaintenance.java | 4 ++ .../yahoo/vespa/hosted/provision/node/Nodes.java | 6 +-- .../provision/provisioning/NodeAllocation.java | 3 +- .../maintenance/DeprovisionedExpirerTest.java | 56 ++++++++++++++++++++++ .../maintenance/HostCapacityMaintainerTest.java | 38 ++++++++------- .../provision/restapi/responses/maintenance.json | 3 ++ 8 files changed, 120 insertions(+), 26 deletions(-) create mode 100644 node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirer.java create mode 100644 node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirerTest.java (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirer.java new file mode 100644 index 00000000000..fcfde53c0df --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirer.java @@ -0,0 +1,31 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.History; + +import java.time.Duration; +import java.util.List; + +/** + * This removes hosts from {@link com.yahoo.vespa.hosted.provision.Node.State#deprovisioned}, after a grace period. + * + * @author mpolden + */ +public class DeprovisionedExpirer extends Expirer { + + DeprovisionedExpirer(NodeRepository nodeRepository, Duration expiryTime, Metric metric) { + super(Node.State.deprovisioned, History.Event.Type.deprovisioned, nodeRepository, expiryTime, metric); + } + + @Override + protected void expire(List expired) { + if (!nodeRepository().zone().cloud().dynamicProvisioning()) return; // Never expire in statically provisioned zones + for (var node : expired) { + nodeRepository().nodes().forget(node); + } + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index 5a77fcca85c..a8929cf9d22 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -12,8 +12,7 @@ import java.util.List; import java.util.logging.Logger; /** - * Superclass of expiry tasks which moves nodes from some state to the dirty state. - * These jobs runs at least every 25 minutes. + * Base class for maintenance of nodes that linger in a particular state too long. * * @author bratseth */ @@ -61,6 +60,6 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } /** Implement this callback to take action to expire these nodes */ - protected abstract void expire(List node); + protected abstract void expire(List expired); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index c7d5efe376b..f6391a7d475 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -60,6 +60,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { maintainers.add(new AutoscalingMaintainer(nodeRepository, deployer, metric, defaults.autoscalingInterval)); maintainers.add(new ScalingSuggestionsMaintainer(nodeRepository, defaults.scalingSuggestionsInterval, metric)); maintainers.add(new SwitchRebalancer(nodeRepository, defaults.switchRebalancerInterval, metric, deployer)); + maintainers.add(new DeprovisionedExpirer(nodeRepository, defaults.deprovisionedExpiry, metric)); provisionServiceProvider.getLoadBalancerService() .map(lbService -> new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService, metric)) @@ -121,6 +122,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration switchRebalancerInterval; private final Duration hostRetirerInterval; private final Duration hostFlavorUpgraderInterval; + private final Duration deprovisionedExpiry; private final NodeFailer.ThrottlePolicy throttlePolicy; @@ -155,6 +157,8 @@ public class NodeRepositoryMaintenance extends AbstractComponent { throttlePolicy = NodeFailer.ThrottlePolicy.hosted; hostRetirerInterval = Duration.ofMinutes(30); hostFlavorUpgraderInterval = Duration.ofMinutes(30); + // CD (de)provisions hosts frequently. Expire deprovisioned ones earlier + deprovisionedExpiry = isCdZone ? Duration.ofDays(1) : Duration.ofDays(30); if (zone.environment().isProduction() && ! isCdZone) { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 29327aaf93a..101ccc80251 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -503,11 +503,7 @@ public class Nodes { db.removeNodes(removed, transaction); } else { removed = removeChildren(node, force, transaction); - if (zone.cloud().dynamicProvisioning()) { - db.removeNodes(List.of(node), transaction); - } else { - move(node.hostname(), Node.State.deprovisioned, Agent.system, false, Optional.empty(), transaction); - } + move(node.hostname(), Node.State.deprovisioned, Agent.system, false, Optional.empty(), transaction); removed.add(node); } transaction.commit(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index 4ef315b9322..c473ae83186 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -353,6 +353,7 @@ class NodeAllocation { // Infrastructure hosts have fixed indices, starting at 1 Set currentIndices = allNodes.nodeType(hostType) + .not().state(Node.State.deprovisioned) .hostnames() .stream() // TODO(mpolden): Use cluster index instead of parsing hostname, once all @@ -366,7 +367,7 @@ class NodeAllocation { indices.add(i); } } - // Ignore our own index as we should never try to provision ourself. This can happen in the following scenario: + // Ignore our own index as we should never try to provision ourselves. This can happen in the following scenario: // - cfg1 has been deprovisioned // - cfg2 has triggered provisioning of a new cfg1 // - cfg1 is starting and redeploys its infrastructure application during bootstrap. A deficit is detected diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirerTest.java new file mode 100644 index 00000000000..4100fe39eca --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DeprovisionedExpirerTest.java @@ -0,0 +1,56 @@ +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.NodeFlavors; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; +import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; +import com.yahoo.vespa.hosted.provision.testutils.MockHostProvisioner; +import org.junit.jupiter.api.Test; + +import java.time.Duration; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +/** + * @author mpolden + */ +class DeprovisionedExpirerTest { + + private final NodeFlavors flavors = FlavorConfigBuilder.createDummies("host"); + private final ProvisioningTester tester = new ProvisioningTester.Builder().dynamicProvisioning() + .flavors(flavors.getFlavors()) + .hostProvisioner(new MockHostProvisioner(flavors.getFlavors())) + .build(); + private final DeprovisionedExpirer expirer = new DeprovisionedExpirer(tester.nodeRepository(), Duration.ofDays(30), + new TestMetric()); + + @Test + public void maintain() { + tester.makeReadyHosts(1, new NodeResources(2,4,8,1)) + .activateTenantHosts(); + NodeList hosts = tester.nodeRepository().nodes().list().state(Node.State.active); + assertEquals(1, hosts.size()); + + // Remove host + String hostname = hosts.first().get().hostname(); + tester.nodeRepository().nodes().park(hostname, false, Agent.system, getClass().getSimpleName()); + tester.nodeRepository().nodes().removeRecursively(hostname); + assertSame(Node.State.deprovisioned, tester.node(hostname).state()); + + // Host is not removed until expiry passes + assertExpiredAfter(Duration.ZERO, false, hostname); + assertExpiredAfter(Duration.ofDays(15), false, hostname); + assertExpiredAfter(Duration.ofDays(15), true, hostname); + } + + private void assertExpiredAfter(Duration duration, boolean expired, String hostname) { + tester.clock().advance(duration); + expirer.maintain(); + assertEquals(expired, tester.nodeRepository().nodes().node(hostname).isEmpty()); + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java index cfa76419262..ead94663807 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java @@ -83,7 +83,7 @@ public class HostCapacityMaintainerTest { assertTrue(tester.nodeRepository.nodes().node("host3").isPresent()); tester.maintain(); - assertTrue(tester.nodeRepository.nodes().node("host2").isEmpty()); + assertSame(State.deprovisioned, tester.nodeRepository.nodes().node("host2").get().state()); } @Test @@ -94,7 +94,7 @@ public class HostCapacityMaintainerTest { assertTrue(failedHost.isPresent()); tester.maintain(); - assertTrue("Failed host is deprovisioned", tester.nodeRepository.nodes().node(failedHost.get().hostname()).isEmpty()); + assertSame("Failed host is deprovisioned", State.deprovisioned, tester.nodeRepository.nodes().node(failedHost.get().hostname()).get().state()); assertEquals(1, tester.hostProvisioner.deprovisionedHosts()); } @@ -118,9 +118,9 @@ public class HostCapacityMaintainerTest { assertEquals(2, tester.hostProvisioner.provisionedHosts().size()); assertEquals(2, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10))); - NodeList nodesAfter = tester.nodeRepository.nodes().list(); + NodeList nodesAfter = tester.nodeRepository.nodes().list().not().state(State.deprovisioned); assertEquals(9, nodesAfter.size()); // 2 removed, 2 added - assertTrue("Failed host 'host2' is deprovisioned", tester.nodeRepository.nodes().node("host2").isEmpty()); + assertSame("Failed host 'host2' is deprovisioned", State.deprovisioned, tester.nodeRepository.nodes().node("host2").get().state()); assertTrue("Node on deprovisioned host removed", tester.nodeRepository.nodes().node("host2-1").isEmpty()); assertTrue("Host satisfying 16-24-100-1 is kept", tester.nodeRepository.nodes().node("host3").isPresent()); assertTrue("New 48-128-1000-10 host added", tester.nodeRepository.nodes().node("host100").isPresent()); @@ -177,8 +177,8 @@ public class HostCapacityMaintainerTest { assertEquals(2, tester.hostProvisioner.provisionedHosts().size()); assertEquals(2, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10))); - assertEquals(8, tester.nodeRepository.nodes().list().size()); // 3 removed, 2 added - assertTrue("preprovision capacity is prefered on shared hosts", tester.nodeRepository.nodes().node("host3").isEmpty()); + assertEquals(8, tester.nodeRepository.nodes().list().not().state(State.deprovisioned).size()); // 3 removed, 2 added + assertSame("preprovision capacity is prefered on shared hosts", State.deprovisioned, tester.nodeRepository.nodes().node("host3").get().state()); assertTrue(tester.nodeRepository.nodes().node("host100").isPresent()); assertTrue(tester.nodeRepository.nodes().node("host101").isPresent()); @@ -193,13 +193,13 @@ public class HostCapacityMaintainerTest { assertEquals("one provisioned host has been deprovisioned, so there are 2 -> 1 provisioned hosts", 1, tester.hostProvisioner.provisionedHosts().size()); assertEquals(1, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10))); - assertEquals(7, tester.nodeRepository.nodes().list().size()); // 4 removed, 2 added + assertEquals(7, tester.nodeRepository.nodes().list().not().state(State.deprovisioned).size()); // 4 removed, 2 added if (tester.nodeRepository.nodes().node("host100").isPresent()) { - assertTrue("host101 is superfluous and should have been deprovisioned", - tester.nodeRepository.nodes().node("host101").isEmpty()); + assertSame("host101 is superfluous and should have been deprovisioned", State.deprovisioned, + tester.nodeRepository.nodes().node("host101").get().state()); } else { assertTrue("host101 is required for preprovision capacity", - tester.nodeRepository.nodes().node("host101").isPresent()); + tester.nodeRepository.nodes().node("host101").isPresent()); } } @@ -207,8 +207,8 @@ public class HostCapacityMaintainerTest { private void verifyFirstMaintain(DynamicProvisioningTester tester) { assertEquals(1, tester.hostProvisioner.provisionedHosts().size()); assertEquals(1, tester.provisionedHostsMatching(new NodeResources(48, 128, 1000, 10))); - assertEquals(8, tester.nodeRepository.nodes().list().size()); // 2 removed, 1 added - assertTrue("Failed host 'host2' is deprovisioned", tester.nodeRepository.nodes().node("host2").isEmpty()); + assertEquals(8, tester.nodeRepository.nodes().list().not().state(State.deprovisioned).size()); // 2 removed, 1 added + assertSame("Failed host 'host2' is deprovisioned", State.deprovisioned, tester.nodeRepository.nodes().node("host2").get().state()); assertTrue("Node on deprovisioned host removed", tester.nodeRepository.nodes().node("host2-1").isEmpty()); assertTrue("One 1-30-20-3 node fits on host3", tester.nodeRepository.nodes().node("host3").isPresent()); assertTrue("New 48-128-1000-10 host added", tester.nodeRepository.nodes().node("host100").isPresent()); @@ -390,7 +390,7 @@ public class HostCapacityMaintainerTest { tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); // Expected number of hosts and children are provisioned - NodeList allNodes = tester.nodeRepository().nodes().list(); + NodeList allNodes = tester.nodeRepository().nodes().list().not().state(State.deprovisioned); NodeList configHosts = allNodes.nodeType(hostType); NodeList configNodes = allNodes.nodeType(hostType.childNodeType()); assertEquals(3, configHosts.size()); @@ -427,7 +427,7 @@ public class HostCapacityMaintainerTest { // Host and child is removed dynamicProvisioningTester.maintain(); - allNodes = tester.nodeRepository().nodes().list(); + allNodes = tester.nodeRepository().nodes().list().not().state(State.deprovisioned); assertEquals(2, allNodes.nodeType(hostType).size()); assertEquals(2, allNodes.nodeType(hostType.childNodeType()).size()); @@ -585,7 +585,7 @@ public class HostCapacityMaintainerTest { // Host and children can now be removed. tester.provisioningTester.activateTenantHosts(); tester.maintain(); - assertEquals(List.of(), tester.nodeRepository.nodes().list().asList()); + assertEquals(List.of(), tester.nodeRepository.nodes().list().not().state(State.deprovisioned).asList()); } @Test @@ -617,7 +617,11 @@ public class HostCapacityMaintainerTest { // Host and children can now be removed. tester.maintain(); for (var node : List.of(host4, host41, host42, host43)) { - assertTrue(node.hostname() + " removed", tester.nodeRepository.nodes().node(node.hostname()).isEmpty()); + if (node.type().isHost()) { + assertSame(node.hostname() + " moved to deprovisioned", State.deprovisioned, tester.nodeRepository.nodes().node(node.hostname()).get().state()); + } else { + assertTrue(node.hostname() + " removed", tester.nodeRepository.nodes().node(node.hostname()).isEmpty()); + } } } @@ -651,7 +655,7 @@ public class HostCapacityMaintainerTest { private void assertCfghost3IsDeprovisioned(DynamicProvisioningTester tester) { assertEquals(4, tester.nodeRepository.nodes().list(Node.State.active).size()); assertEquals(2, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.confighost).size()); - assertTrue(tester.nodeRepository.nodes().node("cfghost3").isEmpty()); + assertSame(State.deprovisioned, tester.nodeRepository.nodes().node("cfghost3").get().state()); } private static class DynamicProvisioningTester { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json index 4c8c5d80018..f39a086e97a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/maintenance.json @@ -3,6 +3,9 @@ { "name": "AutoscalingMaintainer" }, + { + "name": "DeprovisionedExpirer" + }, { "name": "DirtyExpirer" }, -- cgit v1.2.3