diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-02 21:33:27 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-02 21:33:27 +0100 |
commit | d939ca00c4c4acc07374c7d862f892fe702cc328 (patch) | |
tree | 465746d6a6b4854ad6c52a8fe56c9a9b54ddc7f5 /node-repository/src/main/java | |
parent | 2a348d61213778f11c762de5f3570d8174f9f294 (diff) |
Split DynamicProvisioningMaintainer into HostCapacityMaintainer and HostDeprovisioner
Diffstat (limited to 'node-repository/src/main/java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java (renamed from node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java) | 80 | ||||
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java | 57 | ||||
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java | 5 |
3 files changed, 101 insertions, 41 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java index 245bce1b9e8..df04698e0b4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java @@ -37,6 +37,7 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Comparator; +import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -50,19 +51,19 @@ import java.util.stream.Collectors; * @author freva * @author mpolden */ -public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { +public class HostCapacityMaintainer extends NodeRepositoryMaintainer { - private static final Logger log = Logger.getLogger(DynamicProvisioningMaintainer.class.getName()); + private static final Logger log = Logger.getLogger(HostCapacityMaintainer.class.getName()); private final HostProvisioner hostProvisioner; private final ListFlag<ClusterCapacity> preprovisionCapacityFlag; private final JacksonFlag<SharedHost> sharedHostFlag; - DynamicProvisioningMaintainer(NodeRepository nodeRepository, - Duration interval, - HostProvisioner hostProvisioner, - FlagSource flagSource, - Metric metric) { + HostCapacityMaintainer(NodeRepository nodeRepository, + Duration interval, + HostProvisioner hostProvisioner, + FlagSource flagSource, + Metric metric) { super(nodeRepository, interval, metric); this.hostProvisioner = hostProvisioner; this.preprovisionCapacityFlag = PermanentFlags.PREPROVISION_CAPACITY.bindTo(flagSource); @@ -72,42 +73,46 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { @Override protected double maintain() { NodeList nodes = nodeRepository().nodes().list(); - convergeToCapacity(nodes); - return 1.0; - } - - /** Converge zone to wanted capacity */ - private void convergeToCapacity(NodeList nodes) { List<Node> excessHosts; try { excessHosts = provision(nodes); } catch (NodeAllocationException | IllegalStateException e) { log.log(Level.WARNING, "Failed to allocate preprovisioned capacity and/or find excess hosts: " + e.getMessage()); - return; // avoid removing excess hosts + return 0; // avoid removing excess hosts } catch (RuntimeException e) { log.log(Level.WARNING, "Failed to allocate preprovisioned capacity and/or find excess hosts", e); - return; // avoid removing excess hosts + return 0; // avoid removing excess hosts } - excessHosts.forEach(host -> { - Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); - if (optionalMutex.isEmpty()) return; - try (NodeMutex mutex = optionalMutex.get()) { - if (host.state() != mutex.node().state()) return; - host = mutex.node(); - // First mark the host as wantToDeprovision so that if hostProvisioner fails, this host - // * won't get new nodes allocated to it - // * will be selected as excess on next iteration of this maintainer - nodeRepository().nodes().deprovision(host.hostname(), Agent.DynamicProvisioningMaintainer, nodeRepository().clock().instant()); - hostProvisioner.deprovision(host); - nodeRepository().nodes().removeRecursively(host, true); - } catch (UncheckedTimeoutException e) { - log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + - ": Failed to get lock on node, will retry later"); - } catch (RuntimeException e) { - log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e); + markForRemoval(excessHosts); + return 1; + } + + private void markForRemoval(List<Node> excessHosts) { + if (excessHosts.isEmpty()) return; + + try (var lock = nodeRepository().nodes().lockUnallocated()) { + NodeList nodes = nodeRepository().nodes().list(); // Reread nodes under lock + for (Node host : excessHosts) { + Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); + if (optionalMutex.isEmpty()) continue; + try (NodeMutex mutex = optionalMutex.get()) { + host = mutex.node(); + if (!canRemoveHost(host)) continue; + if (!nodes.childrenOf(host).stream().allMatch(HostCapacityMaintainer::canDeprovision)) + continue; + + // Retire the host to parked if possible, otherwise move it straight to parked + if (EnumSet.of(Node.State.reserved, Node.State.active, Node.State.inactive).contains(host.state())) { + Node retiredHost = host.withWantToRetire(true, true, Agent.DynamicProvisioningMaintainer, nodeRepository().clock().instant()); + nodeRepository().nodes().write(retiredHost, mutex); + } else nodeRepository().nodes().park(host.hostname(), true, Agent.DynamicProvisioningMaintainer, "Parked for removal"); + } catch (UncheckedTimeoutException e) { + log.log(Level.WARNING, "Failed to mark " + host.hostname() + + " for deprovisioning: Failed to get lock on node, will retry later"); + } } - }); + } } /** @@ -153,7 +158,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } } for (var node : nodes) { - if (node.parentHostname().isPresent() && !canRemoveNode(node)) { + if (node.parentHostname().isPresent() && !canDeprovision(node)) { removableHostsByHostname.remove(node.parentHostname().get()); } } @@ -169,12 +174,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { }; } - private static boolean canRemoveNode(Node node) { - if (node.type().isHost()) throw new IllegalArgumentException("Node " + node + " is not a child"); - return node.allocation().isEmpty() || canDeprovision(node); - } - - private static boolean canDeprovision(Node node) { + static boolean canDeprovision(Node node) { return node.status().wantToDeprovision() && (node.state() == Node.State.parked || node.state() == Node.State.failed); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java new file mode 100644 index 00000000000..0d9df067f3f --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java @@ -0,0 +1,57 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; + +import java.time.Duration; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * @author freva + */ +public class HostDeprovisioner extends NodeRepositoryMaintainer { + + private static final Logger log = Logger.getLogger(HostDeprovisioner.class.getName()); + + private final HostProvisioner hostProvisioner; + + HostDeprovisioner(NodeRepository nodeRepository, Duration interval, Metric metric, HostProvisioner hostProvisioner) { + super(nodeRepository, interval, metric); + this.hostProvisioner = hostProvisioner; + } + + @Override + protected double maintain() { + NodeList allNodes = nodeRepository().nodes().list(); + NodeList hosts = allNodes.parents().matching(HostCapacityMaintainer::canDeprovision); + + int failures = 0; + for (Node host : hosts) { + // This shouldn't be possible since failed, parked, and wantToDeprovision should be recursive + if (!allNodes.childrenOf(host).stream().allMatch(HostCapacityMaintainer::canDeprovision)) + continue; + + try { + // Technically we should do this under application lock, but + // * HostProvisioner::deprovision may take some time since we are waiting for request(s) against + // the cloud provider + // * Because the application lock is shared between all hosts of the same type we want to avoid + // holding it over longer periods + // * We are about to remove these hosts anyway, so only reason we'd want to hold the lock is + // if we want to support aborting deprovision if operator manually intervenes + hostProvisioner.deprovision(host); + nodeRepository().nodes().removeRecursively(host, true); + } catch (RuntimeException e) { + failures++; + log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e); + } + } + return asSuccessFactor(hosts.size(), failures); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index acc5dd66f7a..9436fcc150e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -70,7 +70,8 @@ public class NodeRepositoryMaintenance extends AbstractComponent { .ifPresent(maintainers::add); provisionServiceProvider.getHostProvisioner() .map(hostProvisioner -> List.of( - new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric), + new HostCapacityMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric), + new HostDeprovisioner(nodeRepository, defaults.hostDeprovisionerInterval, metric, hostProvisioner), new HostResumeProvisioner(nodeRepository, defaults.hostResumeProvisionerInterval, metric, hostProvisioner), new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner), new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner))) @@ -113,6 +114,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration infrastructureProvisionInterval; private final Duration loadBalancerExpirerInterval; private final Duration dynamicProvisionerInterval; + private final Duration hostDeprovisionerInterval; private final Duration hostResumeProvisionerInterval; private final Duration diskReplacerInterval; private final Duration osUpgradeActivatorInterval; @@ -128,6 +130,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { DefaultTimes(Zone zone, Deployer deployer) { autoscalingInterval = Duration.ofMinutes(5); dynamicProvisionerInterval = Duration.ofMinutes(3); + hostDeprovisionerInterval = Duration.ofMinutes(3); hostResumeProvisionerInterval = Duration.ofMinutes(3); diskReplacerInterval = Duration.ofMinutes(3); failedExpirerInterval = Duration.ofMinutes(10); |