summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@yahooinc.com>2022-11-02 21:33:27 +0100
committerValerij Fredriksen <valerijf@yahooinc.com>2022-11-02 21:33:27 +0100
commitd939ca00c4c4acc07374c7d862f892fe702cc328 (patch)
tree465746d6a6b4854ad6c52a8fe56c9a9b54ddc7f5 /node-repository/src/main/java
parent2a348d61213778f11c762de5f3570d8174f9f294 (diff)
Split DynamicProvisioningMaintainer into HostCapacityMaintainer and HostDeprovisioner
Diffstat (limited to 'node-repository/src/main/java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java (renamed from node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java)80
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java57
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java5
3 files changed, 101 insertions, 41 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
index 245bce1b9e8..df04698e0b4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
@@ -37,6 +37,7 @@ import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -50,19 +51,19 @@ import java.util.stream.Collectors;
* @author freva
* @author mpolden
*/
-public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
+public class HostCapacityMaintainer extends NodeRepositoryMaintainer {
- private static final Logger log = Logger.getLogger(DynamicProvisioningMaintainer.class.getName());
+ private static final Logger log = Logger.getLogger(HostCapacityMaintainer.class.getName());
private final HostProvisioner hostProvisioner;
private final ListFlag<ClusterCapacity> preprovisionCapacityFlag;
private final JacksonFlag<SharedHost> sharedHostFlag;
- DynamicProvisioningMaintainer(NodeRepository nodeRepository,
- Duration interval,
- HostProvisioner hostProvisioner,
- FlagSource flagSource,
- Metric metric) {
+ HostCapacityMaintainer(NodeRepository nodeRepository,
+ Duration interval,
+ HostProvisioner hostProvisioner,
+ FlagSource flagSource,
+ Metric metric) {
super(nodeRepository, interval, metric);
this.hostProvisioner = hostProvisioner;
this.preprovisionCapacityFlag = PermanentFlags.PREPROVISION_CAPACITY.bindTo(flagSource);
@@ -72,42 +73,46 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
@Override
protected double maintain() {
NodeList nodes = nodeRepository().nodes().list();
- convergeToCapacity(nodes);
- return 1.0;
- }
-
- /** Converge zone to wanted capacity */
- private void convergeToCapacity(NodeList nodes) {
List<Node> excessHosts;
try {
excessHosts = provision(nodes);
} catch (NodeAllocationException | IllegalStateException e) {
log.log(Level.WARNING, "Failed to allocate preprovisioned capacity and/or find excess hosts: " + e.getMessage());
- return; // avoid removing excess hosts
+ return 0; // avoid removing excess hosts
} catch (RuntimeException e) {
log.log(Level.WARNING, "Failed to allocate preprovisioned capacity and/or find excess hosts", e);
- return; // avoid removing excess hosts
+ return 0; // avoid removing excess hosts
}
- excessHosts.forEach(host -> {
- Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10));
- if (optionalMutex.isEmpty()) return;
- try (NodeMutex mutex = optionalMutex.get()) {
- if (host.state() != mutex.node().state()) return;
- host = mutex.node();
- // First mark the host as wantToDeprovision so that if hostProvisioner fails, this host
- // * won't get new nodes allocated to it
- // * will be selected as excess on next iteration of this maintainer
- nodeRepository().nodes().deprovision(host.hostname(), Agent.DynamicProvisioningMaintainer, nodeRepository().clock().instant());
- hostProvisioner.deprovision(host);
- nodeRepository().nodes().removeRecursively(host, true);
- } catch (UncheckedTimeoutException e) {
- log.log(Level.WARNING, "Failed to deprovision " + host.hostname() +
- ": Failed to get lock on node, will retry later");
- } catch (RuntimeException e) {
- log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e);
+ markForRemoval(excessHosts);
+ return 1;
+ }
+
+ private void markForRemoval(List<Node> excessHosts) {
+ if (excessHosts.isEmpty()) return;
+
+ try (var lock = nodeRepository().nodes().lockUnallocated()) {
+ NodeList nodes = nodeRepository().nodes().list(); // Reread nodes under lock
+ for (Node host : excessHosts) {
+ Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10));
+ if (optionalMutex.isEmpty()) continue;
+ try (NodeMutex mutex = optionalMutex.get()) {
+ host = mutex.node();
+ if (!canRemoveHost(host)) continue;
+ if (!nodes.childrenOf(host).stream().allMatch(HostCapacityMaintainer::canDeprovision))
+ continue;
+
+ // Retire the host to parked if possible, otherwise move it straight to parked
+ if (EnumSet.of(Node.State.reserved, Node.State.active, Node.State.inactive).contains(host.state())) {
+ Node retiredHost = host.withWantToRetire(true, true, Agent.DynamicProvisioningMaintainer, nodeRepository().clock().instant());
+ nodeRepository().nodes().write(retiredHost, mutex);
+ } else nodeRepository().nodes().park(host.hostname(), true, Agent.DynamicProvisioningMaintainer, "Parked for removal");
+ } catch (UncheckedTimeoutException e) {
+ log.log(Level.WARNING, "Failed to mark " + host.hostname() +
+ " for deprovisioning: Failed to get lock on node, will retry later");
+ }
}
- });
+ }
}
/**
@@ -153,7 +158,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
}
}
for (var node : nodes) {
- if (node.parentHostname().isPresent() && !canRemoveNode(node)) {
+ if (node.parentHostname().isPresent() && !canDeprovision(node)) {
removableHostsByHostname.remove(node.parentHostname().get());
}
}
@@ -169,12 +174,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
};
}
- private static boolean canRemoveNode(Node node) {
- if (node.type().isHost()) throw new IllegalArgumentException("Node " + node + " is not a child");
- return node.allocation().isEmpty() || canDeprovision(node);
- }
-
- private static boolean canDeprovision(Node node) {
+ static boolean canDeprovision(Node node) {
return node.status().wantToDeprovision() && (node.state() == Node.State.parked ||
node.state() == Node.State.failed);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java
new file mode 100644
index 00000000000..0d9df067f3f
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostDeprovisioner.java
@@ -0,0 +1,57 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.maintenance;
+
+import com.yahoo.jdisc.Metric;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
+
+import java.time.Duration;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * @author freva
+ */
+public class HostDeprovisioner extends NodeRepositoryMaintainer {
+
+ private static final Logger log = Logger.getLogger(HostDeprovisioner.class.getName());
+
+ private final HostProvisioner hostProvisioner;
+
+ HostDeprovisioner(NodeRepository nodeRepository, Duration interval, Metric metric, HostProvisioner hostProvisioner) {
+ super(nodeRepository, interval, metric);
+ this.hostProvisioner = hostProvisioner;
+ }
+
+ @Override
+ protected double maintain() {
+ NodeList allNodes = nodeRepository().nodes().list();
+ NodeList hosts = allNodes.parents().matching(HostCapacityMaintainer::canDeprovision);
+
+ int failures = 0;
+ for (Node host : hosts) {
+ // This shouldn't be possible since failed, parked, and wantToDeprovision should be recursive
+ if (!allNodes.childrenOf(host).stream().allMatch(HostCapacityMaintainer::canDeprovision))
+ continue;
+
+ try {
+ // Technically we should do this under application lock, but
+ // * HostProvisioner::deprovision may take some time since we are waiting for request(s) against
+ // the cloud provider
+ // * Because the application lock is shared between all hosts of the same type we want to avoid
+ // holding it over longer periods
+ // * We are about to remove these hosts anyway, so only reason we'd want to hold the lock is
+ // if we want to support aborting deprovision if operator manually intervenes
+ hostProvisioner.deprovision(host);
+ nodeRepository().nodes().removeRecursively(host, true);
+ } catch (RuntimeException e) {
+ failures++;
+ log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e);
+ }
+ }
+ return asSuccessFactor(hosts.size(), failures);
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index acc5dd66f7a..9436fcc150e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -70,7 +70,8 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
.ifPresent(maintainers::add);
provisionServiceProvider.getHostProvisioner()
.map(hostProvisioner -> List.of(
- new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric),
+ new HostCapacityMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric),
+ new HostDeprovisioner(nodeRepository, defaults.hostDeprovisionerInterval, metric, hostProvisioner),
new HostResumeProvisioner(nodeRepository, defaults.hostResumeProvisionerInterval, metric, hostProvisioner),
new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner),
new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner)))
@@ -113,6 +114,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration infrastructureProvisionInterval;
private final Duration loadBalancerExpirerInterval;
private final Duration dynamicProvisionerInterval;
+ private final Duration hostDeprovisionerInterval;
private final Duration hostResumeProvisionerInterval;
private final Duration diskReplacerInterval;
private final Duration osUpgradeActivatorInterval;
@@ -128,6 +130,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
DefaultTimes(Zone zone, Deployer deployer) {
autoscalingInterval = Duration.ofMinutes(5);
dynamicProvisionerInterval = Duration.ofMinutes(3);
+ hostDeprovisionerInterval = Duration.ofMinutes(3);
hostResumeProvisionerInterval = Duration.ofMinutes(3);
diskReplacerInterval = Duration.ofMinutes(3);
failedExpirerInterval = Duration.ofMinutes(10);