aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@yahooinc.com>2022-11-03 11:02:16 +0100
committerValerij Fredriksen <valerijf@yahooinc.com>2022-11-03 11:02:16 +0100
commit0271c3ee04792a062b75b43ad29c93c0f3811c22 (patch)
treed4131914287c7196e0491cc9ff82c2d864f93643 /node-repository
parent01b99953932ce2629ee3e71264789a3044d17b65 (diff)
Take application lock, then unallocatedLock
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java43
1 files changed, 25 insertions, 18 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
index 8778b6a0fcf..7310fe63736 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java
@@ -12,6 +12,7 @@ import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
import com.yahoo.lang.MutableInteger;
+import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.flags.FlagSource;
import com.yahoo.vespa.flags.JacksonFlag;
import com.yahoo.vespa.flags.ListFlag;
@@ -84,35 +85,41 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer {
return 0; // avoid removing excess hosts
}
- markForRemoval(excessHosts);
- return 1;
+ return markForRemoval(excessHosts);
}
- private void markForRemoval(List<Node> excessHosts) {
- if (excessHosts.isEmpty()) return;
+ private double markForRemoval(List<Node> excessHosts) {
+ if (excessHosts.isEmpty()) return 1;
- try (var lock = nodeRepository().nodes().lockUnallocated()) {
- NodeList nodes = nodeRepository().nodes().list(); // Reread nodes under lock
- for (Node host : excessHosts) {
- Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10));
- if (optionalMutex.isEmpty()) continue;
- try (NodeMutex mutex = optionalMutex.get()) {
- host = mutex.node();
- if (!canRemoveHost(host)) continue;
- if (!nodes.childrenOf(host).stream().allMatch(HostCapacityMaintainer::canDeprovision))
- continue;
+ int attempts = 0, success = 0;
+ for (List<Node> typeExcessHosts : excessHosts.stream().collect(Collectors.groupingBy(Node::type)).values()) {
+ attempts++;
+ // All nodes in the list are hosts of the same type, so they use the same lock regardless of their allocation
+ Optional<NodeMutex> appMutex = nodeRepository().nodes().lockAndGet(typeExcessHosts.get(0), Duration.ofSeconds(10));
+ if (appMutex.isEmpty()) continue;
+ try (Mutex lock = appMutex.get();
+ Mutex unallocatedLock = nodeRepository().nodes().lockUnallocated()) {
+ // Re-read all nodes under lock and compute the candidates for removal. The actual nodes we want
+ // to mark for removal is the intersection with typeExcessHosts
+ List<Node> toMarkForRemoval = candidatesForRemoval(nodeRepository().nodes().list().asList()).stream()
+ .filter(typeExcessHosts::contains)
+ .toList();
+ for (Node host : toMarkForRemoval) {
+ attempts++;
// Retire the host to parked if possible, otherwise move it straight to parked
if (EnumSet.of(Node.State.reserved, Node.State.active, Node.State.inactive).contains(host.state())) {
Node retiredHost = host.withWantToRetire(true, true, Agent.HostCapacityMaintainer, nodeRepository().clock().instant());
- nodeRepository().nodes().write(retiredHost, mutex);
+ nodeRepository().nodes().write(retiredHost, lock);
} else nodeRepository().nodes().park(host.hostname(), true, Agent.HostCapacityMaintainer, "Parked for removal");
- } catch (UncheckedTimeoutException e) {
- log.log(Level.WARNING, "Failed to mark " + host.hostname() +
- " for deprovisioning: Failed to get lock on node, will retry later");
+ success++;
}
+ } catch (UncheckedTimeoutException e) {
+ log.log(Level.WARNING, "Failed to mark excess hosts for deprovisioning: Failed to get lock, will retry later");
}
+ success++;
}
+ return asSuccessFactor(attempts, attempts - success);
}
/**