diff options
author | jonmv <venstad@gmail.com> | 2023-05-25 12:11:59 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-05-26 10:38:45 +0200 |
commit | f61419e264e1cf69c0891f00ee745b39754cf914 (patch) | |
tree | 2c5d0e13a9d2e9ff538cbe543b3e973698db07a0 /node-repository | |
parent | 4d5ec01f49319a76183bd7cce4682b0365cf39a4 (diff) |
Keep empty hosts around based on TTL, in dynamic zones
Diffstat (limited to 'node-repository')
2 files changed, 74 insertions, 33 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java index 28320743964..9369897efa3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainer.java @@ -34,15 +34,19 @@ import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; -import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.stream.Collectors; + +import static java.util.Comparator.comparing; +import static java.util.Comparator.naturalOrder; +import static java.util.stream.Collectors.groupingBy; +import static java.util.stream.Collectors.toSet; /** * @author freva @@ -67,10 +71,9 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { @Override protected double maintain() { - NodeList nodes = nodeRepository().nodes().list(); - List<Node> excessHosts; + List<Node> provisionedSnapshot; try { - excessHosts = provision(nodes); + provisionedSnapshot = provision(nodeRepository().nodes().list()); } catch (NodeAllocationException | IllegalStateException e) { log.log(Level.WARNING, "Failed to allocate preprovisioned capacity and/or find excess hosts: " + e.getMessage()); return 0; // avoid removing excess hosts @@ -79,33 +82,68 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { return 0; // avoid removing excess hosts } - return markForRemoval(excessHosts); + return markForRemoval(provisionedSnapshot); } - private double markForRemoval(List<Node> excessHosts) { - if (excessHosts.isEmpty()) return 1; + private double markForRemoval(List<Node> provisionedSnapshot) { + // Group nodes by parent; no parent means it's a host. + Map<Optional<String>, List<Node>> nodesByParent = provisionedSnapshot.stream().collect(groupingBy(Node::parentHostname)); + + // Find all hosts that we once thought were empty (first clouse), or whose children are now all removable (second clause). + List<Node> emptyHosts = nodesByParent.get(Optional.<String>empty()).stream() + .filter(host -> host.hostEmptyAt().isPresent() + || nodesByParent.getOrDefault(Optional.of(host.hostname()), List.of()) + .stream().allMatch(HostCapacityMaintainer::canDeprovision)) + .toList(); + + if (emptyHosts.isEmpty()) return 1; int attempts = 0, success = 0; - for (List<Node> typeExcessHosts : excessHosts.stream().collect(Collectors.groupingBy(Node::type)).values()) { + for (Set<Node> typeEmptyHosts : emptyHosts.stream().collect(groupingBy(Node::type, toSet())).values()) { attempts++; // All nodes in the list are hosts of the same type, so they use the same lock regardless of their allocation - Optional<NodeMutex> appMutex = nodeRepository().nodes().lockAndGet(typeExcessHosts.get(0), Duration.ofSeconds(10)); + Optional<NodeMutex> appMutex = nodeRepository().nodes().lockAndGet(typeEmptyHosts.iterator().next(), Duration.ofSeconds(10)); if (appMutex.isEmpty()) continue; try (Mutex lock = appMutex.get(); Mutex unallocatedLock = nodeRepository().nodes().lockUnallocated()) { // Re-read all nodes under lock and compute the candidates for removal. The actual nodes we want - // to mark for removal is the intersection with typeExcessHosts - List<Node> toMarkForRemoval = candidatesForRemoval(nodeRepository().nodes().list().asList()).stream() - .filter(typeExcessHosts::contains) - .toList(); + // to mark for removal is the intersection with typeEmptyHosts, which excludes the preprovisioned hosts. + Map<Optional<String>, List<Node>> currentNodesByParent = nodeRepository().nodes().list().stream().collect(groupingBy(Node::parentHostname)); + List<Node> candidateHosts = new ArrayList<>(currentNodesByParent.get(Optional.<String>empty())); + candidateHosts.retainAll(typeEmptyHosts); - for (Node host : toMarkForRemoval) { + for (Node host : candidateHosts) { attempts++; - // Retire the host to parked if possible, otherwise move it straight to parked - if (EnumSet.of(Node.State.reserved, Node.State.active, Node.State.inactive).contains(host.state())) { - Node retiredHost = host.withWantToRetire(true, true, Agent.HostCapacityMaintainer, nodeRepository().clock().instant()); - nodeRepository().nodes().write(retiredHost, lock); - } else nodeRepository().nodes().park(host.hostname(), true, Agent.HostCapacityMaintainer, "Parked for removal"); + + // Any hosts that are no longer empty should be marked as such, and excluded from removal. + if (currentNodesByParent.getOrDefault(Optional.of(host.hostname()), List.of()) + .stream().anyMatch(n -> ! canDeprovision(n))) { + if (host.hostEmptyAt().isPresent()) { + nodeRepository().nodes().write(host.withHostEmptyAt(null), lock); + } + } + // If the host is still empty, we can mark it as empty now, or mark it for removal if it has already expired. + else { + Instant now = clock().instant(); + Node emptyHost = host.hostEmptyAt().isPresent() ? host : host.withHostEmptyAt(now); + boolean expired = ! now.isBefore(emptyHost.hostEmptyAt().get().plus(host.hostTTL().orElse(Duration.ZERO))); + + if (expired && canRemoveHost(emptyHost)) { + // Retire the host to parked if possible, otherwise move it straight to parked. + if (EnumSet.of(Node.State.reserved, Node.State.active, Node.State.inactive).contains(host.state())) { + emptyHost = emptyHost.withWantToRetire(true, true, Agent.HostCapacityMaintainer, nodeRepository().clock().instant()); + nodeRepository().nodes().write(emptyHost, lock); + } + else { + if (emptyHost != host) nodeRepository().nodes().write(emptyHost, lock); + nodeRepository().nodes().park(host.hostname(), true, Agent.HostCapacityMaintainer, "Parked for removal"); + } + } + else { + if (emptyHost != host) nodeRepository().nodes().write(emptyHost, lock); + } + } + success++; } } catch (UncheckedTimeoutException e) { @@ -116,22 +154,20 @@ public class HostCapacityMaintainer extends NodeRepositoryMaintainer { return asSuccessFactorDeviation(attempts, attempts - success); } - /** - * Provision hosts to ensure there is room to allocate spare nodes. - * - * @param nodeList list of all nodes - * @return excess hosts that can safely be deprovisioned: An excess host 1. contains no nodes allocated - * to an application, and assuming the spare nodes have been allocated, and 2. is not parked - * without wantToDeprovision (which means an operator is looking at the node). - */ private List<Node> provision(NodeList nodeList) { - var nodes = new ArrayList<>(provisionUntilNoDeficit(nodeList)); - return candidatesForRemoval(nodes).stream() - .sorted(Comparator.comparing(node -> node.history().events().stream() - .map(History.Event::at).min(Comparator.naturalOrder()).orElse(Instant.MIN))) - .toList(); + return provisionUntilNoDeficit(nodeList).stream() + .sorted(comparing(node -> node.history().events().stream() + .map(History.Event::at) + .min(naturalOrder()) + .orElse(Instant.MIN))) + .toList(); } + /** + * Excess hosts that can safely be deprovisioned: An excess host 1. contains no nodes allocated + * to an application, and assuming the spare nodes have been allocated, and 2. is not parked + * without wantToDeprovision (which means an operator is looking at the node). + */ private static List<Node> candidatesForRemoval(List<Node> nodes) { Map<String, Node> removableHostsByHostname = new HashMap<>(); for (var node : nodes) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java index 7f5bb79b20c..5dd7665eedd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostCapacityMaintainerTest.java @@ -219,6 +219,11 @@ public class HostCapacityMaintainerTest { } @Test + public void respects_host_TTL() { + fail(); + } + + @Test public void test_minimum_capacity() { var tester = new DynamicProvisioningTester(); NodeResources resources1 = new NodeResources(24, 64, 100, 10); |