summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormgimle <michael@gimle.io>2019-07-15 11:46:20 +0200
committermgimle <michael@gimle.io>2019-07-15 11:58:25 +0200
commit2a51bf3566624e069ac14e3f072e23b31907d78f (patch)
treea80e47ee4bebb1bf011bdd3df5cab1170b6f1fbd
parent16e667deccea2540454a13beb23492a2d222e3d6 (diff)
Separated out the capacity checking part of the capacity report maintainer.
Made aws zones excluded from the capacity calculation because the metric is nonsensical for an automatically scaling zone. Fixed some missing renames.
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java528
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java414
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java (renamed from node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTest.java)36
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java (renamed from node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTester.java)13
5 files changed, 568 insertions, 429 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java
new file mode 100644
index 00000000000..02fcd494875
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityChecker.java
@@ -0,0 +1,528 @@
+package com.yahoo.vespa.hosted.provision.maintenance;
+
+import com.yahoo.config.provision.NodeResources;
+import com.yahoo.config.provision.NodeType;
+import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Allocation;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+public class CapacityChecker {
+ private List<Node> hosts;
+ Map<String, Node> nodeMap;
+ private Map<Node, List<Node>> nodeChildren;
+ private Map<Node, AllocationResources> availableResources;
+
+ public AllocationHistory allocationHistory = null;
+
+ public CapacityChecker(NodeRepository nodeRepository) {
+ this.hosts = getHosts(nodeRepository);
+ List<Node> tenants = getTenants(nodeRepository, hosts);
+ nodeMap = constructHostnameToNodeMap(hosts);
+ this.nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
+ this.availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
+ }
+
+ public List<Node> getHosts() {
+ return hosts;
+ }
+
+ public Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() {
+ Map<Node, Integer> timesNodeCanBeRemoved = computeMaximalRepeatedRemovals(hosts, nodeChildren, availableResources);
+ return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources);
+ }
+
+ protected List<Node> findOvercommittedHosts() {
+ return findOvercommittedNodes(availableResources);
+ }
+
+ public List<Node> nodesFromHostnames(List<String> hostnames) {
+ List<Node> nodes = hostnames.stream()
+ .filter(h -> nodeMap.containsKey(h))
+ .map(h -> nodeMap.get(h))
+ .collect(Collectors.toList());
+ if (nodes.size() != hostnames.size()) {
+ List<String> notFoundNodes = hostnames.stream()
+ .filter(h -> !nodes.stream()
+ .map(Node::hostname).collect(Collectors.toSet()).contains(h))
+ .collect(Collectors.toList());
+ throw new IllegalArgumentException(String.format("Host(s) not found: [ %s ]",
+ String.join(", ", notFoundNodes)));
+ }
+
+ return nodes;
+ }
+
+ public Optional<HostFailurePath> findHostRemovalFailure(List<Node> hostsToRemove) {
+ var removal = findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources);
+ if (removal.isEmpty()) return Optional.empty();
+ HostFailurePath failurePath = new HostFailurePath();
+ failurePath.hostsCausingFailure = hostsToRemove;
+ failurePath.failureReason = removal.get();
+ return Optional.of(failurePath);
+ }
+
+ // We only care about nodes in one of these states.
+ private static Node.State[] relevantNodeStates = {
+ Node.State.active,
+ Node.State.inactive,
+ Node.State.dirty,
+ Node.State.provisioned,
+ Node.State.ready,
+ Node.State.reserved
+ };
+
+ private List<Node> getHosts(NodeRepository nodeRepository) {
+ return nodeRepository.getNodes(NodeType.host, relevantNodeStates);
+ }
+
+ private List<Node> getTenants(NodeRepository nodeRepository, List<Node> hosts) {
+ var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet());
+ return nodeRepository.getNodes(NodeType.tenant, relevantNodeStates).stream()
+ .filter(t -> parentNames.contains(t.parentHostname().orElse("")))
+ .collect(Collectors.toList());
+ }
+
+ private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
+ Map<Node, List<Node>> nodeChildren,
+ Map<Node, AllocationResources> availableResources) {
+ if (hosts.size() == 0) return Optional.empty();
+
+ List<Node> parentRemovalPriorityList = heuristic.entrySet().stream()
+ .sorted(Comparator.comparingInt(Map.Entry::getValue))
+ .map(Map.Entry::getKey)
+ .collect(Collectors.toList());
+
+ for (int i = 1; i <= parentRemovalPriorityList.size(); i++) {
+ List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i);
+ var hostRemovalFailure = findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources);
+ if (hostRemovalFailure.isPresent()) {
+ HostFailurePath failurePath = new HostFailurePath();
+ failurePath.hostsCausingFailure = hostsToRemove;
+ failurePath.failureReason = hostRemovalFailure.get();
+ return Optional.of(failurePath);
+ }
+ }
+
+ throw new IllegalStateException("No path to failure found. This should be impossible!");
+ }
+
+ private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
+ return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n));
+ }
+
+ private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
+ Map<Node, List<Node>> nodeChildren = tenants.stream()
+ .filter(n -> n.parentHostname().isPresent())
+ .filter(n -> hostnameToNode.containsKey(n.parentHostname().get()))
+ .collect(Collectors.groupingBy(
+ n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
+
+ for (var host : hosts) nodeChildren.putIfAbsent(host, List.of());
+
+ return nodeChildren;
+ }
+
+ private Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
+ Map<Node, AllocationResources> availableResources = new HashMap<>();
+ for (var host : hosts) {
+ NodeResources hostResources = host.flavor().resources();
+ int occupiedIps = 0;
+ Set<String> ipPool = host.ipAddressPool().asSet();
+ for (var child : nodeChildren.get(host)) {
+ hostResources = hostResources.subtract(child.flavor().resources().withDiskSpeed(NodeResources.DiskSpeed.any));
+ occupiedIps += child.ipAddresses().stream().filter(ipPool::contains).count();
+ }
+ availableResources.put(host, new AllocationResources(hostResources, host.ipAddressPool().asSet().size() - occupiedIps));
+ }
+
+ return availableResources;
+ }
+
+ /**
+ * Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing
+ * the host causes an unrecoverable state
+ */
+ private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
+ Map<Node, AllocationResources> availableResources) {
+ Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(
+ Function.identity(),
+ _x -> Integer.MAX_VALUE
+ ));
+ for (Node host : hosts) {
+ List<Node> children = nodeChildren.get(host);
+ if (children.size() == 0) continue;
+ Map<Node, AllocationResources> resourceMap = new HashMap<>(availableResources);
+ Map<Node, List<Allocation>> containedAllocations = collateAllocations(nodeChildren);
+
+ int timesHostCanBeRemoved = 0;
+ Optional<Node> unallocatedNode;
+ while (timesHostCanBeRemoved < 1000) { // Arbritrary upper bound
+ unallocatedNode = tryAllocateNodes(nodeChildren.get(host), hosts, resourceMap, containedAllocations);
+ if (unallocatedNode.isEmpty()) {
+ timesHostCanBeRemoved++;
+ } else break;
+ }
+ timesNodeCanBeRemoved.put(host, timesHostCanBeRemoved);
+ }
+
+ return timesNodeCanBeRemoved;
+ }
+
+ private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
+ List<Node> overcommittedNodes = new ArrayList<>();
+ for (var entry : availableResources.entrySet()) {
+ var resources = entry.getValue().nodeResources;
+ if (resources.vcpu() < 0 || resources.memoryGb() < 0 || resources.diskGb() < 0) {
+ overcommittedNodes.add(entry.getKey());
+ }
+ }
+ return overcommittedNodes;
+ }
+
+ private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
+ return nodeChildren.entrySet().stream().collect(Collectors.toMap(
+ Map.Entry::getKey,
+ e -> e.getValue().stream()
+ .map(Node::allocation).flatMap(Optional::stream)
+ .collect(Collectors.toList())
+ ));
+ }
+
+ /**
+ * Tests whether it's possible to remove the provided hosts.
+ * Does not mutate any input variable.
+ * @return Empty optional if removal is possible, information on what caused the failure otherwise
+ */
+ private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
+ Map<Node, List<Node>> nodechildren,
+ Map<Node, AllocationResources> availableResources) {
+ var containedAllocations = collateAllocations(nodechildren);
+ var resourceMap = new HashMap<>(availableResources);
+ List<Node> validAllocationTargets = allHosts.stream()
+ .filter(h -> !hostsToRemove.contains(h))
+ .collect(Collectors.toList());
+ if (validAllocationTargets.size() == 0) {
+ return Optional.of(HostRemovalFailure.none());
+ }
+
+ allocationHistory = new AllocationHistory();
+ for (var host : hostsToRemove) {
+ Optional<Node> unallocatedNode = tryAllocateNodes(nodechildren.get(host),
+ validAllocationTargets, resourceMap, containedAllocations, true);
+
+ if (unallocatedNode.isPresent()) {
+ AllocationFailureReasonList failures = collateAllocationFailures(unallocatedNode.get(),
+ validAllocationTargets, resourceMap, containedAllocations);
+ return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), failures));
+ }
+ }
+ return Optional.empty();
+ }
+
+ /**
+ * Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations,
+ * optionally returning the first node to fail, if one does.
+ * */
+ private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
+ Map<Node, AllocationResources> availableResources,
+ Map<Node, List<Allocation>> containedAllocations) {
+ return tryAllocateNodes(nodes, hosts, availableResources, containedAllocations, false);
+ }
+ private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
+ Map<Node, AllocationResources> availableResources,
+ Map<Node, List<Allocation>> containedAllocations, boolean withHistory) {
+ for (var node : nodes) {
+ var newParent = tryAllocateNode(node, hosts, availableResources, containedAllocations);
+ if (newParent.isEmpty()) {
+ if (withHistory) allocationHistory.addEntry(node, null, 0);
+ return Optional.of(node);
+ }
+ if (withHistory) {
+ long eligibleParents =
+ hosts.stream().filter(h ->
+ !violatesParentHostPolicy(node, h, containedAllocations)
+ && availableResources.get(h).satisfies(AllocationResources.from(node.flavor().resources()))).count();
+ allocationHistory.addEntry(node, newParent.get(), eligibleParents + 1);
+ }
+ }
+ return Optional.empty();
+ }
+
+ /**
+ * @return The parent to which the node was allocated, if it was successfully allocated.
+ */
+ private Optional<Node> tryAllocateNode(Node node, List<Node> hosts,
+ Map<Node, AllocationResources> availableResources,
+ Map<Node, List<Allocation>> containedAllocations) {
+ AllocationResources requiredNodeResources = AllocationResources.from(node.flavor().resources());
+ for (var host : hosts) {
+ var availableHostResources = availableResources.get(host);
+ if (violatesParentHostPolicy(node, host, containedAllocations)) {
+ continue;
+ }
+ if (availableHostResources.satisfies(requiredNodeResources)) {
+ availableResources.put(host, availableHostResources.subtract(requiredNodeResources));
+ if (node.allocation().isPresent()) {
+ containedAllocations.get(host).add(node.allocation().get());
+ }
+ return Optional.of(host);
+ }
+ }
+
+ return Optional.empty();
+ }
+
+ private static boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
+ if (node.allocation().isEmpty()) return false;
+ Allocation nodeAllocation = node.allocation().get();
+ for (var allocation : containedAllocations.get(host)) {
+ if (allocation.membership().cluster().equalsIgnoringGroupAndVespaVersion(nodeAllocation.membership().cluster())
+ && allocation.owner().equals(nodeAllocation.owner())) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
+ Map<Node, AllocationResources> availableResources,
+ Map<Node, List<Allocation>> containedAllocations) {
+ List<AllocationFailureReason> allocationFailureReasons = new ArrayList<>();
+ for (var host : hosts) {
+ AllocationFailureReason reason = new AllocationFailureReason(host);
+ var availableHostResources = availableResources.get(host);
+ reason.violatesParentHostPolicy = violatesParentHostPolicy(node, host, containedAllocations);
+
+ NodeResources l = availableHostResources.nodeResources;
+ NodeResources r = node.flavor().resources();
+ if (l.vcpu() < r.vcpu()) { reason.insufficientVcpu = true; }
+ if (l.memoryGb() < r.memoryGb()) { reason.insufficientMemoryGb = true; }
+ if (l.diskGb() < r.diskGb()) { reason.insufficientDiskGb = true; }
+ if (r.diskSpeed() != NodeResources.DiskSpeed.any && r.diskSpeed() != l.diskSpeed())
+ { reason.incompatibleDiskSpeed = true; }
+ if (availableHostResources.availableIPs < 1) { reason.insufficientAvailableIPs = true; }
+
+ allocationFailureReasons.add(reason);
+ }
+
+ return new AllocationFailureReasonList(allocationFailureReasons);
+ }
+
+ /**
+ * Contains the list of hosts that, upon being removed, caused an unrecoverable state,
+ * as well as the specific host and tenant which caused it.
+ */
+ public static class HostFailurePath {
+ public List<Node> hostsCausingFailure;
+ public HostRemovalFailure failureReason;
+ }
+
+ /**
+ * Data class used for detailing why removing the given tenant from the given host was unsuccessful.
+ * A failure might not be caused by failing to allocate a specific tenant, in which case the fields
+ * will be empty.
+ */
+ public static class HostRemovalFailure {
+ public Optional<Node> host;
+ public Optional<Node> tenant;
+ public AllocationFailureReasonList failureReasons;
+
+ public static HostRemovalFailure none() {
+ return new HostRemovalFailure(
+ Optional.empty(),
+ Optional.empty(),
+ new AllocationFailureReasonList(List.of()));
+ }
+
+ public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
+ return new HostRemovalFailure(
+ Optional.of(host),
+ Optional.of(tenant),
+ failureReasons);
+ }
+
+ private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList failureReasons) {
+ this.host = host;
+ this.tenant = tenant;
+ this.failureReasons = failureReasons;
+ }
+
+ @Override
+ public String toString() {
+ if (host.isEmpty() || tenant.isEmpty()) return "No removal candidates exists.";
+ return String.format(
+ "Failure to remove host %s" +
+ "\n\tNo new host found for tenant %s:" +
+ "\n\t\tSingular Reasons: %s" +
+ "\n\t\tTotal Reasons: %s",
+ this.host.get().hostname(),
+ this.tenant.get().hostname(),
+ this.failureReasons.singularReasonFailures().toString(),
+ this.failureReasons.toString()
+ );
+ }
+ }
+
+ /**
+ * Used to describe the resources required for a tenant, and available to a host.
+ */
+ private static class AllocationResources {
+ NodeResources nodeResources;
+ int availableIPs;
+
+ public static AllocationResources from(NodeResources nodeResources) {
+ return new AllocationResources(nodeResources, 1);
+ }
+
+ public AllocationResources(NodeResources nodeResources, int availableIPs) {
+ this.nodeResources = nodeResources;
+ this.availableIPs = availableIPs;
+ }
+
+ public boolean satisfies(AllocationResources other) {
+ if (!this.nodeResources.satisfies(other.nodeResources)) return false;
+ return this.availableIPs >= other.availableIPs;
+ }
+
+ public AllocationResources subtract(AllocationResources other) {
+ return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs);
+ }
+ }
+
+ /**
+ * Keeps track of the reason why a host rejected an allocation.
+ */
+ private static class AllocationFailureReason {
+ Node host;
+ public AllocationFailureReason (Node host) {
+ this.host = host;
+ }
+ public boolean insufficientVcpu = false;
+ public boolean insufficientMemoryGb = false;
+ public boolean insufficientDiskGb = false;
+ public boolean incompatibleDiskSpeed = false;
+ public boolean insufficientAvailableIPs = false;
+ public boolean violatesParentHostPolicy = false;
+
+ public int numberOfReasons() {
+ int n = 0;
+ if (insufficientVcpu) n++;
+ if (insufficientMemoryGb) n++;
+ if (insufficientDiskGb) n++;
+ if (incompatibleDiskSpeed) n++;
+ if (insufficientAvailableIPs) n++;
+ if (violatesParentHostPolicy) n++;
+ return n;
+ }
+
+ @Override
+ public String toString() {
+ List<String> reasons = new ArrayList<>();
+ if (insufficientVcpu) reasons.add("insufficientVcpu");
+ if (insufficientMemoryGb) reasons.add("insufficientMemoryGb");
+ if (insufficientDiskGb) reasons.add("insufficientDiskGb");
+ if (incompatibleDiskSpeed) reasons.add("incompatibleDiskSpeed");
+ if (insufficientAvailableIPs) reasons.add("insufficientAvailableIPs");
+ if (violatesParentHostPolicy) reasons.add("violatesParentHostPolicy");
+
+ return String.format("[%s]", String.join(", ", reasons));
+ }
+ }
+
+ /**
+ * Provides convenient methods for tallying failures.
+ */
+ public static class AllocationFailureReasonList {
+ private List<AllocationFailureReason> allocationFailureReasons;
+ public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) {
+ this.allocationFailureReasons = allocationFailureReasons;
+ }
+
+ public long insufficientVcpu() { return allocationFailureReasons.stream().filter(r -> r.insufficientVcpu).count(); }
+ public long insufficientMemoryGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientMemoryGb).count(); }
+ public long insufficientDiskGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientDiskGb).count(); }
+ public long incompatibleDiskSpeed() { return allocationFailureReasons.stream().filter(r -> r.incompatibleDiskSpeed).count(); }
+ public long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); }
+ public long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); }
+
+ public AllocationFailureReasonList singularReasonFailures() {
+ return new AllocationFailureReasonList(allocationFailureReasons.stream()
+ .filter(reason -> reason.numberOfReasons() == 1).collect(Collectors.toList()));
+ }
+ public AllocationFailureReasonList multipleReasonFailures() {
+ return new AllocationFailureReasonList(allocationFailureReasons.stream()
+ .filter(reason -> reason.numberOfReasons() > 1).collect(Collectors.toList()));
+ }
+ public long size() {
+ return allocationFailureReasons.size();
+ }
+ @Override
+ public String toString() {
+ return String.format("CPU (%3d), Memory (%3d), Disk size (%3d), Disk speed (%3d), IP (%3d), Parent-Host Policy (%3d)",
+ insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(),
+ incompatibleDiskSpeed(), insufficientAvailableIps(), violatesParentHostPolicy());
+ }
+ }
+
+ public static class AllocationHistory {
+ public static class Entry {
+ public Node tenant;
+ public Node newParent;
+ public long eligibleParents;
+
+ public Entry(Node tenant, Node newParent, long eligibleParents) {
+ this.tenant = tenant;
+ this.newParent = newParent;
+ this.eligibleParents = eligibleParents;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%-20s %-65s -> %15s [%3d valid]",
+ tenant.hostname().replaceFirst("\\..+", ""),
+ tenant.flavor().resources(),
+ newParent == null ? "x" : newParent.hostname().replaceFirst("\\..+", ""),
+ this.eligibleParents
+ );
+ }
+ }
+
+ public List<Entry> historyEntries;
+
+ public AllocationHistory() {
+ this.historyEntries = new ArrayList<>();
+ }
+
+ public void addEntry(Node tenant, Node newParent, long eligibleParents) {
+ this.historyEntries.add(new Entry(tenant, newParent, eligibleParents));
+ }
+
+ public Set<String> oldParents() {
+ Set<String> oldParents = new HashSet<>();
+ for (var entry : historyEntries)
+ entry.tenant.parentHostname().ifPresent(oldParents::add);
+ return oldParents;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder out = new StringBuilder();
+
+ String currentParent = "";
+ for (var entry : historyEntries) {
+ String parentName = entry.tenant.parentHostname().orElseThrow();
+ if (!parentName.equals(currentParent)) {
+ currentParent = parentName;
+ out.append(parentName).append("\n");
+ }
+ out.append(entry.toString()).append("\n");
+ }
+
+ return out.toString();
+ }
+ }
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java
index 44d43081ef2..3c47e418b94 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainer.java
@@ -1,23 +1,15 @@
package com.yahoo.vespa.hosted.provision.maintenance;
-import com.yahoo.config.provision.NodeResources;
-import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
import com.yahoo.log.LogLevel;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import java.time.Duration;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
import java.util.logging.Logger;
import java.util.stream.Collectors;
-import com.yahoo.vespa.hosted.provision.node.Allocation;
-
import java.util.*;
-import java.util.function.Function;
/**
* Performs analysis on the node repository to produce metrics that pertain to the capacity of the node repository.
@@ -29,7 +21,6 @@ import java.util.function.Function;
* @author mgimle
*/
public class CapacityReportMaintainer extends Maintainer {
-
private final Metric metric;
private final NodeRepository nodeRepository;
private static final Logger log = Logger.getLogger(CapacityReportMaintainer.class.getName());
@@ -44,403 +35,20 @@ public class CapacityReportMaintainer extends Maintainer {
@Override
protected void maintain() {
- metric.set("overcommittedHosts", countOvercommittedHosts(), null);
-
- Optional<HostFailurePath> failurePath = worstCaseHostLossLeadingToFailure();
- if (failurePath.isPresent()) {
- int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
- metric.set("spareHostCapacity", worstCaseHostLoss - 1, null);
- }
- }
-
- protected Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() {
- List<Node> hosts = getHosts();
- List<Node> tenants = getTenants(hosts);
- Map<String, Node> nodeMap = constructHostnameToNodeMap(hosts);
- Map<Node, List<Node>> nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
- Map<Node, AllocationResources> availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
-
- Map<Node, Integer> timesNodeCanBeRemoved = computeMaximalRepeatedRemovals(hosts, nodeChildren, availableResources);
- return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources);
- }
-
- // We only care about nodes in one of these states.
- private Node.State[] relevantNodeStates = {
- Node.State.active,
- Node.State.inactive,
- Node.State.dirty,
- Node.State.provisioned,
- Node.State.ready,
- Node.State.reserved
- };
-
- private List<Node> getHosts() {
- return nodeRepository.getNodes(NodeType.host, relevantNodeStates);
- }
-
- private List<Node> getTenants(List<Node> hosts) {
- var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet());
- return nodeRepository.getNodes(NodeType.tenant, relevantNodeStates).stream()
- .filter(t -> parentNames.contains(t.parentHostname().orElse("")))
- .collect(Collectors.toList());
- }
-
- private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
- Map<Node, List<Node>> nodeChildren,
- Map<Node, AllocationResources> availableResources) {
- if (hosts.size() == 0) return Optional.empty();
- List<Node> parentRemovalPriorityList = heuristic.entrySet().stream()
- .sorted(Comparator.comparingInt(Map.Entry::getValue))
- .map(Map.Entry::getKey)
- .collect(Collectors.toList());
- for (int i = 1; i <= parentRemovalPriorityList.size(); i++) {
- List<Node> hostsToRemove = parentRemovalPriorityList.subList(0, i);
- var hostRemovalFailure = findHostRemovalFailure(hostsToRemove, hosts, nodeChildren, availableResources);
- if (hostRemovalFailure.isPresent()) {
- HostFailurePath failurePath = new HostFailurePath();
- failurePath.hostsCausingFailure = hostsToRemove;
- failurePath.failureReason = hostRemovalFailure.get();
- return Optional.of(failurePath);
+ if (!nodeRepository.zone().cloud().value().equals("aws")) {
+ CapacityChecker capacityChecker = new CapacityChecker(this.nodeRepository);
+ List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts();
+ if (overcommittedHosts.size() != 0) {
+ log.log(LogLevel.WARNING, String.format("%d nodes are overcommitted! [ %s ]", overcommittedHosts.size(),
+ overcommittedHosts.stream().map(Node::hostname).collect(Collectors.joining(", "))));
}
- }
-
- throw new IllegalStateException("No path to failure found. This should be impossible!");
- }
-
- protected int countOvercommittedHosts() {
- List<Node> hosts = getHosts();
- List<Node> tenants = getTenants(hosts);
- var nodeMap = constructHostnameToNodeMap(hosts);
- var nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
- var availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
-
- List<Node> overcommittedNodes = findOvercommittedNodes(availableResources);
- if (overcommittedNodes.size() != 0) {
- log.log(LogLevel.WARNING, String.format("%d nodes are overcommitted! [ %s ]", overcommittedNodes.size(),
- overcommittedNodes.stream().map(Node::hostname).collect(Collectors.joining(", "))));
- }
- return overcommittedNodes.size();
- }
-
- private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
- return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n));
- }
-
- private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
- Map<Node, List<Node>> nodeChildren = tenants.stream()
- .filter(n -> n.parentHostname().isPresent())
- .filter(n -> hostnameToNode.containsKey(n.parentHostname().get()))
- .collect(Collectors.groupingBy(
- n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
-
- for (var host : hosts) nodeChildren.putIfAbsent(host, List.of());
-
- return nodeChildren;
- }
-
- private Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
- Map<Node, AllocationResources> availableResources = new HashMap<>();
- for (var host : hosts) {
- NodeResources hostResources = host.flavor().resources();
- int occupiedIps = 0;
- Set<String> ipPool = host.ipAddressPool().asSet();
- for (var child : nodeChildren.get(host)) {
- hostResources = hostResources.subtract(child.flavor().resources());
- occupiedIps += child.ipAddresses().stream().filter(ipPool::contains).count();
- }
- availableResources.put(host, new AllocationResources(hostResources, host.ipAddressPool().asSet().size() - occupiedIps));
- }
-
- return availableResources;
- }
-
- /**
- * Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing
- * the host causes an unrecoverable state
- */
- private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
- Map<Node, AllocationResources> availableResources) {
- Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(
- Function.identity(),
- _x -> Integer.MAX_VALUE
- ));
- for (Node host : hosts) {
- List<Node> children = nodeChildren.get(host);
- if (children.size() == 0) continue;
- Map<Node, AllocationResources> resourceMap = new HashMap<>(availableResources);
- Map<Node, List<Allocation>> containedAllocations = collateAllocations(nodeChildren);
-
- int timesHostCanBeRemoved = 0;
- Optional<Node> unallocatedTenant;
- while (timesHostCanBeRemoved < 1000) { // Arbritrary upper bound
- unallocatedTenant = tryAllocateNodes(nodeChildren.get(host), hosts, resourceMap, containedAllocations);
- if (unallocatedTenant.isEmpty()) {
- timesHostCanBeRemoved++;
- } else break;
- }
- timesNodeCanBeRemoved.put(host, timesHostCanBeRemoved);
- }
-
- return timesNodeCanBeRemoved;
- }
-
- private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
- List<Node> overcommittedNodes = new ArrayList<>();
- for (var entry : availableResources.entrySet()) {
- var resources = entry.getValue().nodeResources;
- if (resources.vcpu() < 0 || resources.memoryGb() < 0 || resources.diskGb() < 0) {
- overcommittedNodes.add(entry.getKey());
- }
- }
- return overcommittedNodes;
- }
-
- private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
- return nodeChildren.entrySet().stream().collect(Collectors.toMap(
- Map.Entry::getKey,
- e -> e.getValue().stream()
- .map(Node::allocation).flatMap(Optional::stream)
- .collect(Collectors.toList())
- ));
- }
-
- /**
- * Tests whether it's possible to remove the provided hosts.
- * Does not mutate any input variable.
- * @return Empty optional if removal is possible, information on what caused the failure otherwise
- */
- private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
- Map<Node, List<Node>> nodechildren,
- Map<Node, AllocationResources> availableResources) {
- var containedAllocations = collateAllocations(nodechildren);
- var resourceMap = new HashMap<>(availableResources);
- List<Node> validAllocationTargets = allHosts.stream()
- .filter(h -> !hostsToRemove.contains(h))
- .collect(Collectors.toList());
- if (validAllocationTargets.size() == 0) {
- return Optional.of(HostRemovalFailure.none());
- }
-
- for (var host : hostsToRemove) {
- Optional<Node> unallocatedNode = tryAllocateNodes(nodechildren.get(host),
- validAllocationTargets, resourceMap, containedAllocations);
-
- if (unallocatedNode.isPresent()) {
- AllocationFailureReasonList failures = collateAllocationFailures(unallocatedNode.get(),
- validAllocationTargets, resourceMap, containedAllocations);
- return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), failures));
- }
- }
- return Optional.empty();
- }
+ metric.set("overcommittedHosts", overcommittedHosts.size(), null);
- /**
- * Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations,
- * optionally returning the first node to fail, if one does.
- * */
- private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
- Map<Node, AllocationResources> availableResources,
- Map<Node, List<Allocation>> containedAllocations) {
- for (var node : nodes) {
- if (!tryAllocateNode(node, hosts, availableResources, containedAllocations)) {
- return Optional.of(node);
+ Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
+ metric.set("spareHostCapacity", worstCaseHostLoss - 1, null);
}
}
- return Optional.empty();
- }
-
- private boolean tryAllocateNode(Node node, List<Node> hosts,
- Map<Node, AllocationResources> availableResources,
- Map<Node, List<Allocation>> containedAllocations) {
- AllocationResources requiredNodeResources = AllocationResources.from(node.flavor().resources());
- for (var host : hosts) {
- var availableHostResources = availableResources.get(host);
- if (violatesParentHostPolicy(node, host, containedAllocations)) {
- continue;
- }
- if (availableHostResources.satisfies(requiredNodeResources)) {
- availableResources.put(host, availableHostResources.subtract(requiredNodeResources));
- if (node.allocation().isPresent()) {
- containedAllocations.get(host).add(node.allocation().get());
- }
- return true;
- }
- }
-
- return false;
- }
-
- private boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
- if (node.allocation().isEmpty()) return false;
- Allocation nodeAllocation = node.allocation().get();
- for (var allocation : containedAllocations.get(host)) {
- if (allocation.membership().cluster().equalsIgnoringGroupAndVespaVersion(nodeAllocation.membership().cluster())
- && allocation.owner().equals(nodeAllocation.owner())) {
- return true;
- }
- }
- return false;
- }
-
- private AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
- Map<Node, AllocationResources> availableResources,
- Map<Node, List<Allocation>> containedAllocations) {
- List<AllocationFailureReason> allocationFailureReasons = new ArrayList<>();
- for (var host : hosts) {
- AllocationFailureReason reason = new AllocationFailureReason(host);
- var availableHostResources = availableResources.get(host);
- reason.violatesParentHostPolicy = violatesParentHostPolicy(node, host, containedAllocations);
-
- NodeResources l = availableHostResources.nodeResources;
- NodeResources r = node.flavor().resources();
- if (l.vcpu() < r.vcpu()) { reason.insufficientVcpu = true; }
- if (l.memoryGb() < r.memoryGb()) { reason.insufficientMemoryGb = true; }
- if (l.diskGb() < r.diskGb()) { reason.insufficientDiskGb = true; }
- if (r.diskSpeed() != NodeResources.DiskSpeed.any && r.diskSpeed() != l.diskSpeed())
- { reason.incompatibleDiskSpeed = true; }
- if (availableHostResources.availableIPs < 1) { reason.insufficientAvailableIPs = true; }
-
- allocationFailureReasons.add(reason);
- }
-
- return new AllocationFailureReasonList(allocationFailureReasons);
- }
-
- /**
- * Contains the list of hosts that, upon being removed, caused an unrecoverable state,
- * as well as the specific host and tenant which caused it.
- */
- public static class HostFailurePath {
- List<Node> hostsCausingFailure;
- HostRemovalFailure failureReason;
- }
-
- /**
- * Data class used for detailing why removing the given tenant from the given host was unsuccessful.
- * A failure might not be caused by failing to allocate a specific tenant, in which case the fields
- * will be empty.
- */
- public static class HostRemovalFailure {
- Optional<Node> host;
- Optional<Node> tenant;
- AllocationFailureReasonList failureReasons;
- public static HostRemovalFailure none() {
- return new HostRemovalFailure(
- Optional.empty(),
- Optional.empty(),
- new AllocationFailureReasonList(List.of()));
- }
- public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
- return new HostRemovalFailure(
- Optional.of(host),
- Optional.of(tenant),
- failureReasons);
- }
- private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList failureReasons) {
- this.host = host;
- this.tenant = tenant;
- this.failureReasons = failureReasons;
- }
- }
-
- /**
- * Used to describe the resources required for a tenant, and available to a host.
- */
- private static class AllocationResources {
- NodeResources nodeResources;
- int availableIPs;
-
- public static AllocationResources from(NodeResources nodeResources) {
- return new AllocationResources(nodeResources, 1);
- }
-
- public AllocationResources(NodeResources nodeResources, int availableIPs) {
- this.nodeResources = nodeResources;
- this.availableIPs = availableIPs;
- }
-
- public boolean satisfies(AllocationResources other) {
- if (!this.nodeResources.satisfies(other.nodeResources)) return false;
- return this.availableIPs >= other.availableIPs;
- }
-
- public AllocationResources subtract(AllocationResources other) {
- return new AllocationResources(this.nodeResources.subtract(other.nodeResources), this.availableIPs - other.availableIPs);
- }
- }
-
- /**
- * Keeps track of the reason why a host rejected an allocation.
- */
- private class AllocationFailureReason {
- Node host;
- public AllocationFailureReason (Node host) {
- this.host = host;
- }
- public boolean insufficientVcpu = false;
- public boolean insufficientMemoryGb = false;
- public boolean insufficientDiskGb = false;
- public boolean incompatibleDiskSpeed = false;
- public boolean insufficientAvailableIPs = false;
- public boolean violatesParentHostPolicy = false;
-
- public int numberOfReasons() {
- int n = 0;
- if (insufficientVcpu) n++;
- if (insufficientMemoryGb) n++;
- if (insufficientDiskGb) n++;
- if (incompatibleDiskSpeed) n++;
- if (insufficientAvailableIPs) n++;
- if (violatesParentHostPolicy) n++;
- return n;
- }
-
- @Override
- public String toString() {
- List<String> reasons = new ArrayList<>();
- if (insufficientVcpu) reasons.add("insufficientVcpu");
- if (insufficientMemoryGb) reasons.add("insufficientMemoryGb");
- if (insufficientDiskGb) reasons.add("insufficientDiskGb");
- if (incompatibleDiskSpeed) reasons.add("incompatibleDiskSpeed");
- if (insufficientAvailableIPs) reasons.add("insufficientAvailableIPs");
- if (violatesParentHostPolicy) reasons.add("violatesParentHostPolicy");
-
- return String.format("[%s]", String.join(", ", reasons));
- }
- }
-
- /**
- * Provides convenient methods for tallying failures.
- */
- public static class AllocationFailureReasonList {
- private List<AllocationFailureReason> allocationFailureReasons;
- public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) {
- this.allocationFailureReasons = allocationFailureReasons;
- }
-
- long insufficientVcpu() { return allocationFailureReasons.stream().filter(r -> r.insufficientVcpu).count(); }
- long insufficientMemoryGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientMemoryGb).count(); }
- long insufficientDiskGb() { return allocationFailureReasons.stream().filter(r -> r.insufficientDiskGb).count(); }
- long incompatibleDiskSpeed() { return allocationFailureReasons.stream().filter(r -> r.incompatibleDiskSpeed).count(); }
- long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); }
- long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); }
-
- public AllocationFailureReasonList singularReasonFailures() {
- return new AllocationFailureReasonList(allocationFailureReasons.stream()
- .filter(reason -> reason.numberOfReasons() == 1).collect(Collectors.toList()));
- }
- public AllocationFailureReasonList multipleReasonFailures() {
- return new AllocationFailureReasonList(allocationFailureReasons.stream()
- .filter(reason -> reason.numberOfReasons() > 1).collect(Collectors.toList()));
- }
- public long size() {
- return allocationFailureReasons.size();
- }
- @Override
- public String toString() {
- return String.format("CPU (%3d), Memory (%3d), Disk size (%3d), Disk speed (%3d), IP (%3d), Parent-Host Policy (%3d)",
- insufficientVcpu(), insufficientMemoryGb(), insufficientDiskGb(),
- incompatibleDiskSpeed(), insufficientAvailableIps(), violatesParentHostPolicy());
- }
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index f661977d933..bb1ff637f08 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -82,7 +82,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
new HostProvisionMaintainer(nodeRepository, durationFromEnv("host_provisioner_interval").orElse(defaults.hostProvisionerInterval), hostProvisioner, flagSource));
hostDeprovisionMaintainer = provisionServiceProvider.getHostProvisioner().map(hostProvisioner ->
new HostDeprovisionMaintainer(nodeRepository, durationFromEnv("host_deprovisioner_interval").orElse(defaults.hostDeprovisionerInterval), hostProvisioner, flagSource));
- capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, durationFromEnv("alert_interval").orElse(defaults.nodeAlerterInterval));
+ capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, durationFromEnv("capacity_report_interval").orElse(defaults.capacityReportInterval));
// The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now
infrastructureProvisioner.maintain();
@@ -143,7 +143,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration dirtyExpiry;
private final Duration provisionedExpiry;
private final Duration rebootInterval;
- private final Duration nodeAlerterInterval;
+ private final Duration capacityReportInterval;
private final Duration metricsInterval;
private final Duration retiredInterval;
private final Duration infrastructureProvisionInterval;
@@ -162,7 +162,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
failedExpirerInterval = Duration.ofMinutes(10);
provisionedExpiry = Duration.ofHours(4);
rebootInterval = Duration.ofDays(30);
- nodeAlerterInterval = Duration.ofHours(1);
+ capacityReportInterval = Duration.ofHours(1);
metricsInterval = Duration.ofMinutes(1);
infrastructureProvisionInterval = Duration.ofMinutes(1);
throttlePolicy = NodeFailer.ThrottlePolicy.hosted;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java
index a486f8619c5..793789def2c 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTest.java
@@ -8,20 +8,19 @@ import org.junit.Test;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.*;
+
import static org.junit.Assert.fail;
import static org.junit.Assert.*;
/**
* @author mgimle
*/
-public class CapacityReportMaintainerTest {
- private CapacityReportMaintainerTester tester;
- private CapacityReportMaintainer capacityReporter;
+public class CapacityCheckerTest {
+ private CapacityCheckerTester tester;
@Before
public void setup() {
- tester = new CapacityReportMaintainerTester();
- capacityReporter = tester.makeCapacityReportMaintainer();
+ tester = new CapacityCheckerTester();
}
@Test
@@ -30,8 +29,11 @@ public class CapacityReportMaintainerTest {
tester.cleanRepository();
tester.restoreNodeRepositoryFromJsonFile(Paths.get(path));
- var failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
if (failurePath.isPresent()) {
+// System.out.println( tester.capacityChecker.allocationHistory );
+// System.out.println( failurePath.get().failureReason );
+ System.out.println("Worst case host loss : " + failurePath.get().hostsCausingFailure.size());
assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure));
} else fail();
}
@@ -41,7 +43,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(7, 4,
10, new NodeResources(-1, 10, 100), 10,
0, new NodeResources(1, 10, 100), 10);
- int overcommittedHosts = capacityReporter.countOvercommittedHosts();
+ int overcommittedHosts = tester.capacityChecker.findOvercommittedHosts().size();
assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), overcommittedHosts);
}
@@ -50,14 +52,14 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 1,
0, new NodeResources(1, 10, 100), 10,
0, new NodeResources(1, 10, 100), 10);
- var failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertFalse("Computing worst case host loss with no hosts should return an empty optional.", failurePath.isPresent());
// Odd edge case that should never be able to occur in prod
tester.createNodes(1, 10,
10, new NodeResources(10, 1000, 10000), 100,
1, new NodeResources(10, 1000, 10000), 100);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
assertTrue("Computing worst case host loss if all hosts have to be removed should result in an non-empty failureReason with empty nodes.",
failurePath.get().failureReason.tenant.isEmpty() && failurePath.get().failureReason.host.isEmpty());
@@ -66,7 +68,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(3, 30,
10, new NodeResources(0, 0, 10000), 1000,
0, new NodeResources(0, 0, 0), 0);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -81,7 +83,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 10,
10, new NodeResources(10, 1000, 10000), 1,
10, new NodeResources(10, 1000, 10000), 1);
- var failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -96,7 +98,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 10,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(0, 100, 1000), 100);
- var failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -107,7 +109,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 10,
10, new NodeResources(10, 1, 1000), 100,
10, new NodeResources(10, 0, 1000), 100);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -118,7 +120,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 10,
10, new NodeResources(10, 100, 10), 100,
10, new NodeResources(10, 100, 0), 100);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -130,7 +132,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 10, List.of(new NodeResources(1, 10, 100)),
10, new NodeResources(0, 0, 0), 100,
10, new NodeResources(10, 1000, 10000, NodeResources.DiskSpeed.slow), 100);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -146,7 +148,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 1,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(10, 1000, 10000), 100);
- var failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ var failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
@@ -157,7 +159,7 @@ public class CapacityReportMaintainerTest {
tester.createNodes(1, 2,
10, new NodeResources(10, 100, 1000), 1,
0, new NodeResources(0, 0, 0), 0);
- failurePath = capacityReporter.worstCaseHostLossLeadingToFailure();
+ failurePath = tester.capacityChecker.worstCaseHostLossLeadingToFailure();
assertTrue(failurePath.isPresent());
if (failurePath.get().failureReason.tenant.isPresent()) {
var failureReasons = failurePath.get().failureReason.failureReasons;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java
index ccea4691f10..f5fd0e0526d 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityReportMaintainerTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/CapacityCheckerTester.java
@@ -20,7 +20,6 @@ import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.time.Duration;
import java.time.Instant;
import java.util.*;
import java.util.stream.Collectors;
@@ -29,22 +28,23 @@ import java.util.stream.IntStream;
/**
* @author mgimle
*/
-public class CapacityReportMaintainerTester {
+public class CapacityCheckerTester {
public static final Zone zone = new Zone(Environment.prod, RegionName.from("us-east"));
// Components with state
public final ManualClock clock = new ManualClock();
public final NodeRepository nodeRepository;
+ public CapacityChecker capacityChecker;
- CapacityReportMaintainerTester() {
+ CapacityCheckerTester() {
Curator curator = new MockCurator();
NodeFlavors f = new NodeFlavors(new FlavorConfigBuilder().build());
nodeRepository = new NodeRepository(f, curator, clock, zone, new MockNameResolver().mockAnyLookup(),
DockerImage.fromString("docker-registry.domain.tld:8080/dist/vespa"), true);
}
- CapacityReportMaintainer makeCapacityReportMaintainer() {
- return new CapacityReportMaintainer(nodeRepository, new MetricsReporterTest.TestMetric(), Duration.ofDays(1));
+ private void updateCapacityChecker() {
+ this.capacityChecker = new CapacityChecker(this.nodeRepository);
}
List<NodeModel> createDistinctChildren(int amount, List<NodeResources> childResources) {
@@ -167,9 +167,9 @@ public class CapacityReportMaintainerTester {
nodes.addAll(createEmptyHosts(numHosts, numEmptyHosts, emptyHostExcessCapacity, emptyHostExcessIps));
nodeRepository.addNodes(nodes);
+ updateCapacityChecker();
}
-
NodeResources containingNodeResources(List<NodeResources> resources, NodeResources excessCapacity) {
NodeResources usedByChildren = resources.stream()
.reduce(new NodeResources(0, 0, 0), NodeResources::add);
@@ -278,6 +278,7 @@ public class CapacityReportMaintainerTester {
}
nodeRepository.addNodes(nodes);
+ updateCapacityChecker();
}
void cleanRepository() {