summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java142
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java122
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java14
3 files changed, 171 insertions, 107 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
index 3d4e0a32128..1e26c359f36 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
@@ -1,5 +1,6 @@
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
@@ -19,6 +20,16 @@ import com.yahoo.vespa.hosted.provision.node.Allocation;
import java.util.*;
import java.util.function.Function;
+/**
+ * Performs analysis on the node repository to produce metrics that can be used for alerts if the repository is in an
+ * undesirable state.
+ * These metrics include:
+ * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's
+ * unable to find a new home for orphaned tenants.
+ * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's.
+ *
+ * @author mgimle
+ */
public class NodeAlerter extends Maintainer {
private final Metric metric;
@@ -30,26 +41,31 @@ public class NodeAlerter extends Maintainer {
Duration interval) {
super(nodeRepository, interval);
this.nodeRepository = nodeRepository;
- this.metric = metric;
+ this.metric = Objects.requireNonNull(metric);
}
@Override
protected void maintain() {
- if (metric != null) {
- int worstCaseHostLoss = worstCaseHostLossLeadingToFailure().hostsCausingFailure.size();
+ Optional<HostFailurePath> failurePath = worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
metric.set("hostedVespa.spareHostCapacity", worstCaseHostLoss - 1, null);
- metric.set("hostedVespa.overcommittedNodes", countOvercommittedNodes(), null);
+ metric.set("hostedVespa.overcommittedHosts", countOvercommittedHosts(), null);
}
}
- class HostFailurePath {
+ /**
+ * Contains the list of hosts that, upon being removed, caused an unrecoverable state,
+ * as well as the specific host and tenant which caused it.
+ */
+ public static class HostFailurePath {
List<Node> hostsCausingFailure;
HostRemovalFailure failureReason;
}
- HostFailurePath worstCaseHostLossLeadingToFailure() {
- List<Node> tenants = nodeRepository.getNodes(NodeType.tenant);
- List<Node> hosts = nodeRepository.getNodes(NodeType.host);
+ public Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() {
+ List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host));
+ List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts);
Map<String, Node> nodeMap = constructHostnameToNodeMap(hosts);
Map<Node, List<Node>> nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
Map<Node, AllocationResources> availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
@@ -58,10 +74,26 @@ public class NodeAlerter extends Maintainer {
return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources);
}
- HostFailurePath greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
+ private List<Node> filterTenants(List<Node> tenants, List<Node> hosts) {
+ var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet());
+ return tenants.stream()
+ .filter((t -> t.flavor().getType() != Flavor.Type.BARE_METAL
+ && t.state() != Node.State.failed
+ && t.state() != Node.State.parked))
+ .filter(t -> parentNames.contains(t.parentHostname().orElse("")))
+ .collect(Collectors.toList());
+ }
+
+ private List<Node> filterHosts(List<Node> hosts) {
+ return hosts.stream()
+ .filter(h -> h.state() != Node.State.failed && h.state() != Node.State.parked)
+ .collect(Collectors.toList());
+ }
+
+ private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
Map<Node, List<Node>> nodeChildren,
Map<Node, AllocationResources> availableResources) {
- if (hosts.size() == 0) return null;
+ if (hosts.size() == 0) return Optional.empty();
List<Node> parentRemovalPriorityList = heuristic.entrySet().stream()
.sorted(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
@@ -73,16 +105,16 @@ public class NodeAlerter extends Maintainer {
HostFailurePath failurePath = new HostFailurePath();
failurePath.hostsCausingFailure = hostsToRemove;
failurePath.failureReason = hostRemovalFailure.get();
- return failurePath;
+ return Optional.of(failurePath);
}
}
throw new IllegalStateException("No path to failure found. This should be impossible!");
}
- int countOvercommittedNodes() {
- List<Node> tenants = nodeRepository.getNodes(NodeType.tenant);
- List<Node> hosts = nodeRepository.getNodes(NodeType.host);
+ int countOvercommittedHosts() {
+ List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host));
+ List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts);
var nodeMap = constructHostnameToNodeMap(hosts);
var nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
var availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
@@ -95,26 +127,25 @@ public class NodeAlerter extends Maintainer {
return overcommittedNodes.size();
}
- Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
+ private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n));
}
- Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
- Map<Node, List<Node>> nodeChildren = tenants.stream()
+ private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
+ Map<Optional<Node>, List<Node>> possibleNodeChildren = tenants.stream()
.filter(n -> n.parentHostname().isPresent())
.collect(Collectors.groupingBy(
- n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
- if (nodeChildren.get(null) != null) {
- List<Node> hasNullParents = nodeChildren.get(null);
- throw new IllegalStateException("Hostname(s) not present in hostname map : " +
- hasNullParents.stream().map(n -> n.parentHostname().orElseThrow()).collect(Collectors.joining(", ")));
- }
+ n -> Optional.ofNullable(hostnameToNode.get(n.parentHostname().orElseThrow()))));
+ possibleNodeChildren.remove(Optional.<Node>empty());
+ Map<Node, List<Node>> nodeChildren = possibleNodeChildren.entrySet().stream()
+ .collect(Collectors.toMap(e -> e.getKey().orElseThrow(), Map.Entry::getValue));
+
for (var host : hosts) nodeChildren.putIfAbsent(host, List.of());
return nodeChildren;
}
- Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
+ private Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
Map<Node, AllocationResources> availableResources = new HashMap<>();
for (var host : hosts) {
NodeResources hostResources = host.flavor().resources();
@@ -134,7 +165,7 @@ public class NodeAlerter extends Maintainer {
* Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing
* the host causes an unrecoverable state
*/
- Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
+ private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
Map<Node, AllocationResources> availableResources) {
Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(
Function.identity(),
@@ -158,7 +189,7 @@ public class NodeAlerter extends Maintainer {
return timesNodeCanBeRemoved;
}
- List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
+ private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
List<Node> overcommittedNodes = new ArrayList<>();
for (var entry : availableResources.entrySet()) {
var resources = entry.getValue().nodeResources;
@@ -169,7 +200,7 @@ public class NodeAlerter extends Maintainer {
return overcommittedNodes;
}
- Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
+ private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
return nodeChildren.entrySet().stream().collect(Collectors.toMap(
Map.Entry::getKey,
e -> e.getValue().stream()
@@ -178,18 +209,30 @@ public class NodeAlerter extends Maintainer {
));
}
- class HostRemovalFailure {
+ /**
+ * Data class used for detailing why removing the given tenant from the given host was unsuccessful.
+ * A failure might not be caused by failing to allocate a specific tenant, in which case the fields
+ * will be empty.
+ */
+ public static class HostRemovalFailure {
Optional<Node> host;
Optional<Node> tenant;
AllocationFailureReasonList failureReasons;
- public HostRemovalFailure() {
- this.host = Optional.empty();
- this.tenant = Optional.empty();
- this.failureReasons = new AllocationFailureReasonList(List.of());
+ public static HostRemovalFailure none() {
+ return new HostRemovalFailure(
+ Optional.empty(),
+ Optional.empty(),
+ new AllocationFailureReasonList(List.of()));
+ }
+ public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
+ return new HostRemovalFailure(
+ Optional.of(host),
+ Optional.of(tenant),
+ failureReasons);
}
- public HostRemovalFailure(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
- this.host = Optional.of(host);
- this.tenant = Optional.of(tenant);
+ private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList failureReasons) {
+ this.host = host;
+ this.tenant = tenant;
this.failureReasons = failureReasons;
}
}
@@ -199,7 +242,7 @@ public class NodeAlerter extends Maintainer {
* Does not mutate any input variable.
* @return Empty optional if removal is possible, information on what caused the failure otherwise
*/
- Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
+ private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
Map<Node, List<Node>> nodechildren,
Map<Node, AllocationResources> availableResources) {
var containedAllocations = collateAllocations(nodechildren);
@@ -208,7 +251,7 @@ public class NodeAlerter extends Maintainer {
.filter(h -> !hostsToRemove.contains(h))
.collect(Collectors.toList());
if (validAllocationTargets.size() == 0) {
- return Optional.of(new HostRemovalFailure());
+ return Optional.of(HostRemovalFailure.none());
}
for (var host : hostsToRemove) {
@@ -216,8 +259,9 @@ public class NodeAlerter extends Maintainer {
validAllocationTargets, resourceMap, containedAllocations);
if (unallocatedNode.isPresent()) {
- return Optional.of(new HostRemovalFailure(host, unallocatedNode.get(),
- collateAllocationFailures(unallocatedNode.get(), validAllocationTargets, resourceMap, containedAllocations)));
+ return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(),
+ collateAllocationFailures(unallocatedNode.get(), validAllocationTargets,
+ resourceMap, containedAllocations)));
}
}
return Optional.empty();
@@ -227,7 +271,7 @@ public class NodeAlerter extends Maintainer {
* Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations,
* optionally returning the first node to fail, if one does.
* */
- Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
+ private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
for (var node : nodes) {
@@ -238,7 +282,7 @@ public class NodeAlerter extends Maintainer {
return Optional.empty();
}
- boolean tryAllocateNode(Node node, List<Node> hosts,
+ private boolean tryAllocateNode(Node node, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
AllocationResources requiredNodeResources = AllocationResources.from(node.flavor().resources());
@@ -259,6 +303,9 @@ public class NodeAlerter extends Maintainer {
return false;
}
+ /**
+ * Used to describe the resources required for a tenant, and available to a host.
+ */
private static class AllocationResources {
NodeResources nodeResources;
int availableIPs;
@@ -282,7 +329,7 @@ public class NodeAlerter extends Maintainer {
}
}
- boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
+ private boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
if (node.allocation().isEmpty()) return false;
Allocation nodeAllocation = node.allocation().get();
for (var allocation : containedAllocations.get(host)) {
@@ -294,6 +341,9 @@ public class NodeAlerter extends Maintainer {
return false;
}
+ /**
+ * Keeps track of the reason why a host rejected an allocation.
+ */
private class AllocationFailureReason {
Node host;
public AllocationFailureReason (Node host) {
@@ -331,7 +381,10 @@ public class NodeAlerter extends Maintainer {
}
}
- class AllocationFailureReasonList {
+ /**
+ * Provides convenient methods for tallying failures.
+ */
+ public static class AllocationFailureReasonList {
private List<AllocationFailureReason> allocationFailureReasons;
public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) {
this.allocationFailureReasons = allocationFailureReasons;
@@ -344,7 +397,6 @@ public class NodeAlerter extends Maintainer {
long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); }
long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); }
-
public AllocationFailureReasonList singularReasonFailures() {
return new AllocationFailureReasonList(allocationFailureReasons.stream()
.filter(reason -> reason.numberOfReasons() == 1).collect(Collectors.toList()));
@@ -364,7 +416,7 @@ public class NodeAlerter extends Maintainer {
}
}
- AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
+ private AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
List<AllocationFailureReason> allocationFailureReasons = new ArrayList<>();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
index 281b289e992..2ed9de1df37 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
@@ -8,7 +8,6 @@ import org.junit.Test;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.*;
-import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
import static org.junit.Assert.*;
@@ -31,37 +30,76 @@ public class NodeAlerterTest {
tester.cleanRepository();
tester.restoreNodeRepositoryFromJsonFile(Paths.get(path));
- NodeAlerter.HostFailurePath failurePath = alerter.worstCaseHostLossLeadingToFailure();
- if (failurePath != null) {
- assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.hostsCausingFailure));
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure));
} else fail();
}
@Test
- public void testWithGeneratedData() {
+ public void testOvercommittedHosts() {
+ tester.createNodes(7, 4,
+ 10, new NodeResources(-1, 10, 100), 10,
+ 0, new NodeResources(1, 10, 100), 10);
+ int overcommittedHosts = alerter.countOvercommittedHosts();
+ assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), overcommittedHosts);
+ }
+
+ @Test
+ public void testEdgeCaseFailurePaths() {
tester.createNodes(1, 1,
0, new NodeResources(1, 10, 100), 10,
0, new NodeResources(1, 10, 100), 10);
var failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNull("Computing worst case host loss with no hosts should return null.", failurePath);
+ assertFalse("Computing worst case host loss with no hosts should return an empty optional.", failurePath.isPresent());
// Odd edge case that should never be able to occur in prod
tester.createNodes(1, 10,
10, new NodeResources(10, 1000, 10000), 100,
1, new NodeResources(10, 1000, 10000), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
+ assertTrue(failurePath.isPresent());
assertTrue("Computing worst case host loss if all hosts have to be removed should result in an non-empty failureReason with empty nodes.",
- failurePath.failureReason.tenant.isEmpty() && failurePath.failureReason.host.isEmpty());
- assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.hostsCausingFailure.size());
+ failurePath.get().failureReason.tenant.isEmpty() && failurePath.get().failureReason.host.isEmpty());
+ assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.get().hostsCausingFailure.size());
+
+ tester.createNodes(3, 30,
+ 10, new NodeResources(0, 0, 10000), 1000,
+ 0, new NodeResources(0, 0, 0), 0);
+ failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
+ assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures",
+ failureReasons.size(), failureReasons.multipleReasonFailures().size());
+ assertEquals(0, failureReasons.singularReasonFailures().size());
+ } else fail();
+ }
+
+ @Test
+ public void testIpFailurePaths() {
+ tester.createNodes(1, 10,
+ 10, new NodeResources(10, 1000, 10000), 1,
+ 10, new NodeResources(10, 1000, 10000), 1);
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
+ assertEquals("All failures should be due to hosts having a lack of available ip addresses.",
+ failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size());
+ } else fail();
+
+ }
+ @Test
+ public void testNodeResourceFailurePaths() {
tester.createNodes(1, 10,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(0, 100, 1000), 100);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking cpu cores.",
failureReasons.singularReasonFailures().insufficientVcpu(), failureReasons.size());
} else fail();
@@ -70,9 +108,9 @@ public class NodeAlerterTest {
10, new NodeResources(10, 1, 1000), 100,
10, new NodeResources(10, 0, 1000), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking memory.",
failureReasons.singularReasonFailures().insufficientMemoryGb(), failureReasons.size());
} else fail();
@@ -81,9 +119,9 @@ public class NodeAlerterTest {
10, new NodeResources(10, 100, 10), 100,
10, new NodeResources(10, 100, 0), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking disk space.",
failureReasons.singularReasonFailures().insufficientDiskGb(), failureReasons.size());
} else fail();
@@ -93,31 +131,25 @@ public class NodeAlerterTest {
10, new NodeResources(0, 0, 0), 100,
10, new NodeResources(10, 1000, 10000, NodeResources.DiskSpeed.slow), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All empty hosts should be invalid due to having incompatible disk speed.",
failureReasons.singularReasonFailures().incompatibleDiskSpeed(), emptyHostsWithSlowDisk);
} else fail();
- tester.createNodes(1, 10,
- 10, new NodeResources(10, 1000, 10000), 1,
- 10, new NodeResources(10, 1000, 10000), 1);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
- assertEquals("All failures should be due to hosts having a lack of available ip addresses.",
- failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size());
- } else fail();
+ }
+
+ @Test
+ public void testParentHostPolicyIntegrityFailurePaths() {
tester.createNodes(1, 1,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(10, 1000, 10000), 100);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("With only one type of tenant, all failures should be due to violation of the parent host policy.",
failureReasons.singularReasonFailures().violatesParentHostPolicy(), failureReasons.size());
} else fail();
@@ -126,25 +158,13 @@ public class NodeAlerterTest {
10, new NodeResources(10, 100, 1000), 1,
0, new NodeResources(0, 0, 0), 0);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertNotEquals("Fewer distinct children than hosts should result in some parent host policy violations.",
failureReasons.size(), failureReasons.singularReasonFailures().violatesParentHostPolicy());
assertNotEquals(0, failureReasons.singularReasonFailures().violatesParentHostPolicy());
} else fail();
-
- tester.createNodes(3, 30,
- 10, new NodeResources(0, 0, 10000), 1000,
- 0, new NodeResources(0, 0, 0), 0);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
- assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures",
- failureReasons.size(), failureReasons.multipleReasonFailures().size());
- assertEquals(0, failureReasons.singularReasonFailures().size());
- } else fail();
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
index 0e05f8a1934..fd8e0561fb8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
@@ -44,7 +44,7 @@ public class NodeAlerterTester {
}
NodeAlerter makeNodeAlerter() {
- return new NodeAlerter(nodeRepository, null, Duration.ofDays(1));
+ return new NodeAlerter(nodeRepository, new MetricsReporterTest.TestMetric(), Duration.ofDays(1));
}
List<NodeModel> createDistinctChildren(int amount, List<NodeResources> childResources) {
@@ -273,15 +273,6 @@ public class NodeAlerterTester {
List<Node> nodes = new ArrayList<>();
for (var nmod : nmods) {
if (nmod.type != NodeType.host && nmod.type != NodeType.tenant) continue;
- if (nmod.type == NodeType.tenant && nmod.environment == Flavor.Type.BARE_METAL) continue;
- NodeModel parent = nmods.stream()
- .filter(n -> n.hostname.equals(nmod.parentHostname.orElse("")))
- .findAny().orElse(null);
- if ((nmod.state == Node.State.failed || nmod.state == Node.State.parked)
- || (parent != null &&
- (parent.state == Node.State.failed || parent.state == Node.State.parked))) {
- continue;
- }
nodes.add(createNodeFromModel(nmod));
}
@@ -291,8 +282,9 @@ public class NodeAlerterTester {
void cleanRepository() {
nodeRepository.getNodes(NodeType.host).forEach(n -> nodeRepository.removeRecursively(n, true));
+ nodeRepository.getNodes().forEach(n -> nodeRepository.removeRecursively(n, true));
if (nodeRepository.getNodes().size() != 0) {
- throw new IllegalStateException("Removing all nodes didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]");
+ throw new IllegalStateException("Cleaning repository didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]");
}
}
}