summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authormgimle <michael@gimle.io>2019-07-05 14:13:50 +0200
committermgimle <michael@gimle.io>2019-07-08 16:47:32 +0200
commitf3e18b139b78eb7c953afcf78896573531e1daa3 (patch)
tree234d75a4d63033b42b8198c17fd5c41955dade8e /node-repository
parentecd4a99b03618b483af7b4c5a51cebc68547f059 (diff)
Added more comments, made private members private,
made static inner class candidates static, moved filtering of uninteresting hosts and tenants to the alerter from the test case generator, added a test for overcommitted nodes, logically segmented the generated test cases, removed explicit null checks in favour of Optionals, changed some inner class initialization behaviour.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java142
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java122
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java14
3 files changed, 171 insertions, 107 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
index 3d4e0a32128..1e26c359f36 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java
@@ -1,5 +1,6 @@
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
@@ -19,6 +20,16 @@ import com.yahoo.vespa.hosted.provision.node.Allocation;
import java.util.*;
import java.util.function.Function;
+/**
+ * Performs analysis on the node repository to produce metrics that can be used for alerts if the repository is in an
+ * undesirable state.
+ * These metrics include:
+ * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's
+ * unable to find a new home for orphaned tenants.
+ * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's.
+ *
+ * @author mgimle
+ */
public class NodeAlerter extends Maintainer {
private final Metric metric;
@@ -30,26 +41,31 @@ public class NodeAlerter extends Maintainer {
Duration interval) {
super(nodeRepository, interval);
this.nodeRepository = nodeRepository;
- this.metric = metric;
+ this.metric = Objects.requireNonNull(metric);
}
@Override
protected void maintain() {
- if (metric != null) {
- int worstCaseHostLoss = worstCaseHostLossLeadingToFailure().hostsCausingFailure.size();
+ Optional<HostFailurePath> failurePath = worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size();
metric.set("hostedVespa.spareHostCapacity", worstCaseHostLoss - 1, null);
- metric.set("hostedVespa.overcommittedNodes", countOvercommittedNodes(), null);
+ metric.set("hostedVespa.overcommittedHosts", countOvercommittedHosts(), null);
}
}
- class HostFailurePath {
+ /**
+ * Contains the list of hosts that, upon being removed, caused an unrecoverable state,
+ * as well as the specific host and tenant which caused it.
+ */
+ public static class HostFailurePath {
List<Node> hostsCausingFailure;
HostRemovalFailure failureReason;
}
- HostFailurePath worstCaseHostLossLeadingToFailure() {
- List<Node> tenants = nodeRepository.getNodes(NodeType.tenant);
- List<Node> hosts = nodeRepository.getNodes(NodeType.host);
+ public Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() {
+ List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host));
+ List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts);
Map<String, Node> nodeMap = constructHostnameToNodeMap(hosts);
Map<Node, List<Node>> nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
Map<Node, AllocationResources> availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
@@ -58,10 +74,26 @@ public class NodeAlerter extends Maintainer {
return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources);
}
- HostFailurePath greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
+ private List<Node> filterTenants(List<Node> tenants, List<Node> hosts) {
+ var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet());
+ return tenants.stream()
+ .filter((t -> t.flavor().getType() != Flavor.Type.BARE_METAL
+ && t.state() != Node.State.failed
+ && t.state() != Node.State.parked))
+ .filter(t -> parentNames.contains(t.parentHostname().orElse("")))
+ .collect(Collectors.toList());
+ }
+
+ private List<Node> filterHosts(List<Node> hosts) {
+ return hosts.stream()
+ .filter(h -> h.state() != Node.State.failed && h.state() != Node.State.parked)
+ .collect(Collectors.toList());
+ }
+
+ private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts,
Map<Node, List<Node>> nodeChildren,
Map<Node, AllocationResources> availableResources) {
- if (hosts.size() == 0) return null;
+ if (hosts.size() == 0) return Optional.empty();
List<Node> parentRemovalPriorityList = heuristic.entrySet().stream()
.sorted(Comparator.comparingInt(Map.Entry::getValue))
.map(Map.Entry::getKey)
@@ -73,16 +105,16 @@ public class NodeAlerter extends Maintainer {
HostFailurePath failurePath = new HostFailurePath();
failurePath.hostsCausingFailure = hostsToRemove;
failurePath.failureReason = hostRemovalFailure.get();
- return failurePath;
+ return Optional.of(failurePath);
}
}
throw new IllegalStateException("No path to failure found. This should be impossible!");
}
- int countOvercommittedNodes() {
- List<Node> tenants = nodeRepository.getNodes(NodeType.tenant);
- List<Node> hosts = nodeRepository.getNodes(NodeType.host);
+ int countOvercommittedHosts() {
+ List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host));
+ List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts);
var nodeMap = constructHostnameToNodeMap(hosts);
var nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap);
var availableResources = constructAvailableResourcesMap(hosts, nodeChildren);
@@ -95,26 +127,25 @@ public class NodeAlerter extends Maintainer {
return overcommittedNodes.size();
}
- Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
+ private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) {
return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n));
}
- Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
- Map<Node, List<Node>> nodeChildren = tenants.stream()
+ private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) {
+ Map<Optional<Node>, List<Node>> possibleNodeChildren = tenants.stream()
.filter(n -> n.parentHostname().isPresent())
.collect(Collectors.groupingBy(
- n -> hostnameToNode.get(n.parentHostname().orElseThrow())));
- if (nodeChildren.get(null) != null) {
- List<Node> hasNullParents = nodeChildren.get(null);
- throw new IllegalStateException("Hostname(s) not present in hostname map : " +
- hasNullParents.stream().map(n -> n.parentHostname().orElseThrow()).collect(Collectors.joining(", ")));
- }
+ n -> Optional.ofNullable(hostnameToNode.get(n.parentHostname().orElseThrow()))));
+ possibleNodeChildren.remove(Optional.<Node>empty());
+ Map<Node, List<Node>> nodeChildren = possibleNodeChildren.entrySet().stream()
+ .collect(Collectors.toMap(e -> e.getKey().orElseThrow(), Map.Entry::getValue));
+
for (var host : hosts) nodeChildren.putIfAbsent(host, List.of());
return nodeChildren;
}
- Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
+ private Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) {
Map<Node, AllocationResources> availableResources = new HashMap<>();
for (var host : hosts) {
NodeResources hostResources = host.flavor().resources();
@@ -134,7 +165,7 @@ public class NodeAlerter extends Maintainer {
* Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing
* the host causes an unrecoverable state
*/
- Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
+ private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren,
Map<Node, AllocationResources> availableResources) {
Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap(
Function.identity(),
@@ -158,7 +189,7 @@ public class NodeAlerter extends Maintainer {
return timesNodeCanBeRemoved;
}
- List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
+ private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) {
List<Node> overcommittedNodes = new ArrayList<>();
for (var entry : availableResources.entrySet()) {
var resources = entry.getValue().nodeResources;
@@ -169,7 +200,7 @@ public class NodeAlerter extends Maintainer {
return overcommittedNodes;
}
- Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
+ private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) {
return nodeChildren.entrySet().stream().collect(Collectors.toMap(
Map.Entry::getKey,
e -> e.getValue().stream()
@@ -178,18 +209,30 @@ public class NodeAlerter extends Maintainer {
));
}
- class HostRemovalFailure {
+ /**
+ * Data class used for detailing why removing the given tenant from the given host was unsuccessful.
+ * A failure might not be caused by failing to allocate a specific tenant, in which case the fields
+ * will be empty.
+ */
+ public static class HostRemovalFailure {
Optional<Node> host;
Optional<Node> tenant;
AllocationFailureReasonList failureReasons;
- public HostRemovalFailure() {
- this.host = Optional.empty();
- this.tenant = Optional.empty();
- this.failureReasons = new AllocationFailureReasonList(List.of());
+ public static HostRemovalFailure none() {
+ return new HostRemovalFailure(
+ Optional.empty(),
+ Optional.empty(),
+ new AllocationFailureReasonList(List.of()));
+ }
+ public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
+ return new HostRemovalFailure(
+ Optional.of(host),
+ Optional.of(tenant),
+ failureReasons);
}
- public HostRemovalFailure(Node host, Node tenant, AllocationFailureReasonList failureReasons) {
- this.host = Optional.of(host);
- this.tenant = Optional.of(tenant);
+ private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList failureReasons) {
+ this.host = host;
+ this.tenant = tenant;
this.failureReasons = failureReasons;
}
}
@@ -199,7 +242,7 @@ public class NodeAlerter extends Maintainer {
* Does not mutate any input variable.
* @return Empty optional if removal is possible, information on what caused the failure otherwise
*/
- Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
+ private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts,
Map<Node, List<Node>> nodechildren,
Map<Node, AllocationResources> availableResources) {
var containedAllocations = collateAllocations(nodechildren);
@@ -208,7 +251,7 @@ public class NodeAlerter extends Maintainer {
.filter(h -> !hostsToRemove.contains(h))
.collect(Collectors.toList());
if (validAllocationTargets.size() == 0) {
- return Optional.of(new HostRemovalFailure());
+ return Optional.of(HostRemovalFailure.none());
}
for (var host : hostsToRemove) {
@@ -216,8 +259,9 @@ public class NodeAlerter extends Maintainer {
validAllocationTargets, resourceMap, containedAllocations);
if (unallocatedNode.isPresent()) {
- return Optional.of(new HostRemovalFailure(host, unallocatedNode.get(),
- collateAllocationFailures(unallocatedNode.get(), validAllocationTargets, resourceMap, containedAllocations)));
+ return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(),
+ collateAllocationFailures(unallocatedNode.get(), validAllocationTargets,
+ resourceMap, containedAllocations)));
}
}
return Optional.empty();
@@ -227,7 +271,7 @@ public class NodeAlerter extends Maintainer {
* Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations,
* optionally returning the first node to fail, if one does.
* */
- Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
+ private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
for (var node : nodes) {
@@ -238,7 +282,7 @@ public class NodeAlerter extends Maintainer {
return Optional.empty();
}
- boolean tryAllocateNode(Node node, List<Node> hosts,
+ private boolean tryAllocateNode(Node node, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
AllocationResources requiredNodeResources = AllocationResources.from(node.flavor().resources());
@@ -259,6 +303,9 @@ public class NodeAlerter extends Maintainer {
return false;
}
+ /**
+ * Used to describe the resources required for a tenant, and available to a host.
+ */
private static class AllocationResources {
NodeResources nodeResources;
int availableIPs;
@@ -282,7 +329,7 @@ public class NodeAlerter extends Maintainer {
}
}
- boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
+ private boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) {
if (node.allocation().isEmpty()) return false;
Allocation nodeAllocation = node.allocation().get();
for (var allocation : containedAllocations.get(host)) {
@@ -294,6 +341,9 @@ public class NodeAlerter extends Maintainer {
return false;
}
+ /**
+ * Keeps track of the reason why a host rejected an allocation.
+ */
private class AllocationFailureReason {
Node host;
public AllocationFailureReason (Node host) {
@@ -331,7 +381,10 @@ public class NodeAlerter extends Maintainer {
}
}
- class AllocationFailureReasonList {
+ /**
+ * Provides convenient methods for tallying failures.
+ */
+ public static class AllocationFailureReasonList {
private List<AllocationFailureReason> allocationFailureReasons;
public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) {
this.allocationFailureReasons = allocationFailureReasons;
@@ -344,7 +397,6 @@ public class NodeAlerter extends Maintainer {
long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); }
long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); }
-
public AllocationFailureReasonList singularReasonFailures() {
return new AllocationFailureReasonList(allocationFailureReasons.stream()
.filter(reason -> reason.numberOfReasons() == 1).collect(Collectors.toList()));
@@ -364,7 +416,7 @@ public class NodeAlerter extends Maintainer {
}
}
- AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
+ private AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts,
Map<Node, AllocationResources> availableResources,
Map<Node, List<Allocation>> containedAllocations) {
List<AllocationFailureReason> allocationFailureReasons = new ArrayList<>();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
index 281b289e992..2ed9de1df37 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java
@@ -8,7 +8,6 @@ import org.junit.Test;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.*;
-import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
import static org.junit.Assert.*;
@@ -31,37 +30,76 @@ public class NodeAlerterTest {
tester.cleanRepository();
tester.restoreNodeRepositoryFromJsonFile(Paths.get(path));
- NodeAlerter.HostFailurePath failurePath = alerter.worstCaseHostLossLeadingToFailure();
- if (failurePath != null) {
- assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.hostsCausingFailure));
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ if (failurePath.isPresent()) {
+ assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure));
} else fail();
}
@Test
- public void testWithGeneratedData() {
+ public void testOvercommittedHosts() {
+ tester.createNodes(7, 4,
+ 10, new NodeResources(-1, 10, 100), 10,
+ 0, new NodeResources(1, 10, 100), 10);
+ int overcommittedHosts = alerter.countOvercommittedHosts();
+ assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), overcommittedHosts);
+ }
+
+ @Test
+ public void testEdgeCaseFailurePaths() {
tester.createNodes(1, 1,
0, new NodeResources(1, 10, 100), 10,
0, new NodeResources(1, 10, 100), 10);
var failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNull("Computing worst case host loss with no hosts should return null.", failurePath);
+ assertFalse("Computing worst case host loss with no hosts should return an empty optional.", failurePath.isPresent());
// Odd edge case that should never be able to occur in prod
tester.createNodes(1, 10,
10, new NodeResources(10, 1000, 10000), 100,
1, new NodeResources(10, 1000, 10000), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
+ assertTrue(failurePath.isPresent());
assertTrue("Computing worst case host loss if all hosts have to be removed should result in an non-empty failureReason with empty nodes.",
- failurePath.failureReason.tenant.isEmpty() && failurePath.failureReason.host.isEmpty());
- assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.hostsCausingFailure.size());
+ failurePath.get().failureReason.tenant.isEmpty() && failurePath.get().failureReason.host.isEmpty());
+ assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.get().hostsCausingFailure.size());
+
+ tester.createNodes(3, 30,
+ 10, new NodeResources(0, 0, 10000), 1000,
+ 0, new NodeResources(0, 0, 0), 0);
+ failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
+ assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures",
+ failureReasons.size(), failureReasons.multipleReasonFailures().size());
+ assertEquals(0, failureReasons.singularReasonFailures().size());
+ } else fail();
+ }
+
+ @Test
+ public void testIpFailurePaths() {
+ tester.createNodes(1, 10,
+ 10, new NodeResources(10, 1000, 10000), 1,
+ 10, new NodeResources(10, 1000, 10000), 1);
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
+ assertEquals("All failures should be due to hosts having a lack of available ip addresses.",
+ failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size());
+ } else fail();
+
+ }
+ @Test
+ public void testNodeResourceFailurePaths() {
tester.createNodes(1, 10,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(0, 100, 1000), 100);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking cpu cores.",
failureReasons.singularReasonFailures().insufficientVcpu(), failureReasons.size());
} else fail();
@@ -70,9 +108,9 @@ public class NodeAlerterTest {
10, new NodeResources(10, 1, 1000), 100,
10, new NodeResources(10, 0, 1000), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking memory.",
failureReasons.singularReasonFailures().insufficientMemoryGb(), failureReasons.size());
} else fail();
@@ -81,9 +119,9 @@ public class NodeAlerterTest {
10, new NodeResources(10, 100, 10), 100,
10, new NodeResources(10, 100, 0), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All failures should be due to hosts lacking disk space.",
failureReasons.singularReasonFailures().insufficientDiskGb(), failureReasons.size());
} else fail();
@@ -93,31 +131,25 @@ public class NodeAlerterTest {
10, new NodeResources(0, 0, 0), 100,
10, new NodeResources(10, 1000, 10000, NodeResources.DiskSpeed.slow), 100);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("All empty hosts should be invalid due to having incompatible disk speed.",
failureReasons.singularReasonFailures().incompatibleDiskSpeed(), emptyHostsWithSlowDisk);
} else fail();
- tester.createNodes(1, 10,
- 10, new NodeResources(10, 1000, 10000), 1,
- 10, new NodeResources(10, 1000, 10000), 1);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
- assertEquals("All failures should be due to hosts having a lack of available ip addresses.",
- failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size());
- } else fail();
+ }
+
+ @Test
+ public void testParentHostPolicyIntegrityFailurePaths() {
tester.createNodes(1, 1,
10, new NodeResources(1, 100, 1000), 100,
10, new NodeResources(10, 1000, 10000), 100);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ var failurePath = alerter.worstCaseHostLossLeadingToFailure();
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertEquals("With only one type of tenant, all failures should be due to violation of the parent host policy.",
failureReasons.singularReasonFailures().violatesParentHostPolicy(), failureReasons.size());
} else fail();
@@ -126,25 +158,13 @@ public class NodeAlerterTest {
10, new NodeResources(10, 100, 1000), 1,
0, new NodeResources(0, 0, 0), 0);
failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
+ assertTrue(failurePath.isPresent());
+ if (failurePath.get().failureReason.tenant.isPresent()) {
+ var failureReasons = failurePath.get().failureReason.failureReasons;
assertNotEquals("Fewer distinct children than hosts should result in some parent host policy violations.",
failureReasons.size(), failureReasons.singularReasonFailures().violatesParentHostPolicy());
assertNotEquals(0, failureReasons.singularReasonFailures().violatesParentHostPolicy());
} else fail();
-
- tester.createNodes(3, 30,
- 10, new NodeResources(0, 0, 10000), 1000,
- 0, new NodeResources(0, 0, 0), 0);
- failurePath = alerter.worstCaseHostLossLeadingToFailure();
- assertNotNull(failurePath);
- if (failurePath.failureReason.tenant.isPresent()) {
- var failureReasons = failurePath.failureReason.failureReasons;
- assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures",
- failureReasons.size(), failureReasons.multipleReasonFailures().size());
- assertEquals(0, failureReasons.singularReasonFailures().size());
- } else fail();
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
index 0e05f8a1934..fd8e0561fb8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java
@@ -44,7 +44,7 @@ public class NodeAlerterTester {
}
NodeAlerter makeNodeAlerter() {
- return new NodeAlerter(nodeRepository, null, Duration.ofDays(1));
+ return new NodeAlerter(nodeRepository, new MetricsReporterTest.TestMetric(), Duration.ofDays(1));
}
List<NodeModel> createDistinctChildren(int amount, List<NodeResources> childResources) {
@@ -273,15 +273,6 @@ public class NodeAlerterTester {
List<Node> nodes = new ArrayList<>();
for (var nmod : nmods) {
if (nmod.type != NodeType.host && nmod.type != NodeType.tenant) continue;
- if (nmod.type == NodeType.tenant && nmod.environment == Flavor.Type.BARE_METAL) continue;
- NodeModel parent = nmods.stream()
- .filter(n -> n.hostname.equals(nmod.parentHostname.orElse("")))
- .findAny().orElse(null);
- if ((nmod.state == Node.State.failed || nmod.state == Node.State.parked)
- || (parent != null &&
- (parent.state == Node.State.failed || parent.state == Node.State.parked))) {
- continue;
- }
nodes.add(createNodeFromModel(nmod));
}
@@ -291,8 +282,9 @@ public class NodeAlerterTester {
void cleanRepository() {
nodeRepository.getNodes(NodeType.host).forEach(n -> nodeRepository.removeRecursively(n, true));
+ nodeRepository.getNodes().forEach(n -> nodeRepository.removeRecursively(n, true));
if (nodeRepository.getNodes().size() != 0) {
- throw new IllegalStateException("Removing all nodes didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]");
+ throw new IllegalStateException("Cleaning repository didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]");
}
}
}