diff options
author | mgimle <michael@gimle.io> | 2019-07-05 14:13:50 +0200 |
---|---|---|
committer | mgimle <michael@gimle.io> | 2019-07-08 16:47:32 +0200 |
commit | f3e18b139b78eb7c953afcf78896573531e1daa3 (patch) | |
tree | 234d75a4d63033b42b8198c17fd5c41955dade8e /node-repository | |
parent | ecd4a99b03618b483af7b4c5a51cebc68547f059 (diff) |
Added more comments, made private members private,
made static inner class candidates static, moved filtering of uninteresting
hosts and tenants to the alerter from the test case generator, added
a test for overcommitted nodes, logically segmented the generated test cases,
removed explicit null checks in favour of Optionals, changed some inner
class initialization behaviour.
Diffstat (limited to 'node-repository')
3 files changed, 171 insertions, 107 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java index 3d4e0a32128..1e26c359f36 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerter.java @@ -1,5 +1,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; @@ -19,6 +20,16 @@ import com.yahoo.vespa.hosted.provision.node.Allocation; import java.util.*; import java.util.function.Function; +/** + * Performs analysis on the node repository to produce metrics that can be used for alerts if the repository is in an + * undesirable state. + * These metrics include: + * Spare host capacity, or how many hosts the repository can stand to lose without ending up in a situation where it's + * unable to find a new home for orphaned tenants. + * Overcommitted hosts, which tracks if there are any hosts whose capacity is less than the sum of its children's. + * + * @author mgimle + */ public class NodeAlerter extends Maintainer { private final Metric metric; @@ -30,26 +41,31 @@ public class NodeAlerter extends Maintainer { Duration interval) { super(nodeRepository, interval); this.nodeRepository = nodeRepository; - this.metric = metric; + this.metric = Objects.requireNonNull(metric); } @Override protected void maintain() { - if (metric != null) { - int worstCaseHostLoss = worstCaseHostLossLeadingToFailure().hostsCausingFailure.size(); + Optional<HostFailurePath> failurePath = worstCaseHostLossLeadingToFailure(); + if (failurePath.isPresent()) { + int worstCaseHostLoss = failurePath.get().hostsCausingFailure.size(); metric.set("hostedVespa.spareHostCapacity", worstCaseHostLoss - 1, null); - metric.set("hostedVespa.overcommittedNodes", countOvercommittedNodes(), null); + metric.set("hostedVespa.overcommittedHosts", countOvercommittedHosts(), null); } } - class HostFailurePath { + /** + * Contains the list of hosts that, upon being removed, caused an unrecoverable state, + * as well as the specific host and tenant which caused it. + */ + public static class HostFailurePath { List<Node> hostsCausingFailure; HostRemovalFailure failureReason; } - HostFailurePath worstCaseHostLossLeadingToFailure() { - List<Node> tenants = nodeRepository.getNodes(NodeType.tenant); - List<Node> hosts = nodeRepository.getNodes(NodeType.host); + public Optional<HostFailurePath> worstCaseHostLossLeadingToFailure() { + List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host)); + List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts); Map<String, Node> nodeMap = constructHostnameToNodeMap(hosts); Map<Node, List<Node>> nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap); Map<Node, AllocationResources> availableResources = constructAvailableResourcesMap(hosts, nodeChildren); @@ -58,10 +74,26 @@ public class NodeAlerter extends Maintainer { return greedyHeuristicFindFailurePath(timesNodeCanBeRemoved, hosts, nodeChildren, availableResources); } - HostFailurePath greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts, + private List<Node> filterTenants(List<Node> tenants, List<Node> hosts) { + var parentNames = hosts.stream().map(Node::hostname).collect(Collectors.toSet()); + return tenants.stream() + .filter((t -> t.flavor().getType() != Flavor.Type.BARE_METAL + && t.state() != Node.State.failed + && t.state() != Node.State.parked)) + .filter(t -> parentNames.contains(t.parentHostname().orElse(""))) + .collect(Collectors.toList()); + } + + private List<Node> filterHosts(List<Node> hosts) { + return hosts.stream() + .filter(h -> h.state() != Node.State.failed && h.state() != Node.State.parked) + .collect(Collectors.toList()); + } + + private Optional<HostFailurePath> greedyHeuristicFindFailurePath(Map<Node, Integer> heuristic, List<Node> hosts, Map<Node, List<Node>> nodeChildren, Map<Node, AllocationResources> availableResources) { - if (hosts.size() == 0) return null; + if (hosts.size() == 0) return Optional.empty(); List<Node> parentRemovalPriorityList = heuristic.entrySet().stream() .sorted(Comparator.comparingInt(Map.Entry::getValue)) .map(Map.Entry::getKey) @@ -73,16 +105,16 @@ public class NodeAlerter extends Maintainer { HostFailurePath failurePath = new HostFailurePath(); failurePath.hostsCausingFailure = hostsToRemove; failurePath.failureReason = hostRemovalFailure.get(); - return failurePath; + return Optional.of(failurePath); } } throw new IllegalStateException("No path to failure found. This should be impossible!"); } - int countOvercommittedNodes() { - List<Node> tenants = nodeRepository.getNodes(NodeType.tenant); - List<Node> hosts = nodeRepository.getNodes(NodeType.host); + int countOvercommittedHosts() { + List<Node> hosts = filterHosts(nodeRepository.getNodes(NodeType.host)); + List<Node> tenants = filterTenants(nodeRepository.getNodes(NodeType.tenant), hosts); var nodeMap = constructHostnameToNodeMap(hosts); var nodeChildren = constructNodeChildrenMap(tenants, hosts, nodeMap); var availableResources = constructAvailableResourcesMap(hosts, nodeChildren); @@ -95,26 +127,25 @@ public class NodeAlerter extends Maintainer { return overcommittedNodes.size(); } - Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) { + private Map<String, Node> constructHostnameToNodeMap(List<Node> nodes) { return nodes.stream().collect(Collectors.toMap(Node::hostname, n -> n)); } - Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) { - Map<Node, List<Node>> nodeChildren = tenants.stream() + private Map<Node, List<Node>> constructNodeChildrenMap(List<Node> tenants, List<Node> hosts, Map<String, Node> hostnameToNode) { + Map<Optional<Node>, List<Node>> possibleNodeChildren = tenants.stream() .filter(n -> n.parentHostname().isPresent()) .collect(Collectors.groupingBy( - n -> hostnameToNode.get(n.parentHostname().orElseThrow()))); - if (nodeChildren.get(null) != null) { - List<Node> hasNullParents = nodeChildren.get(null); - throw new IllegalStateException("Hostname(s) not present in hostname map : " + - hasNullParents.stream().map(n -> n.parentHostname().orElseThrow()).collect(Collectors.joining(", "))); - } + n -> Optional.ofNullable(hostnameToNode.get(n.parentHostname().orElseThrow())))); + possibleNodeChildren.remove(Optional.<Node>empty()); + Map<Node, List<Node>> nodeChildren = possibleNodeChildren.entrySet().stream() + .collect(Collectors.toMap(e -> e.getKey().orElseThrow(), Map.Entry::getValue)); + for (var host : hosts) nodeChildren.putIfAbsent(host, List.of()); return nodeChildren; } - Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) { + private Map<Node, AllocationResources> constructAvailableResourcesMap(List<Node> hosts, Map<Node, List<Node>> nodeChildren) { Map<Node, AllocationResources> availableResources = new HashMap<>(); for (var host : hosts) { NodeResources hostResources = host.flavor().resources(); @@ -134,7 +165,7 @@ public class NodeAlerter extends Maintainer { * Computes a heuristic for each host, with a lower score indicating a higher perceived likelihood that removing * the host causes an unrecoverable state */ - Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren, + private Map<Node, Integer> computeMaximalRepeatedRemovals(List<Node> hosts, Map<Node, List<Node>> nodeChildren, Map<Node, AllocationResources> availableResources) { Map<Node, Integer> timesNodeCanBeRemoved = hosts.stream().collect(Collectors.toMap( Function.identity(), @@ -158,7 +189,7 @@ public class NodeAlerter extends Maintainer { return timesNodeCanBeRemoved; } - List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) { + private List<Node> findOvercommittedNodes(Map<Node, AllocationResources> availableResources) { List<Node> overcommittedNodes = new ArrayList<>(); for (var entry : availableResources.entrySet()) { var resources = entry.getValue().nodeResources; @@ -169,7 +200,7 @@ public class NodeAlerter extends Maintainer { return overcommittedNodes; } - Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) { + private Map<Node, List<Allocation>> collateAllocations(Map<Node, List<Node>> nodeChildren) { return nodeChildren.entrySet().stream().collect(Collectors.toMap( Map.Entry::getKey, e -> e.getValue().stream() @@ -178,18 +209,30 @@ public class NodeAlerter extends Maintainer { )); } - class HostRemovalFailure { + /** + * Data class used for detailing why removing the given tenant from the given host was unsuccessful. + * A failure might not be caused by failing to allocate a specific tenant, in which case the fields + * will be empty. + */ + public static class HostRemovalFailure { Optional<Node> host; Optional<Node> tenant; AllocationFailureReasonList failureReasons; - public HostRemovalFailure() { - this.host = Optional.empty(); - this.tenant = Optional.empty(); - this.failureReasons = new AllocationFailureReasonList(List.of()); + public static HostRemovalFailure none() { + return new HostRemovalFailure( + Optional.empty(), + Optional.empty(), + new AllocationFailureReasonList(List.of())); + } + public static HostRemovalFailure create(Node host, Node tenant, AllocationFailureReasonList failureReasons) { + return new HostRemovalFailure( + Optional.of(host), + Optional.of(tenant), + failureReasons); } - public HostRemovalFailure(Node host, Node tenant, AllocationFailureReasonList failureReasons) { - this.host = Optional.of(host); - this.tenant = Optional.of(tenant); + private HostRemovalFailure(Optional<Node> host, Optional<Node> tenant, AllocationFailureReasonList failureReasons) { + this.host = host; + this.tenant = tenant; this.failureReasons = failureReasons; } } @@ -199,7 +242,7 @@ public class NodeAlerter extends Maintainer { * Does not mutate any input variable. * @return Empty optional if removal is possible, information on what caused the failure otherwise */ - Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts, + private Optional<HostRemovalFailure> findHostRemovalFailure(List<Node> hostsToRemove, List<Node> allHosts, Map<Node, List<Node>> nodechildren, Map<Node, AllocationResources> availableResources) { var containedAllocations = collateAllocations(nodechildren); @@ -208,7 +251,7 @@ public class NodeAlerter extends Maintainer { .filter(h -> !hostsToRemove.contains(h)) .collect(Collectors.toList()); if (validAllocationTargets.size() == 0) { - return Optional.of(new HostRemovalFailure()); + return Optional.of(HostRemovalFailure.none()); } for (var host : hostsToRemove) { @@ -216,8 +259,9 @@ public class NodeAlerter extends Maintainer { validAllocationTargets, resourceMap, containedAllocations); if (unallocatedNode.isPresent()) { - return Optional.of(new HostRemovalFailure(host, unallocatedNode.get(), - collateAllocationFailures(unallocatedNode.get(), validAllocationTargets, resourceMap, containedAllocations))); + return Optional.of(HostRemovalFailure.create(host, unallocatedNode.get(), + collateAllocationFailures(unallocatedNode.get(), validAllocationTargets, + resourceMap, containedAllocations))); } } return Optional.empty(); @@ -227,7 +271,7 @@ public class NodeAlerter extends Maintainer { * Attempts to allocate the listed nodes to a new host, mutating availableResources and containedAllocations, * optionally returning the first node to fail, if one does. * */ - Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts, + private Optional<Node> tryAllocateNodes(List<Node> nodes, List<Node> hosts, Map<Node, AllocationResources> availableResources, Map<Node, List<Allocation>> containedAllocations) { for (var node : nodes) { @@ -238,7 +282,7 @@ public class NodeAlerter extends Maintainer { return Optional.empty(); } - boolean tryAllocateNode(Node node, List<Node> hosts, + private boolean tryAllocateNode(Node node, List<Node> hosts, Map<Node, AllocationResources> availableResources, Map<Node, List<Allocation>> containedAllocations) { AllocationResources requiredNodeResources = AllocationResources.from(node.flavor().resources()); @@ -259,6 +303,9 @@ public class NodeAlerter extends Maintainer { return false; } + /** + * Used to describe the resources required for a tenant, and available to a host. + */ private static class AllocationResources { NodeResources nodeResources; int availableIPs; @@ -282,7 +329,7 @@ public class NodeAlerter extends Maintainer { } } - boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) { + private boolean violatesParentHostPolicy(Node node, Node host, Map<Node, List<Allocation>> containedAllocations) { if (node.allocation().isEmpty()) return false; Allocation nodeAllocation = node.allocation().get(); for (var allocation : containedAllocations.get(host)) { @@ -294,6 +341,9 @@ public class NodeAlerter extends Maintainer { return false; } + /** + * Keeps track of the reason why a host rejected an allocation. + */ private class AllocationFailureReason { Node host; public AllocationFailureReason (Node host) { @@ -331,7 +381,10 @@ public class NodeAlerter extends Maintainer { } } - class AllocationFailureReasonList { + /** + * Provides convenient methods for tallying failures. + */ + public static class AllocationFailureReasonList { private List<AllocationFailureReason> allocationFailureReasons; public AllocationFailureReasonList(List<AllocationFailureReason> allocationFailureReasons) { this.allocationFailureReasons = allocationFailureReasons; @@ -344,7 +397,6 @@ public class NodeAlerter extends Maintainer { long insufficientAvailableIps() { return allocationFailureReasons.stream().filter(r -> r.insufficientAvailableIPs).count(); } long violatesParentHostPolicy() { return allocationFailureReasons.stream().filter(r -> r.violatesParentHostPolicy).count(); } - public AllocationFailureReasonList singularReasonFailures() { return new AllocationFailureReasonList(allocationFailureReasons.stream() .filter(reason -> reason.numberOfReasons() == 1).collect(Collectors.toList())); @@ -364,7 +416,7 @@ public class NodeAlerter extends Maintainer { } } - AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts, + private AllocationFailureReasonList collateAllocationFailures(Node node, List<Node> hosts, Map<Node, AllocationResources> availableResources, Map<Node, List<Allocation>> containedAllocations) { List<AllocationFailureReason> allocationFailureReasons = new ArrayList<>(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java index 281b289e992..2ed9de1df37 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTest.java @@ -8,7 +8,6 @@ import org.junit.Test; import java.io.IOException; import java.nio.file.Paths; import java.util.*; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; import static org.junit.Assert.*; @@ -31,37 +30,76 @@ public class NodeAlerterTest { tester.cleanRepository(); tester.restoreNodeRepositoryFromJsonFile(Paths.get(path)); - NodeAlerter.HostFailurePath failurePath = alerter.worstCaseHostLossLeadingToFailure(); - if (failurePath != null) { - assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.hostsCausingFailure)); + var failurePath = alerter.worstCaseHostLossLeadingToFailure(); + if (failurePath.isPresent()) { + assertTrue(tester.nodeRepository.getNodes(NodeType.host).containsAll(failurePath.get().hostsCausingFailure)); } else fail(); } @Test - public void testWithGeneratedData() { + public void testOvercommittedHosts() { + tester.createNodes(7, 4, + 10, new NodeResources(-1, 10, 100), 10, + 0, new NodeResources(1, 10, 100), 10); + int overcommittedHosts = alerter.countOvercommittedHosts(); + assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), overcommittedHosts); + } + + @Test + public void testEdgeCaseFailurePaths() { tester.createNodes(1, 1, 0, new NodeResources(1, 10, 100), 10, 0, new NodeResources(1, 10, 100), 10); var failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNull("Computing worst case host loss with no hosts should return null.", failurePath); + assertFalse("Computing worst case host loss with no hosts should return an empty optional.", failurePath.isPresent()); // Odd edge case that should never be able to occur in prod tester.createNodes(1, 10, 10, new NodeResources(10, 1000, 10000), 100, 1, new NodeResources(10, 1000, 10000), 100); failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); + assertTrue(failurePath.isPresent()); assertTrue("Computing worst case host loss if all hosts have to be removed should result in an non-empty failureReason with empty nodes.", - failurePath.failureReason.tenant.isEmpty() && failurePath.failureReason.host.isEmpty()); - assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.hostsCausingFailure.size()); + failurePath.get().failureReason.tenant.isEmpty() && failurePath.get().failureReason.host.isEmpty()); + assertEquals(tester.nodeRepository.getNodes(NodeType.host).size(), failurePath.get().hostsCausingFailure.size()); + + tester.createNodes(3, 30, + 10, new NodeResources(0, 0, 10000), 1000, + 0, new NodeResources(0, 0, 0), 0); + failurePath = alerter.worstCaseHostLossLeadingToFailure(); + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; + assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures", + failureReasons.size(), failureReasons.multipleReasonFailures().size()); + assertEquals(0, failureReasons.singularReasonFailures().size()); + } else fail(); + } + + @Test + public void testIpFailurePaths() { + tester.createNodes(1, 10, + 10, new NodeResources(10, 1000, 10000), 1, + 10, new NodeResources(10, 1000, 10000), 1); + var failurePath = alerter.worstCaseHostLossLeadingToFailure(); + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; + assertEquals("All failures should be due to hosts having a lack of available ip addresses.", + failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size()); + } else fail(); + + } + @Test + public void testNodeResourceFailurePaths() { tester.createNodes(1, 10, 10, new NodeResources(1, 100, 1000), 100, 10, new NodeResources(0, 100, 1000), 100); - failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + var failurePath = alerter.worstCaseHostLossLeadingToFailure(); + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertEquals("All failures should be due to hosts lacking cpu cores.", failureReasons.singularReasonFailures().insufficientVcpu(), failureReasons.size()); } else fail(); @@ -70,9 +108,9 @@ public class NodeAlerterTest { 10, new NodeResources(10, 1, 1000), 100, 10, new NodeResources(10, 0, 1000), 100); failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertEquals("All failures should be due to hosts lacking memory.", failureReasons.singularReasonFailures().insufficientMemoryGb(), failureReasons.size()); } else fail(); @@ -81,9 +119,9 @@ public class NodeAlerterTest { 10, new NodeResources(10, 100, 10), 100, 10, new NodeResources(10, 100, 0), 100); failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertEquals("All failures should be due to hosts lacking disk space.", failureReasons.singularReasonFailures().insufficientDiskGb(), failureReasons.size()); } else fail(); @@ -93,31 +131,25 @@ public class NodeAlerterTest { 10, new NodeResources(0, 0, 0), 100, 10, new NodeResources(10, 1000, 10000, NodeResources.DiskSpeed.slow), 100); failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertEquals("All empty hosts should be invalid due to having incompatible disk speed.", failureReasons.singularReasonFailures().incompatibleDiskSpeed(), emptyHostsWithSlowDisk); } else fail(); - tester.createNodes(1, 10, - 10, new NodeResources(10, 1000, 10000), 1, - 10, new NodeResources(10, 1000, 10000), 1); - failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; - assertEquals("All failures should be due to hosts having a lack of available ip addresses.", - failureReasons.singularReasonFailures().insufficientAvailableIps(), failureReasons.size()); - } else fail(); + } + + @Test + public void testParentHostPolicyIntegrityFailurePaths() { tester.createNodes(1, 1, 10, new NodeResources(1, 100, 1000), 100, 10, new NodeResources(10, 1000, 10000), 100); - failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + var failurePath = alerter.worstCaseHostLossLeadingToFailure(); + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertEquals("With only one type of tenant, all failures should be due to violation of the parent host policy.", failureReasons.singularReasonFailures().violatesParentHostPolicy(), failureReasons.size()); } else fail(); @@ -126,25 +158,13 @@ public class NodeAlerterTest { 10, new NodeResources(10, 100, 1000), 1, 0, new NodeResources(0, 0, 0), 0); failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; + assertTrue(failurePath.isPresent()); + if (failurePath.get().failureReason.tenant.isPresent()) { + var failureReasons = failurePath.get().failureReason.failureReasons; assertNotEquals("Fewer distinct children than hosts should result in some parent host policy violations.", failureReasons.size(), failureReasons.singularReasonFailures().violatesParentHostPolicy()); assertNotEquals(0, failureReasons.singularReasonFailures().violatesParentHostPolicy()); } else fail(); - - tester.createNodes(3, 30, - 10, new NodeResources(0, 0, 10000), 1000, - 0, new NodeResources(0, 0, 0), 0); - failurePath = alerter.worstCaseHostLossLeadingToFailure(); - assertNotNull(failurePath); - if (failurePath.failureReason.tenant.isPresent()) { - var failureReasons = failurePath.failureReason.failureReasons; - assertEquals("When there are multiple lacking resources, all failures are multipleReasonFailures", - failureReasons.size(), failureReasons.multipleReasonFailures().size()); - assertEquals(0, failureReasons.singularReasonFailures().size()); - } else fail(); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java index 0e05f8a1934..fd8e0561fb8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeAlerterTester.java @@ -44,7 +44,7 @@ public class NodeAlerterTester { } NodeAlerter makeNodeAlerter() { - return new NodeAlerter(nodeRepository, null, Duration.ofDays(1)); + return new NodeAlerter(nodeRepository, new MetricsReporterTest.TestMetric(), Duration.ofDays(1)); } List<NodeModel> createDistinctChildren(int amount, List<NodeResources> childResources) { @@ -273,15 +273,6 @@ public class NodeAlerterTester { List<Node> nodes = new ArrayList<>(); for (var nmod : nmods) { if (nmod.type != NodeType.host && nmod.type != NodeType.tenant) continue; - if (nmod.type == NodeType.tenant && nmod.environment == Flavor.Type.BARE_METAL) continue; - NodeModel parent = nmods.stream() - .filter(n -> n.hostname.equals(nmod.parentHostname.orElse(""))) - .findAny().orElse(null); - if ((nmod.state == Node.State.failed || nmod.state == Node.State.parked) - || (parent != null && - (parent.state == Node.State.failed || parent.state == Node.State.parked))) { - continue; - } nodes.add(createNodeFromModel(nmod)); } @@ -291,8 +282,9 @@ public class NodeAlerterTester { void cleanRepository() { nodeRepository.getNodes(NodeType.host).forEach(n -> nodeRepository.removeRecursively(n, true)); + nodeRepository.getNodes().forEach(n -> nodeRepository.removeRecursively(n, true)); if (nodeRepository.getNodes().size() != 0) { - throw new IllegalStateException("Removing all nodes didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]"); + throw new IllegalStateException("Cleaning repository didn't remove all nodes! [" + nodeRepository.getNodes().size() + "]"); } } } |