diff options
author | freva <valerijf@yahoo-inc.com> | 2017-02-27 14:33:20 +0100 |
---|---|---|
committer | freva <valerijf@yahoo-inc.com> | 2017-02-27 14:33:20 +0100 |
commit | 12f1b6946e3ef2ffc907aecf29309692432d0541 (patch) | |
tree | e7f87be0be11d3b150153e1ada455ba9e2c79a28 /node-repository | |
parent | 692a32d760cf003c7440f760587f9cb5b4430118 (diff) |
Allow NodeFailer to fail multiple hosts
Diffstat (limited to 'node-repository')
2 files changed, 50 insertions, 93 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index b0cee37c5ec..f65b471dc20 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -150,13 +150,13 @@ public class NodeFailer extends Maintainer { } /** - * We can attempt to fail any number of *tenant* nodes because the operation will not be effected unless - * the node is replaced. + * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected + * unless the node is replaced. * However, nodes of other types are not replaced (because all of the type are used by a single application), * so we only allow one to be in failed at any point in time to protect against runaway failing. */ private boolean failAllowedFor(NodeType nodeType) { - if (nodeType == NodeType.tenant) return true; + if (nodeType == NodeType.tenant || nodeType == NodeType.host) return true; return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 0251e60e70b..165965adede 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -3,18 +3,21 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Status; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import org.junit.Test; import java.time.Duration; +import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -207,18 +210,13 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } // Select the first host that has two active nodes - String downHost1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream() - .collect(Collectors.groupingBy(Node::parentHostname)) - .entrySet().stream() - .filter(entry -> entry.getValue().size() == 2) - .map(Map.Entry::getKey) - .findFirst().get().get(); + String downHost1 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2); tester.serviceMonitor.setHostDown(downHost1); // nothing happens the first 45 minutes @@ -226,135 +224,77 @@ public class NodeFailerTest { tester.failer.run(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 0, tester.deployer.redeployments); + assertEquals(0, tester.deployer.redeployments); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } tester.clock.advance(Duration.ofMinutes(30)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); - assertEquals( 3, tester.deployer.redeployments); + assertEquals(2 + 1, tester.deployer.redeployments); assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); // Now lets fail an active tenant node - String downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0).hostname(); - tester.serviceMonitor.setHostDown(downTenant1); + Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0); + tester.serviceMonitor.setHostDown(downTenant1.hostname()); // nothing happens the first 45 minutes for (int minutes = 0; minutes < 45; minutes += 5 ) { tester.failer.run(); tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size()); } tester.clock.advance(Duration.ofMinutes(30)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); - assertEquals( 4, tester.deployer.redeployments); + assertEquals(3 + 1, tester.deployer.redeployments); assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - // Lets fail another host, but now nothing should happen as we have already failed a host - String downHost2 = tester.nodeRepository.getNodes(NodeType.host).get(0).hostname(); + // Lets fail another host, make sure it is not the same where downTenant1 is a child + String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get()); tester.serviceMonitor.setHostDown(downHost2); - - // Nothing happens - for (int minutes = 0; minutes < 90; minutes += 5 ) { - tester.failer.run(); - tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 4, tester.deployer.redeployments); - assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - } - } - - @Test - public void testFailingDockerHostNoReplacement() { - // app2 requires 5 nodes - NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(5); - - // For a day all nodes work so nothing happens - for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) { - tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - tester.failer.run(); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - } - - - // Select the first host that has two active nodes - String downHost1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream() - .collect(Collectors.groupingBy(Node::parentHostname)) - .entrySet().stream() - .filter(entry -> entry.getValue().size() == 2) - .map(Map.Entry::getKey) - .findFirst().get().get(); - tester.serviceMonitor.setHostDown(downHost1); - - // nothing happens the first 45 minutes - for (int minutes = 0; minutes < 45; minutes += 5 ) { - tester.failer.run(); - tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals( 0, tester.deployer.redeployments); - assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - } - - tester.clock.advance(Duration.ofMinutes(30)); + tester.failer.run(); + tester.clock.advance(Duration.ofMinutes(90)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); - // The node used by app1 should've been redeployed, while the host and node used by app2 should stay - assertEquals( 1, tester.deployer.redeployments); - assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(5 + 2, tester.deployer.redeployments); + assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(6, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - - // Now lets fail an active tenant node, this should work as normal - String downTenant1 = tester.nodeRepository.getNodes(NodeFailTester.app1).get(0).hostname(); - tester.serviceMonitor.setHostDown(downTenant1); - - // nothing happens the first 45 minutes - for (int minutes = 0; minutes < 45; minutes += 5 ) { - tester.failer.run(); - tester.clock.advance(Duration.ofMinutes(5)); - tester.allNodesMakeAConfigRequestExcept(); - assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - } - tester.clock.advance(Duration.ofMinutes(30)); + // We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's + // node, while app2's should remain + String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get()); + tester.serviceMonitor.setHostDown(downHost3); + tester.failer.run(); + tester.clock.advance(Duration.ofMinutes(90)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); - assertEquals( 2, tester.deployer.redeployments); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); + assertEquals(6 + 2, tester.deployer.redeployments); + assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); + assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size()); - assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } - @Test public void testFailingProxyNodes() { NodeFailTester tester = NodeFailTester.withProxyApplication(); @@ -410,4 +350,21 @@ public class NodeFailerTest { assertTrue(downHosts.contains(failedHost2)); } + /** + * Selects the first parent host that: + * - has exactly n nodes in state 'active' + * - is not present in the 'except' array + */ + private String selectFirstParentHostWithNActiveNodesExcept(NodeRepository nodeRepository, int n, String... except) { + Set<String> exceptSet = Arrays.stream(except).collect(Collectors.toSet()); + return nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream() + .collect(Collectors.groupingBy(Node::parentHostname)) + .entrySet().stream() + .filter(entry -> entry.getValue().size() == n) + .map(Map.Entry::getKey) + .flatMap(parentHost -> Stream.of(parentHost.get())) + .filter(node -> ! exceptSet.contains(node)) + .findFirst().get(); + } + } |