summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorfreva <valerijf@yahoo-inc.com>2017-02-27 14:33:20 +0100
committerfreva <valerijf@yahoo-inc.com>2017-02-27 14:33:20 +0100
commit12f1b6946e3ef2ffc907aecf29309692432d0541 (patch)
treee7f87be0be11d3b150153e1ada455ba9e2c79a28 /node-repository
parent692a32d760cf003c7440f760587f9cb5b4430118 (diff)
Allow NodeFailer to fail multiple hosts
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java137
2 files changed, 50 insertions, 93 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index b0cee37c5ec..f65b471dc20 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -150,13 +150,13 @@ public class NodeFailer extends Maintainer {
}
/**
- * We can attempt to fail any number of *tenant* nodes because the operation will not be effected unless
- * the node is replaced.
+ * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
+ * unless the node is replaced.
* However, nodes of other types are not replaced (because all of the type are used by a single application),
* so we only allow one to be in failed at any point in time to protect against runaway failing.
*/
private boolean failAllowedFor(NodeType nodeType) {
- if (nodeType == NodeType.tenant) return true;
+ if (nodeType == NodeType.tenant || nodeType == NodeType.host) return true;
return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 0251e60e70b..165965adede 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -3,18 +3,21 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Status;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException;
import org.junit.Test;
import java.time.Duration;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -207,18 +210,13 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
// Select the first host that has two active nodes
- String downHost1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream()
- .collect(Collectors.groupingBy(Node::parentHostname))
- .entrySet().stream()
- .filter(entry -> entry.getValue().size() == 2)
- .map(Map.Entry::getKey)
- .findFirst().get().get();
+ String downHost1 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
tester.serviceMonitor.setHostDown(downHost1);
// nothing happens the first 45 minutes
@@ -226,135 +224,77 @@ public class NodeFailerTest {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- assertEquals( 0, tester.deployer.redeployments);
+ assertEquals(0, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
- assertEquals( 3, tester.deployer.redeployments);
+ assertEquals(2 + 1, tester.deployer.redeployments);
assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
// Now lets fail an active tenant node
- String downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0).hostname();
- tester.serviceMonitor.setHostDown(downTenant1);
+ Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0);
+ tester.serviceMonitor.setHostDown(downTenant1.hostname());
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes += 5 ) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
}
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
- assertEquals( 4, tester.deployer.redeployments);
+ assertEquals(3 + 1, tester.deployer.redeployments);
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- // Lets fail another host, but now nothing should happen as we have already failed a host
- String downHost2 = tester.nodeRepository.getNodes(NodeType.host).get(0).hostname();
+ // Lets fail another host, make sure it is not the same where downTenant1 is a child
+ String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
tester.serviceMonitor.setHostDown(downHost2);
-
- // Nothing happens
- for (int minutes = 0; minutes < 90; minutes += 5 ) {
- tester.failer.run();
- tester.clock.advance(Duration.ofMinutes(5));
- tester.allNodesMakeAConfigRequestExcept();
- assertEquals( 4, tester.deployer.redeployments);
- assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
- assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- }
- }
-
- @Test
- public void testFailingDockerHostNoReplacement() {
- // app2 requires 5 nodes
- NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(5);
-
- // For a day all nodes work so nothing happens
- for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) {
- tester.clock.advance(Duration.ofMinutes(5));
- tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
- assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
- assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- }
-
-
- // Select the first host that has two active nodes
- String downHost1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream()
- .collect(Collectors.groupingBy(Node::parentHostname))
- .entrySet().stream()
- .filter(entry -> entry.getValue().size() == 2)
- .map(Map.Entry::getKey)
- .findFirst().get().get();
- tester.serviceMonitor.setHostDown(downHost1);
-
- // nothing happens the first 45 minutes
- for (int minutes = 0; minutes < 45; minutes += 5 ) {
- tester.failer.run();
- tester.clock.advance(Duration.ofMinutes(5));
- tester.allNodesMakeAConfigRequestExcept();
- assertEquals( 0, tester.deployer.redeployments);
- assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
- assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- }
-
- tester.clock.advance(Duration.ofMinutes(30));
+ tester.failer.run();
+ tester.clock.advance(Duration.ofMinutes(90));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
- // The node used by app1 should've been redeployed, while the host and node used by app2 should stay
- assertEquals( 1, tester.deployer.redeployments);
- assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(5 + 2, tester.deployer.redeployments);
+ assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(6, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
-
- // Now lets fail an active tenant node, this should work as normal
- String downTenant1 = tester.nodeRepository.getNodes(NodeFailTester.app1).get(0).hostname();
- tester.serviceMonitor.setHostDown(downTenant1);
-
- // nothing happens the first 45 minutes
- for (int minutes = 0; minutes < 45; minutes += 5 ) {
- tester.failer.run();
- tester.clock.advance(Duration.ofMinutes(5));
- tester.allNodesMakeAConfigRequestExcept();
- assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- }
- tester.clock.advance(Duration.ofMinutes(30));
+ // We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's
+ // node, while app2's should remain
+ String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
+ tester.serviceMonitor.setHostDown(downHost3);
+ tester.failer.run();
+ tester.clock.advance(Duration.ofMinutes(90));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
- assertEquals( 2, tester.deployer.redeployments);
- assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
+ assertEquals(6 + 2, tester.deployer.redeployments);
+ assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
+ assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
- assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
-
@Test
public void testFailingProxyNodes() {
NodeFailTester tester = NodeFailTester.withProxyApplication();
@@ -410,4 +350,21 @@ public class NodeFailerTest {
assertTrue(downHosts.contains(failedHost2));
}
+ /**
+ * Selects the first parent host that:
+ * - has exactly n nodes in state 'active'
+ * - is not present in the 'except' array
+ */
+ private String selectFirstParentHostWithNActiveNodesExcept(NodeRepository nodeRepository, int n, String... except) {
+ Set<String> exceptSet = Arrays.stream(except).collect(Collectors.toSet());
+ return nodeRepository.getNodes(NodeType.tenant, Node.State.active).stream()
+ .collect(Collectors.groupingBy(Node::parentHostname))
+ .entrySet().stream()
+ .filter(entry -> entry.getValue().size() == n)
+ .map(Map.Entry::getKey)
+ .flatMap(parentHost -> Stream.of(parentHost.get()))
+ .filter(node -> ! exceptSet.contains(node))
+ .findFirst().get();
+ }
+
}