From 41d8c9da0b06d68666adeb1ab2b3bf7d772d162d Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Thu, 9 May 2019 13:25:06 +0200 Subject: Disallow failing config/controller(hosts) --- .../hosted/provision/maintenance/NodeFailer.java | 18 ++++++-- .../provision/maintenance/NodeFailTester.java | 20 ++++---- .../provision/maintenance/NodeFailerTest.java | 53 ++++++++++------------ 3 files changed, 46 insertions(+), 45 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 52eb9ba95df..f0d9f9f314f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -209,7 +209,7 @@ public class NodeFailer extends Maintainer { nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get()); } else { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); - if (hostNode.type() == NodeType.host) { + if (hostNode.type().isDockerHost()) { List failureReports = reasonsToFailParentHost(hostNode); if (failureReports.size() > 0) { if (hostNode.equals(node)) { @@ -291,12 +291,20 @@ public class NodeFailer extends Maintainer { /** * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected * unless the node is replaced. - * However, nodes of other types are not replaced (because all of the type are used by a single application), - * so we only allow one to be in failed at any point in time to protect against runaway failing. + * We can also attempt to fail a single proxy(host) as there should be enough redudancy to handle that. + * But we refuse to fail out config(host)/controller(host) */ private boolean failAllowedFor(NodeType nodeType) { - if (nodeType == NodeType.tenant || nodeType == NodeType.host) return true; - return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0; + switch (nodeType) { + case tenant: + case host: + return true; + case proxy: + case proxyhost: + return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0; + default: + return false; + } } /** diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 5e76ebff9b2..2438b5e1893 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -10,10 +10,10 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; -import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.InstanceName; import com.yahoo.config.provision.NodeFlavors; +import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.TenantName; @@ -48,6 +48,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import static org.junit.Assert.assertEquals; @@ -158,22 +159,21 @@ public class NodeFailTester { return tester; } - public static NodeFailTester withProxyApplication() { + public static NodeFailTester withInfraApplication(NodeType nodeType, int count) { NodeFailTester tester = new NodeFailTester(); - - tester.createReadyNodes(16, NodeType.proxy); + tester.createReadyNodes(count, nodeType); // Create application - Capacity allProxies = Capacity.fromRequiredNodeType(NodeType.proxy); + Capacity allNodes = Capacity.fromRequiredNodeType(nodeType); ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test"), Version.fromString("6.42"), - false, Collections.emptySet()); - tester.activate(app1, clusterApp1, allProxies); - assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); + false, Set.of()); + tester.activate(app1, clusterApp1, allNodes); + assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); - Map apps = new HashMap<>(); - apps.put(app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allProxies, 1)); + Map apps = Map.of( + app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allNodes, 1)); tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps); tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); tester.metric = new MetricsReporterTest.TestMetric(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 965ce990c64..1d988b6283a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -16,8 +16,6 @@ import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -90,10 +88,9 @@ public class NodeFailerTest { tester.failer.run(); // The first (and the only) ready node and the 1st active node that was allowed to fail should be failed - Map> expectedHostnamesByState1Iter = new HashMap<>(); - expectedHostnamesByState1Iter.put(Node.State.failed, - Arrays.asList(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0))); - expectedHostnamesByState1Iter.put(Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2)); + Map> expectedHostnamesByState1Iter = Map.of( + Node.State.failed, List.of(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0)), + Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2)); Map> hostnamesByState1Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList()))); assertEquals(expectedHostnamesByState1Iter, hostnamesByState1Iter); @@ -108,7 +105,7 @@ public class NodeFailerTest { // All of the children should be failed now Set childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream() .map(Node::state).collect(Collectors.toSet()); - assertEquals(Collections.singleton(Node.State.failed), childStates2Iter); + assertEquals(Set.of(Node.State.failed), childStates2Iter); // The host itself is still active as it too must be allowed to suspend assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state()); @@ -483,7 +480,16 @@ public class NodeFailerTest { @Test public void failing_proxy_nodes() { - NodeFailTester tester = NodeFailTester.withProxyApplication(); + test_infra_application_fail(NodeType.proxy, 10, 1); + } + + @Test + public void failing_config_hosts() { + test_infra_application_fail(NodeType.confighost, 3, 0); + } + + private void test_infra_application_fail(NodeType nodeType, int count, int expectedFailCount) { + NodeFailTester tester = NodeFailTester.withInfraApplication(nodeType, count); // For a day all nodes work so nothing happens for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { @@ -491,12 +497,10 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); - assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); + assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); } - Set downHosts = new HashSet<>(); - downHosts.add("host4"); - downHosts.add("host5"); + Set downHosts = Set.of("host2", "host3"); for (String downHost : downHosts) tester.serviceMonitor.setHostDown(downHost); @@ -506,34 +510,23 @@ public class NodeFailerTest { tester.clock.advance(Duration.ofMinutes(5)); tester.allNodesMakeAConfigRequestExcept(); assertEquals( 0, tester.deployer.redeployments); - assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); - assertEquals( 0, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size()); + assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); } tester.clock.advance(Duration.ofMinutes(60)); tester.failer.run(); // one down host should now be failed, but not two as we are only allowed to fail one proxy - assertEquals( 1, tester.deployer.redeployments); - assertEquals(15, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); - assertEquals( 1, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size()); - String failedHost1 = tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).get(0).hostname(); - assertTrue(downHosts.contains(failedHost1)); + assertEquals(expectedFailCount, tester.deployer.redeployments); + assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); + assertEquals(expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.failed).size()); + tester.nodeRepository.getNodes(nodeType, Node.State.failed) + .forEach(node -> assertTrue(downHosts.contains(node.hostname()))); // trying to fail again will still not fail the other down host tester.clock.advance(Duration.ofMinutes(60)); tester.failer.run(); - assertEquals(15, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); - - // The first down host is removed, which causes the second one to be moved to failed - tester.nodeRepository.removeRecursively(failedHost1); - tester.failer.run(); - assertEquals( 2, tester.deployer.redeployments); - assertEquals(14, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size()); - assertEquals( 1, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size()); - String failedHost2 = tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).get(0).hostname(); - assertFalse(failedHost1.equals(failedHost2)); - assertTrue(downHosts.contains(failedHost2)); + assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size()); } @Test -- cgit v1.2.3