summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@verizonmedia.com>2019-05-09 13:25:06 +0200
committerValerij Fredriksen <valerijf@verizonmedia.com>2019-05-09 13:25:06 +0200
commit41d8c9da0b06d68666adeb1ab2b3bf7d772d162d (patch)
tree7d50c5f87268766080982122753ca07aabb4d3bb /node-repository
parent68ce1d964984d11d50aed462751d0d9c7b80fb51 (diff)
Disallow failing config/controller(hosts)
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java18
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java20
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java53
3 files changed, 46 insertions, 45 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 52eb9ba95df..f0d9f9f314f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -209,7 +209,7 @@ public class NodeFailer extends Maintainer {
nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
} else {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
- if (hostNode.type() == NodeType.host) {
+ if (hostNode.type().isDockerHost()) {
List<String> failureReports = reasonsToFailParentHost(hostNode);
if (failureReports.size() > 0) {
if (hostNode.equals(node)) {
@@ -291,12 +291,20 @@ public class NodeFailer extends Maintainer {
/**
* We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
* unless the node is replaced.
- * However, nodes of other types are not replaced (because all of the type are used by a single application),
- * so we only allow one to be in failed at any point in time to protect against runaway failing.
+ * We can also attempt to fail a single proxy(host) as there should be enough redudancy to handle that.
+ * But we refuse to fail out config(host)/controller(host)
*/
private boolean failAllowedFor(NodeType nodeType) {
- if (nodeType == NodeType.tenant || nodeType == NodeType.host) return true;
- return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0;
+ switch (nodeType) {
+ case tenant:
+ case host:
+ return true;
+ case proxy:
+ case proxyhost:
+ return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0;
+ default:
+ return false;
+ }
}
/**
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 5e76ebff9b2..2438b5e1893 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -10,10 +10,10 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.DockerImage;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.Flavor;
-import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.HostSpec;
import com.yahoo.config.provision.InstanceName;
import com.yahoo.config.provision.NodeFlavors;
+import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.TenantName;
@@ -48,6 +48,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.Set;
import static org.junit.Assert.assertEquals;
@@ -158,22 +159,21 @@ public class NodeFailTester {
return tester;
}
- public static NodeFailTester withProxyApplication() {
+ public static NodeFailTester withInfraApplication(NodeType nodeType, int count) {
NodeFailTester tester = new NodeFailTester();
-
- tester.createReadyNodes(16, NodeType.proxy);
+ tester.createReadyNodes(count, nodeType);
// Create application
- Capacity allProxies = Capacity.fromRequiredNodeType(NodeType.proxy);
+ Capacity allNodes = Capacity.fromRequiredNodeType(nodeType);
ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container,
ClusterSpec.Id.from("test"),
Version.fromString("6.42"),
- false, Collections.emptySet());
- tester.activate(app1, clusterApp1, allProxies);
- assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
+ false, Set.of());
+ tester.activate(app1, clusterApp1, allNodes);
+ assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
- Map<ApplicationId, MockDeployer.ApplicationContext> apps = new HashMap<>();
- apps.put(app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allProxies, 1));
+ Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(
+ app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allNodes, 1));
tester.deployer = new MockDeployer(tester.provisioner, tester.clock(), apps);
tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
tester.metric = new MetricsReporterTest.TestMetric();
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 965ce990c64..1d988b6283a 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -16,8 +16,6 @@ import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -90,10 +88,9 @@ public class NodeFailerTest {
tester.failer.run();
// The first (and the only) ready node and the 1st active node that was allowed to fail should be failed
- Map<Node.State, List<String>> expectedHostnamesByState1Iter = new HashMap<>();
- expectedHostnamesByState1Iter.put(Node.State.failed,
- Arrays.asList(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0)));
- expectedHostnamesByState1Iter.put(Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2));
+ Map<Node.State, List<String>> expectedHostnamesByState1Iter = Map.of(
+ Node.State.failed, List.of(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0)),
+ Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2));
Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
.collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
assertEquals(expectedHostnamesByState1Iter, hostnamesByState1Iter);
@@ -108,7 +105,7 @@ public class NodeFailerTest {
// All of the children should be failed now
Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
.map(Node::state).collect(Collectors.toSet());
- assertEquals(Collections.singleton(Node.State.failed), childStates2Iter);
+ assertEquals(Set.of(Node.State.failed), childStates2Iter);
// The host itself is still active as it too must be allowed to suspend
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state());
@@ -483,7 +480,16 @@ public class NodeFailerTest {
@Test
public void failing_proxy_nodes() {
- NodeFailTester tester = NodeFailTester.withProxyApplication();
+ test_infra_application_fail(NodeType.proxy, 10, 1);
+ }
+
+ @Test
+ public void failing_config_hosts() {
+ test_infra_application_fail(NodeType.confighost, 3, 0);
+ }
+
+ private void test_infra_application_fail(NodeType nodeType, int count, int expectedFailCount) {
+ NodeFailTester tester = NodeFailTester.withInfraApplication(nodeType, count);
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
@@ -491,12 +497,10 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
- assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
+ assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
}
- Set<String> downHosts = new HashSet<>();
- downHosts.add("host4");
- downHosts.add("host5");
+ Set<String> downHosts = Set.of("host2", "host3");
for (String downHost : downHosts)
tester.serviceMonitor.setHostDown(downHost);
@@ -506,34 +510,23 @@ public class NodeFailerTest {
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals( 0, tester.deployer.redeployments);
- assertEquals(16, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
- assertEquals( 0, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size());
+ assertEquals(count, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
}
tester.clock.advance(Duration.ofMinutes(60));
tester.failer.run();
// one down host should now be failed, but not two as we are only allowed to fail one proxy
- assertEquals( 1, tester.deployer.redeployments);
- assertEquals(15, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
- assertEquals( 1, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size());
- String failedHost1 = tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).get(0).hostname();
- assertTrue(downHosts.contains(failedHost1));
+ assertEquals(expectedFailCount, tester.deployer.redeployments);
+ assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
+ assertEquals(expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.failed).size());
+ tester.nodeRepository.getNodes(nodeType, Node.State.failed)
+ .forEach(node -> assertTrue(downHosts.contains(node.hostname())));
// trying to fail again will still not fail the other down host
tester.clock.advance(Duration.ofMinutes(60));
tester.failer.run();
- assertEquals(15, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
-
- // The first down host is removed, which causes the second one to be moved to failed
- tester.nodeRepository.removeRecursively(failedHost1);
- tester.failer.run();
- assertEquals( 2, tester.deployer.redeployments);
- assertEquals(14, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.active).size());
- assertEquals( 1, tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).size());
- String failedHost2 = tester.nodeRepository.getNodes(NodeType.proxy, Node.State.failed).get(0).hostname();
- assertFalse(failedHost1.equals(failedHost2));
- assertTrue(downHosts.contains(failedHost2));
+ assertEquals(count - expectedFailCount, tester.nodeRepository.getNodes(nodeType, Node.State.active).size());
}
@Test