summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-08-21 09:46:29 +0200
committerGitHub <noreply@github.com>2017-08-21 09:46:29 +0200
commit3a78992a83949fd4d085badc6f2dc241afb5560b (patch)
tree9aa12d11d1f9f5e1590ac650c30d4a271bd6b8f8 /node-repository
parent9c911e14aa16f5165a46880ebe112e3a8162277a (diff)
parent868459925a76e9b1105831629841da8c50985a8b (diff)
Merge pull request #3154 from vespa-engine/freva/FailedExpired-park-nodes-with-hw-failure
FailedExpirer move to parked if HW failure
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java33
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java120
2 files changed, 113 insertions, 40 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index d61f06be744..a86ba955a9a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -3,24 +3,31 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.Flavor;
+import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import java.time.Clock;
import java.time.Duration;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
/**
- * This moves nodes from failed back to dirty if
+ * This moves expired failed nodes:
* <ul>
- * <li>No hardware failure is known to be detected on the node
- * <li>The node has failed less than 5 times OR the environment is dev, test or perf OR system is CI or CD,
+ * <li>To parked: If the node has known hardware failure, docker hosts are moved to parked only when all its
+ * children are already in parked
+ * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf OR system is CD,
* as those environments have no protection against users running bogus applications, so
* we cannot use the node failure count to conclude the node has a failure.
+ * <li>Otherwise the node will remain in failed
* </ul>
* Failed nodes are typically given a long expiry time to enable us to manually moved them back to
* active to recover data in cases where the node was failed accidentally.
@@ -36,6 +43,7 @@ import java.util.List;
*/
public class FailedExpirer extends Expirer {
+ private static final Logger log = Logger.getLogger(NodeRetirer.class.getName());
private final NodeRepository nodeRepository;
private final Zone zone;
@@ -50,9 +58,22 @@ public class FailedExpirer extends Expirer {
protected void expire(List<Node> expired) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node recycleCandidate : expired) {
- if (recycleCandidate.status().hardwareFailureDescription().isPresent()) continue;
- if (failCountIndicatesHwFail(zone, recycleCandidate) && recycleCandidate.status().failCount() >= 5) continue;
- nodesToRecycle.add(recycleCandidate);
+ if (recycleCandidate.status().hardwareFailureDescription().isPresent()) {
+ List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() :
+ nodeRepository.getChildNodes(recycleCandidate.hostname()).stream()
+ .filter(node -> node.state() != Node.State.parked)
+ .map(Node::hostname)
+ .collect(Collectors.toList());
+
+ if (nonParkedChildren.isEmpty()) {
+ nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node");
+ } else {
+ log.info(String.format("Expired failed node %s with HW fail is not parked because some of its children" +
+ " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren)));
+ }
+ } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) {
+ nodesToRecycle.add(recycleCandidate);
+ }
}
nodeRepository.setDirty(nodesToRecycle);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 028282bea13..3ccacb3ff02 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -32,8 +32,11 @@ import org.junit.Test;
import java.time.Duration;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.Optional;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
import static org.junit.Assert.assertEquals;
@@ -43,50 +46,102 @@ import static org.junit.Assert.assertEquals;
public class FailedExpirerTest {
private final Curator curator = new MockCurator();
+ private final ManualClock clock = new ManualClock();
+ private FailedExpirer failedExpirer;
@Test
public void ensure_failed_nodes_are_deallocated_in_prod() throws InterruptedException {
- NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.prod, "default");
+ failureScenarioIn(SystemName.main, Environment.prod, "default");
+ clock.advance(Duration.ofDays(5));
+ failedExpirer.run();
- assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size());
- assertEquals("node3", nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).get(0).hostname());
+ assertNodeHostnames(Node.State.failed, "node1");
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
}
@Test
public void ensure_failed_nodes_are_deallocated_in_dev() throws InterruptedException {
- NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.dev, "default");
+ failureScenarioIn(SystemName.main, Environment.dev, "default");
+ clock.advance(Duration.ofDays(5));
+ failedExpirer.run();
- assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size());
- assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname());
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
}
@Test
public void ensure_failed_nodes_are_deallocated_in_cd() throws InterruptedException {
- NodeRepository nodeRepository = failureScenarioIn(SystemName.cd, Environment.prod, "default");
+ failureScenarioIn(SystemName.cd, Environment.prod, "default");
+ clock.advance(Duration.ofDays(5));
+ failedExpirer.run();
- assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size());
- assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname());
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
}
@Test
public void ensure_failed_docker_nodes_are_deallocated() throws InterruptedException {
- NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.prod, "docker");
+ failureScenarioIn(SystemName.main, Environment.prod, "docker");
+ clock.advance(Duration.ofDays(5));
+ failedExpirer.run();
+
+ assertNodeHostnames(Node.State.parked, "node2", "node3");
+ assertNodeHostnames(Node.State.dirty, "node1");
+ }
+
+ @Test
+ public void ensure_parked_docker_host() throws InterruptedException {
+ failureScenarioIn(SystemName.main, Environment.prod, "docker");
+
+ failNode("parent2");
+ setHWFailureForNode("parent2");
+
+ clock.advance(Duration.ofDays(5));
+ failedExpirer.run(); // Run twice because parent can only be parked after the child
+ failedExpirer.run();
+
+ assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3");
+ }
+
+ @Test
+ public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException {
+ failureScenarioIn(SystemName.cd, Environment.prod, "docker");
- assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size());
- assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname());
+ failNode("parent1");
+ setHWFailureForNode("parent1");
+ clock.advance(Duration.ofDays(2));
+ failNode("node4");
+ failNode("node5");
+ clock.advance(Duration.ofDays(3));
+
+ failedExpirer.run(); // Run twice because parent can only be parked after the child
+ failedExpirer.run();
+
+ assertNodeHostnames(Node.State.failed, "parent1", "node4", "node5");
+ }
+
+ private void assertNodeHostnames(Node.State state, String... hostnames) {
+ assertEquals(Stream.of(hostnames).collect(Collectors.toSet()),
+ failedExpirer.nodeRepository().getNodes(state).stream().map(Node::hostname).collect(Collectors.toSet()));
}
- private NodeRepository failureScenarioIn(SystemName system, Environment environment, String flavorName) {
- ManualClock clock = new ManualClock();
+ private void setHWFailureForNode(String hostname) {
+ Node node2 = failedExpirer.nodeRepository().getNode(hostname).get();
+ node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog")));
+ failedExpirer.nodeRepository().write(node2);
+ }
+
+ private void failNode(String hostname) {
+ failedExpirer.nodeRepository().fail(hostname, Agent.system, "Failing to unit test");
+ }
+
+ private void failureScenarioIn(SystemName system, Environment environment, String flavorName) {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", flavorName);
NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, Zone.defaultZone(),
new MockNameResolver().mockAnyLookup(),
new DockerImage("docker-registry.domain.tld:8080/dist/vespa"));
NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock, (x,y) -> {});
+ failedExpirer = new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database()));
Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default");
List<Node> hostNodes = new ArrayList<>(3);
@@ -107,20 +162,25 @@ public class FailedExpirerTest {
// Set node1 to have failed 4 times before
Node node1 = nodeRepository.getNode("node1").get();
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
- node1 = node1.with(node1.status().withIncreasedFailCount());
+ node1 = node1.with(node1.status().setFailCount(4));
nodeRepository.write(node1);
// Set node2 to have a detected hardware failure
- Node node2 = nodeRepository.getNode("node2").get();
- node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog")));
- nodeRepository.write(node2);
+ setHWFailureForNode("node2");
+
+ // Set node3 to have failed 8 times before and have a HW failure
+ Node node3 = nodeRepository.getNode("node3").get();
+ node3 = node1.with(node3.status().setFailCount(8));
+ nodeRepository.write(node3);
+ setHWFailureForNode("node3");
// Allocate the nodes
List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned);
nodeRepository.setReady(nodeRepository.setDirty(provisioned));
+ nodeRepository.addNodes(Arrays.asList(
+ nodeRepository.createNode("node4", "node4", parentHost1, flavor, NodeType.tenant),
+ nodeRepository.createNode("node5", "node5", parentHost1, flavor, NodeType.tenant)));
+
ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz"));
ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42"));
provisioner.prepare(applicationId, cluster, Capacity.fromNodeCount(3, flavorName), 1, null);
@@ -130,15 +190,7 @@ public class FailedExpirerTest {
assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
// Fail the nodes
- nodeRepository.fail("node1", Agent.system, "Failing to unit test");
- nodeRepository.fail("node2", Agent.system, "Failing to unit test");
- nodeRepository.fail("node3", Agent.system, "Failing to unit test");
+ nodes.forEach(node -> failNode(node.hostname()));
assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
-
- // Failure times out
- clock.advance(Duration.ofDays(5));
- new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database())).run();
-
- return nodeRepository;
}
}