diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2017-08-21 09:46:29 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-08-21 09:46:29 +0200 |
commit | 3a78992a83949fd4d085badc6f2dc241afb5560b (patch) | |
tree | 9aa12d11d1f9f5e1590ac650c30d4a271bd6b8f8 /node-repository | |
parent | 9c911e14aa16f5165a46880ebe112e3a8162277a (diff) | |
parent | 868459925a76e9b1105831629841da8c50985a8b (diff) |
Merge pull request #3154 from vespa-engine/freva/FailedExpired-park-nodes-with-hw-failure
FailedExpirer move to parked if HW failure
Diffstat (limited to 'node-repository')
2 files changed, 113 insertions, 40 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index d61f06be744..a86ba955a9a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -3,24 +3,31 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import java.time.Clock; import java.time.Duration; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.logging.Logger; +import java.util.stream.Collectors; /** - * This moves nodes from failed back to dirty if + * This moves expired failed nodes: * <ul> - * <li>No hardware failure is known to be detected on the node - * <li>The node has failed less than 5 times OR the environment is dev, test or perf OR system is CI or CD, + * <li>To parked: If the node has known hardware failure, docker hosts are moved to parked only when all its + * children are already in parked + * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf OR system is CD, * as those environments have no protection against users running bogus applications, so * we cannot use the node failure count to conclude the node has a failure. + * <li>Otherwise the node will remain in failed * </ul> * Failed nodes are typically given a long expiry time to enable us to manually moved them back to * active to recover data in cases where the node was failed accidentally. @@ -36,6 +43,7 @@ import java.util.List; */ public class FailedExpirer extends Expirer { + private static final Logger log = Logger.getLogger(NodeRetirer.class.getName()); private final NodeRepository nodeRepository; private final Zone zone; @@ -50,9 +58,22 @@ public class FailedExpirer extends Expirer { protected void expire(List<Node> expired) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node recycleCandidate : expired) { - if (recycleCandidate.status().hardwareFailureDescription().isPresent()) continue; - if (failCountIndicatesHwFail(zone, recycleCandidate) && recycleCandidate.status().failCount() >= 5) continue; - nodesToRecycle.add(recycleCandidate); + if (recycleCandidate.status().hardwareFailureDescription().isPresent()) { + List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() : + nodeRepository.getChildNodes(recycleCandidate.hostname()).stream() + .filter(node -> node.state() != Node.State.parked) + .map(Node::hostname) + .collect(Collectors.toList()); + + if (nonParkedChildren.isEmpty()) { + nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure on node"); + } else { + log.info(String.format("Expired failed node %s with HW fail is not parked because some of its children" + + " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren))); + } + } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) { + nodesToRecycle.add(recycleCandidate); + } } nodeRepository.setDirty(nodesToRecycle); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 028282bea13..3ccacb3ff02 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -32,8 +32,11 @@ import org.junit.Test; import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.junit.Assert.assertEquals; @@ -43,50 +46,102 @@ import static org.junit.Assert.assertEquals; public class FailedExpirerTest { private final Curator curator = new MockCurator(); + private final ManualClock clock = new ManualClock(); + private FailedExpirer failedExpirer; @Test public void ensure_failed_nodes_are_deallocated_in_prod() throws InterruptedException { - NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.prod, "default"); + failureScenarioIn(SystemName.main, Environment.prod, "default"); + clock.advance(Duration.ofDays(5)); + failedExpirer.run(); - assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size()); - assertEquals("node3", nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).get(0).hostname()); + assertNodeHostnames(Node.State.failed, "node1"); + assertNodeHostnames(Node.State.parked, "node2", "node3"); } @Test public void ensure_failed_nodes_are_deallocated_in_dev() throws InterruptedException { - NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.dev, "default"); + failureScenarioIn(SystemName.main, Environment.dev, "default"); + clock.advance(Duration.ofDays(5)); + failedExpirer.run(); - assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size()); - assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); } @Test public void ensure_failed_nodes_are_deallocated_in_cd() throws InterruptedException { - NodeRepository nodeRepository = failureScenarioIn(SystemName.cd, Environment.prod, "default"); + failureScenarioIn(SystemName.cd, Environment.prod, "default"); + clock.advance(Duration.ofDays(5)); + failedExpirer.run(); - assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size()); - assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); } @Test public void ensure_failed_docker_nodes_are_deallocated() throws InterruptedException { - NodeRepository nodeRepository = failureScenarioIn(SystemName.main, Environment.prod, "docker"); + failureScenarioIn(SystemName.main, Environment.prod, "docker"); + clock.advance(Duration.ofDays(5)); + failedExpirer.run(); + + assertNodeHostnames(Node.State.parked, "node2", "node3"); + assertNodeHostnames(Node.State.dirty, "node1"); + } + + @Test + public void ensure_parked_docker_host() throws InterruptedException { + failureScenarioIn(SystemName.main, Environment.prod, "docker"); + + failNode("parent2"); + setHWFailureForNode("parent2"); + + clock.advance(Duration.ofDays(5)); + failedExpirer.run(); // Run twice because parent can only be parked after the child + failedExpirer.run(); + + assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3"); + } + + @Test + public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException { + failureScenarioIn(SystemName.cd, Environment.prod, "docker"); - assertEquals(1, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - assertEquals(2, nodeRepository.getNodes(NodeType.tenant, Node.State.dirty).size()); - assertEquals("node2", nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname()); + failNode("parent1"); + setHWFailureForNode("parent1"); + clock.advance(Duration.ofDays(2)); + failNode("node4"); + failNode("node5"); + clock.advance(Duration.ofDays(3)); + + failedExpirer.run(); // Run twice because parent can only be parked after the child + failedExpirer.run(); + + assertNodeHostnames(Node.State.failed, "parent1", "node4", "node5"); + } + + private void assertNodeHostnames(Node.State state, String... hostnames) { + assertEquals(Stream.of(hostnames).collect(Collectors.toSet()), + failedExpirer.nodeRepository().getNodes(state).stream().map(Node::hostname).collect(Collectors.toSet())); } - private NodeRepository failureScenarioIn(SystemName system, Environment environment, String flavorName) { - ManualClock clock = new ManualClock(); + private void setHWFailureForNode(String hostname) { + Node node2 = failedExpirer.nodeRepository().getNode(hostname).get(); + node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))); + failedExpirer.nodeRepository().write(node2); + } + + private void failNode(String hostname) { + failedExpirer.nodeRepository().fail(hostname, Agent.system, "Failing to unit test"); + } + + private void failureScenarioIn(SystemName system, Environment environment, String flavorName) { NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", flavorName); NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, Zone.defaultZone(), new MockNameResolver().mockAnyLookup(), new DockerImage("docker-registry.domain.tld:8080/dist/vespa")); NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock, (x,y) -> {}); + failedExpirer = new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database())); Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default"); List<Node> hostNodes = new ArrayList<>(3); @@ -107,20 +162,25 @@ public class FailedExpirerTest { // Set node1 to have failed 4 times before Node node1 = nodeRepository.getNode("node1").get(); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); - node1 = node1.with(node1.status().withIncreasedFailCount()); + node1 = node1.with(node1.status().setFailCount(4)); nodeRepository.write(node1); // Set node2 to have a detected hardware failure - Node node2 = nodeRepository.getNode("node2").get(); - node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))); - nodeRepository.write(node2); + setHWFailureForNode("node2"); + + // Set node3 to have failed 8 times before and have a HW failure + Node node3 = nodeRepository.getNode("node3").get(); + node3 = node1.with(node3.status().setFailCount(8)); + nodeRepository.write(node3); + setHWFailureForNode("node3"); // Allocate the nodes List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned); nodeRepository.setReady(nodeRepository.setDirty(provisioned)); + nodeRepository.addNodes(Arrays.asList( + nodeRepository.createNode("node4", "node4", parentHost1, flavor, NodeType.tenant), + nodeRepository.createNode("node5", "node5", parentHost1, flavor, NodeType.tenant))); + ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz")); ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42")); provisioner.prepare(applicationId, cluster, Capacity.fromNodeCount(3, flavorName), 1, null); @@ -130,15 +190,7 @@ public class FailedExpirerTest { assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); // Fail the nodes - nodeRepository.fail("node1", Agent.system, "Failing to unit test"); - nodeRepository.fail("node2", Agent.system, "Failing to unit test"); - nodeRepository.fail("node3", Agent.system, "Failing to unit test"); + nodes.forEach(node -> failNode(node.hostname())); assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - - // Failure times out - clock.advance(Duration.ofDays(5)); - new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database())).run(); - - return nodeRepository; } } |