diff options
Diffstat (limited to 'node-repository')
3 files changed, 313 insertions, 156 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 1e0202d4735..031d56e3164 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; +import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; @@ -21,65 +22,108 @@ import java.util.stream.Collectors; /** * This moves expired failed nodes: * <ul> - * <li>To parked: If the node has known hardware failure, docker hosts are moved to parked only when all its + * <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their * children are already in parked - * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf OR system is CD, - * as those environments have no protection against users running bogus applications, so + * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf. + * Those environments have no protection against users running bogus applications, so * we cannot use the node failure count to conclude the node has a failure. * <li>Otherwise the node will remain in failed * </ul> - * Failed nodes are typically given a long expiry time to enable us to manually moved them back to + * Failed content nodes are given a long expiry time to enable us to manually moved them back to * active to recover data in cases where the node was failed accidentally. * <p> + * Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover. + * </p> + * <p> * The purpose of the automatic recycling to dirty + fail count is that nodes which were moved * to failed due to some undetected hardware failure will end up being failed again. * When that has happened enough they will not be recycled. * <p> - * The Chef recipe running locally on the node may set the hardwareFailureDescription to avoid the node + * The Chef recipe running locally on the node may set hardwareFailureDescription to avoid the node * being automatically recycled in cases where an error has been positively detected. * * @author bratseth + * @author mpolden */ -public class FailedExpirer extends Expirer { +public class FailedExpirer extends Maintainer { private static final Logger log = Logger.getLogger(NodeRetirer.class.getName()); + + private static final Duration defaultExpiry = Duration.ofDays(4); // Grace period to allow recovery of data + private static final Duration containerExpiry = Duration.ofHours(1); // Stateless nodes, no data to recover + private static final int maxAllowedFailures = 5; // Stop recycling nodes after this number of failures + private final NodeRepository nodeRepository; private final Zone zone; + private final Clock clock; - public FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock, - Duration failTimeout, JobControl jobControl) { - super(Node.State.failed, History.Event.Type.failed, nodeRepository, clock, failTimeout, jobControl); + public FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock, Duration interval, + JobControl jobControl) { + super(nodeRepository, interval, jobControl); this.nodeRepository = nodeRepository; this.zone = zone; + this.clock = clock; } @Override - protected void expire(List<Node> expired) { + protected void maintain() { + List<Node> containerNodes = getExpiredNodes(containerExpiry) + .stream() + .filter(node -> node.allocation().isPresent() && + node.allocation().get().membership().cluster().type() == ClusterSpec.Type.container) + .collect(Collectors.toList()); + List<Node> remainingNodes = getExpiredNodes(defaultExpiry); + remainingNodes.removeAll(containerNodes); + recycle(containerNodes); + recycle(remainingNodes); + } + + /** Get failed nodes that have expired according to given expiry */ + private List<Node> getExpiredNodes(Duration expiry) { + return nodeRepository.getNodes(Node.State.failed).stream() + .filter(node -> node.history().event(History.Event.Type.failed) + .map(event -> event.at().plus(expiry).isBefore(clock.instant())) + .orElse(false)) + .collect(Collectors.toList()); + } + + /** Move eligible nodes to dirty. This may be a subset of the given nodes */ + private void recycle(List<Node> nodes) { List<Node> nodesToRecycle = new ArrayList<>(); - for (Node recycleCandidate : expired) { - if (recycleCandidate.status().hardwareFailureDescription().isPresent() || recycleCandidate.status().hardwareDivergence().isPresent()) { - List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() : - nodeRepository.getChildNodes(recycleCandidate.hostname()).stream() + for (Node candidate : nodes) { + if (hasHardwareIssue(candidate)) { + List<String> unparkedChildren = candidate.type() != NodeType.host ? Collections.emptyList() : + nodeRepository.getChildNodes(candidate.hostname()).stream() .filter(node -> node.state() != Node.State.parked) .map(Node::hostname) .collect(Collectors.toList()); - if (nonParkedChildren.isEmpty()) { - nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure/divergence on node"); + if (unparkedChildren.isEmpty()) { + nodeRepository.park(candidate.hostname(), Agent.system, + "Parked by FailedExpirer due to hardware issue"); } else { - log.info(String.format("Expired failed node %s with HW failure/divergence is not parked because some of its children" + - " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren))); + log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + + "unparked children: %s", candidate.hostname(), + String.join(", ", unparkedChildren))); } - } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) { - nodesToRecycle.add(recycleCandidate); + } else if (!failCountIndicatesHardwareIssue(candidate)) { + nodesToRecycle.add(candidate); } } nodeRepository.setDirty(nodesToRecycle); } - private boolean failCountIndicatesHwFail(Zone zone, Node node) { + /** Returns whether the current node fail count should be used as an indicator of hardware issue */ + private boolean failCountIndicatesHardwareIssue(Node node) { if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false; - return zone.environment() == Environment.prod || zone.environment() == Environment.staging; + return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) && + node.status().failCount() >= maxAllowedFailures; + } + + /** Returns whether node has any kind of hardware issue */ + private static boolean hasHardwareIssue(Node node) { + return node.status().hardwareFailureDescription().isPresent() || + node.status().hardwareDivergence().isPresent(); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 5a66621150f..9e826bfcb9a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -72,7 +72,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { retiredExpirer = new RetiredExpirer(nodeRepository, deployer, clock, durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry), jobControl); retiredEarlyExpirer = new RetiredEarlyExpirer(nodeRepository, durationFromEnv("retired_early_interval").orElse(defaults.retiredEarlyInterval), jobControl, deployer, orchestrator); inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry), jobControl); - failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expiry").orElse(defaults.failedExpiry), jobControl); + failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expirer_interval").orElse(defaults.failedExpirerInterval), jobControl); dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl); provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry), jobControl); nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval), jobControl); @@ -134,7 +134,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration reservationExpiry; private final Duration inactiveExpiry; private final Duration retiredExpiry; - private final Duration failedExpiry; + private final Duration failedExpirerInterval; private final Duration dirtyExpiry; private final Duration provisionedExpiry; private final Duration rebootInterval; @@ -156,7 +156,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy retiredExpiry = Duration.ofDays(4); // enough time to migrate data retiredEarlyInterval = Duration.ofMinutes(29); - failedExpiry = Duration.ofDays(4); // enough time to recover data even if it happens friday night + failedExpirerInterval = Duration.ofMinutes(10); dirtyExpiry = Duration.ofHours(2); // enough time to clean the node provisionedExpiry = Duration.ofHours(4); rebootInterval = Duration.ofDays(30); @@ -174,7 +174,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { inactiveExpiry = Duration.ofSeconds(2); // support interactive wipe start over retiredExpiry = Duration.ofMinutes(1); retiredEarlyInterval = Duration.ofMinutes(5); - failedExpiry = Duration.ofMinutes(10); + failedExpirerInterval = Duration.ofMinutes(10); dirtyExpiry = Duration.ofMinutes(30); provisionedExpiry = Duration.ofHours(4); rebootInterval = Duration.ofDays(30); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java index 51991a844d7..720c5b05443 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java @@ -9,6 +9,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.HostSpec; import com.yahoo.config.provision.InstanceName; import com.yahoo.config.provision.NodeFlavors; import com.yahoo.config.provision.NodeType; @@ -18,7 +19,6 @@ import com.yahoo.config.provision.TenantName; import com.yahoo.config.provision.Zone; import com.yahoo.test.ManualClock; import com.yahoo.transaction.NestedTransaction; -import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; @@ -26,15 +26,15 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; -import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; import org.junit.Test; import java.time.Duration; -import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Optional; +import java.util.Set; +import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -42,155 +42,268 @@ import static org.junit.Assert.assertEquals; /** * @author bratseth + * @author mpolden */ public class FailedExpirerTest { - private final Curator curator = new MockCurator(); - private final ManualClock clock = new ManualClock(); - private FailedExpirer failedExpirer; - @Test - public void ensure_failed_nodes_are_deallocated_in_prod() throws InterruptedException { - failureScenarioIn(SystemName.main, Environment.prod, "default"); - clock.advance(Duration.ofDays(5)); - failedExpirer.run(); - - assertNodeHostnames(Node.State.failed, "node1"); - assertNodeHostnames(Node.State.parked, "node2", "node3"); + public void ensure_failed_nodes_are_deallocated_in_prod() { + FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) + .withNode("node1") + .withNode("node2") + .withNode("node3") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, "node1", "node2", "node3") + .failNode(4, "node1") + .failWithHardwareFailure("node2", "node3"); + + scenario.clock().advance(Duration.ofDays(3)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.failed, "node1", "node2", "node3"); // None moved yet + + scenario.clock().advance(Duration.ofDays(2)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.failed, "node1"); + scenario.assertNodesIn(Node.State.parked, "node2", "node3"); } @Test - public void ensure_failed_nodes_are_deallocated_in_dev() throws InterruptedException { - failureScenarioIn(SystemName.main, Environment.dev, "default"); - clock.advance(Duration.ofDays(5)); - failedExpirer.run(); - - assertNodeHostnames(Node.State.parked, "node2", "node3"); - assertNodeHostnames(Node.State.dirty, "node1"); + public void ensure_failed_nodes_are_deallocated_in_dev() { + FailureScenario scenario = new FailureScenario(SystemName.main, Environment.dev) + .withNode("node1") + .withNode("node2") + .withNode("node3") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, "node1", "node2", "node3") + .failNode(4, "node1") + .failWithHardwareFailure("node2", "node3"); + + scenario.clock().advance(Duration.ofDays(5)); + scenario.expirer().run(); + + scenario.assertNodesIn(Node.State.parked, "node2", "node3"); + scenario.assertNodesIn(Node.State.dirty, "node1"); } @Test - public void ensure_failed_nodes_are_deallocated_in_cd() throws InterruptedException { - failureScenarioIn(SystemName.cd, Environment.prod, "default"); - clock.advance(Duration.ofDays(5)); - failedExpirer.run(); - - assertNodeHostnames(Node.State.failed, "node1"); - assertNodeHostnames(Node.State.parked, "node2", "node3"); + public void ensure_failed_nodes_are_deallocated_in_cd() { + FailureScenario scenario = new FailureScenario(SystemName.cd, Environment.prod) + .withNode("node1") + .withNode("node2") + .withNode("node3") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, "node1", "node2", "node3") + .failNode(4, "node1") + .failWithHardwareFailure("node2", "node3"); + + scenario.clock().advance(Duration.ofDays(5)); + scenario.expirer().run(); + + scenario.assertNodesIn(Node.State.failed, "node1"); + scenario.assertNodesIn(Node.State.parked, "node2", "node3"); } @Test - public void ensure_failed_docker_nodes_are_deallocated() throws InterruptedException { - failureScenarioIn(SystemName.main, Environment.prod, "docker"); - clock.advance(Duration.ofDays(5)); - failedExpirer.run(); - - assertNodeHostnames(Node.State.parked, "node2", "node3"); - assertNodeHostnames(Node.State.dirty, "node1"); + public void ensure_failed_docker_nodes_are_deallocated() { + FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3") + .failNode(4, "node1") + .failWithHardwareFailure("node2", "node3"); + + scenario.clock().advance(Duration.ofDays(5)); + scenario.expirer().run(); + + scenario.assertNodesIn(Node.State.parked, "node2", "node3"); + scenario.assertNodesIn(Node.State.dirty, "node1"); } @Test - public void ensure_parked_docker_host() throws InterruptedException { - failureScenarioIn(SystemName.main, Environment.prod, "docker"); - - failNode("parent2"); - setHWFailureForNode("parent2"); - - clock.advance(Duration.ofDays(5)); - failedExpirer.run(); // Run twice because parent can only be parked after the child - failedExpirer.run(); - - assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3"); + public void ensure_parked_docker_host() { + FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3") + .failNode(8, "node3") + .failWithHardwareFailure("node2", "node3") + .failWithHardwareFailure("parent2"); + + scenario.clock.advance(Duration.ofDays(5)); + scenario.expirer().run(); // Run twice because parent can only be parked after the child + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.parked, "parent2", "node2", "node3"); } @Test - public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException { - failureScenarioIn(SystemName.cd, Environment.prod, "docker"); - - failNode("parent1"); - setHWFailureForNode("parent1"); - clock.advance(Duration.ofDays(2)); - failNode("node4"); - failNode("node5"); - clock.advance(Duration.ofDays(3)); - - failedExpirer.run(); // Run twice because parent can only be parked after the child - failedExpirer.run(); - - assertNodeHostnames(Node.State.failed, "parent1", "node4", "node5"); + public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() { + FailureScenario scenario = new FailureScenario(SystemName.cd, Environment.prod) + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2") + .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node4", "parent1") + .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node5", "parent1") + .setReady("node1", "node2", "node3") + .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3") + .failWithHardwareFailure("parent1"); + + scenario.clock().advance(Duration.ofDays(2)); + scenario.failNode(1, "node4", "node5"); + scenario.clock().advance(Duration.ofDays(3)); + + scenario.expirer().run(); // Run twice because parent can only be parked after the child + scenario.expirer().run(); + + scenario.assertNodesIn(Node.State.failed, "parent1", "node4", "node5"); } - private void assertNodeHostnames(Node.State state, String... hostnames) { - assertEquals(Stream.of(hostnames).collect(Collectors.toSet()), - failedExpirer.nodeRepository().getNodes(state).stream().map(Node::hostname).collect(Collectors.toSet())); - } - - private void setHWFailureForNode(String hostname) { - Node node2 = failedExpirer.nodeRepository().getNode(hostname).get(); - node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))); - failedExpirer.nodeRepository().write(node2); + @Test + public void ensure_container_nodes_are_recycled_early() { + FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod) + .withNode("node1") + .withNode("node2") + .withNode("node3") + .withNode("node4") + .withNode("node5") + .withNode("node6") + .setReady("node1", "node2", "node3", "node4", "node5", "node6") + .allocate(ClusterSpec.Type.content, "node1", "node2", "node3") + .allocate(ClusterSpec.Type.container, "node4", "node5", "node6"); + + // Vespa container fails + scenario.failNode(1, "node4"); + + // 30 minutes pass, nothing happens + scenario.clock().advance(Duration.ofMinutes(30)); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.dirty); + + // Recycles container when more than 1 hour passes + scenario.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1))); + scenario.expirer().run(); + scenario.assertNodesIn(Node.State.dirty, "node4"); } - private void failNode(String hostname) { - failedExpirer.nodeRepository().fail(hostname, Agent.system, "Failing to unit test"); + private static class FailureScenario { + + private static final NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", "docker"); + public static final Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default"); + public static final Flavor dockerFlavor = nodeFlavors.getFlavorOrThrow("docker"); + + private final MockCurator curator = new MockCurator(); + private final ManualClock clock = new ManualClock(); + private final ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), + ApplicationName.from("bar"), + InstanceName.from("default")); + + private final NodeRepository nodeRepository; + private final NodeRepositoryProvisioner provisioner; + private final FailedExpirer expirer; + + public FailureScenario(SystemName system, Environment environment) { + Zone zone = new Zone(system, environment, RegionName.defaultName()); + this.nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone, + new MockNameResolver().mockAnyLookup(), + new DockerImage("docker-image")); + this.provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock, + (x, y) -> {}); + this.expirer = new FailedExpirer(nodeRepository, zone, clock, Duration.ofMinutes(30), + new JobControl(nodeRepository.database())); + } + + public ManualClock clock() { + return clock; + } + + public FailedExpirer expirer() { + return expirer; + } + + public Node get(String hostname) { + return nodeRepository.getNode(hostname) + .orElseThrow(() -> new IllegalArgumentException("No such node: " + hostname)); + } + + public FailureScenario withNode(NodeType type, Flavor flavor, String hostname, String parentHostname) { + nodeRepository.addNodes(Collections.singletonList( + nodeRepository.createNode(UUID.randomUUID().toString(), hostname, + Optional.ofNullable(parentHostname), flavor, type) + )); + return this; + } + + public FailureScenario withNode(NodeType type, Flavor flavor, String hostname) { + return withNode(type, flavor, hostname,null); + } + + public FailureScenario withNode(String hostname) { + return withNode(NodeType.tenant, defaultFlavor, hostname, null); + } + + public FailureScenario failNode(int times, String... hostname) { + Stream.of(hostname).forEach(h -> { + Node node = get(h); + nodeRepository.write(node.with(node.status().setFailCount(times))); + nodeRepository.fail(h, Agent.system, "Failed by unit test"); + }); + return this; + } + + public FailureScenario failWithHardwareFailure(String... hostname) { + Stream.of(hostname).forEach(h -> { + Node node = get(h); + nodeRepository.write(node.with(node.status().withHardwareFailureDescription( + Optional.of("memory_mcelog")))); + nodeRepository.fail(h, Agent.system, "Failed by unit test"); + }); + return this; + } + + public FailureScenario setReady(String... hostname) { + List<Node> nodes = Stream.of(hostname) + .map(this::get) + .collect(Collectors.toList()); + nodeRepository.setReady(nodeRepository.setDirty(nodes)); + return this; + } + + public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) { + return allocate(clusterType, defaultFlavor, hostname); + } + + public FailureScenario allocate(ClusterSpec.Type clusterType, Flavor flavor, String... hostname) { + Set<HostSpec> hosts = Stream.of(hostname) + .map(h -> new HostSpec(h, Optional.empty())) + .collect(Collectors.toSet()); + ClusterSpec clusterSpec = ClusterSpec.request(clusterType, ClusterSpec.Id.from("test"), + Version.fromString("6.42")); + provisioner.prepare(applicationId, clusterSpec, Capacity.fromNodeCount(hostname.length, flavor.name()), + 1, null); + NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator)); + provisioner.activate(transaction, applicationId, hosts); + transaction.commit(); + return this; + } + + public void assertNodesIn(Node.State state, String... hostnames) { + assertEquals(Stream.of(hostnames).collect(Collectors.toSet()), + nodeRepository.getNodes(state).stream() + .map(Node::hostname) + .collect(Collectors.toSet())); + } } - private void failureScenarioIn(SystemName system, Environment environment, String flavorName) { - NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", flavorName); - NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, Zone.defaultZone(), - new MockNameResolver().mockAnyLookup(), - new DockerImage("docker-registry.domain.tld:8080/dist/vespa")); - NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock, (x,y) -> {}); - failedExpirer = new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database())); - - Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default"); - List<Node> hostNodes = new ArrayList<>(3); - hostNodes.add(nodeRepository.createNode("parent1", "parent1", Optional.empty(), defaultFlavor, NodeType.host)); - hostNodes.add(nodeRepository.createNode("parent2", "parent2", Optional.empty(), defaultFlavor, NodeType.host)); - hostNodes.add(nodeRepository.createNode("parent3", "parent3", Optional.empty(), defaultFlavor, NodeType.host)); - nodeRepository.addNodes(hostNodes); - - Flavor flavor = nodeFlavors.getFlavorOrThrow(flavorName); - List<Node> nodes = new ArrayList<>(3); - Optional<String> parentHost1 = flavorName.equals("docker") ? Optional.of("parent1") : Optional.empty(); - Optional<String> parentHost2 = flavorName.equals("docker") ? Optional.of("parent2") : Optional.empty(); - Optional<String> parentHost3 = flavorName.equals("docker") ? Optional.of("parent3") : Optional.empty(); - nodes.add(nodeRepository.createNode("node1", "node1", parentHost1, flavor, NodeType.tenant)); - nodes.add(nodeRepository.createNode("node2", "node2", parentHost2, flavor, NodeType.tenant)); - nodes.add(nodeRepository.createNode("node3", "node3", parentHost3, flavor, NodeType.tenant)); - nodeRepository.addNodes(nodes); - - // Set node1 to have failed 4 times before - Node node1 = nodeRepository.getNode("node1").get(); - node1 = node1.with(node1.status().setFailCount(4)); - nodeRepository.write(node1); - - // Set node2 to have a detected hardware failure - setHWFailureForNode("node2"); - - // Set node3 to have failed 8 times before and have a HW failure - Node node3 = nodeRepository.getNode("node3").get(); - node3 = node1.with(node3.status().setFailCount(8)); - nodeRepository.write(node3); - setHWFailureForNode("node3"); - - // Allocate the nodes - List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned); - nodeRepository.setReady(nodeRepository.setDirty(provisioned)); - nodeRepository.addNodes(Arrays.asList( - nodeRepository.createNode("node4", "node4", parentHost1, flavor, NodeType.tenant), - nodeRepository.createNode("node5", "node5", parentHost1, flavor, NodeType.tenant))); - - ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz")); - ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42")); - provisioner.prepare(applicationId, cluster, Capacity.fromNodeCount(3, flavorName), 1, null); - NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator)); - provisioner.activate(transaction, applicationId, ProvisioningTester.toHostSpecs(nodes)); - transaction.commit(); - assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); - - // Fail the nodes - nodes.forEach(node -> failNode(node.hostname())); - assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - } } |