summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2017-12-13 11:22:53 +0100
committerMartin Polden <mpolden@mpolden.no>2017-12-13 12:12:35 +0100
commit77588324c47c8d1867b4ddf2f43fe4ee81f26e1d (patch)
tree3f086ecd88ce55e1396798b7ba3b603f28dc78ce /node-repository
parent4e7313d8a237011e0bb27dce2ac1f2d97cf81f28 (diff)
Recycle failed container nodes after 1 hour
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java88
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java373
3 files changed, 313 insertions, 156 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 1e0202d4735..031d56e3164 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
+import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
@@ -21,65 +22,108 @@ import java.util.stream.Collectors;
/**
* This moves expired failed nodes:
* <ul>
- * <li>To parked: If the node has known hardware failure, docker hosts are moved to parked only when all its
+ * <li>To parked: If the node has known hardware failure, Docker hosts are moved to parked only when all of their
* children are already in parked
- * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf OR system is CD,
- * as those environments have no protection against users running bogus applications, so
+ * <li>To dirty: If the node has failed less than 5 times OR the environment is dev, test or perf.
+ * Those environments have no protection against users running bogus applications, so
* we cannot use the node failure count to conclude the node has a failure.
* <li>Otherwise the node will remain in failed
* </ul>
- * Failed nodes are typically given a long expiry time to enable us to manually moved them back to
+ * Failed content nodes are given a long expiry time to enable us to manually moved them back to
* active to recover data in cases where the node was failed accidentally.
* <p>
+ * Failed container (Vespa, not Docker) nodes are expired early as there's no data to potentially recover.
+ * </p>
+ * <p>
* The purpose of the automatic recycling to dirty + fail count is that nodes which were moved
* to failed due to some undetected hardware failure will end up being failed again.
* When that has happened enough they will not be recycled.
* <p>
- * The Chef recipe running locally on the node may set the hardwareFailureDescription to avoid the node
+ * The Chef recipe running locally on the node may set hardwareFailureDescription to avoid the node
* being automatically recycled in cases where an error has been positively detected.
*
* @author bratseth
+ * @author mpolden
*/
-public class FailedExpirer extends Expirer {
+public class FailedExpirer extends Maintainer {
private static final Logger log = Logger.getLogger(NodeRetirer.class.getName());
+
+ private static final Duration defaultExpiry = Duration.ofDays(4); // Grace period to allow recovery of data
+ private static final Duration containerExpiry = Duration.ofHours(1); // Stateless nodes, no data to recover
+ private static final int maxAllowedFailures = 5; // Stop recycling nodes after this number of failures
+
private final NodeRepository nodeRepository;
private final Zone zone;
+ private final Clock clock;
- public FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock,
- Duration failTimeout, JobControl jobControl) {
- super(Node.State.failed, History.Event.Type.failed, nodeRepository, clock, failTimeout, jobControl);
+ public FailedExpirer(NodeRepository nodeRepository, Zone zone, Clock clock, Duration interval,
+ JobControl jobControl) {
+ super(nodeRepository, interval, jobControl);
this.nodeRepository = nodeRepository;
this.zone = zone;
+ this.clock = clock;
}
@Override
- protected void expire(List<Node> expired) {
+ protected void maintain() {
+ List<Node> containerNodes = getExpiredNodes(containerExpiry)
+ .stream()
+ .filter(node -> node.allocation().isPresent() &&
+ node.allocation().get().membership().cluster().type() == ClusterSpec.Type.container)
+ .collect(Collectors.toList());
+ List<Node> remainingNodes = getExpiredNodes(defaultExpiry);
+ remainingNodes.removeAll(containerNodes);
+ recycle(containerNodes);
+ recycle(remainingNodes);
+ }
+
+ /** Get failed nodes that have expired according to given expiry */
+ private List<Node> getExpiredNodes(Duration expiry) {
+ return nodeRepository.getNodes(Node.State.failed).stream()
+ .filter(node -> node.history().event(History.Event.Type.failed)
+ .map(event -> event.at().plus(expiry).isBefore(clock.instant()))
+ .orElse(false))
+ .collect(Collectors.toList());
+ }
+
+ /** Move eligible nodes to dirty. This may be a subset of the given nodes */
+ private void recycle(List<Node> nodes) {
List<Node> nodesToRecycle = new ArrayList<>();
- for (Node recycleCandidate : expired) {
- if (recycleCandidate.status().hardwareFailureDescription().isPresent() || recycleCandidate.status().hardwareDivergence().isPresent()) {
- List<String> nonParkedChildren = recycleCandidate.type() != NodeType.host ? Collections.emptyList() :
- nodeRepository.getChildNodes(recycleCandidate.hostname()).stream()
+ for (Node candidate : nodes) {
+ if (hasHardwareIssue(candidate)) {
+ List<String> unparkedChildren = candidate.type() != NodeType.host ? Collections.emptyList() :
+ nodeRepository.getChildNodes(candidate.hostname()).stream()
.filter(node -> node.state() != Node.State.parked)
.map(Node::hostname)
.collect(Collectors.toList());
- if (nonParkedChildren.isEmpty()) {
- nodeRepository.park(recycleCandidate.hostname(), Agent.system, "Parked by FailedExpirer due to HW failure/divergence on node");
+ if (unparkedChildren.isEmpty()) {
+ nodeRepository.park(candidate.hostname(), Agent.system,
+ "Parked by FailedExpirer due to hardware issue");
} else {
- log.info(String.format("Expired failed node %s with HW failure/divergence is not parked because some of its children" +
- " (%s) are not yet parked", recycleCandidate.hostname(), String.join(", ", nonParkedChildren)));
+ log.info(String.format("Expired failed node %s with hardware issue was not parked because of " +
+ "unparked children: %s", candidate.hostname(),
+ String.join(", ", unparkedChildren)));
}
- } else if (! failCountIndicatesHwFail(zone, recycleCandidate) || recycleCandidate.status().failCount() < 5) {
- nodesToRecycle.add(recycleCandidate);
+ } else if (!failCountIndicatesHardwareIssue(candidate)) {
+ nodesToRecycle.add(candidate);
}
}
nodeRepository.setDirty(nodesToRecycle);
}
- private boolean failCountIndicatesHwFail(Zone zone, Node node) {
+ /** Returns whether the current node fail count should be used as an indicator of hardware issue */
+ private boolean failCountIndicatesHardwareIssue(Node node) {
if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER) return false;
- return zone.environment() == Environment.prod || zone.environment() == Environment.staging;
+ return (zone.environment() == Environment.prod || zone.environment() == Environment.staging) &&
+ node.status().failCount() >= maxAllowedFailures;
+ }
+
+ /** Returns whether node has any kind of hardware issue */
+ private static boolean hasHardwareIssue(Node node) {
+ return node.status().hardwareFailureDescription().isPresent() ||
+ node.status().hardwareDivergence().isPresent();
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 5a66621150f..9e826bfcb9a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -72,7 +72,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
retiredExpirer = new RetiredExpirer(nodeRepository, deployer, clock, durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry), jobControl);
retiredEarlyExpirer = new RetiredEarlyExpirer(nodeRepository, durationFromEnv("retired_early_interval").orElse(defaults.retiredEarlyInterval), jobControl, deployer, orchestrator);
inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry), jobControl);
- failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expiry").orElse(defaults.failedExpiry), jobControl);
+ failedExpirer = new FailedExpirer(nodeRepository, zone, clock, durationFromEnv("failed_expirer_interval").orElse(defaults.failedExpirerInterval), jobControl);
dirtyExpirer = new DirtyExpirer(nodeRepository, clock, durationFromEnv("dirty_expiry").orElse(defaults.dirtyExpiry), jobControl);
provisionedExpirer = new ProvisionedExpirer(nodeRepository, clock, durationFromEnv("provisioned_expiry").orElse(defaults.provisionedExpiry), jobControl);
nodeRebooter = new NodeRebooter(nodeRepository, clock, durationFromEnv("reboot_interval").orElse(defaults.rebootInterval), jobControl);
@@ -134,7 +134,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
private final Duration reservationExpiry;
private final Duration inactiveExpiry;
private final Duration retiredExpiry;
- private final Duration failedExpiry;
+ private final Duration failedExpirerInterval;
private final Duration dirtyExpiry;
private final Duration provisionedExpiry;
private final Duration rebootInterval;
@@ -156,7 +156,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
inactiveExpiry = Duration.ofHours(4); // enough time for the application owner to discover and redeploy
retiredExpiry = Duration.ofDays(4); // enough time to migrate data
retiredEarlyInterval = Duration.ofMinutes(29);
- failedExpiry = Duration.ofDays(4); // enough time to recover data even if it happens friday night
+ failedExpirerInterval = Duration.ofMinutes(10);
dirtyExpiry = Duration.ofHours(2); // enough time to clean the node
provisionedExpiry = Duration.ofHours(4);
rebootInterval = Duration.ofDays(30);
@@ -174,7 +174,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
inactiveExpiry = Duration.ofSeconds(2); // support interactive wipe start over
retiredExpiry = Duration.ofMinutes(1);
retiredEarlyInterval = Duration.ofMinutes(5);
- failedExpiry = Duration.ofMinutes(10);
+ failedExpirerInterval = Duration.ofMinutes(10);
dirtyExpiry = Duration.ofMinutes(30);
provisionedExpiry = Duration.ofHours(4);
rebootInterval = Duration.ofDays(30);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
index 51991a844d7..720c5b05443 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirerTest.java
@@ -9,6 +9,7 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.DockerImage;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.Flavor;
+import com.yahoo.config.provision.HostSpec;
import com.yahoo.config.provision.InstanceName;
import com.yahoo.config.provision.NodeFlavors;
import com.yahoo.config.provision.NodeType;
@@ -18,7 +19,6 @@ import com.yahoo.config.provision.TenantName;
import com.yahoo.config.provision.Zone;
import com.yahoo.test.ManualClock;
import com.yahoo.transaction.NestedTransaction;
-import com.yahoo.vespa.curator.Curator;
import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.curator.transaction.CuratorTransaction;
import com.yahoo.vespa.hosted.provision.Node;
@@ -26,15 +26,15 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner;
-import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver;
import org.junit.Test;
import java.time.Duration;
-import java.util.ArrayList;
-import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -42,155 +42,268 @@ import static org.junit.Assert.assertEquals;
/**
* @author bratseth
+ * @author mpolden
*/
public class FailedExpirerTest {
- private final Curator curator = new MockCurator();
- private final ManualClock clock = new ManualClock();
- private FailedExpirer failedExpirer;
-
@Test
- public void ensure_failed_nodes_are_deallocated_in_prod() throws InterruptedException {
- failureScenarioIn(SystemName.main, Environment.prod, "default");
- clock.advance(Duration.ofDays(5));
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.failed, "node1");
- assertNodeHostnames(Node.State.parked, "node2", "node3");
+ public void ensure_failed_nodes_are_deallocated_in_prod() {
+ FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
+ .withNode("node1")
+ .withNode("node2")
+ .withNode("node3")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, "node1", "node2", "node3")
+ .failNode(4, "node1")
+ .failWithHardwareFailure("node2", "node3");
+
+ scenario.clock().advance(Duration.ofDays(3));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.failed, "node1", "node2", "node3"); // None moved yet
+
+ scenario.clock().advance(Duration.ofDays(2));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.failed, "node1");
+ scenario.assertNodesIn(Node.State.parked, "node2", "node3");
}
@Test
- public void ensure_failed_nodes_are_deallocated_in_dev() throws InterruptedException {
- failureScenarioIn(SystemName.main, Environment.dev, "default");
- clock.advance(Duration.ofDays(5));
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.parked, "node2", "node3");
- assertNodeHostnames(Node.State.dirty, "node1");
+ public void ensure_failed_nodes_are_deallocated_in_dev() {
+ FailureScenario scenario = new FailureScenario(SystemName.main, Environment.dev)
+ .withNode("node1")
+ .withNode("node2")
+ .withNode("node3")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, "node1", "node2", "node3")
+ .failNode(4, "node1")
+ .failWithHardwareFailure("node2", "node3");
+
+ scenario.clock().advance(Duration.ofDays(5));
+ scenario.expirer().run();
+
+ scenario.assertNodesIn(Node.State.parked, "node2", "node3");
+ scenario.assertNodesIn(Node.State.dirty, "node1");
}
@Test
- public void ensure_failed_nodes_are_deallocated_in_cd() throws InterruptedException {
- failureScenarioIn(SystemName.cd, Environment.prod, "default");
- clock.advance(Duration.ofDays(5));
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.failed, "node1");
- assertNodeHostnames(Node.State.parked, "node2", "node3");
+ public void ensure_failed_nodes_are_deallocated_in_cd() {
+ FailureScenario scenario = new FailureScenario(SystemName.cd, Environment.prod)
+ .withNode("node1")
+ .withNode("node2")
+ .withNode("node3")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, "node1", "node2", "node3")
+ .failNode(4, "node1")
+ .failWithHardwareFailure("node2", "node3");
+
+ scenario.clock().advance(Duration.ofDays(5));
+ scenario.expirer().run();
+
+ scenario.assertNodesIn(Node.State.failed, "node1");
+ scenario.assertNodesIn(Node.State.parked, "node2", "node3");
}
@Test
- public void ensure_failed_docker_nodes_are_deallocated() throws InterruptedException {
- failureScenarioIn(SystemName.main, Environment.prod, "docker");
- clock.advance(Duration.ofDays(5));
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.parked, "node2", "node3");
- assertNodeHostnames(Node.State.dirty, "node1");
+ public void ensure_failed_docker_nodes_are_deallocated() {
+ FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3")
+ .failNode(4, "node1")
+ .failWithHardwareFailure("node2", "node3");
+
+ scenario.clock().advance(Duration.ofDays(5));
+ scenario.expirer().run();
+
+ scenario.assertNodesIn(Node.State.parked, "node2", "node3");
+ scenario.assertNodesIn(Node.State.dirty, "node1");
}
@Test
- public void ensure_parked_docker_host() throws InterruptedException {
- failureScenarioIn(SystemName.main, Environment.prod, "docker");
-
- failNode("parent2");
- setHWFailureForNode("parent2");
-
- clock.advance(Duration.ofDays(5));
- failedExpirer.run(); // Run twice because parent can only be parked after the child
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.parked, "parent2", "node2", "node3");
+ public void ensure_parked_docker_host() {
+ FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3")
+ .failNode(8, "node3")
+ .failWithHardwareFailure("node2", "node3")
+ .failWithHardwareFailure("parent2");
+
+ scenario.clock.advance(Duration.ofDays(5));
+ scenario.expirer().run(); // Run twice because parent can only be parked after the child
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.parked, "parent2", "node2", "node3");
}
@Test
- public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() throws InterruptedException {
- failureScenarioIn(SystemName.cd, Environment.prod, "docker");
-
- failNode("parent1");
- setHWFailureForNode("parent1");
- clock.advance(Duration.ofDays(2));
- failNode("node4");
- failNode("node5");
- clock.advance(Duration.ofDays(3));
-
- failedExpirer.run(); // Run twice because parent can only be parked after the child
- failedExpirer.run();
-
- assertNodeHostnames(Node.State.failed, "parent1", "node4", "node5");
+ public void ensure_failed_docker_host_is_not_parked_unless_all_children_are() {
+ FailureScenario scenario = new FailureScenario(SystemName.cd, Environment.prod)
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent1")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent2")
+ .withNode(NodeType.host, FailureScenario.defaultFlavor, "parent3")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node1", "parent1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node2", "parent2")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node3", "parent3")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node4", "parent1")
+ .withNode(NodeType.tenant, FailureScenario.dockerFlavor, "node5", "parent1")
+ .setReady("node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.content, FailureScenario.dockerFlavor, "node1", "node2", "node3")
+ .failWithHardwareFailure("parent1");
+
+ scenario.clock().advance(Duration.ofDays(2));
+ scenario.failNode(1, "node4", "node5");
+ scenario.clock().advance(Duration.ofDays(3));
+
+ scenario.expirer().run(); // Run twice because parent can only be parked after the child
+ scenario.expirer().run();
+
+ scenario.assertNodesIn(Node.State.failed, "parent1", "node4", "node5");
}
- private void assertNodeHostnames(Node.State state, String... hostnames) {
- assertEquals(Stream.of(hostnames).collect(Collectors.toSet()),
- failedExpirer.nodeRepository().getNodes(state).stream().map(Node::hostname).collect(Collectors.toSet()));
- }
-
- private void setHWFailureForNode(String hostname) {
- Node node2 = failedExpirer.nodeRepository().getNode(hostname).get();
- node2 = node2.with(node2.status().withHardwareFailureDescription(Optional.of("memory_mcelog")));
- failedExpirer.nodeRepository().write(node2);
+ @Test
+ public void ensure_container_nodes_are_recycled_early() {
+ FailureScenario scenario = new FailureScenario(SystemName.main, Environment.prod)
+ .withNode("node1")
+ .withNode("node2")
+ .withNode("node3")
+ .withNode("node4")
+ .withNode("node5")
+ .withNode("node6")
+ .setReady("node1", "node2", "node3", "node4", "node5", "node6")
+ .allocate(ClusterSpec.Type.content, "node1", "node2", "node3")
+ .allocate(ClusterSpec.Type.container, "node4", "node5", "node6");
+
+ // Vespa container fails
+ scenario.failNode(1, "node4");
+
+ // 30 minutes pass, nothing happens
+ scenario.clock().advance(Duration.ofMinutes(30));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.dirty);
+
+ // Recycles container when more than 1 hour passes
+ scenario.clock().advance(Duration.ofMinutes(30).plus(Duration.ofSeconds(1)));
+ scenario.expirer().run();
+ scenario.assertNodesIn(Node.State.dirty, "node4");
}
- private void failNode(String hostname) {
- failedExpirer.nodeRepository().fail(hostname, Agent.system, "Failing to unit test");
+ private static class FailureScenario {
+
+ private static final NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", "docker");
+ public static final Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default");
+ public static final Flavor dockerFlavor = nodeFlavors.getFlavorOrThrow("docker");
+
+ private final MockCurator curator = new MockCurator();
+ private final ManualClock clock = new ManualClock();
+ private final ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"),
+ ApplicationName.from("bar"),
+ InstanceName.from("default"));
+
+ private final NodeRepository nodeRepository;
+ private final NodeRepositoryProvisioner provisioner;
+ private final FailedExpirer expirer;
+
+ public FailureScenario(SystemName system, Environment environment) {
+ Zone zone = new Zone(system, environment, RegionName.defaultName());
+ this.nodeRepository = new NodeRepository(nodeFlavors, curator, clock, zone,
+ new MockNameResolver().mockAnyLookup(),
+ new DockerImage("docker-image"));
+ this.provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock,
+ (x, y) -> {});
+ this.expirer = new FailedExpirer(nodeRepository, zone, clock, Duration.ofMinutes(30),
+ new JobControl(nodeRepository.database()));
+ }
+
+ public ManualClock clock() {
+ return clock;
+ }
+
+ public FailedExpirer expirer() {
+ return expirer;
+ }
+
+ public Node get(String hostname) {
+ return nodeRepository.getNode(hostname)
+ .orElseThrow(() -> new IllegalArgumentException("No such node: " + hostname));
+ }
+
+ public FailureScenario withNode(NodeType type, Flavor flavor, String hostname, String parentHostname) {
+ nodeRepository.addNodes(Collections.singletonList(
+ nodeRepository.createNode(UUID.randomUUID().toString(), hostname,
+ Optional.ofNullable(parentHostname), flavor, type)
+ ));
+ return this;
+ }
+
+ public FailureScenario withNode(NodeType type, Flavor flavor, String hostname) {
+ return withNode(type, flavor, hostname,null);
+ }
+
+ public FailureScenario withNode(String hostname) {
+ return withNode(NodeType.tenant, defaultFlavor, hostname, null);
+ }
+
+ public FailureScenario failNode(int times, String... hostname) {
+ Stream.of(hostname).forEach(h -> {
+ Node node = get(h);
+ nodeRepository.write(node.with(node.status().setFailCount(times)));
+ nodeRepository.fail(h, Agent.system, "Failed by unit test");
+ });
+ return this;
+ }
+
+ public FailureScenario failWithHardwareFailure(String... hostname) {
+ Stream.of(hostname).forEach(h -> {
+ Node node = get(h);
+ nodeRepository.write(node.with(node.status().withHardwareFailureDescription(
+ Optional.of("memory_mcelog"))));
+ nodeRepository.fail(h, Agent.system, "Failed by unit test");
+ });
+ return this;
+ }
+
+ public FailureScenario setReady(String... hostname) {
+ List<Node> nodes = Stream.of(hostname)
+ .map(this::get)
+ .collect(Collectors.toList());
+ nodeRepository.setReady(nodeRepository.setDirty(nodes));
+ return this;
+ }
+
+ public FailureScenario allocate(ClusterSpec.Type clusterType, String... hostname) {
+ return allocate(clusterType, defaultFlavor, hostname);
+ }
+
+ public FailureScenario allocate(ClusterSpec.Type clusterType, Flavor flavor, String... hostname) {
+ Set<HostSpec> hosts = Stream.of(hostname)
+ .map(h -> new HostSpec(h, Optional.empty()))
+ .collect(Collectors.toSet());
+ ClusterSpec clusterSpec = ClusterSpec.request(clusterType, ClusterSpec.Id.from("test"),
+ Version.fromString("6.42"));
+ provisioner.prepare(applicationId, clusterSpec, Capacity.fromNodeCount(hostname.length, flavor.name()),
+ 1, null);
+ NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator));
+ provisioner.activate(transaction, applicationId, hosts);
+ transaction.commit();
+ return this;
+ }
+
+ public void assertNodesIn(Node.State state, String... hostnames) {
+ assertEquals(Stream.of(hostnames).collect(Collectors.toSet()),
+ nodeRepository.getNodes(state).stream()
+ .map(Node::hostname)
+ .collect(Collectors.toSet()));
+ }
}
- private void failureScenarioIn(SystemName system, Environment environment, String flavorName) {
- NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default", flavorName);
- NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, clock, Zone.defaultZone(),
- new MockNameResolver().mockAnyLookup(),
- new DockerImage("docker-registry.domain.tld:8080/dist/vespa"));
- NodeRepositoryProvisioner provisioner = new NodeRepositoryProvisioner(nodeRepository, nodeFlavors, Zone.defaultZone(), clock, (x,y) -> {});
- failedExpirer = new FailedExpirer(nodeRepository, new Zone(system, environment, RegionName.from("us-west-1")), clock, Duration.ofDays(4), new JobControl(nodeRepository.database()));
-
- Flavor defaultFlavor = nodeFlavors.getFlavorOrThrow("default");
- List<Node> hostNodes = new ArrayList<>(3);
- hostNodes.add(nodeRepository.createNode("parent1", "parent1", Optional.empty(), defaultFlavor, NodeType.host));
- hostNodes.add(nodeRepository.createNode("parent2", "parent2", Optional.empty(), defaultFlavor, NodeType.host));
- hostNodes.add(nodeRepository.createNode("parent3", "parent3", Optional.empty(), defaultFlavor, NodeType.host));
- nodeRepository.addNodes(hostNodes);
-
- Flavor flavor = nodeFlavors.getFlavorOrThrow(flavorName);
- List<Node> nodes = new ArrayList<>(3);
- Optional<String> parentHost1 = flavorName.equals("docker") ? Optional.of("parent1") : Optional.empty();
- Optional<String> parentHost2 = flavorName.equals("docker") ? Optional.of("parent2") : Optional.empty();
- Optional<String> parentHost3 = flavorName.equals("docker") ? Optional.of("parent3") : Optional.empty();
- nodes.add(nodeRepository.createNode("node1", "node1", parentHost1, flavor, NodeType.tenant));
- nodes.add(nodeRepository.createNode("node2", "node2", parentHost2, flavor, NodeType.tenant));
- nodes.add(nodeRepository.createNode("node3", "node3", parentHost3, flavor, NodeType.tenant));
- nodeRepository.addNodes(nodes);
-
- // Set node1 to have failed 4 times before
- Node node1 = nodeRepository.getNode("node1").get();
- node1 = node1.with(node1.status().setFailCount(4));
- nodeRepository.write(node1);
-
- // Set node2 to have a detected hardware failure
- setHWFailureForNode("node2");
-
- // Set node3 to have failed 8 times before and have a HW failure
- Node node3 = nodeRepository.getNode("node3").get();
- node3 = node1.with(node3.status().setFailCount(8));
- nodeRepository.write(node3);
- setHWFailureForNode("node3");
-
- // Allocate the nodes
- List<Node> provisioned = nodeRepository.getNodes(NodeType.tenant, Node.State.provisioned);
- nodeRepository.setReady(nodeRepository.setDirty(provisioned));
- nodeRepository.addNodes(Arrays.asList(
- nodeRepository.createNode("node4", "node4", parentHost1, flavor, NodeType.tenant),
- nodeRepository.createNode("node5", "node5", parentHost1, flavor, NodeType.tenant)));
-
- ApplicationId applicationId = ApplicationId.from(TenantName.from("foo"), ApplicationName.from("bar"), InstanceName.from("fuz"));
- ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42"));
- provisioner.prepare(applicationId, cluster, Capacity.fromNodeCount(3, flavorName), 1, null);
- NestedTransaction transaction = new NestedTransaction().add(new CuratorTransaction(curator));
- provisioner.activate(transaction, applicationId, ProvisioningTester.toHostSpecs(nodes));
- transaction.commit();
- assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
-
- // Fail the nodes
- nodes.forEach(node -> failNode(node.hostname()));
- assertEquals(3, nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- }
}