summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-11-16 09:10:49 +0100
committerGitHub <noreply@github.com>2017-11-16 09:10:49 +0100
commitd21531b7614f3816dd4650a2f9e846c0a42495c3 (patch)
treea7ba2e5c9840aa59de7385bedc5bf01c1796c060 /node-repository
parente952b1d5abd4311e3743affa91726a3b09f5a859 (diff)
parent1a69165c6e149e7c2a905e9ce1b733876549d7b2 (diff)
Merge pull request #3998 from vespa-engine/freva/node-failer
Count Docker nodes when throttling in node failure
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java117
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java84
3 files changed, 101 insertions, 104 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 1c81d97ddea..266d91e7e3e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
-import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.transaction.Mutex;
@@ -24,12 +23,12 @@ import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
-import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
-import java.util.stream.Collectors;
/**
* Maintains information in the node repo about when this node last responded to ping
@@ -76,21 +75,15 @@ public class NodeFailer extends Maintainer {
@Override
protected void maintain() {
// Ready nodes
- updateNodeLivenessEventsForReadyNodes();
- for (Node node : readyNodesWhichAreDead()) {
- // Docker hosts and nodes do not run Vespa services
- if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue;
- if ( ! throttle(node)) nodeRepository().fail(node.hostname(),
- Agent.system, "Not receiving config requests from node");
- }
-
- for (Node node : readyNodesWithHardwareFailure())
- if ( ! throttle(node)) nodeRepository().fail(node.hostname(),
- Agent.system, "Node has hardware failure");
+ try (Mutex lock = nodeRepository().lockUnallocated()) {
+ updateNodeLivenessEventsForReadyNodes();
- for (Node node: readyNodesWithHardwareDivergence())
- if ( ! throttle(node)) nodeRepository().fail(node.hostname(),
- Agent.system, "Node hardware diverges from spec");
+ getReadyNodesByFailureReason().forEach((node, reason) -> {
+ if (!throttle(node)) {
+ nodeRepository().fail(node.hostname(), Agent.system, reason);
+ }
+ });
+ }
// Active nodes
for (Node node : determineActiveNodeDownStatus()) {
@@ -103,59 +96,55 @@ public class NodeFailer extends Maintainer {
private void updateNodeLivenessEventsForReadyNodes() {
// Update node last request events through ZooKeeper to collect request to all config servers.
// We do this here ("lazily") to avoid writing to zk for each config request.
- try (Mutex lock = nodeRepository().lockUnallocated()) {
- for (Node node : nodeRepository().getNodes(Node.State.ready)) {
- Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
- if ( ! lastLocalRequest.isPresent()) continue;
-
- Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested);
- if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) {
- History updatedHistory = node.history().with(new History.Event(History.Event.Type.requested,
- Agent.system,
- lastLocalRequest.get()));
- nodeRepository().write(node.with(updatedHistory));
- }
+ for (Node node : nodeRepository().getNodes(Node.State.ready)) {
+ Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
+ if ( ! lastLocalRequest.isPresent()) continue;
+
+ Optional<History.Event> recordedRequest = node.history().event(History.Event.Type.requested);
+ if ( ! recordedRequest.isPresent() || recordedRequest.get().at().isBefore(lastLocalRequest.get())) {
+ History updatedHistory = node.history().with(new History.Event(History.Event.Type.requested,
+ Agent.system,
+ lastLocalRequest.get()));
+ nodeRepository().write(node.with(updatedHistory));
}
}
}
- private List<Node> readyNodesWhichAreDead() {
- // Allow requests some time to be registered in case all config servers have been down
- if (constructionTime.isAfter(clock.instant().minus(nodeRequestInterval).minus(nodeRequestInterval) ))
- return Collections.emptyList();
-
- // Nodes are taken as dead if they have not made a config request since this instant.
- // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently.
- Instant oldestAcceptableRequestTime = clock.instant().minus(downTimeLimit).minus(nodeRequestInterval);
-
- return nodeRepository().getNodes(Node.State.ready).stream()
- .filter(node -> wasMadeReadyBefore(oldestAcceptableRequestTime, node))
- .filter(node -> ! hasRecordedRequestAfter(oldestAcceptableRequestTime, node))
- .collect(Collectors.toList());
+ private Map<Node, String> getReadyNodesByFailureReason() {
+ Instant oldestAcceptableRequestTime =
+ // Allow requests some time to be registered in case all config servers have been down
+ constructionTime.isAfter(clock.instant().minus(nodeRequestInterval.multipliedBy(2))) ?
+ Instant.EPOCH :
+
+ // Nodes are taken as dead if they have not made a config request since this instant.
+ // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently.
+ clock.instant().minus(downTimeLimit).minus(nodeRequestInterval);
+
+ Map<Node, String> nodesByFailureReason = new HashMap<>();
+ for (Node node : nodeRepository().getNodes(Node.State.ready)) {
+ if (! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) {
+ nodesByFailureReason.put(node, "Not receiving config requests from node");
+ } else if (node.status().hardwareFailureDescription().isPresent()) {
+ nodesByFailureReason.put(node, "Node has hardware failure");
+ } else if (node.status().hardwareDivergence().isPresent()) {
+ nodesByFailureReason.put(node, "Node has hardware divergence");
+ }
+ }
+ return nodesByFailureReason;
}
- private List<Node> readyNodesWithHardwareDivergence() {
- return nodeRepository().getNodes(Node.State.ready).stream()
- .filter(node -> node.status().hardwareDivergence().isPresent())
- .collect(Collectors.toList());
+ private boolean hasNodeRequestedConfigAfter(Node node, Instant instant) {
+ return !wasMadeReadyBefore(node, instant) || hasRecordedRequestAfter(node, instant);
}
- private boolean wasMadeReadyBefore(Instant instant, Node node) {
+ private boolean wasMadeReadyBefore(Node node, Instant instant) {
Optional<History.Event> readiedEvent = node.history().event(History.Event.Type.readied);
- if ( ! readiedEvent.isPresent()) return false;
- return readiedEvent.get().at().isBefore(instant);
+ return readiedEvent.map(event -> event.at().isBefore(instant)).orElse(false);
}
- private boolean hasRecordedRequestAfter(Instant instant, Node node) {
+ private boolean hasRecordedRequestAfter(Node node, Instant instant) {
Optional<History.Event> lastRequest = node.history().event(History.Event.Type.requested);
- if ( ! lastRequest.isPresent()) return false;
- return lastRequest.get().at().isAfter(instant);
- }
-
- private List<Node> readyNodesWithHardwareFailure() {
- return nodeRepository().getNodes(Node.State.ready).stream()
- .filter(node -> node.status().hardwareFailureDescription().isPresent())
- .collect(Collectors.toList());
+ return lastRequest.map(event -> event.at().isAfter(instant)).orElse(false);
}
private boolean applicationSuspended(Node node) {
@@ -272,18 +261,18 @@ public class NodeFailer extends Maintainer {
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
- List<Node> nodes = nodeRepository().getNodes().stream()
- // Do not consider Docker containers when throttling
- .filter(n -> n.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
- .collect(Collectors.toList());
+ List<Node> nodes = nodeRepository().getNodes();
long recentlyFailedNodes = nodes.stream()
.map(n -> n.history().event(History.Event.Type.failed))
.filter(Optional::isPresent)
.map(Optional::get)
.filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow))
.count();
- boolean throttle = recentlyFailedNodes >= Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail,
- throttlePolicy.minimumAllowedToFail);
+ int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail,
+ throttlePolicy.minimumAllowedToFail);
+
+ boolean throttle = allowedFailedNodes < recentlyFailedNodes ||
+ (allowedFailedNodes == recentlyFailedNodes && node.type() != NodeType.host);
if (throttle) {
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
throttlePolicy.toHumanReadableString()));
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 6bcb9426373..0e0195a5bed 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -185,8 +185,8 @@ public class NodeFailTester {
}
public void allNodesMakeAConfigRequestExcept(List<Node> deadNodes) {
- for (Node node : nodeRepository.getNodes(NodeType.tenant)) {
- if ( ! deadNodes.contains(node) && node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
+ for (Node node : nodeRepository.getNodes()) {
+ if ( ! deadNodes.contains(node))
hostLivenessTracker.receivedRequestFrom(node.hostname());
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index c78663415ef..b9b871dfd1f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -5,7 +5,6 @@ import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException;
import org.junit.Test;
@@ -66,7 +65,7 @@ public class NodeFailerTest {
assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
- // Failures are detected on two ready nodes, which are then failed
+ // Hardware failures are detected on two ready nodes, which are then failed
Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2);
Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3);
tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))));
@@ -166,18 +165,16 @@ public class NodeFailerTest {
tester.createReadyNodes(1, 16, "docker");
// For a day all nodes work so nothing happens
- for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
- tester.clock.advance(Duration.ofMinutes(5));
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
List<Node> ready = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready);
- List<Node> readyHosts = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready);
- // Two ready nodes die and a ready docker node "dies"
- // (Vespa does not run when in ready state for docker node, so it does not make config requests)
+ // Two ready nodes and a ready docker node die, but only 2 of those are failed out
tester.clock.advance(Duration.ofMinutes(180));
Node dockerNode = ready.stream().filter(node -> node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER).findFirst().get();
List<Node> otherNodes = ready.stream()
@@ -188,18 +185,13 @@ public class NodeFailerTest {
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- // Another ready node die
+ // Another ready node dies and the node that died earlier, are allowed to fail
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3));
tester.failer.run();
- assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- assertEquals(ready.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0));
- assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
-
- // Ready Docker hosts do not make config requests
- tester.allNodesMakeAConfigRequestExcept(readyHosts.get(0), readyHosts.get(1), readyHosts.get(2));
- tester.failer.run();
- assertEquals(3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size());
+ assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
+ assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0));
+ assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
}
@Test
@@ -207,8 +199,8 @@ public class NodeFailerTest {
NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7);
// For a day all nodes work so nothing happens
- for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) {
- tester.clock.advance(Duration.ofMinutes(5));
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
@@ -247,10 +239,10 @@ public class NodeFailerTest {
Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0);
tester.serviceMonitor.setHostDown(downTenant1.hostname());
- // nothing happens the first 45 minutes
- for (int minutes = 0; minutes < 45; minutes += 5 ) {
+ // nothing happens during the entire day because of the failure throttling
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.failer.run();
- tester.clock.advance(Duration.ofMinutes(5));
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
}
@@ -374,14 +366,12 @@ public class NodeFailerTest {
public void node_failing_throttle() {
// Throttles based on a absolute number in small zone
{
- NodeFailTester tester = NodeFailTester.withNoApplications();
- List<Node> readyNodes = tester.createReadyNodes(50);
- List<Node> readyDockerNodes = tester.createReadyNodes(50, 50, "docker");
+ // 50 regular tenant nodes, 10 hosts with each 3 tenant nodes, total 90 nodes
+ NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(10);
+ List<Node> readyNodes = tester.createReadyNodes(50, 30);
+ List<Node> hosts = tester.nodeRepository.getNodes(NodeType.host);
List<Node> deadNodes = readyNodes.subList(0, 4);
- // Fail 10 Docker containers, should not impact throttling policy
- readyDockerNodes.subList(0, 10)
- .forEach(node -> tester.nodeRepository.fail(node.hostname(), Agent.system, "Failed in test"));
// 2 hours pass, 4 nodes die
for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
@@ -391,7 +381,7 @@ public class NodeFailerTest {
// 2 nodes are failed (the minimum amount that are always allowed to fail)
tester.failer.run();
- assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -399,15 +389,39 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
- // 18 more hours pass, it's now 24 hours since the first 2 failed. The remaining 2 are failed
- for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
+ // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail)
+ hosts.subList(0, 2).forEach(host -> {
+ tester.serviceMonitor.setHostDown(host.hostname());
+ deadNodes.add(host);
+ });
+ tester.failer.run();
+ tester.clock.advance(Duration.ofMinutes(61));
+ tester.allNodesMakeAConfigRequestExcept(deadNodes);
+
+ tester.failer.run();
+ assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
+
+ // 24 more hours pass without any other nodes being failed out
+ for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(4, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
+
+ // Next, the 2 ready nodes that were dead from the start are failed out, and finally
+ // the second host and all its children are failed
+ tester.clock.advance(Duration.ofMinutes(30));
+ tester.failer.run();
+ assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
+
+ // Nothing else to fail
+ tester.clock.advance(Duration.ofHours(25));
+ tester.allNodesMakeAConfigRequestExcept(deadNodes);
+ tester.failer.run();
+ assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
}
// Throttles based on percentage in large zone
@@ -443,12 +457,6 @@ public class NodeFailerTest {
}
}
- /** Get all failed nodes that are not Docker containers */
- private static List<Node> getNonDockerFailedNodes(NodeRepository nodeRepository) {
- return nodeRepository.getNodes(Node.State.failed).stream()
- .filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
- .collect(Collectors.toList());
- }
/**
* Selects the first parent host that: