summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2017-11-03 11:05:59 +0100
committerValerij Fredriksen <valerijf@oath.com>2017-11-03 11:05:59 +0100
commit8fcdfd80c411d38ebbc885b2280be02945bc5c9a (patch)
tree545b1f860c442dc47de52bf6011af684b0f6f429 /node-repository
parentbf3d8f80a0612092c8ece9717d5c73697be5b9d0 (diff)
Take docker nodes into throttling count when failing
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java4
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java59
3 files changed, 26 insertions, 45 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 1c81d97ddea..6aa990d05cc 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
-import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.transaction.Mutex;
@@ -78,8 +77,6 @@ public class NodeFailer extends Maintainer {
// Ready nodes
updateNodeLivenessEventsForReadyNodes();
for (Node node : readyNodesWhichAreDead()) {
- // Docker hosts and nodes do not run Vespa services
- if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue;
if ( ! throttle(node)) nodeRepository().fail(node.hostname(),
Agent.system, "Not receiving config requests from node");
}
@@ -272,10 +269,7 @@ public class NodeFailer extends Maintainer {
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
- List<Node> nodes = nodeRepository().getNodes().stream()
- // Do not consider Docker containers when throttling
- .filter(n -> n.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
- .collect(Collectors.toList());
+ List<Node> nodes = nodeRepository().getNodes();
long recentlyFailedNodes = nodes.stream()
.map(n -> n.history().event(History.Event.Type.failed))
.filter(Optional::isPresent)
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 6bcb9426373..0e0195a5bed 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -185,8 +185,8 @@ public class NodeFailTester {
}
public void allNodesMakeAConfigRequestExcept(List<Node> deadNodes) {
- for (Node node : nodeRepository.getNodes(NodeType.tenant)) {
- if ( ! deadNodes.contains(node) && node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
+ for (Node node : nodeRepository.getNodes()) {
+ if ( ! deadNodes.contains(node))
hostLivenessTracker.receivedRequestFrom(node.hostname());
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index c78663415ef..afc298872c8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -5,13 +5,13 @@ import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException;
import org.junit.Test;
import java.time.Duration;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -66,7 +66,7 @@ public class NodeFailerTest {
assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
- // Failures are detected on two ready nodes, which are then failed
+ // Hardware failures are detected on two ready nodes, which are then failed
Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2);
Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3);
tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))));
@@ -166,18 +166,16 @@ public class NodeFailerTest {
tester.createReadyNodes(1, 16, "docker");
// For a day all nodes work so nothing happens
- for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) {
- tester.clock.advance(Duration.ofMinutes(5));
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
List<Node> ready = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready);
- List<Node> readyHosts = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready);
- // Two ready nodes die and a ready docker node "dies"
- // (Vespa does not run when in ready state for docker node, so it does not make config requests)
+ // Two ready nodes and a ready docker node die, but only 2 of those are failed out
tester.clock.advance(Duration.ofMinutes(180));
Node dockerNode = ready.stream().filter(node -> node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER).findFirst().get();
List<Node> otherNodes = ready.stream()
@@ -188,18 +186,13 @@ public class NodeFailerTest {
assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
- // Another ready node die
+ // Another ready node dies and the node that died earlier, are allowed to fail
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3));
tester.failer.run();
- assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
- assertEquals(ready.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0));
- assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
-
- // Ready Docker hosts do not make config requests
- tester.allNodesMakeAConfigRequestExcept(readyHosts.get(0), readyHosts.get(1), readyHosts.get(2));
- tester.failer.run();
- assertEquals(3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size());
+ assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
+ assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0));
+ assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
}
@Test
@@ -207,8 +200,8 @@ public class NodeFailerTest {
NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7);
// For a day all nodes work so nothing happens
- for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) {
- tester.clock.advance(Duration.ofMinutes(5));
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
@@ -247,10 +240,10 @@ public class NodeFailerTest {
Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0);
tester.serviceMonitor.setHostDown(downTenant1.hostname());
- // nothing happens the first 45 minutes
- for (int minutes = 0; minutes < 45; minutes += 5 ) {
+ // nothing happens during the entire day because of the failure throttling
+ for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.failer.run();
- tester.clock.advance(Duration.ofMinutes(5));
+ tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
}
@@ -375,13 +368,13 @@ public class NodeFailerTest {
// Throttles based on a absolute number in small zone
{
NodeFailTester tester = NodeFailTester.withNoApplications();
- List<Node> readyNodes = tester.createReadyNodes(50);
- List<Node> readyDockerNodes = tester.createReadyNodes(50, 50, "docker");
+ tester.createReadyNodes(50);
+ tester.createReadyNodes(50, 50, "docker");
+
+ List<Node> readyNodes = tester.nodeRepository.getNodes();
+ Collections.shuffle(readyNodes);
List<Node> deadNodes = readyNodes.subList(0, 4);
- // Fail 10 Docker containers, should not impact throttling policy
- readyDockerNodes.subList(0, 10)
- .forEach(node -> tester.nodeRepository.fail(node.hostname(), Agent.system, "Failed in test"));
// 2 hours pass, 4 nodes die
for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) {
@@ -391,7 +384,7 @@ public class NodeFailerTest {
// 2 nodes are failed (the minimum amount that are always allowed to fail)
tester.failer.run();
- assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -399,15 +392,15 @@ public class NodeFailerTest {
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
// 18 more hours pass, it's now 24 hours since the first 2 failed. The remaining 2 are failed
- for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
+ for (int minutes = 0, interval = 30; minutes <= 18 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
}
tester.failer.run();
- assertEquals(4, getNonDockerFailedNodes(tester.nodeRepository).size());
+ assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size());
}
// Throttles based on percentage in large zone
@@ -443,12 +436,6 @@ public class NodeFailerTest {
}
}
- /** Get all failed nodes that are not Docker containers */
- private static List<Node> getNonDockerFailedNodes(NodeRepository nodeRepository) {
- return nodeRepository.getNodes(Node.State.failed).stream()
- .filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
- .collect(Collectors.toList());
- }
/**
* Selects the first parent host that: