From 8fcdfd80c411d38ebbc885b2280be02945bc5c9a Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Fri, 3 Nov 2017 11:05:59 +0100 Subject: Take docker nodes into throttling count when failing --- .../hosted/provision/maintenance/NodeFailer.java | 8 +-- .../provision/maintenance/NodeFailTester.java | 4 +- .../provision/maintenance/NodeFailerTest.java | 59 +++++++++------------- 3 files changed, 26 insertions(+), 45 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 1c81d97ddea..6aa990d05cc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; import com.yahoo.transaction.Mutex; @@ -78,8 +77,6 @@ public class NodeFailer extends Maintainer { // Ready nodes updateNodeLivenessEventsForReadyNodes(); for (Node node : readyNodesWhichAreDead()) { - // Docker hosts and nodes do not run Vespa services - if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER || node.type() == NodeType.host) continue; if ( ! throttle(node)) nodeRepository().fail(node.hostname(), Agent.system, "Not receiving config requests from node"); } @@ -272,10 +269,7 @@ public class NodeFailer extends Maintainer { private boolean throttle(Node node) { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow); - List nodes = nodeRepository().getNodes().stream() - // Do not consider Docker containers when throttling - .filter(n -> n.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) - .collect(Collectors.toList()); + List nodes = nodeRepository().getNodes(); long recentlyFailedNodes = nodes.stream() .map(n -> n.history().event(History.Event.Type.failed)) .filter(Optional::isPresent) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 6bcb9426373..0e0195a5bed 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -185,8 +185,8 @@ public class NodeFailTester { } public void allNodesMakeAConfigRequestExcept(List deadNodes) { - for (Node node : nodeRepository.getNodes(NodeType.tenant)) { - if ( ! deadNodes.contains(node) && node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) + for (Node node : nodeRepository.getNodes()) { + if ( ! deadNodes.contains(node)) hostLivenessTracker.receivedRequestFrom(node.hostname()); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index c78663415ef..afc298872c8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -5,13 +5,13 @@ import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.ApplicationStateChangeDeniedException; import org.junit.Test; import java.time.Duration; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -66,7 +66,7 @@ public class NodeFailerTest { assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } - // Failures are detected on two ready nodes, which are then failed + // Hardware failures are detected on two ready nodes, which are then failed Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2); Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3); tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog")))); @@ -166,18 +166,16 @@ public class NodeFailerTest { tester.createReadyNodes(1, 16, "docker"); // For a day all nodes work so nothing happens - for (int minutes = 0; minutes < 24 * 60; minutes +=5 ) { - tester.clock.advance(Duration.ofMinutes(5)); + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); assertEquals( 5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); } List ready = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready); - List readyHosts = tester.nodeRepository.getNodes(NodeType.host, Node.State.ready); - // Two ready nodes die and a ready docker node "dies" - // (Vespa does not run when in ready state for docker node, so it does not make config requests) + // Two ready nodes and a ready docker node die, but only 2 of those are failed out tester.clock.advance(Duration.ofMinutes(180)); Node dockerNode = ready.stream().filter(node -> node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER).findFirst().get(); List otherNodes = ready.stream() @@ -188,18 +186,13 @@ public class NodeFailerTest { assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - // Another ready node die + // Another ready node dies and the node that died earlier, are allowed to fail tester.clock.advance(Duration.ofDays(1)); tester.allNodesMakeAConfigRequestExcept(otherNodes.get(0), otherNodes.get(2), dockerNode, otherNodes.get(3)); tester.failer.run(); - assertEquals( 2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); - assertEquals(ready.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); - assertEquals( 3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); - - // Ready Docker hosts do not make config requests - tester.allNodesMakeAConfigRequestExcept(readyHosts.get(0), readyHosts.get(1), readyHosts.get(2)); - tester.failer.run(); - assertEquals(3, tester.nodeRepository.getNodes(NodeType.host, Node.State.ready).size()); + assertEquals( 1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size()); + assertEquals(otherNodes.get(1), tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(0)); + assertEquals( 4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size()); } @Test @@ -207,8 +200,8 @@ public class NodeFailerTest { NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7); // For a day all nodes work so nothing happens - for (int minutes = 0; minutes < 24 * 60; minutes += 5 ) { - tester.clock.advance(Duration.ofMinutes(5)); + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); tester.failer.run(); assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size()); @@ -247,10 +240,10 @@ public class NodeFailerTest { Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0); tester.serviceMonitor.setHostDown(downTenant1.hostname()); - // nothing happens the first 45 minutes - for (int minutes = 0; minutes < 45; minutes += 5 ) { + // nothing happens during the entire day because of the failure throttling + for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) { tester.failer.run(); - tester.clock.advance(Duration.ofMinutes(5)); + tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(); assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size()); } @@ -375,13 +368,13 @@ public class NodeFailerTest { // Throttles based on a absolute number in small zone { NodeFailTester tester = NodeFailTester.withNoApplications(); - List readyNodes = tester.createReadyNodes(50); - List readyDockerNodes = tester.createReadyNodes(50, 50, "docker"); + tester.createReadyNodes(50); + tester.createReadyNodes(50, 50, "docker"); + + List readyNodes = tester.nodeRepository.getNodes(); + Collections.shuffle(readyNodes); List deadNodes = readyNodes.subList(0, 4); - // Fail 10 Docker containers, should not impact throttling policy - readyDockerNodes.subList(0, 10) - .forEach(node -> tester.nodeRepository.fail(node.hostname(), Agent.system, "Failed in test")); // 2 hours pass, 4 nodes die for (int minutes = 0, interval = 30; minutes < 2 * 60; minutes += interval) { @@ -391,7 +384,7 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.failer.run(); - assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -399,15 +392,15 @@ public class NodeFailerTest { tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(2, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); // 18 more hours pass, it's now 24 hours since the first 2 failed. The remaining 2 are failed - for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { + for (int minutes = 0, interval = 30; minutes <= 18 * 60; minutes += interval) { tester.clock.advance(Duration.ofMinutes(interval)); tester.allNodesMakeAConfigRequestExcept(deadNodes); } tester.failer.run(); - assertEquals(4, getNonDockerFailedNodes(tester.nodeRepository).size()); + assertEquals(4, tester.nodeRepository.getNodes(Node.State.failed).size()); } // Throttles based on percentage in large zone @@ -443,12 +436,6 @@ public class NodeFailerTest { } } - /** Get all failed nodes that are not Docker containers */ - private static List getNonDockerFailedNodes(NodeRepository nodeRepository) { - return nodeRepository.getNodes(Node.State.failed).stream() - .filter(node -> node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) - .collect(Collectors.toList()); - } /** * Selects the first parent host that: -- cgit v1.2.3