summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-02-15 13:49:02 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-02-15 13:49:02 +0100
commit5df006545147994f43f40acea91a7baa1acd93df (patch)
tree7158a6661ef9030baa6ac25d14c70c76deecb34e /node-repository
parent0cfa248003df6099bb88821b119fee45791db016 (diff)
Fail instead of retire on failure report in NodeFailer
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java78
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java117
2 files changed, 38 insertions, 157 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index f6933f7f585..28e036169fa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -2,8 +2,6 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.cloud.config.ConfigserverConfig;
-import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
@@ -28,7 +26,6 @@ import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
-import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -134,28 +131,6 @@ public class NodeFailer extends Maintainer {
failActive(node, reason);
}
- // Retire active hosts and their children.
- activeNodes.stream()
- .filter(node -> failAllowedFor(node.type()))
- .filter(node -> !nodesWithFailureReason.contains(node))
- // Defer to parent host (it should also be active)
- .filter(node -> node.parentHostname().isEmpty())
- // This will sort those with wantToRetire first
- .sorted(Comparator.comparing(node -> node.status().wantToRetire(), Comparator.reverseOrder()))
- .filter(node -> {
- if (node.status().wantToRetire()) return true;
- if (node.allocation().map(a -> a.membership().retired()).orElse(false)) return true;
- List<String> reasons = reasonsToRetireActiveParentHost(node);
- if (reasons.size() > 0) {
- retireRecursively(node, reasons, activeNodes);
- return true;
- }
- return false;
- })
- // Only allow 1 active host to be wantToRetire at a time for rate limiting.
- .limit(1)
- .count();
-
metric.set(throttlingActiveMetric, Math.min( 1, throttledNodeFailures), null);
metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
}
@@ -235,8 +210,20 @@ public class NodeFailer extends Maintainer {
for (Node node : activeNodes) {
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
- } else if (node.status().hardwareFailureDescription().isPresent() && nodeSuspended(node)) {
- nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
+ } else if (nodeSuspended(node)) {
+ if (node.status().hardwareFailureDescription().isPresent()) {
+ nodesByFailureReason.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
+ } else {
+ Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node);
+ List<String> failureReports = reasonsToRetireActiveParentHost(hostNode);
+ if (failureReports.size() > 0) {
+ if (hostNode.equals(node)) {
+ nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
+ } else {
+ nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
+ }
+ }
+ }
}
}
return nodesByFailureReason;
@@ -261,43 +248,6 @@ public class NodeFailer extends Maintainer {
reportId + " reported " + report.getCreatedTime() + ": " + report.getDescription());
}
- /**
- * There are reasons why this node should be parked, and we'd like to do it through retiring,
- * including any child nodes.
- */
- private void retireRecursively(Node node, List<String> reasons, List<Node> activeNodes) {
- if (activeNodes != null) {
- List<Node> childNodesToRetire = activeNodes.stream()
- .filter(n -> n.parentHostname().equals(Optional.of(node.hostname())))
- .collect(Collectors.toList());
- for (Node childNode : childNodesToRetire) {
- retireRecursively(childNode, reasons, null);
- }
- }
-
- if (node.status().wantToRetire()) return;
- retireActive(node.hostname(), node.allocation().get().owner(), reasons);
- }
-
- private void retireActive(String hostname, ApplicationId owner, List<String> reasons) {
- // Getting the application lock can take a very long time for the largest applications.
- // Don't bother waiting for too long since retries is automatic with maintainers.
- Duration lockWait = Duration.ofSeconds(10);
- try (Mutex lock = nodeRepository().lock(owner, lockWait)) {
- // Recheck all conditions in case anything has changed
- Optional<Node> node = nodeRepository().getNode(hostname);
- if (node.isEmpty()) return;
- if (node.get().state() != Node.State.active) return;
- if (!node.get().allocation().orElseThrow().owner().equals(owner)) return;
- if (node.get().status().wantToRetire()) return;
-
- log.info("Setting wantToRetire on " + node.get() + " due to these reports: " + reasons);
- nodeRepository().write(node.get().withWantToRetire(true, Agent.NodeFailer, clock.instant()));
- } catch (ApplicationLockException e) {
- log.warning("Failed to get lock on " + owner + " within " + lockWait + " to set wantToRetire, will retry later");
- }
- }
-
/** Returns whether node has any kind of hardware issue */
public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
if (node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent()) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 3e022de6a5a..eaacf7c656b 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -53,9 +53,30 @@ public class NodeFailerTest {
tester.nodeRepository.write(updatedNode);
});
+ testNodeFailingWith(tester, hostWithHwFailure);
+ }
+
+ @Test
+ public void fail_nodes_with_severe_reports_if_allowed_to_be_down() {
+ NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6);
+ String hostWithFailureReports = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
+
+ // Set failure report to the parent and all its children.
+ Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
+ tester.nodeRepository.getNodes().stream()
+ .filter(node -> node.hostname().equals(hostWithFailureReports))
+ .forEach(node -> {
+ Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
+ tester.nodeRepository.write(updatedNode);
+ });
+
+ testNodeFailingWith(tester, hostWithFailureReports);
+ }
+
+ private void testNodeFailingWith(NodeFailTester tester, String hostWithHwFailure) {
// The host should have 2 nodes in active and 1 ready
Map<Node.State, List<String>> hostnamesByState = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
- .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
+ .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
assertEquals(2, hostnamesByState.get(Node.State.active).size());
assertEquals(1, hostnamesByState.get(Node.State.ready).size());
@@ -73,7 +94,7 @@ public class NodeFailerTest {
Arrays.asList(hostnamesByState.get(Node.State.active).get(0), hostnamesByState.get(Node.State.ready).get(0)));
expectedHostnamesByState1Iter.put(Node.State.active, hostnamesByState.get(Node.State.active).subList(1, 2));
Map<Node.State, List<String>> hostnamesByState1Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
- .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
+ .collect(Collectors.groupingBy(Node::state, Collectors.mapping(Node::hostname, Collectors.toList())));
assertEquals(expectedHostnamesByState1Iter, hostnamesByState1Iter);
// Suspend the second of the active nodes
@@ -85,7 +106,7 @@ public class NodeFailerTest {
// All of the children should be failed now
Set<Node.State> childStates2Iter = tester.nodeRepository.list().childrenOf(hostWithHwFailure).asList().stream()
- .map(Node::state).collect(Collectors.toSet());
+ .map(Node::state).collect(Collectors.toSet());
assertEquals(Collections.singleton(Node.State.failed), childStates2Iter);
// The host itself is still active as it too must be allowed to suspend
assertEquals(Node.State.active, tester.nodeRepository.getNode(hostWithHwFailure).get().state());
@@ -97,96 +118,6 @@ public class NodeFailerTest {
}
@Test
- public void set_want_to_retire_if_failure_report() {
- NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(6);
- String dockerHostWithFailureReport = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
-
- // Set failure report to the parent and all its children.
- Report badTotalMemorySizeReport = Report.basicReport("badTotalMemorySize", Instant.now(), "too low");
- tester.nodeRepository.getNodes().stream()
- .filter(node -> node.hostname().equals(dockerHostWithFailureReport))
- .forEach(node -> {
- Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
- tester.nodeRepository.write(updatedNode);
- });
-
- {
- // The host is active
- Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow();
- assertEquals(Node.State.active, parentNode.state());
- assertEquals(1, parentNode.reports().getReports().size());
- assertFalse(parentNode.status().wantToRetire());
-
- List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
- assertEquals(3, childNodes.size());
-
- // The 2 active child nodes
- List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList());
- assertEquals(2, activeChildNodes.size());
- assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire()));
-
- // The ready child node
- List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.ready).collect(Collectors.toList());
- assertEquals(1, failedChildNodes.size());
- assertTrue(activeChildNodes.stream().noneMatch(n -> n.status().wantToRetire()));
- }
-
- tester.failer.run();
-
- {
- // The host is active with wantToRetire
- Node parentNode = tester.nodeRepository.getNode(dockerHostWithFailureReport).orElseThrow();
- assertEquals(Node.State.active, parentNode.state());
- assertTrue(parentNode.status().wantToRetire());
-
- List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHostWithFailureReport).asList();
- assertEquals(3, childNodes.size());
-
- // The active nodes -> wantToRetire
- List<Node> activeChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.active).collect(Collectors.toList());
- assertEquals(2, activeChildNodes.size());
- assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
-
- // The ready node -> failed with wantToRetire
- List<Node> failedChildNodes = childNodes.stream().filter(n -> n.state() == Node.State.failed).collect(Collectors.toList());
- assertEquals(1, failedChildNodes.size());
- assertTrue(activeChildNodes.stream().allMatch(n -> n.status().wantToRetire()));
- }
-
- // Set wantToRetire on the second host. Rate limiting will keep it from becoming wantToRetire
-
- String dockerHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, dockerHostWithFailureReport);
- tester.nodeRepository.getNodes().stream()
- .filter(node -> node.hostname().equals(dockerHost2))
- .forEach(node -> {
- Node updatedNode = node.with(node.reports().withReport(badTotalMemorySizeReport));
- tester.nodeRepository.write(updatedNode);
- });
-
- {
- // dockerHost2 is active and with reports
- Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow();
- assertEquals(Node.State.active, parentNode.state());
- assertEquals(1, parentNode.reports().getReports().size());
- assertFalse(parentNode.status().wantToRetire());
-
- List<Node> childNodes = tester.nodeRepository.list().childrenOf(dockerHost2).asList();
- assertEquals(3, childNodes.size());
- }
-
- tester.clock.advance(Duration.ofHours(25));
- tester.allNodesMakeAConfigRequestExcept();
- tester.failer.run();
-
- {
- // dockerHost2 is active with wantToRetire
- Node parentNode = tester.nodeRepository.getNode(dockerHost2).orElseThrow();
- assertEquals(Node.State.active, parentNode.state());
- assertFalse(parentNode.status().wantToRetire());
- }
- }
-
- @Test
public void nodes_for_suspended_applications_are_not_failed() {
NodeFailTester tester = NodeFailTester.withTwoApplications();
tester.suspend(NodeFailTester.app1);