aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-01-25 13:27:22 +0100
committerJon Bratseth <bratseth@gmail.com>2022-01-25 13:27:22 +0100
commit9bfff8de475fb7cb8671430fa6883f26d1fe61cf (patch)
tree1b515f5ccf98daf094a31f98e32e444106336de1 /node-repository/src/main/java
parent141924ab31eea1a979608958358c56e7640518c0 (diff)
Increase down grace time while nodes are suspended
Diffstat (limited to 'node-repository/src/main/java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java70
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java2
3 files changed, 44 insertions, 30 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index ca14a1be4c4..7cf806eb16c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -201,7 +201,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context);
metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context);
- metric.set("failReport", NodeFailer.reasonsToFailParentHost(node).isEmpty() ? 0 : 1, context);
+ metric.set("failReport", NodeFailer.reasonsToFailHost(node).isEmpty() ? 0 : 1, context);
if (node.type().isHost()) {
metric.set("wantToEncrypt", node.reports().getReport("wantToEncrypt").isPresent() ? 1 : 0, context);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index cb5db4e1d76..8ff35d26efd 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -22,10 +22,11 @@ import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.time.Instant;
-import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Optional;
+import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -40,7 +41,6 @@ import java.util.stream.Collectors;
public class NodeFailer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
- private static final Duration nodeRequestInterval = Duration.ofMinutes(10);
/** Metric for number of hosts that we want to fail, but cannot due to throttling */
static final String throttledHostFailuresMetric = "throttledHostFailures";
@@ -53,8 +53,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private final Deployer deployer;
private final Duration downTimeLimit;
+ private final Duration suspendedDownTimeLimit;
private final Orchestrator orchestrator;
- private final Instant constructionTime;
private final ThrottlePolicy throttlePolicy;
private final Metric metric;
@@ -65,8 +65,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric);
this.deployer = deployer;
this.downTimeLimit = downTimeLimit;
+ this.suspendedDownTimeLimit = downTimeLimit.multipliedBy(4); // Allow more downtime when a node is suspended
this.orchestrator = orchestrator;
- this.constructionTime = nodeRepository.clock().instant();
this.throttlePolicy = throttlePolicy;
this.metric = metric;
}
@@ -139,11 +139,11 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return asSuccessFactor(attempts, failures);
}
- private List<FailingNode> findReadyFailingNodes() {
- List<FailingNode> failingNodes = new ArrayList<>();
+ private Collection<FailingNode> findReadyFailingNodes() {
+ Set<FailingNode> failingNodes = new HashSet<>();
for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
- List<String> failureReports = reasonsToFailParentHost(hostNode);
+ List<String> failureReports = reasonsToFailHost(hostNode);
if (failureReports.size() > 0) {
if (hostNode.equals(node)) {
failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports));
@@ -155,35 +155,38 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return failingNodes;
}
- private List<FailingNode> findActiveFailingNodes() {
+ private Collection<FailingNode> findActiveFailingNodes() {
+ Set<FailingNode> failingNodes = new HashSet<>();
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
- Instant graceTimeEnd = clock().instant().minus(downTimeLimit);
- List<FailingNode> failingNodes = new ArrayList<>();
+
for (Node node : activeNodes) {
- if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
+ Instant graceTimeStart = clock().instant().minus(suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
+ if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
// Allow a grace period after node re-activation
- if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
+ if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
}
- else if (hostSuspended(node, activeNodes)) {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
- if (hostNode.type().isHost()) {
- List<String> failureReports = reasonsToFailParentHost(hostNode);
- if (failureReports.size() > 0) {
- if (hostNode.equals(node)) {
- failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports));
- } else {
- failingNodes.add(new FailingNode(node, "Parent (" + hostNode + ") has failure reports: " + failureReports));
- }
+ }
+
+ for (Node node : activeNodes) {
+ if (allSuspended(node, activeNodes)) {
+ Node host = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
+ if (host.type().isHost()) {
+ List<String> failureReports = reasonsToFailHost(host);
+ if ( ! failureReports.isEmpty()) {
+ failingNodes.add(new FailingNode(node, host.equals(node) ?
+ "Host has failure reports: " + failureReports :
+ "Parent " + host + " has failure reports: " + failureReports));
}
}
}
}
+
return failingNodes;
}
- public static List<String> reasonsToFailParentHost(Node hostNode) {
- return hostNode.reports().getReports().stream()
+ public static List<String> reasonsToFailHost(Node host) {
+ return host.reports().getReports().stream()
.filter(report -> report.getType().hostShouldBeFailed())
// The generated string is built from the report's ID, created time, and description only.
.map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription())
@@ -192,8 +195,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
/** Returns whether node has any kind of hardware issue */
static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node);
- return reasonsToFailParentHost(hostNode).size() > 0;
+ Node host = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node);
+ return reasonsToFailHost(host).size() > 0;
}
private boolean applicationSuspended(Node node) {
@@ -216,7 +219,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
/** Is the node and all active children suspended? */
- private boolean hostSuspended(Node node, NodeList activeNodes) {
+ private boolean allSuspended(Node node, NodeList activeNodes) {
if (!suspended(node)) return false;
if (node.parentHostname().isPresent()) return true; // optimization
return activeNodes.stream()
@@ -359,6 +362,17 @@ public class NodeFailer extends NodeRepositoryMaintainer {
public Node node() { return node; }
public String reason() { return reason; }
+ @Override
+ public boolean equals(Object other) {
+ if ( ! (other instanceof FailingNode)) return false;
+ return ((FailingNode)other).node().equals(this.node());
+ }
+
+ @Override
+ public int hashCode() {
+ return node.hashCode();
+ }
+
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 4d4456920e7..4c96e58d954 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -474,7 +474,7 @@ public class Nodes {
if (node.state() == Node.State.ready) return node;
Node parentHost = node.parentHostname().flatMap(this::node).orElse(node);
- List<String> failureReasons = NodeFailer.reasonsToFailParentHost(parentHost);
+ List<String> failureReasons = NodeFailer.reasonsToFailHost(parentHost);
if ( ! failureReasons.isEmpty())
illegal(node + " cannot be readied because it has hard failures: " + failureReasons);