summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-04-22 20:56:12 +0200
committerGitHub <noreply@github.com>2022-04-22 20:56:12 +0200
commita623fb063f36fae897757e110ba428a8cfb30472 (patch)
treedae9df67d4b5a032f7428653efef9830aa8d53c0
parent2b0050d8edb0df787e8c0f605e30e6707d476f0c (diff)
parentd64f143362a87c8f8bf31f594897cd7ce87f6bd6 (diff)
Merge pull request #22228 from vespa-engine/bratseth/read-nodes-less
Read nodes less
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java53
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java11
2 files changed, 33 insertions, 31 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 3274f12dbc6..1b0c7602f82 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.Zone;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
@@ -66,45 +67,49 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
@Override
protected double maintain() {
- List<Node> remainingNodes = new ArrayList<>(nodeRepository.nodes().list(Node.State.failed)
- .nodeType(NodeType.tenant, NodeType.host)
- .asList());
+ NodeList allNodes = nodeRepository.nodes().list();
+ List<Node> remainingNodes = new ArrayList<>(allNodes.state(Node.State.failed)
+ .nodeType(NodeType.tenant, NodeType.host)
+ .asList());
- recycleIf(remainingNodes, node -> node.allocation().isEmpty());
- recycleIf(remainingNodes, node ->
- !node.allocation().get().membership().cluster().isStateful() &&
- node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statelessExpiry)));
- recycleIf(remainingNodes, node ->
- node.allocation().get().membership().cluster().isStateful() &&
- node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry)));
+ recycleIf(remainingNodes,
+ node -> node.allocation().isEmpty(),
+ allNodes);
+ recycleIf(remainingNodes,
+ node -> !node.allocation().get().membership().cluster().isStateful() &&
+ node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statelessExpiry)),
+ allNodes);
+ recycleIf(remainingNodes,
+ node -> node.allocation().get().membership().cluster().isStateful() &&
+ node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry)),
+ allNodes);
return 1.0;
}
/** Recycle the nodes matching condition, and remove those nodes from the nodes list. */
- private void recycleIf(List<Node> nodes, Predicate<Node> recycleCondition) {
- List<Node> nodesToRecycle = nodes.stream().filter(recycleCondition).collect(Collectors.toList());
- nodes.removeAll(nodesToRecycle);
- recycle(nodesToRecycle);
+ private void recycleIf(List<Node> failedNodes, Predicate<Node> recycleCondition, NodeList allNodes) {
+ List<Node> nodesToRecycle = failedNodes.stream().filter(recycleCondition).collect(Collectors.toList());
+ failedNodes.removeAll(nodesToRecycle);
+ recycle(nodesToRecycle, allNodes);
}
/** Move eligible nodes to dirty or parked. This may be a subset of the given nodes */
- private void recycle(List<Node> nodes) {
+ private void recycle(List<Node> nodes, NodeList allNodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (broken(candidate)) {
+ if (broken(candidate, allNodes)) {
List<String> unparkedChildren = !candidate.type().isHost() ? List.of() :
- nodeRepository.nodes().list()
- .childrenOf(candidate)
- .not().state(Node.State.parked)
- .mapToList(Node::hostname);
+ allNodes.childrenOf(candidate)
+ .not().state(Node.State.parked)
+ .mapToList(Node::hostname);
if (unparkedChildren.isEmpty()) {
nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer,
"Parked by FailedExpirer due to hardware issue or high fail count");
} else {
log.info(String.format("Expired failed node %s with hardware issue was not parked because of " +
- "unparked children: %s", candidate.hostname(),
- String.join(", ", unparkedChildren)));
+ "unparked children: %s",
+ candidate.hostname(), String.join(", ", unparkedChildren)));
}
} else {
nodesToRecycle.add(candidate);
@@ -114,8 +119,8 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
}
/** Returns whether node is broken and cannot be recycled */
- private boolean broken(Node node) {
- return NodeFailer.hasHardwareIssue(node, nodeRepository) ||
+ private boolean broken(Node node, NodeList allNodes) {
+ return NodeFailer.hasHardwareIssue(node, allNodes) ||
(node.type().isHost() && node.status().failCount() >= maxAllowedFailures);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 237cbaedf46..3c5b20da4d0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -164,7 +164,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
for (Node node : activeNodes) {
if (allSuspended(node, activeNodes)) {
- Node host = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
+ Node host = node.parentHostname().flatMap(parent -> activeNodes.node(parent)).orElse(node);
if (host.type().isHost()) {
List<String> failureReports = reasonsToFailHost(host);
if ( ! failureReports.isEmpty()) {
@@ -188,8 +188,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
/** Returns whether node has any kind of hardware issue */
- static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
- Node host = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node);
+ static boolean hasHardwareIssue(Node node, NodeList allNodes) {
+ Node host = node.parentHostname().flatMap(parent -> allNodes.node(parent)).orElse(node);
return reasonsToFailHost(host).size() > 0;
}
@@ -207,10 +207,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private boolean allSuspended(Node node, NodeList activeNodes) {
if (!nodeRepository().nodes().suspended(node)) return false;
if (node.parentHostname().isPresent()) return true; // optimization
- return activeNodes.stream()
- .filter(childNode -> childNode.parentHostname().isPresent() &&
- childNode.parentHostname().get().equals(node.hostname()))
- .allMatch(nodeRepository().nodes()::suspended);
+ return activeNodes.childrenOf(node.hostname()).stream().allMatch(nodeRepository().nodes()::suspended);
}
/**