aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-12 10:16:25 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-12 10:16:25 +0200
commit034c06f6e47b59f5634f6faaa316995c115c83ed (patch)
treeef76aef674977db0db22fcd215915060e630a8f6 /node-repository
parent0d2381234ad4d0a18b08ce15efd550474249e657 (diff)
Node failing improvements
- Fail hosts that wants to fail and do not have active children - Clear want to fail on failing in case the nodes are later reactivated
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java25
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java20
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java2
5 files changed, 42 insertions, 9 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index de72024cb77..3dbafdc2aba 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -196,7 +196,7 @@ public final class Node implements Nodelike {
/**
* Returns a copy of this where wantToFail is set to true and history is updated to reflect this.
*/
- public Node withWantToFail(boolean wantToFail, Agent agent, String reason, Instant at) {
+ public Node withWantToFail(boolean wantToFail, Agent agent, Instant at) {
Node node = this.with(status.withWantToFail(wantToFail));
if (wantToFail)
node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at)));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index ac6ecd98fac..04102c3c38e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -10,6 +10,7 @@ import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
@@ -110,6 +111,22 @@ public class NodeFailer extends NodeRepositoryMaintainer {
failActive(node, reason);
}
+ // Active hosts
+ NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
+ for (Node host : activeNodes.hosts().failing()) {
+ if ( ! activeNodes.childrenOf(host).isEmpty()) continue;
+ Optional<NodeMutex> locked = Optional.empty();
+ try {
+ locked = nodeRepository().nodes().lockAndGet(host);
+ if (locked.isEmpty()) continue;
+ nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer,
+ "Host should be failed and have no tenant nodes");
+ }
+ finally {
+ locked.ifPresent(NodeMutex::close);
+ }
+ }
+
int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures);
metric.set(throttlingActiveMetric, throttlingActive, null);
metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
@@ -277,7 +294,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- wantToFail(node, true, reason, lock);
+ wantToFail(node, true, lock);
try {
deployment.get().activate();
return true;
@@ -289,7 +306,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
} catch (RuntimeException e) {
// Reset want to fail: We'll retry failing unless it heals in the meantime
nodeRepository().nodes().node(node.hostname())
- .ifPresent(n -> wantToFail(n, false, "Could not fail", lock));
+ .ifPresent(n -> wantToFail(n, false, lock));
log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() +
" for " + reason + ": " + Exceptions.toMessageString(e));
return false;
@@ -297,8 +314,8 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
}
- private void wantToFail(Node node, boolean wantToFail, String reason, Mutex lock) {
- nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, reason, clock().instant()), lock);
+ private void wantToFail(Node node, boolean wantToFail, Mutex lock) {
+ nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
}
/** Returns true if node failing should be throttled */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 9ac33643148..3e4221ecfcd 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -123,7 +123,7 @@ public class Nodes {
/** Adds a list of newly created reserved nodes to the node repository */
public List<Node> addReservedNodes(LockedNodeList nodes) {
for (Node node : nodes) {
- if ( ! node.flavor().getType().equals(Flavor.Type.DOCKER_CONTAINER))
+ if ( node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER)
illegal("Cannot add " + node + ": This is not a child node");
if (node.allocation().isEmpty())
illegal("Cannot add " + node + ": Child nodes need to be allocated");
@@ -257,7 +257,21 @@ public class Nodes {
* transaction commits.
*/
public List<Node> fail(List<Node> nodes, ApplicationTransaction transaction) {
- return db.writeTo(Node.State.failed, nodes, Agent.application, Optional.of("Failed by application"), transaction.nested());
+ return fail(nodes, Agent.application, "Failed by application", transaction.nested());
+ }
+
+ public List<Node> fail(List<Node> nodes, Agent agent, String reason) {
+ NestedTransaction transaction = new NestedTransaction();
+ nodes = fail(nodes, agent, reason, transaction);
+ transaction.commit();;
+ return nodes;
+ }
+
+ private List<Node> fail(List<Node> nodes, Agent agent, String reason, NestedTransaction transaction) {
+ nodes = nodes.stream()
+ .map(n -> n.withWantToFail(false, agent, clock.instant()))
+ .collect(Collectors.toList());
+ return db.writeTo(Node.State.failed, nodes, agent, Optional.of(reason), transaction);
}
/** Move nodes to the dirty state */
@@ -347,7 +361,7 @@ public class Nodes {
private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) {
if (node.state() == Node.State.active) {
- node = node.withWantToFail(true, agent, reason, clock.instant());
+ node = node.withWantToFail(true, agent, clock.instant());
write(node, lock);
return node;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index 6ab8fc8ad49..8eca4ff2d95 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -206,7 +206,7 @@ public class NodePrioritizer {
/** Returns whether we are allocating to replace a failed node */
private boolean isReplacement(NodeList nodesInCluster) {
- int failedNodesInCluster = nodesInCluster.failing().size();
+ int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size();
if (failedNodesInCluster == 0) return false;
return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index bb954058916..0c48c6dfd83 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -475,6 +475,7 @@ public class NodeFailerTest {
assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size());
assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
+ assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size());
// Now lets fail an active tenant node
@@ -514,6 +515,7 @@ public class NodeFailerTest {
assertEquals(6, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size());
assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size());
+
// We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's
// node, while app2's should remain
String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());