diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-12 10:16:25 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-12 10:16:25 +0200 |
commit | 034c06f6e47b59f5634f6faaa316995c115c83ed (patch) | |
tree | ef76aef674977db0db22fcd215915060e630a8f6 | |
parent | 0d2381234ad4d0a18b08ce15efd550474249e657 (diff) |
Node failing improvements
- Fail hosts that wants to fail and do not have active children
- Clear want to fail on failing in case the nodes are later reactivated
5 files changed, 42 insertions, 9 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index de72024cb77..3dbafdc2aba 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -196,7 +196,7 @@ public final class Node implements Nodelike { /** * Returns a copy of this where wantToFail is set to true and history is updated to reflect this. */ - public Node withWantToFail(boolean wantToFail, Agent agent, String reason, Instant at) { + public Node withWantToFail(boolean wantToFail, Agent agent, Instant at) { Node node = this.with(status.withWantToFail(wantToFail)); if (wantToFail) node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at))); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index ac6ecd98fac..04102c3c38e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -10,6 +10,7 @@ import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; @@ -110,6 +111,22 @@ public class NodeFailer extends NodeRepositoryMaintainer { failActive(node, reason); } + // Active hosts + NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); + for (Node host : activeNodes.hosts().failing()) { + if ( ! activeNodes.childrenOf(host).isEmpty()) continue; + Optional<NodeMutex> locked = Optional.empty(); + try { + locked = nodeRepository().nodes().lockAndGet(host); + if (locked.isEmpty()) continue; + nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, + "Host should be failed and have no tenant nodes"); + } + finally { + locked.ifPresent(NodeMutex::close); + } + } + int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures); metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); @@ -277,7 +294,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { } if (! allTenantNodesFailedOutSuccessfully) return false; - wantToFail(node, true, reason, lock); + wantToFail(node, true, lock); try { deployment.get().activate(); return true; @@ -289,7 +306,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { } catch (RuntimeException e) { // Reset want to fail: We'll retry failing unless it heals in the meantime nodeRepository().nodes().node(node.hostname()) - .ifPresent(n -> wantToFail(n, false, "Could not fail", lock)); + .ifPresent(n -> wantToFail(n, false, lock)); log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() + " for " + reason + ": " + Exceptions.toMessageString(e)); return false; @@ -297,8 +314,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } - private void wantToFail(Node node, boolean wantToFail, String reason, Mutex lock) { - nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, reason, clock().instant()), lock); + private void wantToFail(Node node, boolean wantToFail, Mutex lock) { + nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); } /** Returns true if node failing should be throttled */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 9ac33643148..3e4221ecfcd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -123,7 +123,7 @@ public class Nodes { /** Adds a list of newly created reserved nodes to the node repository */ public List<Node> addReservedNodes(LockedNodeList nodes) { for (Node node : nodes) { - if ( ! node.flavor().getType().equals(Flavor.Type.DOCKER_CONTAINER)) + if ( node.flavor().getType() != Flavor.Type.DOCKER_CONTAINER) illegal("Cannot add " + node + ": This is not a child node"); if (node.allocation().isEmpty()) illegal("Cannot add " + node + ": Child nodes need to be allocated"); @@ -257,7 +257,21 @@ public class Nodes { * transaction commits. */ public List<Node> fail(List<Node> nodes, ApplicationTransaction transaction) { - return db.writeTo(Node.State.failed, nodes, Agent.application, Optional.of("Failed by application"), transaction.nested()); + return fail(nodes, Agent.application, "Failed by application", transaction.nested()); + } + + public List<Node> fail(List<Node> nodes, Agent agent, String reason) { + NestedTransaction transaction = new NestedTransaction(); + nodes = fail(nodes, agent, reason, transaction); + transaction.commit();; + return nodes; + } + + private List<Node> fail(List<Node> nodes, Agent agent, String reason, NestedTransaction transaction) { + nodes = nodes.stream() + .map(n -> n.withWantToFail(false, agent, clock.instant())) + .collect(Collectors.toList()); + return db.writeTo(Node.State.failed, nodes, agent, Optional.of(reason), transaction); } /** Move nodes to the dirty state */ @@ -347,7 +361,7 @@ public class Nodes { private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) { if (node.state() == Node.State.active) { - node = node.withWantToFail(true, agent, reason, clock.instant()); + node = node.withWantToFail(true, agent, clock.instant()); write(node, lock); return node; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index 6ab8fc8ad49..8eca4ff2d95 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -206,7 +206,7 @@ public class NodePrioritizer { /** Returns whether we are allocating to replace a failed node */ private boolean isReplacement(NodeList nodesInCluster) { - int failedNodesInCluster = nodesInCluster.failing().size(); + int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size(); if (failedNodesInCluster == 0) return false; return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index bb954058916..0c48c6dfd83 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -475,6 +475,7 @@ public class NodeFailerTest { assertEquals(8, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.tenant).size()); assertEquals(10, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(6, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); + assertEquals(1, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.host).size()); // Now lets fail an active tenant node @@ -514,6 +515,7 @@ public class NodeFailerTest { assertEquals(6, tester.nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.tenant).size()); assertEquals(5, tester.nodeRepository.nodes().list(Node.State.active).nodeType(NodeType.host).size()); + // We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's // node, while app2's should remain String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get()); |