diff options
author | Martin Polden <mpolden@mpolden.no> | 2023-03-09 13:26:35 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2023-03-10 11:26:29 +0100 |
commit | f6702b43e47bfdc925b3d9187766cdd7d4f82bf9 (patch) | |
tree | efef95543cd301f4e511d9fb73d8d66901877aa5 /node-repository/src/main/java/com/yahoo/vespa | |
parent | 73465b058689a0c7ae5599083c4ae9f31557584b (diff) |
Do not hold application lock while replacing failing node
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 63 |
1 files changed, 36 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 84a45de39d7..afea08711fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -182,27 +182,26 @@ public class NodeFailer extends NodeRepositoryMaintainer { * Called when a node should be moved to the failed state: Do that if it seems safe, * which is when the node repo has available capacity to replace the node (and all its tenant nodes if host). * Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken. - * - * @return whether node was successfully failed */ - private boolean failActive(FailingNode failing) { + private void failActive(FailingNode failing) { Optional<Deployment> deployment = deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5)); - if (deployment.isEmpty()) return false; + if (deployment.isEmpty()) return; // If the active node that we are trying to fail is of type host, we need to successfully fail all // the children nodes running on it before we fail the host. Failing a child node in a dynamically // provisioned zone may require provisioning new hosts that require the host application lock to be held, // so we must release ours before failing the children. List<FailingNode> activeChildrenToFail = new ArrayList<>(); + boolean redeploy = false; try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) { // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner))) - return false; + return; if (lock.node().state() == Node.State.failed) - return true; + return; if (!Objects.equals(failing.node().state(), lock.node().state())) - return false; + return; failing = new FailingNode(lock.node(), failing.reason); String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason(); @@ -216,36 +215,46 @@ public class NodeFailer extends NodeRepositoryMaintainer { if (activeChildrenToFail.isEmpty()) { log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason); - wantToFail(failing.node(), true, lock); - try { - deployment.get().activate(); - return true; - } catch (TransientException | UncheckedTimeoutException e) { - log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + - " with a transient error, will be retried by application maintainer: " + - Exceptions.toMessageString(e)); - return true; - } catch (RuntimeException e) { - // Reset want to fail: We'll retry failing unless it heals in the meantime - nodeRepository().nodes().node(failing.node().hostname()) - .ifPresent(n -> wantToFail(n, false, lock)); - log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + - " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); - return false; - } + markWantToFail(failing.node(), true, lock); + redeploy = true; } } + // Redeploy to replace failing node + if (redeploy) { + redeploy(deployment.get(), failing); + return; + } + // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned, // so failActive() may take a long time to complete, but the remaining children should be fast. activeChildrenToFail.forEach(this::failActive); - return false; } - private void wantToFail(Node node, boolean wantToFail, Mutex lock) { - if (!node.status().wantToFail()) + private void redeploy(Deployment deployment, FailingNode failing) { + try { + deployment.activate(); + } catch (TransientException | UncheckedTimeoutException e) { + log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + + " with a transient error, will be retried by application maintainer: " + + Exceptions.toMessageString(e)); + } catch (RuntimeException e) { + // Reset want to fail: We'll retry failing unless it heals in the meantime + Optional<NodeMutex> optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node()); + if (optionalNodeMutex.isEmpty()) return; + try (var nodeMutex = optionalNodeMutex.get()) { + markWantToFail(nodeMutex.node(), false, nodeMutex); + log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + + " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); + } + } + } + + private void markWantToFail(Node node, boolean wantToFail, Mutex lock) { + if (node.status().wantToFail() != wantToFail) { nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); + } } /** Returns true if node failing should be throttled */ |