diff options
author | Harald Musum <musum@verizonmedia.com> | 2020-09-29 14:31:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-29 14:31:02 +0200 |
commit | cae8ba3bfc5573828a6e4a28d2616694b09cb616 (patch) | |
tree | 1b1e0592fd6648eb3a1cf7b2b41427121959bdfa | |
parent | e6fe148c64d16d57d61ec9bd7239e65081e5fdeb (diff) | |
parent | b5f7391d32057b5283eb28bfbd7536df94c21a3e (diff) |
Merge pull request #14617 from vespa-engine/mpolden/less-node-failer-locking
Take application lock once per run MERGEOK
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 52 |
1 files changed, 31 insertions, 21 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 77ef88f0952..2ce3fa6c0f6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.google.common.util.concurrent.UncheckedTimeoutException; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; @@ -187,27 +188,29 @@ public class NodeFailer extends NodeRepositoryMaintainer { * Otherwise we remove any "down" history record. */ private void updateNodeDownState() { - Map<String, Node> activeNodesByHostname = nodeRepository().getNodes(Node.State.active).stream() - .collect(Collectors.toMap(Node::hostname, node -> node)); - - serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName() - .forEach((hostName, serviceInstances) -> { - Node node = activeNodesByHostname.get(hostName.s()); - if (node == null) return; - try (var lock = nodeRepository().lock(node.allocation().get().owner())) { - Optional<Node> currentNode = nodeRepository().getNode(node.hostname(), Node.State.active); // re-get inside lock - if (currentNode.isEmpty()) return; // Node disappeared since acquiring lock - node = currentNode.get(); - if (badNode(serviceInstances)) { - recordAsDown(node, lock); - } else { - clearDownRecord(node, lock); - } - } - catch (UncheckedTimeoutException e) { - // Ignore - node may be locked on this round due to deployment - } - }); + NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active)); + serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { + Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); + if (node.isEmpty()) return; + + // Already correct record, nothing to do + boolean badNode = badNode(serviceInstances); + if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return; + + // Lock and update status + ApplicationId owner = node.get().allocation().get().owner(); + try (var lock = nodeRepository().lock(owner, Duration.ofSeconds(1))) { + node = getNode(hostname.toString(), owner, lock); // Re-get inside lock + if (node.isEmpty()) return; // Node disappeared or changed allocation + if (badNode) { + recordAsDown(node.get(), lock); + } else { + clearDownRecord(node.get(), lock); + } + } catch (UncheckedTimeoutException ignored) { + // Fine, we'll try updating this node in the next run + } + }); } private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) { @@ -248,6 +251,13 @@ public class NodeFailer extends NodeRepositoryMaintainer { return reasonsToFailParentHost(hostNode).size() > 0; } + /** Get node by given hostname and application. The applicationLock must be held when calling this */ + private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) { + return nodeRepository().getNode(hostname, Node.State.active) + .filter(node -> node.allocation().isPresent()) + .filter(node -> node.allocation().get().owner().equals(application)); + } + private boolean expectConfigRequests(Node node) { return !node.type().isHost(); } |