summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@verizonmedia.com>2020-09-29 14:31:02 +0200
committerGitHub <noreply@github.com>2020-09-29 14:31:02 +0200
commitcae8ba3bfc5573828a6e4a28d2616694b09cb616 (patch)
tree1b1e0592fd6648eb3a1cf7b2b41427121959bdfa
parente6fe148c64d16d57d61ec9bd7239e65081e5fdeb (diff)
parentb5f7391d32057b5283eb28bfbd7536df94c21a3e (diff)
Merge pull request #14617 from vespa-engine/mpolden/less-node-failer-locking
Take application lock once per run MERGEOK
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java52
1 files changed, 31 insertions, 21 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 77ef88f0952..2ce3fa6c0f6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.provision.maintenance;
import com.google.common.util.concurrent.UncheckedTimeoutException;
+import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
@@ -187,27 +188,29 @@ public class NodeFailer extends NodeRepositoryMaintainer {
* Otherwise we remove any "down" history record.
*/
private void updateNodeDownState() {
- Map<String, Node> activeNodesByHostname = nodeRepository().getNodes(Node.State.active).stream()
- .collect(Collectors.toMap(Node::hostname, node -> node));
-
- serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName()
- .forEach((hostName, serviceInstances) -> {
- Node node = activeNodesByHostname.get(hostName.s());
- if (node == null) return;
- try (var lock = nodeRepository().lock(node.allocation().get().owner())) {
- Optional<Node> currentNode = nodeRepository().getNode(node.hostname(), Node.State.active); // re-get inside lock
- if (currentNode.isEmpty()) return; // Node disappeared since acquiring lock
- node = currentNode.get();
- if (badNode(serviceInstances)) {
- recordAsDown(node, lock);
- } else {
- clearDownRecord(node, lock);
- }
- }
- catch (UncheckedTimeoutException e) {
- // Ignore - node may be locked on this round due to deployment
- }
- });
+ NodeList activeNodes = NodeList.copyOf(nodeRepository().getNodes(Node.State.active));
+ serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
+ Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first();
+ if (node.isEmpty()) return;
+
+ // Already correct record, nothing to do
+ boolean badNode = badNode(serviceInstances);
+ if (badNode == node.get().history().event(History.Event.Type.down).isPresent()) return;
+
+ // Lock and update status
+ ApplicationId owner = node.get().allocation().get().owner();
+ try (var lock = nodeRepository().lock(owner, Duration.ofSeconds(1))) {
+ node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
+ if (node.isEmpty()) return; // Node disappeared or changed allocation
+ if (badNode) {
+ recordAsDown(node.get(), lock);
+ } else {
+ clearDownRecord(node.get(), lock);
+ }
+ } catch (UncheckedTimeoutException ignored) {
+ // Fine, we'll try updating this node in the next run
+ }
+ });
}
private Map<Node, String> getActiveNodesByFailureReason(List<Node> activeNodes) {
@@ -248,6 +251,13 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return reasonsToFailParentHost(hostNode).size() > 0;
}
+ /** Get node by given hostname and application. The applicationLock must be held when calling this */
+ private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) {
+ return nodeRepository().getNode(hostname, Node.State.active)
+ .filter(node -> node.allocation().isPresent())
+ .filter(node -> node.allocation().get().owner().equals(application));
+ }
+
private boolean expectConfigRequests(Node node) {
return !node.type().isHost();
}