From 9bf4d074cb4c14a7bbd70c441a80cd68fbf88e92 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Thu, 19 Sep 2019 11:16:10 +0200 Subject: Take symmetric vip actions. Only act on full info --- .../com/yahoo/search/cluster/BaseNodeMonitor.java | 12 ++-- .../com/yahoo/search/cluster/ClusterMonitor.java | 4 +- .../java/com/yahoo/search/cluster/NodeManager.java | 3 + .../yahoo/search/dispatch/searchcluster/Node.java | 12 +++- .../dispatch/searchcluster/SearchCluster.java | 76 ++++++++++++++-------- 5 files changed, 70 insertions(+), 37 deletions(-) (limited to 'container-search') diff --git a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java index 59674d25402..d21ef35bcc2 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java @@ -26,25 +26,25 @@ public abstract class BaseNodeMonitor { /** The object representing the monitored node */ protected T node; - protected boolean isWorking=true; + protected boolean isWorking = true; /** Whether this node is quarantined for unstability */ - protected boolean isQuarantined=false; + protected boolean isQuarantined = false; /** The last time this node failed, in ms */ - protected long failedAt=0; + protected long failedAt = 0; /** The last time this node responded (failed or succeeded), in ms */ - protected long respondedAt=0; + protected long respondedAt = 0; /** The last time this node responded successfully */ - protected long succeededAt=0; + protected long succeededAt = 0; /** The configuration of this monitor */ protected MonitorConfiguration configuration; /** Is the node we monitor part of an internal Vespa cluster or not */ - private boolean internal=false; + private boolean internal; public BaseNodeMonitor(boolean internal) { this.internal=internal; diff --git a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java index ac0c8375f04..b79f6f49c19 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java @@ -24,7 +24,7 @@ public class ClusterMonitor { private MonitorConfiguration configuration = new MonitorConfiguration(); - private static Logger log=Logger.getLogger(ClusterMonitor.class.getName()); + private static Logger log = Logger.getLogger(ClusterMonitor.class.getName()); private NodeManager nodeManager; @@ -69,6 +69,7 @@ public class ClusterMonitor { /** Called from ClusterSearcher/NodeManager when a node failed */ public synchronized void failed(T node, ErrorMessage error) { + nodeManager.statusIsKnown(node); BaseNodeMonitor monitor = nodeMonitors.get(node); boolean wasWorking = monitor.isWorking(); monitor.failed(error); @@ -79,6 +80,7 @@ public class ClusterMonitor { /** Called when a node responded */ public synchronized void responded(T node) { + nodeManager.statusIsKnown(node); BaseNodeMonitor monitor = nodeMonitors.get(node); boolean wasFailing =! monitor.isWorking(); monitor.responded(); diff --git a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java index 9b20139e3c5..ef10680a4ae 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java @@ -11,6 +11,9 @@ import java.util.concurrent.Executor; */ public interface NodeManager { + /** Called when we gain evidence about whether or not a node is working */ + default void statusIsKnown(T node) { } + /** Called when a failed node is working (ready for production) again */ void working(T node); diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java index a71ce0354f9..0dd933fc24e 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java @@ -19,6 +19,7 @@ public class Node { private final int fs4port; final int group; + private final AtomicBoolean statusIsKnown = new AtomicBoolean(false); private final AtomicBoolean working = new AtomicBoolean(true); private final AtomicLong activeDocuments = new AtomicLong(0); @@ -45,9 +46,13 @@ public class Node { /** Returns the id of this group this node belongs to */ public int group() { return group; } - public void setWorking(boolean working) { - this.working.lazySet(working); - } + /** Note that we know the status of this node */ + public void setStatusIsKnown() { statusIsKnown.lazySet(true); } + + /** Returns whether we know the status of this node */ + public boolean getStatusIsKnown() { return statusIsKnown.get(); } + + public void setWorking(boolean working) { this.working.lazySet(working); } /** Returns whether this node is currently responding to requests */ public boolean isWorking() { return working.get(); } @@ -77,4 +82,5 @@ public class Node { @Override public String toString() { return "search node " + hostname + ":" + fs4port + " in group " + group; } + } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index fed4ffb1f08..37ccd340f8a 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -205,30 +205,52 @@ public class SearchCluster implements NodeManager { return localCorpusDispatchTarget; } - /** Used by the cluster monitor to manage node status */ + /** Called by the cluster monitor whenever we get information (positive or negative) about a node */ + @Override + public void statusIsKnown(Node node) { + node.setStatusIsKnown(); + } + + /** Called by the cluster monitor when node state changes to working */ @Override public void working(Node node) { node.setWorking(true); - - if (usesLocalCorpusIn(node)) - vipStatus.addToRotation(clusterId); + updateVipStatusOnNodeChange(node, true); } - /** Used by the cluster monitor to manage node status */ + /** Called by the cluster monitor when node state changes to failed */ @Override public void failed(Node node) { node.setWorking(false); - - if (usesLocalCorpusIn(node)) - vipStatus.removeFromRotation(clusterId); + updateVipStatusOnNodeChange(node, true); } private void updateSufficientCoverage(Group group, boolean sufficientCoverage) { + if (sufficientCoverage == group.hasSufficientCoverage()) return; // no change + group.setHasSufficientCoverage(sufficientCoverage); + updateVipStatusOnCoverageChange(group, sufficientCoverage); + } - boolean isInRotation = vipStatus.isInRotation(); - boolean hasChanged = sufficientCoverage != group.hasSufficientCoverage(); + private void updateVipStatusOnNodeChange(Node node, boolean working) { + if (usesLocalCorpusIn(node)) { // follow the status of the local corpus + if (working) + vipStatus.addToRotation(clusterId); + else + vipStatus.removeFromRotation(clusterId); + } + else { + if ( ! hasInformationAboutAllNodes()) return; + if (hasWorkingNodesWithDocumentsOnline()) + vipStatus.addToRotation(clusterId); + else + vipStatus.removeFromRotation(clusterId); + } + } + + private void updateVipStatusOnCoverageChange(Group group, boolean sufficientCoverage) { + boolean isInRotation = vipStatus.isInRotation(); if (usesLocalCorpusIn(group)) { // follow the status of the local corpus if (sufficientCoverage) vipStatus.addToRotation(clusterId); @@ -241,6 +263,14 @@ public class SearchCluster implements NodeManager { } } + private boolean hasInformationAboutAllNodes() { + return nodesByHost.values().stream().allMatch(Node::getStatusIsKnown); + } + + private boolean hasWorkingNodesWithDocumentsOnline() { + return nodesByHost.values().stream().anyMatch(node -> node.isWorking() && node.getActiveDocuments() > 0); + } + private boolean usesLocalCorpusIn(Node node) { return localCorpusDispatchTarget.isPresent() && localCorpusDispatchTarget.get().equals(node); } @@ -252,8 +282,8 @@ public class SearchCluster implements NodeManager { /** Used by the cluster monitor to manage node status */ @Override public void ping(Node node, Executor executor) { - if (pingFactory == null) // not initialized yet - return; + if (pingFactory == null) return; // not initialized yet + FutureTask futurePong = new FutureTask<>(pingFactory.createPinger(node, clusterMonitor)); executor.execute(futurePong); Pong pong = getPong(futurePong, node); @@ -276,9 +306,10 @@ public class SearchCluster implements NodeManager { // group will always be marked sufficient for use. updateSufficientCoverage(group, true); boolean fullCoverage = isGroupCoverageSufficient(group.workingNodes(), group.nodes().size(), group.getActiveDocuments(), - group.getActiveDocuments()); + group.getActiveDocuments()); trackGroupCoverageChanges(0, group, fullCoverage, group.getActiveDocuments()); } + private void pingIterationCompletedMultipleGroups() { int numGroups = orderedGroups.size(); // Update active documents per group and use it to decide if the group should be active @@ -303,13 +334,7 @@ public class SearchCluster implements NodeManager { trackGroupCoverageChanges(i, group, sufficientCoverage, averageDocumentsInOtherGroups); } } - private boolean areAllNodesDownInAllgroups() { - for(int i = 0; i < groups.size(); i++) { - Group group = orderedGroups.get(i); - if (group.workingNodes() > 0) return false; - } - return true; - } + /** * Update statistics after a round of issuing pings. * Note that this doesn't wait for pings to return, so it will typically accumulate data from @@ -323,9 +348,6 @@ public class SearchCluster implements NodeManager { } else { pingIterationCompletedMultipleGroups(); } - if ( areAllNodesDownInAllgroups() ) { - vipStatus.removeFromRotation(clusterId); - } } private boolean isGroupCoverageSufficient(int workingNodes, int nodesInGroup, long activeDocuments, long averageDocumentsInOtherGroups) { @@ -401,11 +423,11 @@ public class SearchCluster implements NodeManager { if (changed) { int requiredNodes = groupSize() - dispatchConfig.maxNodesDownPerGroup(); if (fullCoverage) { - log.info(() -> String.format("Group %d is now good again (%d/%d active docs, coverage %d/%d)", index, - group.getActiveDocuments(), averageDocuments, group.workingNodes(), groupSize())); + log.info(() -> String.format("Group %d is now good again (%d/%d active docs, coverage %d/%d)", + index, group.getActiveDocuments(), averageDocuments, group.workingNodes(), groupSize())); } else { - log.warning(() -> String.format("Coverage of group %d is only %d/%d (requires %d)", index, group.workingNodes(), groupSize(), - requiredNodes)); + log.warning(() -> String.format("Coverage of group %d is only %d/%d (requires %d)", + index, group.workingNodes(), groupSize(), requiredNodes)); } } } -- cgit v1.2.3