summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@verizonmedia.com>2019-09-19 11:16:10 +0200
committerJon Bratseth <bratseth@verizonmedia.com>2019-09-19 11:16:10 +0200
commit9bf4d074cb4c14a7bbd70c441a80cd68fbf88e92 (patch)
treec7e54d7ad39e48e8b71e0642f3112cd86877a533 /container-search
parenta6a3377aa89aa520bfe3a61bb76eaf4d6636388a (diff)
Take symmetric vip actions. Only act on full info
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java12
-rw-r--r--container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java4
-rw-r--r--container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java12
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java76
5 files changed, 70 insertions, 37 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java
index 59674d25402..d21ef35bcc2 100644
--- a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java
+++ b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java
@@ -26,25 +26,25 @@ public abstract class BaseNodeMonitor<T> {
/** The object representing the monitored node */
protected T node;
- protected boolean isWorking=true;
+ protected boolean isWorking = true;
/** Whether this node is quarantined for unstability */
- protected boolean isQuarantined=false;
+ protected boolean isQuarantined = false;
/** The last time this node failed, in ms */
- protected long failedAt=0;
+ protected long failedAt = 0;
/** The last time this node responded (failed or succeeded), in ms */
- protected long respondedAt=0;
+ protected long respondedAt = 0;
/** The last time this node responded successfully */
- protected long succeededAt=0;
+ protected long succeededAt = 0;
/** The configuration of this monitor */
protected MonitorConfiguration configuration;
/** Is the node we monitor part of an internal Vespa cluster or not */
- private boolean internal=false;
+ private boolean internal;
public BaseNodeMonitor(boolean internal) {
this.internal=internal;
diff --git a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java
index ac0c8375f04..b79f6f49c19 100644
--- a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java
+++ b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java
@@ -24,7 +24,7 @@ public class ClusterMonitor<T> {
private MonitorConfiguration configuration = new MonitorConfiguration();
- private static Logger log=Logger.getLogger(ClusterMonitor.class.getName());
+ private static Logger log = Logger.getLogger(ClusterMonitor.class.getName());
private NodeManager<T> nodeManager;
@@ -69,6 +69,7 @@ public class ClusterMonitor<T> {
/** Called from ClusterSearcher/NodeManager when a node failed */
public synchronized void failed(T node, ErrorMessage error) {
+ nodeManager.statusIsKnown(node);
BaseNodeMonitor<T> monitor = nodeMonitors.get(node);
boolean wasWorking = monitor.isWorking();
monitor.failed(error);
@@ -79,6 +80,7 @@ public class ClusterMonitor<T> {
/** Called when a node responded */
public synchronized void responded(T node) {
+ nodeManager.statusIsKnown(node);
BaseNodeMonitor<T> monitor = nodeMonitors.get(node);
boolean wasFailing =! monitor.isWorking();
monitor.responded();
diff --git a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java
index 9b20139e3c5..ef10680a4ae 100644
--- a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java
+++ b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java
@@ -11,6 +11,9 @@ import java.util.concurrent.Executor;
*/
public interface NodeManager<T> {
+ /** Called when we gain evidence about whether or not a node is working */
+ default void statusIsKnown(T node) { }
+
/** Called when a failed node is working (ready for production) again */
void working(T node);
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java
index a71ce0354f9..0dd933fc24e 100644
--- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java
@@ -19,6 +19,7 @@ public class Node {
private final int fs4port;
final int group;
+ private final AtomicBoolean statusIsKnown = new AtomicBoolean(false);
private final AtomicBoolean working = new AtomicBoolean(true);
private final AtomicLong activeDocuments = new AtomicLong(0);
@@ -45,9 +46,13 @@ public class Node {
/** Returns the id of this group this node belongs to */
public int group() { return group; }
- public void setWorking(boolean working) {
- this.working.lazySet(working);
- }
+ /** Note that we know the status of this node */
+ public void setStatusIsKnown() { statusIsKnown.lazySet(true); }
+
+ /** Returns whether we know the status of this node */
+ public boolean getStatusIsKnown() { return statusIsKnown.get(); }
+
+ public void setWorking(boolean working) { this.working.lazySet(working); }
/** Returns whether this node is currently responding to requests */
public boolean isWorking() { return working.get(); }
@@ -77,4 +82,5 @@ public class Node {
@Override
public String toString() { return "search node " + hostname + ":" + fs4port + " in group " + group; }
+
}
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
index fed4ffb1f08..37ccd340f8a 100644
--- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
@@ -205,30 +205,52 @@ public class SearchCluster implements NodeManager<Node> {
return localCorpusDispatchTarget;
}
- /** Used by the cluster monitor to manage node status */
+ /** Called by the cluster monitor whenever we get information (positive or negative) about a node */
+ @Override
+ public void statusIsKnown(Node node) {
+ node.setStatusIsKnown();
+ }
+
+ /** Called by the cluster monitor when node state changes to working */
@Override
public void working(Node node) {
node.setWorking(true);
-
- if (usesLocalCorpusIn(node))
- vipStatus.addToRotation(clusterId);
+ updateVipStatusOnNodeChange(node, true);
}
- /** Used by the cluster monitor to manage node status */
+ /** Called by the cluster monitor when node state changes to failed */
@Override
public void failed(Node node) {
node.setWorking(false);
-
- if (usesLocalCorpusIn(node))
- vipStatus.removeFromRotation(clusterId);
+ updateVipStatusOnNodeChange(node, true);
}
private void updateSufficientCoverage(Group group, boolean sufficientCoverage) {
+ if (sufficientCoverage == group.hasSufficientCoverage()) return; // no change
+
group.setHasSufficientCoverage(sufficientCoverage);
+ updateVipStatusOnCoverageChange(group, sufficientCoverage);
+ }
- boolean isInRotation = vipStatus.isInRotation();
- boolean hasChanged = sufficientCoverage != group.hasSufficientCoverage();
+ private void updateVipStatusOnNodeChange(Node node, boolean working) {
+ if (usesLocalCorpusIn(node)) { // follow the status of the local corpus
+ if (working)
+ vipStatus.addToRotation(clusterId);
+ else
+ vipStatus.removeFromRotation(clusterId);
+ }
+ else {
+ if ( ! hasInformationAboutAllNodes()) return;
+ if (hasWorkingNodesWithDocumentsOnline())
+ vipStatus.addToRotation(clusterId);
+ else
+ vipStatus.removeFromRotation(clusterId);
+ }
+ }
+
+ private void updateVipStatusOnCoverageChange(Group group, boolean sufficientCoverage) {
+ boolean isInRotation = vipStatus.isInRotation();
if (usesLocalCorpusIn(group)) { // follow the status of the local corpus
if (sufficientCoverage)
vipStatus.addToRotation(clusterId);
@@ -241,6 +263,14 @@ public class SearchCluster implements NodeManager<Node> {
}
}
+ private boolean hasInformationAboutAllNodes() {
+ return nodesByHost.values().stream().allMatch(Node::getStatusIsKnown);
+ }
+
+ private boolean hasWorkingNodesWithDocumentsOnline() {
+ return nodesByHost.values().stream().anyMatch(node -> node.isWorking() && node.getActiveDocuments() > 0);
+ }
+
private boolean usesLocalCorpusIn(Node node) {
return localCorpusDispatchTarget.isPresent() && localCorpusDispatchTarget.get().equals(node);
}
@@ -252,8 +282,8 @@ public class SearchCluster implements NodeManager<Node> {
/** Used by the cluster monitor to manage node status */
@Override
public void ping(Node node, Executor executor) {
- if (pingFactory == null) // not initialized yet
- return;
+ if (pingFactory == null) return; // not initialized yet
+
FutureTask<Pong> futurePong = new FutureTask<>(pingFactory.createPinger(node, clusterMonitor));
executor.execute(futurePong);
Pong pong = getPong(futurePong, node);
@@ -276,9 +306,10 @@ public class SearchCluster implements NodeManager<Node> {
// group will always be marked sufficient for use.
updateSufficientCoverage(group, true);
boolean fullCoverage = isGroupCoverageSufficient(group.workingNodes(), group.nodes().size(), group.getActiveDocuments(),
- group.getActiveDocuments());
+ group.getActiveDocuments());
trackGroupCoverageChanges(0, group, fullCoverage, group.getActiveDocuments());
}
+
private void pingIterationCompletedMultipleGroups() {
int numGroups = orderedGroups.size();
// Update active documents per group and use it to decide if the group should be active
@@ -303,13 +334,7 @@ public class SearchCluster implements NodeManager<Node> {
trackGroupCoverageChanges(i, group, sufficientCoverage, averageDocumentsInOtherGroups);
}
}
- private boolean areAllNodesDownInAllgroups() {
- for(int i = 0; i < groups.size(); i++) {
- Group group = orderedGroups.get(i);
- if (group.workingNodes() > 0) return false;
- }
- return true;
- }
+
/**
* Update statistics after a round of issuing pings.
* Note that this doesn't wait for pings to return, so it will typically accumulate data from
@@ -323,9 +348,6 @@ public class SearchCluster implements NodeManager<Node> {
} else {
pingIterationCompletedMultipleGroups();
}
- if ( areAllNodesDownInAllgroups() ) {
- vipStatus.removeFromRotation(clusterId);
- }
}
private boolean isGroupCoverageSufficient(int workingNodes, int nodesInGroup, long activeDocuments, long averageDocumentsInOtherGroups) {
@@ -401,11 +423,11 @@ public class SearchCluster implements NodeManager<Node> {
if (changed) {
int requiredNodes = groupSize() - dispatchConfig.maxNodesDownPerGroup();
if (fullCoverage) {
- log.info(() -> String.format("Group %d is now good again (%d/%d active docs, coverage %d/%d)", index,
- group.getActiveDocuments(), averageDocuments, group.workingNodes(), groupSize()));
+ log.info(() -> String.format("Group %d is now good again (%d/%d active docs, coverage %d/%d)",
+ index, group.getActiveDocuments(), averageDocuments, group.workingNodes(), groupSize()));
} else {
- log.warning(() -> String.format("Coverage of group %d is only %d/%d (requires %d)", index, group.workingNodes(), groupSize(),
- requiredNodes));
+ log.warning(() -> String.format("Coverage of group %d is only %d/%d (requires %d)",
+ index, group.workingNodes(), groupSize(), requiredNodes));
}
}
}