From d4c44d4a41d477182630ca0e8821c2e466adf06b Mon Sep 17 00:00:00 2001 From: Arne H Juul Date: Fri, 3 Feb 2017 15:07:02 +0100 Subject: assume node down until proven otherwise * change initializer values in NodeMonitor to assume the node is down and has no search nodes online until we see some data from it. Also, update vip status as soon as we have some nodes attached. This is to avoid the container believing it should be in service at startup and then changing its mind when it discovers that underlying services are down. --- .../com/yahoo/prelude/cluster/ClusterMonitor.java | 1 + .../com/yahoo/prelude/cluster/NodeMonitor.java | 27 +++++++++------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'container-search') diff --git a/container-search/src/main/java/com/yahoo/prelude/cluster/ClusterMonitor.java b/container-search/src/main/java/com/yahoo/prelude/cluster/ClusterMonitor.java index 871ecc37ea5..0312db914df 100644 --- a/container-search/src/main/java/com/yahoo/prelude/cluster/ClusterMonitor.java +++ b/container-search/src/main/java/com/yahoo/prelude/cluster/ClusterMonitor.java @@ -67,6 +67,7 @@ public class ClusterMonitor implements Runnable, Freezable { if (isFrozen()) throw new IllegalStateException("Can not add new nodes after ClusterMonitor has been frozen."); nodeMonitors.put(node, new NodeMonitor(node)); + updateVipStatus(); } /** Called from ClusterSearcher/NodeManager when a node failed */ diff --git a/container-search/src/main/java/com/yahoo/prelude/cluster/NodeMonitor.java b/container-search/src/main/java/com/yahoo/prelude/cluster/NodeMonitor.java index c06b7fe04ba..ab52bab08a0 100644 --- a/container-search/src/main/java/com/yahoo/prelude/cluster/NodeMonitor.java +++ b/container-search/src/main/java/com/yahoo/prelude/cluster/NodeMonitor.java @@ -15,6 +15,7 @@ import com.yahoo.search.result.ErrorMessage; * * * @author bratseth @@ -27,13 +28,13 @@ public class NodeMonitor { /** The object representing the monitored node */ private final VespaBackEndSearcher node; - private boolean isWorking = true; + private boolean isWorking = false; /** The last time this node responded successfully */ private long succeededAt = 0; /** Whether it is assumed the node has documents available to serve */ - private boolean searchNodesOnline = true; + private boolean searchNodesOnline = false; /** * Creates a new node monitor for a node @@ -66,7 +67,8 @@ public class NodeMonitor { long respondedAt = System.currentTimeMillis(); if (error.getCode() == BACKEND_COMMUNICATION_ERROR.code - || error.getCode() == NO_ANSWER_WHEN_PINGING_NODE.code) { + || error.getCode() == NO_ANSWER_WHEN_PINGING_NODE.code) + { // Only count not being able to talk to backend at all // as errors we care about if ((respondedAt - succeededAt) > 10000) { @@ -83,26 +85,19 @@ public class NodeMonitor { public void responded(boolean searchNodesOnline) { succeededAt = System.currentTimeMillis(); this.searchNodesOnline = searchNodesOnline; - atStartUp = false; - - if ( ! isWorking) + if (! isWorking) setWorking(true, "Responds correctly"); + atStartUp = false; } /** Changes the state of this node if required */ private void setWorking(boolean working, String explanation) { if (isWorking == working) return; // Old news - String explanationToLog; - if (explanation == null) - explanationToLog = ""; - else - explanationToLog = ": " + explanation; - - if (working) - log.info("Putting " + node + " in service" + explanationToLog); - else if ( ! atStartUp) - log.info("Taking " + node + " out of service" + explanationToLog); + if (working && ! atStartUp) + log.info("Putting " + node + " in service:" + explanation); + else if (! atStartUp) + log.info("Taking " + node + " out of service:" + explanation); isWorking = working; } -- cgit v1.2.3