From 13b9f16b13a370ddaea0460d8d3721e5f2ef1c8f Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Thu, 19 Sep 2019 21:19:48 +0200 Subject: Transition from down to up initially - Use tri-state logic for working/failing/unknown - Be initially down in test and verify we come up --- .../com/yahoo/search/cluster/BaseNodeMonitor.java | 4 +- .../com/yahoo/search/cluster/ClusterMonitor.java | 21 ++++----- .../java/com/yahoo/search/cluster/NodeManager.java | 3 -- .../yahoo/search/cluster/TrafficNodeMonitor.java | 9 +++- .../com/yahoo/search/dispatch/InvokerFactory.java | 13 +++--- .../yahoo/search/dispatch/searchcluster/Group.java | 4 +- .../yahoo/search/dispatch/searchcluster/Node.java | 17 ++++---- .../dispatch/searchcluster/SearchCluster.java | 23 +++------- .../java/com/yahoo/search/result/ErrorMessage.java | 1 + .../cluster/test/ClusteredConnectionTestCase.java | 3 +- .../dispatch/searchcluster/SearchClusterTest.java | 51 ++++++++++++++-------- 11 files changed, 78 insertions(+), 71 deletions(-) (limited to 'container-search/src') diff --git a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java index d21ef35bcc2..dd01d895963 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/BaseNodeMonitor.java @@ -54,10 +54,12 @@ public abstract class BaseNodeMonitor { /** * Returns whether this node is currently in a state suitable - * for receiving traffic. As far as we know, that is + * for receiving traffic (default true) */ public boolean isWorking() { return isWorking; } + /** @deprecated Not used */ + @Deprecated // TODO: Remove on Vespa 8 public boolean isQuarantined() { return isQuarantined; } /** diff --git a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java index b79f6f49c19..015adcf3490 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/ClusterMonitor.java @@ -33,7 +33,7 @@ public class ClusterMonitor { private volatile boolean shutdown = false; /** A map from Node to corresponding MonitoredNode */ - private final Map> nodeMonitors = Collections.synchronizedMap(new java.util.LinkedHashMap<>()); + private final Map> nodeMonitors = Collections.synchronizedMap(new java.util.LinkedHashMap<>()); public ClusterMonitor(NodeManager manager) { nodeManager = manager; @@ -56,8 +56,7 @@ public class ClusterMonitor { * @param internal whether or not this node is internal to this cluster */ public void add(T node, boolean internal) { - BaseNodeMonitor monitor = new TrafficNodeMonitor<>(node, configuration, internal); - nodeMonitors.put(node, monitor); + nodeMonitors.put(node, new TrafficNodeMonitor<>(node, configuration, internal)); } /** @@ -69,24 +68,20 @@ public class ClusterMonitor { /** Called from ClusterSearcher/NodeManager when a node failed */ public synchronized void failed(T node, ErrorMessage error) { - nodeManager.statusIsKnown(node); - BaseNodeMonitor monitor = nodeMonitors.get(node); - boolean wasWorking = monitor.isWorking(); + TrafficNodeMonitor monitor = nodeMonitors.get(node); + Boolean wasWorking = monitor.isKnownWorking(); monitor.failed(error); - if (wasWorking && !monitor.isWorking()) { + if (wasWorking != monitor.isKnownWorking()) nodeManager.failed(node); - } } /** Called when a node responded */ public synchronized void responded(T node) { - nodeManager.statusIsKnown(node); - BaseNodeMonitor monitor = nodeMonitors.get(node); - boolean wasFailing =! monitor.isWorking(); + TrafficNodeMonitor monitor = nodeMonitors.get(node); + Boolean wasWorking = monitor.isKnownWorking(); monitor.responded(); - if (wasFailing && monitor.isWorking()) { + if (wasWorking != monitor.isKnownWorking()) nodeManager.working(monitor.getNode()); - } } /** diff --git a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java index ef10680a4ae..9b20139e3c5 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/NodeManager.java @@ -11,9 +11,6 @@ import java.util.concurrent.Executor; */ public interface NodeManager { - /** Called when we gain evidence about whether or not a node is working */ - default void statusIsKnown(T node) { } - /** Called when a failed node is working (ready for production) again */ void working(T node); diff --git a/container-search/src/main/java/com/yahoo/search/cluster/TrafficNodeMonitor.java b/container-search/src/main/java/com/yahoo/search/cluster/TrafficNodeMonitor.java index ea881ad8b48..e484811ab38 100644 --- a/container-search/src/main/java/com/yahoo/search/cluster/TrafficNodeMonitor.java +++ b/container-search/src/main/java/com/yahoo/search/cluster/TrafficNodeMonitor.java @@ -60,9 +60,14 @@ public class TrafficNodeMonitor extends BaseNodeMonitor { setWorking(true,"Responds correctly"); } + /** + * Returns whether this node is currently is a state suitable for receiving traffic, or null if not known + */ + public Boolean isKnownWorking() { return atStartUp ? null : isWorking; } + /** Thread-safely changes the state of this node if required */ - protected synchronized void setWorking(boolean working,String explanation) { - if (this.isWorking==working) return; // Old news + protected synchronized void setWorking(boolean working, String explanation) { + if (this.isWorking == working) return; // Old news if (explanation==null) { explanation=""; diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/InvokerFactory.java b/container-search/src/main/java/com/yahoo/search/dispatch/InvokerFactory.java index 31af74a39b2..78c50254a84 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/InvokerFactory.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/InvokerFactory.java @@ -48,21 +48,24 @@ public abstract class InvokerFactory { * @return Optional containing the SearchInvoker or empty if some node in the * list is invalid and the remaining coverage is not sufficient */ - public Optional createSearchInvoker(VespaBackEndSearcher searcher, Query query, OptionalInt groupId, List nodes, - boolean acceptIncompleteCoverage) { + public Optional createSearchInvoker(VespaBackEndSearcher searcher, + Query query, + OptionalInt groupId, + List nodes, + boolean acceptIncompleteCoverage) { List invokers = new ArrayList<>(nodes.size()); Set failed = null; for (Node node : nodes) { boolean nodeAdded = false; - if (node.isWorking()) { + if (node.isWorking() != Boolean.FALSE) { Optional invoker = createNodeSearchInvoker(searcher, query, node); - if(invoker.isPresent()) { + if (invoker.isPresent()) { invokers.add(invoker.get()); nodeAdded = true; } } - if (!nodeAdded) { + if ( ! nodeAdded) { if (failed == null) { failed = new HashSet<>(); } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java index 93cbd77cef7..0e4e87b9a6a 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Group.java @@ -54,7 +54,7 @@ public class Group { public int workingNodes() { int nodesUp = 0; for (Node node : nodes) { - if (node.isWorking()) { + if (node.isWorking() == Boolean.TRUE) { nodesUp++; } } @@ -64,7 +64,7 @@ public class Group { void aggregateActiveDocuments() { long activeDocumentsInGroup = 0; for (Node node : nodes) { - if (node.isWorking()) { + if (node.isWorking() == Boolean.TRUE) { activeDocumentsInGroup += node.getActiveDocuments(); } } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java index 0dd933fc24e..b47f2fefa5b 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/Node.java @@ -46,16 +46,15 @@ public class Node { /** Returns the id of this group this node belongs to */ public int group() { return group; } - /** Note that we know the status of this node */ - public void setStatusIsKnown() { statusIsKnown.lazySet(true); } - - /** Returns whether we know the status of this node */ - public boolean getStatusIsKnown() { return statusIsKnown.get(); } - - public void setWorking(boolean working) { this.working.lazySet(working); } + public void setWorking(boolean working) { + this.statusIsKnown.lazySet(true); + this.working.lazySet(working); + } - /** Returns whether this node is currently responding to requests */ - public boolean isWorking() { return working.get(); } + /** Returns whether this node is currently responding to requests, or null if status is not known */ + public Boolean isWorking() { + return statusIsKnown.get() ? working.get() : null; + } /** Updates the active documents on this node */ public void setActiveDocuments(long activeDocuments) { diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index e2719545a6a..277252d14f0 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -95,13 +95,8 @@ public class SearchCluster implements NodeManager { this.pingFactory = pingFactory; for (var group : orderedGroups) { - for (var node : group.nodes()) { - // cluster monitor will only call working() when the - // node transitions from down to up, so we need to - // register the initial (working) state here: - working(node); + for (var node : group.nodes()) clusterMonitor.add(node, true); - } } } @@ -199,18 +194,12 @@ public class SearchCluster implements NodeManager { Group localSearchGroup = groups.get(localCorpusDispatchTarget.get().group()); if ( ! localSearchGroup.hasSufficientCoverage()) return Optional.empty(); - // Only use direct dispatch if the local search node is up - if ( ! localCorpusDispatchTarget.get().isWorking()) return Optional.empty(); + // Only use direct dispatch if the local search node is not down + if ( localCorpusDispatchTarget.get().isWorking() == Boolean.FALSE) return Optional.empty(); return localCorpusDispatchTarget; } - /** Called by the cluster monitor whenever we get information (positive or negative) about a node */ - @Override - public void statusIsKnown(Node node) { - node.setStatusIsKnown(); - } - /** Called by the cluster monitor when node state changes to working */ @Override public void working(Node node) { @@ -244,7 +233,7 @@ public class SearchCluster implements NodeManager { private void updateVipStatusOnCoverageChange(Group group, boolean sufficientCoverage) { if ( localCorpusDispatchTarget.isEmpty()) { // consider entire cluster - if (vipStatus.isInRotation() && sufficientCoverage) + if ( ! vipStatus.isInRotation() && sufficientCoverage) vipStatus.addToRotation(clusterId); } else if (usesLocalCorpusIn(group)) { // follow the status of this group @@ -260,11 +249,11 @@ public class SearchCluster implements NodeManager { } private boolean hasInformationAboutAllNodes() { - return nodesByHost.values().stream().allMatch(Node::getStatusIsKnown); + return nodesByHost.values().stream().allMatch(node -> node.isWorking() != null); } private boolean hasWorkingNodes() { - return nodesByHost.values().stream().anyMatch(Node::isWorking); + return nodesByHost.values().stream().anyMatch(node -> node.isWorking() != Boolean.FALSE ); } private boolean usesLocalCorpusIn(Node node) { diff --git a/container-search/src/main/java/com/yahoo/search/result/ErrorMessage.java b/container-search/src/main/java/com/yahoo/search/result/ErrorMessage.java index a58db5a542e..47221ee00f5 100644 --- a/container-search/src/main/java/com/yahoo/search/result/ErrorMessage.java +++ b/container-search/src/main/java/com/yahoo/search/result/ErrorMessage.java @@ -103,6 +103,7 @@ public class ErrorMessage extends com.yahoo.processing.request.ErrorMessage { /** Creates an error indicating that there was a general error communicating with a backend service. */ public static ErrorMessage createBackendCommunicationError(String detailedMessage) { + if (1==1) throw new RuntimeException(); return new ErrorMessage(BACKEND_COMMUNICATION_ERROR.code, "Backend communication error", detailedMessage); } diff --git a/container-search/src/test/java/com/yahoo/search/cluster/test/ClusteredConnectionTestCase.java b/container-search/src/test/java/com/yahoo/search/cluster/test/ClusteredConnectionTestCase.java index 84c10991293..79e96a7c5a2 100644 --- a/container-search/src/test/java/com/yahoo/search/cluster/test/ClusteredConnectionTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/cluster/test/ClusteredConnectionTestCase.java @@ -114,7 +114,8 @@ public class ClusteredConnectionTestCase { connection0.setInService(false); forcePing(myBackend); r=new Execution(myBackend, Execution.Context.createContextStub()).search(new SimpleQuery(0)); - assertEquals("No backends in service. Try later",r.hits().getError().getMessage()); + System.out.println(r.hits().getError().getDetailedMessage()); + assertEquals("No backends in service. Try later", r.hits().getError().getMessage()); connection2.setInService(true); connection1.setInService(true); diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java index bbaf512534a..df1049f499d 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java @@ -19,13 +19,16 @@ import java.util.concurrent.atomic.AtomicInteger; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +/** + * @author baldersheim + */ public class SearchClusterTest { static class State { final String clusterId; final int nodesPerGroup; final VipStatus vipStatus; - final SearchCluster sc; + final SearchCluster searchCluster; final List numDocsPerNode; List pingCounts; State(String clusterId, int nodesPergroup, String ... nodeNames) { @@ -35,9 +38,6 @@ public class SearchClusterTest { this.clusterId = clusterId; this.nodesPerGroup = nodesPergroup; vipStatus = new VipStatus(new QrSearchersConfig.Builder().searchcluster(new QrSearchersConfig.Searchcluster.Builder().name(clusterId)).build(), new ClustersStatus()); - assertFalse(vipStatus.isInRotation()); - vipStatus.addToRotation(clusterId); - assertTrue(vipStatus.isInRotation()); numDocsPerNode = new ArrayList<>(nodeNames.size()); pingCounts = new ArrayList<>(nodeNames.size()); List nodes = new ArrayList<>(nodeNames.size()); @@ -49,10 +49,10 @@ public class SearchClusterTest { numDocsPerNode.add(new AtomicInteger(1)); pingCounts.add(new AtomicInteger(0)); } - sc = new SearchCluster(clusterId, MockSearchCluster.createDispatchConfig(nodes),nodes.size() / nodesPergroup, vipStatus); + searchCluster = new SearchCluster(clusterId, MockSearchCluster.createDispatchConfig(nodes), nodes.size() / nodesPergroup, vipStatus); } void startMonitoring() { - sc.startClusterMonitoring(new Factory(nodesPerGroup, numDocsPerNode, pingCounts)); + searchCluster.startClusterMonitoring(new Factory(nodesPerGroup, numDocsPerNode, pingCounts)); } private static int getMaxValue(List list) { int max = list.get(0).get(); @@ -118,19 +118,25 @@ public class SearchClusterTest { } @Test - public void requireThatVipStatusIsDefaultUp() { + public void requireThatVipStatusIsDefaultDownButComesUpAfterPinging() { State test = new State("cluster.1", 2, "a", "b"); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isEmpty()); + + assertFalse(test.vipStatus.isInRotation()); + test.startMonitoring(); + test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isEmpty()); } @Test public void requireThatZeroDocsAreFine() { State test = new State("cluster.1", 2,"a", "b"); + test.startMonitoring(); + test.waitOneFullPingRound(); + assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isEmpty()); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isEmpty()); - test.startMonitoring(); test.numDocsPerNode.get(0).set(-1); test.numDocsPerNode.get(1).set(-1); test.waitOneFullPingRound(); @@ -141,19 +147,25 @@ public class SearchClusterTest { } @Test - public void requireThatVipStatusIsDefaultUpWithLocalDispatch() { + public void requireThatVipStatusIsDefaultDownWithLocalDispatch() { State test = new State("cluster.1", 1, HostName.getLocalhost(), "b"); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isPresent()); + + assertFalse(test.vipStatus.isInRotation()); + test.startMonitoring(); + test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isPresent()); } @Test public void requireThatVipStatusDownWhenLocalIsDown() { State test = new State("cluster.1",1,HostName.getLocalhost(), "b"); - assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isPresent()); test.startMonitoring(); + test.waitOneFullPingRound(); + assertTrue(test.vipStatus.isInRotation()); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isPresent()); + test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); test.numDocsPerNode.get(0).set(-1); @@ -183,10 +195,11 @@ public class SearchClusterTest { private void verifyThatVipStatusDownRequireAllNodesDown(int numGroups, int nodesPerGroup) { List nodeNames = generateNodeNames(numGroups, nodesPerGroup); State test = new State("cluster.1", nodesPerGroup, nodeNames); + test.startMonitoring(); + test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isEmpty()); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isEmpty()); - test.startMonitoring(); test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); @@ -214,13 +227,15 @@ public class SearchClusterTest { } return nodeNames; } + private void verifyThatVipStatusUpRequireOnlyOneOnlineNode(int numGroups, int nodesPerGroup) { List nodeNames = generateNodeNames(numGroups, nodesPerGroup); State test = new State("cluster.1", nodesPerGroup, nodeNames); + test.startMonitoring(); + test.waitOneFullPingRound(); assertTrue(test.vipStatus.isInRotation()); - assertTrue(test.sc.localCorpusDispatchTarget().isEmpty()); + assertTrue(test.searchCluster.localCorpusDispatchTarget().isEmpty()); - test.startMonitoring(); for (int i=0; i < test.numDocsPerNode.size()-1; i++) { test.numDocsPerNode.get(i).set(-1); } -- cgit v1.2.3