From 712ad877d53849772f29b6962a5cb261131e3668 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Wed, 15 Apr 2020 10:15:25 +0000 Subject: Introduce top-k-probability and use it to fetch correct proper amount of hits from each partition --- container-search/pom.xml | 5 ++++ .../search/dispatch/InterleavedSearchInvoker.java | 2 +- .../dispatch/searchcluster/SearchCluster.java | 29 ++++++++++++++++++++++ .../dispatch/searchcluster/SearchClusterTest.java | 12 +++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) (limited to 'container-search') diff --git a/container-search/pom.xml b/container-search/pom.xml index 84ee5b2bc65..6fa32947869 100644 --- a/container-search/pom.xml +++ b/container-search/pom.xml @@ -131,6 +131,11 @@ protobuf-java compile + + org.apache.commons + commons-math3 + compile + javax.xml.bind jaxb-api diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java b/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java index cec3e94d551..bae1eb03e5f 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java @@ -81,7 +81,7 @@ public class InterleavedSearchInvoker extends SearchInvoker implements ResponseM int originalHits = query.getHits(); int originalOffset = query.getOffset(); - query.setHits(query.getHits() + query.getOffset()); + query.setHits(searchCluster.estimateHitsToFetch(query.getHits() + query.getOffset(), invokers.size())); query.setOffset(0); for (SearchInvoker invoker : invokers) { diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index 7862648ba51..e94cd085a1a 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -11,6 +11,7 @@ import com.yahoo.prelude.Pong; import com.yahoo.search.cluster.ClusterMonitor; import com.yahoo.search.cluster.NodeManager; import com.yahoo.vespa.config.search.DispatchConfig; +import org.apache.commons.math3.distribution.TDistribution; import java.util.LinkedHashMap; import java.util.List; @@ -38,8 +39,27 @@ public class SearchCluster implements NodeManager { private final ImmutableList orderedGroups; private final VipStatus vipStatus; private final PingFactory pingFactory; + private final TopKEstimator hitEstimator; private long nextLogTime = 0; + static class TopKEstimator { + private final TDistribution studentT; + private final double p; + + TopKEstimator(double freedom, double wantedprobability) { + this.studentT = new TDistribution(null, freedom); + p = wantedprobability; + } + double estimateExactK(double k, double n) { + double variance = k * 1/n * (1 - 1/n); + double p_inverse = 1 - (1 - p)/n; + return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance); + } + int estimateK(double k, double n) { + return (int)Math.ceil(estimateExactK(k, n)); + } + } + /** * A search node on this local machine having the entire corpus, which we therefore * should prefer to dispatch directly to, or empty if there is no such local search node. @@ -76,6 +96,9 @@ public class SearchCluster implements NodeManager { for (Node node : nodes) nodesByHostBuilder.put(node.hostname(), node); this.nodesByHost = nodesByHostBuilder.build(); + hitEstimator = ((0.0 < dispatchConfig.topKProbability()) && (dispatchConfig.topKProbability() < 1.0)) + ? new TopKEstimator(30.0, dispatchConfig.topKProbability()) + : null; this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(), size, @@ -240,6 +263,12 @@ public class SearchCluster implements NodeManager { vipStatus.removeFromRotation(clusterId); } + public int estimateHitsToFetch(int wantedHits, int numPartitions) { + return ((hitEstimator == null) || (numPartitions <= 1)) + ? wantedHits + : hitEstimator.estimateK(wantedHits, numPartitions); + } + public boolean hasInformationAboutAllNodes() { return nodesByHost.values().stream().allMatch(node -> node.isWorking() != null); } diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java index ad281aeda7d..840edd3a419 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java @@ -27,6 +27,7 @@ import static org.junit.Assert.assertTrue; * @author baldersheim */ public class SearchClusterTest { + private static final double EPSILON = 0.00000000001; static class State implements AutoCloseable{ @@ -334,4 +335,15 @@ public class SearchClusterTest { assertEquals(3, node.getLastReceivedPongId()); } + @Test + public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() { + SearchCluster.TopKEstimator estimator = new SearchCluster.TopKEstimator(30, 0.999); + assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), EPSILON); + assertEquals(92, estimator.estimateK(200, 3)); + assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), EPSILON); + assertEquals(38, estimator.estimateK(200, 10)); + assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), EPSILON); + assertEquals(24, estimator.estimateK(200, 20)); + } + } -- cgit v1.2.3