diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-04-15 11:58:29 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2020-04-15 11:58:29 +0000 |
commit | a4e565c808f7999f561b0dad881f3a34040ab5d7 (patch) | |
tree | a4f30fd3fd32d3d18494b47a61e58f95bfc16fb3 /container-search | |
parent | 2cebe16d47872f08472ac52cbfc9e1102fb695da (diff) |
Make SearchCluster.TopKEstimator a top level class.
Diffstat (limited to 'container-search')
4 files changed, 52 insertions, 37 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java new file mode 100644 index 00000000000..374f919e2bb --- /dev/null +++ b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java @@ -0,0 +1,30 @@ +package com.yahoo.search.dispatch; + +import org.apache.commons.math3.distribution.TDistribution; + +/** + * Use StudentT distribution and estimate how many hits you need from each partition + * to to get the globally top-k documents with the desired probability + * @author baldersheim + */ +public class TopKEstimator { + private final TDistribution studentT; + private final double p; + private final boolean estimate; + + public TopKEstimator(double freedom, double wantedprobability) { + this.studentT = new TDistribution(null, freedom); + p = wantedprobability; + estimate = (0.0 < p) && (p < 1.0); + } + double estimateExactK(double k, double n) { + double variance = k * 1/n * (1 - 1/n); + double p_inverse = 1 - (1 - p)/n; + return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance); + } + public int estimateK(int k, int n) { + return (estimate && n > 1) + ? (int)Math.ceil(estimateExactK(k, n)) + : k; + } +} diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index e94cd085a1a..5acafb9e0a5 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -10,8 +10,8 @@ import com.yahoo.net.HostName; import com.yahoo.prelude.Pong; import com.yahoo.search.cluster.ClusterMonitor; import com.yahoo.search.cluster.NodeManager; +import com.yahoo.search.dispatch.TopKEstimator; import com.yahoo.vespa.config.search.DispatchConfig; -import org.apache.commons.math3.distribution.TDistribution; import java.util.LinkedHashMap; import java.util.List; @@ -42,24 +42,6 @@ public class SearchCluster implements NodeManager<Node> { private final TopKEstimator hitEstimator; private long nextLogTime = 0; - static class TopKEstimator { - private final TDistribution studentT; - private final double p; - - TopKEstimator(double freedom, double wantedprobability) { - this.studentT = new TDistribution(null, freedom); - p = wantedprobability; - } - double estimateExactK(double k, double n) { - double variance = k * 1/n * (1 - 1/n); - double p_inverse = 1 - (1 - p)/n; - return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance); - } - int estimateK(double k, double n) { - return (int)Math.ceil(estimateExactK(k, n)); - } - } - /** * A search node on this local machine having the entire corpus, which we therefore * should prefer to dispatch directly to, or empty if there is no such local search node. @@ -96,9 +78,7 @@ public class SearchCluster implements NodeManager<Node> { for (Node node : nodes) nodesByHostBuilder.put(node.hostname(), node); this.nodesByHost = nodesByHostBuilder.build(); - hitEstimator = ((0.0 < dispatchConfig.topKProbability()) && (dispatchConfig.topKProbability() < 1.0)) - ? new TopKEstimator(30.0, dispatchConfig.topKProbability()) - : null; + hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability()); this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(), size, @@ -264,9 +244,7 @@ public class SearchCluster implements NodeManager<Node> { } public int estimateHitsToFetch(int wantedHits, int numPartitions) { - return ((hitEstimator == null) || (numPartitions <= 1)) - ? wantedHits - : hitEstimator.estimateK(wantedHits, numPartitions); + return hitEstimator.estimateK(wantedHits, numPartitions); } public boolean hasInformationAboutAllNodes() { diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java new file mode 100644 index 00000000000..6ef28119c23 --- /dev/null +++ b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java @@ -0,0 +1,18 @@ +package com.yahoo.search.dispatch; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TopKEstimatorTest { + @Test + public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() { + TopKEstimator estimator = new TopKEstimator(30, 0.999); + assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), 0.0); + assertEquals(92, estimator.estimateK(200, 3)); + assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), 0.0); + assertEquals(38, estimator.estimateK(200, 10)); + assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), 0.0); + assertEquals(24, estimator.estimateK(200, 20)); + } +} diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java index 840edd3a419..09024150a9a 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java @@ -8,6 +8,7 @@ import com.yahoo.net.HostName; import com.yahoo.prelude.Pong; import com.yahoo.search.cluster.ClusterMonitor; import com.yahoo.search.dispatch.MockSearchCluster; +import com.yahoo.search.dispatch.TopKEstimator; import com.yahoo.search.result.ErrorMessage; import org.junit.Test; @@ -27,7 +28,6 @@ import static org.junit.Assert.assertTrue; * @author baldersheim */ public class SearchClusterTest { - private static final double EPSILON = 0.00000000001; static class State implements AutoCloseable{ @@ -335,15 +335,4 @@ public class SearchClusterTest { assertEquals(3, node.getLastReceivedPongId()); } - @Test - public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() { - SearchCluster.TopKEstimator estimator = new SearchCluster.TopKEstimator(30, 0.999); - assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), EPSILON); - assertEquals(92, estimator.estimateK(200, 3)); - assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), EPSILON); - assertEquals(38, estimator.estimateK(200, 10)); - assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), EPSILON); - assertEquals(24, estimator.estimateK(200, 20)); - } - } |