summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--configdefinitions/src/vespa/dispatch.def10
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java30
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java28
-rw-r--r--container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java18
-rw-r--r--container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java13
5 files changed, 58 insertions, 41 deletions
diff --git a/configdefinitions/src/vespa/dispatch.def b/configdefinitions/src/vespa/dispatch.def
index 3f553b5b8ba..0776e648ad7 100644
--- a/configdefinitions/src/vespa/dispatch.def
+++ b/configdefinitions/src/vespa/dispatch.def
@@ -23,11 +23,13 @@ distributionPolicy enum { ROUNDROBIN, ADAPTIVE } default=ROUNDROBIN
## don't use it if you don't (really) mean it.
maxHitsPerNode int default=2147483647
-## Probability for getting the correct topK documents.
-## A value of 1.0 will ask all partitions for topK documents.
-## Any value between <0, 1> will use a Student T fith 30 degrees freedom and compute a K value that
-## will give you the topK documents according to this formulae.
+## Probability for getting the K best hits (topK).
+## A value of 1.0 will ask all N partitions for K hits.
+## Any value between <0, 1> will use a Student T with 30 degrees freedom and compute a value Q that
+## will give you the globally K best hits according to this formula with the desired probability.
## q = k/n + qT (p',30) x √(k × (1/n) × (1 − 1/n))
+## With a probability of 0.999 and K=200 and N=10 will give a Q of 38, meaning that you only need to fetch 19% compared to
+## default setting of 1.0. This is a significant optimisation with with very little loss in presicion.
topKProbability double default=1.0
# Is multi-level dispatch configured for this cluster
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
new file mode 100644
index 00000000000..374f919e2bb
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
@@ -0,0 +1,30 @@
+package com.yahoo.search.dispatch;
+
+import org.apache.commons.math3.distribution.TDistribution;
+
+/**
+ * Use StudentT distribution and estimate how many hits you need from each partition
+ * to to get the globally top-k documents with the desired probability
+ * @author baldersheim
+ */
+public class TopKEstimator {
+ private final TDistribution studentT;
+ private final double p;
+ private final boolean estimate;
+
+ public TopKEstimator(double freedom, double wantedprobability) {
+ this.studentT = new TDistribution(null, freedom);
+ p = wantedprobability;
+ estimate = (0.0 < p) && (p < 1.0);
+ }
+ double estimateExactK(double k, double n) {
+ double variance = k * 1/n * (1 - 1/n);
+ double p_inverse = 1 - (1 - p)/n;
+ return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance);
+ }
+ public int estimateK(int k, int n) {
+ return (estimate && n > 1)
+ ? (int)Math.ceil(estimateExactK(k, n))
+ : k;
+ }
+}
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
index e94cd085a1a..5acafb9e0a5 100644
--- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
@@ -10,8 +10,8 @@ import com.yahoo.net.HostName;
import com.yahoo.prelude.Pong;
import com.yahoo.search.cluster.ClusterMonitor;
import com.yahoo.search.cluster.NodeManager;
+import com.yahoo.search.dispatch.TopKEstimator;
import com.yahoo.vespa.config.search.DispatchConfig;
-import org.apache.commons.math3.distribution.TDistribution;
import java.util.LinkedHashMap;
import java.util.List;
@@ -42,24 +42,6 @@ public class SearchCluster implements NodeManager<Node> {
private final TopKEstimator hitEstimator;
private long nextLogTime = 0;
- static class TopKEstimator {
- private final TDistribution studentT;
- private final double p;
-
- TopKEstimator(double freedom, double wantedprobability) {
- this.studentT = new TDistribution(null, freedom);
- p = wantedprobability;
- }
- double estimateExactK(double k, double n) {
- double variance = k * 1/n * (1 - 1/n);
- double p_inverse = 1 - (1 - p)/n;
- return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance);
- }
- int estimateK(double k, double n) {
- return (int)Math.ceil(estimateExactK(k, n));
- }
- }
-
/**
* A search node on this local machine having the entire corpus, which we therefore
* should prefer to dispatch directly to, or empty if there is no such local search node.
@@ -96,9 +78,7 @@ public class SearchCluster implements NodeManager<Node> {
for (Node node : nodes)
nodesByHostBuilder.put(node.hostname(), node);
this.nodesByHost = nodesByHostBuilder.build();
- hitEstimator = ((0.0 < dispatchConfig.topKProbability()) && (dispatchConfig.topKProbability() < 1.0))
- ? new TopKEstimator(30.0, dispatchConfig.topKProbability())
- : null;
+ hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability());
this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(),
size,
@@ -264,9 +244,7 @@ public class SearchCluster implements NodeManager<Node> {
}
public int estimateHitsToFetch(int wantedHits, int numPartitions) {
- return ((hitEstimator == null) || (numPartitions <= 1))
- ? wantedHits
- : hitEstimator.estimateK(wantedHits, numPartitions);
+ return hitEstimator.estimateK(wantedHits, numPartitions);
}
public boolean hasInformationAboutAllNodes() {
diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java
new file mode 100644
index 00000000000..6ef28119c23
--- /dev/null
+++ b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java
@@ -0,0 +1,18 @@
+package com.yahoo.search.dispatch;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TopKEstimatorTest {
+ @Test
+ public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() {
+ TopKEstimator estimator = new TopKEstimator(30, 0.999);
+ assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), 0.0);
+ assertEquals(92, estimator.estimateK(200, 3));
+ assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), 0.0);
+ assertEquals(38, estimator.estimateK(200, 10));
+ assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), 0.0);
+ assertEquals(24, estimator.estimateK(200, 20));
+ }
+}
diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java
index 840edd3a419..09024150a9a 100644
--- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java
+++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java
@@ -8,6 +8,7 @@ import com.yahoo.net.HostName;
import com.yahoo.prelude.Pong;
import com.yahoo.search.cluster.ClusterMonitor;
import com.yahoo.search.dispatch.MockSearchCluster;
+import com.yahoo.search.dispatch.TopKEstimator;
import com.yahoo.search.result.ErrorMessage;
import org.junit.Test;
@@ -27,7 +28,6 @@ import static org.junit.Assert.assertTrue;
* @author baldersheim
*/
public class SearchClusterTest {
- private static final double EPSILON = 0.00000000001;
static class State implements AutoCloseable{
@@ -335,15 +335,4 @@ public class SearchClusterTest {
assertEquals(3, node.getLastReceivedPongId());
}
- @Test
- public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() {
- SearchCluster.TopKEstimator estimator = new SearchCluster.TopKEstimator(30, 0.999);
- assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), EPSILON);
- assertEquals(92, estimator.estimateK(200, 3));
- assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), EPSILON);
- assertEquals(38, estimator.estimateK(200, 10));
- assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), EPSILON);
- assertEquals(24, estimator.estimateK(200, 20));
- }
-
}