summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2020-05-28 15:12:02 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2020-05-28 15:12:02 +0000
commitf5a172a11f6ed80c606ca8f8bd3600e1d499d0b4 (patch)
treebd81b4a3e118da8dc4e59ce29d1477835e672ddb /container-search
parent01b28baa634e2e6c1b9e6830df74a9fd6f208e04 (diff)
Allow from 5% skew in document distribution and still get good results when asking for many hits.
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java9
-rw-r--r--container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java3
-rw-r--r--container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java2
-rw-r--r--container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java55
4 files changed, 45 insertions, 24 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
index d3f222a9f3e..d48e337b0c1 100644
--- a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
@@ -12,16 +12,23 @@ public class TopKEstimator {
private final TDistribution studentT;
private final double defaultP;
private final boolean estimate;
+ private final double skewFactor;
private static boolean needEstimate(double p) {
return (0.0 < p) && (p < 1.0);
}
- public TopKEstimator(double freedom, double defaultProbability) {
+ TopKEstimator(double freedom, double defaultProbability) {
+ this(freedom, defaultProbability, 0.0);
+ }
+ public TopKEstimator(double freedom, double defaultProbability, double skewFactor) {
this.studentT = new TDistribution(null, freedom);
defaultP = defaultProbability;
estimate = needEstimate(defaultP);
+ this.skewFactor = skewFactor;
}
double estimateExactK(double k, double n, double p) {
+ double p_max = (1 + skewFactor)/n;
+ n = Math.max(1, 1/p_max);
double variance = k * 1/n * (1 - 1/n);
double p_inverse = 1 - (1 - p)/n;
return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance);
diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
index 7dfc03fd2d7..2f62b07ac04 100644
--- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
+++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java
@@ -41,6 +41,7 @@ public class SearchCluster implements NodeManager<Node> {
private final PingFactory pingFactory;
private final TopKEstimator hitEstimator;
private long nextLogTime = 0;
+ private static final double SKEW_FACTOR = 0.05;
/**
* A search node on this local machine having the entire corpus, which we therefore
@@ -78,7 +79,7 @@ public class SearchCluster implements NodeManager<Node> {
for (Node node : nodes)
nodesByHostBuilder.put(node.hostname(), node);
this.nodesByHost = nodesByHostBuilder.build();
- hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability());
+ hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability(), SKEW_FACTOR);
this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(),
size,
diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java
index e16f09a58ab..2bfa778a2ba 100644
--- a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java
+++ b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java
@@ -228,7 +228,7 @@ public class InterleavedSearchInvokerTest {
@Test
public void requireThatTopKProbabilityOverrideTakesEffect() throws IOException {
validateThatTopKProbabilityOverrideTakesEffect(null, 8);
- validateThatTopKProbabilityOverrideTakesEffect(0.8, 6);
+ validateThatTopKProbabilityOverrideTakesEffect(0.8, 7);
}
@Test
diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java
index a0f1b32ec13..271e145087b 100644
--- a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java
+++ b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java
@@ -85,24 +85,17 @@ public class TopKEstimatorTest {
@Test
public void requireThatLargeKAreSane() {
- TopKEstimator estimator = new TopKEstimator(30, 0.9999);
- assertEquals(6, estimator.estimateK(10, 10));
- assertEquals(9, estimator.estimateK(20, 10));
- assertEquals(14, estimator.estimateK(40, 10));
- assertEquals(22, estimator.estimateK(80, 10));
- assertEquals(26, estimator.estimateK(100, 10));
- assertEquals(42, estimator.estimateK(200, 10));
- assertEquals(71, estimator.estimateK(400, 10));
- assertEquals(123, estimator.estimateK(800, 10));
- assertEquals(148, estimator.estimateK(1000, 10));
- assertEquals(268, estimator.estimateK(2000, 10));
- assertEquals(496, estimator.estimateK(4000, 10));
- assertEquals(936, estimator.estimateK(8000, 10));
- assertEquals(1152, estimator.estimateK(10000, 10));
- assertEquals(2215, estimator.estimateK(20000, 10));
- assertEquals(4304, estimator.estimateK(40000, 10));
- assertEquals(8429, estimator.estimateK(80000, 10));
- assertEquals(10480, estimator.estimateK(100000, 10));
+ System.out.println(dumpProbability(10, 0.05));
+ TopKEstimator idealEstimator = new TopKEstimator(30, 0.9999);
+ TopKEstimator skewedEstimator = new TopKEstimator(30, 0.9999, 0.05);
+ int [] K = {10, 20, 40, 80, 100, 200, 400, 800, 1000, 2000, 4000, 8000, 10000, 20000, 40000, 80000, 100000};
+ int [] expecedWithZeroSkew = {6, 9, 14, 22, 26, 42, 71, 123, 148, 268, 496, 936, 1152, 2215, 4304, 8429, 10480};
+ int [] expecedWith5pSkew = {6, 10, 14, 23, 26, 43, 73, 128, 154, 280, 518, 979, 1205, 2319, 4509, 8837, 10989};
+ for (int i = 0; i < K.length; i++) {
+ assertEquals(expecedWithZeroSkew[i], idealEstimator.estimateK(K[i], 10));
+ assertEquals(expecedWith5pSkew[i], skewedEstimator.estimateK(K[i], 10));
+ }
+
String expected =
"Prob/Hits: 1.0000000000 0.9999000000 0.9999900000 0.9999990000 0.9999999000 0.9999999900 0.9999999990 0.9999999999\n" +
" 10: 10.000 6.000 7.000 8.000 9.000 10.000 10.000 10.000\n" +
@@ -122,15 +115,35 @@ public class TopKEstimatorTest {
" 40000: 10.000 1.076 1.088 1.101 1.114 1.127 1.141 1.156\n" +
" 80000: 10.000 1.054 1.062 1.071 1.080 1.090 1.100 1.110\n" +
" 100000: 10.000 1.048 1.056 1.064 1.072 1.080 1.089 1.098\n";
- assertEquals(expected, dumpProbability(10));
+ assertEquals(expected, dumpProbability(10, 0.0));
+ String expectedSkew =
+ "Prob/Hits: 1.0000000000 0.9999000000 0.9999900000 0.9999990000 0.9999999000 0.9999999900 0.9999999990 0.9999999999\n" +
+ " 10: 10.000 6.000 7.000 8.000 9.000 10.000 10.000 10.000\n" +
+ " 20: 10.000 5.000 5.500 6.000 6.500 7.000 7.500 8.500\n" +
+ " 40: 10.000 3.500 4.000 4.500 4.750 5.250 5.750 6.250\n" +
+ " 80: 10.000 2.875 3.125 3.375 3.750 4.000 4.375 4.625\n" +
+ " 100: 10.000 2.600 2.900 3.100 3.400 3.700 4.000 4.300\n" +
+ " 200: 10.000 2.150 2.350 2.500 2.700 2.900 3.100 3.300\n" +
+ " 400: 10.000 1.825 1.950 2.075 2.225 2.350 2.500 2.650\n" +
+ " 800: 10.000 1.600 1.688 1.775 1.875 1.975 2.075 2.175\n" +
+ " 1000: 10.000 1.540 1.620 1.700 1.790 1.870 1.960 2.060\n" +
+ " 2000: 10.000 1.400 1.455 1.510 1.570 1.630 1.695 1.760\n" +
+ " 4000: 10.000 1.295 1.335 1.375 1.418 1.460 1.505 1.553\n" +
+ " 8000: 10.000 1.224 1.251 1.280 1.309 1.340 1.371 1.405\n" +
+ " 10000: 10.000 1.205 1.230 1.255 1.282 1.309 1.337 1.367\n" +
+ " 20000: 10.000 1.160 1.177 1.195 1.214 1.233 1.253 1.275\n" +
+ " 40000: 10.000 1.127 1.140 1.153 1.166 1.179 1.194 1.209\n" +
+ " 80000: 10.000 1.105 1.114 1.123 1.132 1.141 1.152 1.162\n" +
+ " 100000: 10.000 1.099 1.107 1.115 1.123 1.132 1.141 1.150\n";
+ assertEquals(expectedSkew, dumpProbability(10, 0.05));
}
/**
* This make a table showing how many more hits will be fetched as a factor of hits requested.
* It shows how it varies with probability and hits requested for a given number of partitions.
*/
- private String dumpProbability(int numPartitions) {
- TopKEstimator estimator = new TopKEstimator(30, 0.9999);
+ private String dumpProbability(int numPartitions, double skewFactor) {
+ TopKEstimator estimator = new TopKEstimator(30, 0.9999, skewFactor);
int [] K = {10, 20, 40, 80, 100, 200, 400, 800, 1000, 2000, 4000, 8000, 10000, 20000, 40000, 80000, 100000};
double [] P = {1.0, 0.9999, 0.99999, 0.999999, 0.9999999, 0.99999999, 0.999999999, 0.9999999999};
int n = numPartitions;