From f5a172a11f6ed80c606ca8f8bd3600e1d499d0b4 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Thu, 28 May 2020 15:12:02 +0000 Subject: Allow from 5% skew in document distribution and still get good results when asking for many hits. --- .../com/yahoo/search/dispatch/TopKEstimator.java | 9 +++- .../dispatch/searchcluster/SearchCluster.java | 3 +- .../dispatch/InterleavedSearchInvokerTest.java | 2 +- .../yahoo/search/dispatch/TopKEstimatorTest.java | 55 +++++++++++++--------- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java index d3f222a9f3e..d48e337b0c1 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java @@ -12,16 +12,23 @@ public class TopKEstimator { private final TDistribution studentT; private final double defaultP; private final boolean estimate; + private final double skewFactor; private static boolean needEstimate(double p) { return (0.0 < p) && (p < 1.0); } - public TopKEstimator(double freedom, double defaultProbability) { + TopKEstimator(double freedom, double defaultProbability) { + this(freedom, defaultProbability, 0.0); + } + public TopKEstimator(double freedom, double defaultProbability, double skewFactor) { this.studentT = new TDistribution(null, freedom); defaultP = defaultProbability; estimate = needEstimate(defaultP); + this.skewFactor = skewFactor; } double estimateExactK(double k, double n, double p) { + double p_max = (1 + skewFactor)/n; + n = Math.max(1, 1/p_max); double variance = k * 1/n * (1 - 1/n); double p_inverse = 1 - (1 - p)/n; return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance); diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index 7dfc03fd2d7..2f62b07ac04 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -41,6 +41,7 @@ public class SearchCluster implements NodeManager { private final PingFactory pingFactory; private final TopKEstimator hitEstimator; private long nextLogTime = 0; + private static final double SKEW_FACTOR = 0.05; /** * A search node on this local machine having the entire corpus, which we therefore @@ -78,7 +79,7 @@ public class SearchCluster implements NodeManager { for (Node node : nodes) nodesByHostBuilder.put(node.hostname(), node); this.nodesByHost = nodesByHostBuilder.build(); - hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability()); + hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability(), SKEW_FACTOR); this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(), size, diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java index e16f09a58ab..2bfa778a2ba 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java @@ -228,7 +228,7 @@ public class InterleavedSearchInvokerTest { @Test public void requireThatTopKProbabilityOverrideTakesEffect() throws IOException { validateThatTopKProbabilityOverrideTakesEffect(null, 8); - validateThatTopKProbabilityOverrideTakesEffect(0.8, 6); + validateThatTopKProbabilityOverrideTakesEffect(0.8, 7); } @Test diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java index a0f1b32ec13..271e145087b 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java @@ -85,24 +85,17 @@ public class TopKEstimatorTest { @Test public void requireThatLargeKAreSane() { - TopKEstimator estimator = new TopKEstimator(30, 0.9999); - assertEquals(6, estimator.estimateK(10, 10)); - assertEquals(9, estimator.estimateK(20, 10)); - assertEquals(14, estimator.estimateK(40, 10)); - assertEquals(22, estimator.estimateK(80, 10)); - assertEquals(26, estimator.estimateK(100, 10)); - assertEquals(42, estimator.estimateK(200, 10)); - assertEquals(71, estimator.estimateK(400, 10)); - assertEquals(123, estimator.estimateK(800, 10)); - assertEquals(148, estimator.estimateK(1000, 10)); - assertEquals(268, estimator.estimateK(2000, 10)); - assertEquals(496, estimator.estimateK(4000, 10)); - assertEquals(936, estimator.estimateK(8000, 10)); - assertEquals(1152, estimator.estimateK(10000, 10)); - assertEquals(2215, estimator.estimateK(20000, 10)); - assertEquals(4304, estimator.estimateK(40000, 10)); - assertEquals(8429, estimator.estimateK(80000, 10)); - assertEquals(10480, estimator.estimateK(100000, 10)); + System.out.println(dumpProbability(10, 0.05)); + TopKEstimator idealEstimator = new TopKEstimator(30, 0.9999); + TopKEstimator skewedEstimator = new TopKEstimator(30, 0.9999, 0.05); + int [] K = {10, 20, 40, 80, 100, 200, 400, 800, 1000, 2000, 4000, 8000, 10000, 20000, 40000, 80000, 100000}; + int [] expecedWithZeroSkew = {6, 9, 14, 22, 26, 42, 71, 123, 148, 268, 496, 936, 1152, 2215, 4304, 8429, 10480}; + int [] expecedWith5pSkew = {6, 10, 14, 23, 26, 43, 73, 128, 154, 280, 518, 979, 1205, 2319, 4509, 8837, 10989}; + for (int i = 0; i < K.length; i++) { + assertEquals(expecedWithZeroSkew[i], idealEstimator.estimateK(K[i], 10)); + assertEquals(expecedWith5pSkew[i], skewedEstimator.estimateK(K[i], 10)); + } + String expected = "Prob/Hits: 1.0000000000 0.9999000000 0.9999900000 0.9999990000 0.9999999000 0.9999999900 0.9999999990 0.9999999999\n" + " 10: 10.000 6.000 7.000 8.000 9.000 10.000 10.000 10.000\n" + @@ -122,15 +115,35 @@ public class TopKEstimatorTest { " 40000: 10.000 1.076 1.088 1.101 1.114 1.127 1.141 1.156\n" + " 80000: 10.000 1.054 1.062 1.071 1.080 1.090 1.100 1.110\n" + " 100000: 10.000 1.048 1.056 1.064 1.072 1.080 1.089 1.098\n"; - assertEquals(expected, dumpProbability(10)); + assertEquals(expected, dumpProbability(10, 0.0)); + String expectedSkew = + "Prob/Hits: 1.0000000000 0.9999000000 0.9999900000 0.9999990000 0.9999999000 0.9999999900 0.9999999990 0.9999999999\n" + + " 10: 10.000 6.000 7.000 8.000 9.000 10.000 10.000 10.000\n" + + " 20: 10.000 5.000 5.500 6.000 6.500 7.000 7.500 8.500\n" + + " 40: 10.000 3.500 4.000 4.500 4.750 5.250 5.750 6.250\n" + + " 80: 10.000 2.875 3.125 3.375 3.750 4.000 4.375 4.625\n" + + " 100: 10.000 2.600 2.900 3.100 3.400 3.700 4.000 4.300\n" + + " 200: 10.000 2.150 2.350 2.500 2.700 2.900 3.100 3.300\n" + + " 400: 10.000 1.825 1.950 2.075 2.225 2.350 2.500 2.650\n" + + " 800: 10.000 1.600 1.688 1.775 1.875 1.975 2.075 2.175\n" + + " 1000: 10.000 1.540 1.620 1.700 1.790 1.870 1.960 2.060\n" + + " 2000: 10.000 1.400 1.455 1.510 1.570 1.630 1.695 1.760\n" + + " 4000: 10.000 1.295 1.335 1.375 1.418 1.460 1.505 1.553\n" + + " 8000: 10.000 1.224 1.251 1.280 1.309 1.340 1.371 1.405\n" + + " 10000: 10.000 1.205 1.230 1.255 1.282 1.309 1.337 1.367\n" + + " 20000: 10.000 1.160 1.177 1.195 1.214 1.233 1.253 1.275\n" + + " 40000: 10.000 1.127 1.140 1.153 1.166 1.179 1.194 1.209\n" + + " 80000: 10.000 1.105 1.114 1.123 1.132 1.141 1.152 1.162\n" + + " 100000: 10.000 1.099 1.107 1.115 1.123 1.132 1.141 1.150\n"; + assertEquals(expectedSkew, dumpProbability(10, 0.05)); } /** * This make a table showing how many more hits will be fetched as a factor of hits requested. * It shows how it varies with probability and hits requested for a given number of partitions. */ - private String dumpProbability(int numPartitions) { - TopKEstimator estimator = new TopKEstimator(30, 0.9999); + private String dumpProbability(int numPartitions, double skewFactor) { + TopKEstimator estimator = new TopKEstimator(30, 0.9999, skewFactor); int [] K = {10, 20, 40, 80, 100, 200, 400, 800, 1000, 2000, 4000, 8000, 10000, 20000, 40000, 80000, 100000}; double [] P = {1.0, 0.9999, 0.99999, 0.999999, 0.9999999, 0.99999999, 0.999999999, 0.9999999999}; int n = numPartitions; -- cgit v1.2.3