diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-04-16 02:28:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-16 02:28:52 +0200 |
commit | b2519d490b2bb1d4e28e9c6c5e3ed72ee16b5469 (patch) | |
tree | c3a4fa47a30163d46bc01b9c97c539da9593cca8 | |
parent | bdb570a9e21410108bbb56f183bad1603c45c1fc (diff) | |
parent | cc436f402118300a5ffba223480cd63da2345008 (diff) |
Merge pull request #12918 from vespa-engine/balder/top-k-probability
Introduce top-k-probability and use it to fetch correct proper amount…
26 files changed, 232 insertions, 7 deletions
diff --git a/application/pom.xml b/application/pom.xml index d173fec19cf..cd27b53f557 100644 --- a/application/pom.xml +++ b/application/pom.xml @@ -120,6 +120,10 @@ <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + </dependency> <dependency> <groupId>com.yahoo.vespa</groupId> diff --git a/config-model-api/src/main/java/com/yahoo/config/model/api/ModelContext.java b/config-model-api/src/main/java/com/yahoo/config/model/api/ModelContext.java index b8f03794301..f58fb5fbb69 100644 --- a/config-model-api/src/main/java/com/yahoo/config/model/api/ModelContext.java +++ b/config-model-api/src/main/java/com/yahoo/config/model/api/ModelContext.java @@ -60,13 +60,20 @@ public interface ModelContext { // TODO: Only needed for LbServicesProducerTest default boolean useDedicatedNodeForLogserver() { return true; } + // TODO Revisit in May or June 2020 boolean useAdaptiveDispatch(); // TODO: Remove after April 2020 default Optional<TlsSecrets> tlsSecrets() { return Optional.empty(); } default Optional<EndpointCertificateSecrets> endpointCertificateSecrets() { return Optional.empty(); } + + // TODO Revisit in May or June 2020 double defaultTermwiseLimit(); + + // TODO Revisit in May or June 2020 + double defaultTopKProbability(); + boolean useBucketSpaceMetric(); default boolean useNewAthenzFilter() { return true; } // TODO bjorncs: Remove after end of April diff --git a/config-model/src/main/java/com/yahoo/config/model/deploy/TestProperties.java b/config-model/src/main/java/com/yahoo/config/model/deploy/TestProperties.java index 4c9e9489c63..a772d7c8a1f 100644 --- a/config-model/src/main/java/com/yahoo/config/model/deploy/TestProperties.java +++ b/config-model/src/main/java/com/yahoo/config/model/deploy/TestProperties.java @@ -40,6 +40,7 @@ public class TestProperties implements ModelContext.Properties { private boolean isFirstTimeDeployment = false; private boolean useDedicatedNodeForLogserver = false; private boolean useAdaptiveDispatch = false; + private double topKProbability = 1.0; private double defaultTermwiseLimit = 1.0; private Optional<EndpointCertificateSecrets> endpointCertificateSecrets = Optional.empty(); private AthenzDomain athenzDomain; @@ -61,6 +62,7 @@ public class TestProperties implements ModelContext.Properties { @Override public Optional<EndpointCertificateSecrets> endpointCertificateSecrets() { return endpointCertificateSecrets; } @Override public Optional<TlsSecrets> tlsSecrets() { return endpointCertificateSecrets.map(TlsSecrets::new); } @Override public double defaultTermwiseLimit() { return defaultTermwiseLimit; } + @Override public double defaultTopKProbability() { return topKProbability; } @Override public boolean useBucketSpaceMetric() { return true; } @Override public Optional<AthenzDomain> athenzDomain() { return Optional.ofNullable(athenzDomain); } @@ -69,6 +71,11 @@ public class TestProperties implements ModelContext.Properties { return this; } + public TestProperties setTopKProbability(double probability) { + topKProbability = probability; + return this; + } + public TestProperties setApplicationId(ApplicationId applicationId) { this.applicationId = applicationId; return this; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/DispatchTuning.java b/config-model/src/main/java/com/yahoo/vespa/model/content/DispatchTuning.java index 0d15207b6ce..0f9eb5341ab 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/DispatchTuning.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/DispatchTuning.java @@ -11,18 +11,25 @@ public class DispatchTuning { public static final DispatchTuning empty = new DispatchTuning.Builder().build(); - public enum DispatchPolicy { ROUNDROBIN, ADAPTIVE}; + public enum DispatchPolicy { ROUNDROBIN, ADAPTIVE} private final Integer maxHitsPerPartition; private DispatchPolicy dispatchPolicy; private final Double minGroupCoverage; private final Double minActiveDocsCoverage; + public Double getTopkProbability() { + return topkProbability; + } + + private final Double topkProbability; + private DispatchTuning(Builder builder) { maxHitsPerPartition = builder.maxHitsPerPartition; dispatchPolicy = builder.dispatchPolicy; minGroupCoverage = builder.minGroupCoverage; minActiveDocsCoverage = builder.minActiveDocsCoverage; + topkProbability = builder.topKProbability; } /** Returns the max number of hits to fetch from each partition, or null to fetch all */ @@ -46,6 +53,7 @@ public class DispatchTuning { private DispatchPolicy dispatchPolicy; private Double minGroupCoverage; private Double minActiveDocsCoverage; + private Double topKProbability; public DispatchTuning build() { return new DispatchTuning(this); @@ -55,6 +63,10 @@ public class DispatchTuning { this.maxHitsPerPartition = maxHitsPerPartition; return this; } + public Builder setTopKProbability(Double topKProbability) { + this.topKProbability = topKProbability; + return this; + } public Builder setDispatchPolicy(String policy) { if (policy != null) dispatchPolicy = toDispatchPolicy(policy); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DomTuningDispatchBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DomTuningDispatchBuilder.java index b53d66632a8..d599a1a1aca 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DomTuningDispatchBuilder.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DomTuningDispatchBuilder.java @@ -23,6 +23,7 @@ public class DomTuningDispatchBuilder { return builder.build(); } builder.setMaxHitsPerPartition(dispatchElement.childAsInteger("max-hits-per-partition")); + builder.setTopKProbability(dispatchElement.childAsDouble("top-k-probability")); builder.setDispatchPolicy(dispatchElement.childAsString("dispatch-policy")); builder.setMinGroupCoverage(dispatchElement.childAsDouble("min-group-coverage")); builder.setMinActiveDocsCoverage(dispatchElement.childAsDouble("min-active-docs-coverage")); diff --git a/config-model/src/main/java/com/yahoo/vespa/model/search/IndexedSearchCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/search/IndexedSearchCluster.java index 9746c50450e..56adc227df4 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/search/IndexedSearchCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/search/IndexedSearchCluster.java @@ -53,6 +53,7 @@ public class IndexedSearchCluster extends SearchCluster private final DispatchGroup rootDispatch; private DispatchSpec dispatchSpec; private final boolean useAdaptiveDispatch; + private final double defaultTopKProbability; private List<SearchNode> searchNodes = new ArrayList<>(); /** @@ -70,6 +71,7 @@ public class IndexedSearchCluster extends SearchCluster unionCfg = new UnionConfiguration(this, documentDbs); rootDispatch = new DispatchGroup(this); useAdaptiveDispatch = deployState.getProperties().useAdaptiveDispatch(); + defaultTopKProbability = deployState.getProperties().defaultTopKProbability(); } @Override @@ -307,7 +309,11 @@ public class IndexedSearchCluster extends SearchCluster } if (useAdaptiveDispatch) builder.distributionPolicy(DistributionPolicy.ADAPTIVE); - + if (tuning.dispatch.getTopkProbability() != null) { + builder.topKProbability(tuning.dispatch.getTopkProbability()); + } else { + builder.topKProbability(defaultTopKProbability); + } if (tuning.dispatch.getMinActiveDocsCoverage() != null) builder.minActivedocsPercentage(tuning.dispatch.getMinActiveDocsCoverage()); if (tuning.dispatch.getMinGroupCoverage() != null) diff --git a/config-model/src/main/resources/schema/content.rnc b/config-model/src/main/resources/schema/content.rnc index b1821680b14..481d82ebb4b 100644 --- a/config-model/src/main/resources/schema/content.rnc +++ b/config-model/src/main/resources/schema/content.rnc @@ -85,6 +85,7 @@ DispatchTuning = element dispatch { element dispatch-policy { string "round-robin" | string "adaptive" | string "random" }? & element min-group-coverage { xsd:double }? & element min-active-docs-coverage { xsd:double }? & + element top-k-probability { xsd:double }? & element use-local-node { string "true" | string "false" }? } diff --git a/config-model/src/test/java/com/yahoo/vespa/model/content/ContentClusterTest.java b/config-model/src/test/java/com/yahoo/vespa/model/content/ContentClusterTest.java index b08cc92d20c..4d5df7c1965 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/content/ContentClusterTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/content/ContentClusterTest.java @@ -933,6 +933,17 @@ public class ContentClusterTest extends ContentBaseTest { assertEquals(distributionBits, storDistributormanagerConfig.minsplitcount()); } + private void verifyTopKProbabilityPropertiesControl(double topKProbability) { + VespaModel model = createEnd2EndOneNode(new TestProperties().setTopKProbability(topKProbability)); + + ContentCluster cc = model.getContentClusters().get("storage"); + DispatchConfig.Builder builder = new DispatchConfig.Builder(); + cc.getSearch().getConfig(builder); + + DispatchConfig cfg = new DispatchConfig(builder); + assertEquals(topKProbability, cfg.topKProbability(), 0.0); + } + private void verifyRoundRobinPropertiesControl(boolean useAdaptiveDispatch) { VespaModel model = createEnd2EndOneNode(new TestProperties().setUseAdaptiveDispatch(useAdaptiveDispatch)); @@ -946,7 +957,6 @@ public class ContentClusterTest extends ContentBaseTest { } else { assertEquals(DispatchConfig.DistributionPolicy.ROUNDROBIN, cfg.distributionPolicy()); } - } @Test @@ -955,5 +965,12 @@ public class ContentClusterTest extends ContentBaseTest { verifyRoundRobinPropertiesControl(true); } + @Test + public void default_topKprobability_controlled_by_properties() { + verifyTopKProbabilityPropertiesControl(1.0); + verifyTopKProbabilityPropertiesControl(0.999); + verifyTopKProbabilityPropertiesControl(0.77); + } + } diff --git a/config-model/src/test/java/com/yahoo/vespa/model/content/DispatchTuningTest.java b/config-model/src/test/java/com/yahoo/vespa/model/content/DispatchTuningTest.java index f708d7673e2..8a46aaaa230 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/content/DispatchTuningTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/content/DispatchTuningTest.java @@ -19,11 +19,13 @@ public class DispatchTuningTest { .setDispatchPolicy("round-robin") .setMinGroupCoverage(7.5) .setMinActiveDocsCoverage(12.5) + .setTopKProbability(18.3) .build(); assertEquals(69, dispatch.getMaxHitsPerPartition().intValue()); assertEquals(7.5, dispatch.getMinGroupCoverage().doubleValue(), 0.0); assertEquals(12.5, dispatch.getMinActiveDocsCoverage().doubleValue(), 0.0); assertTrue(DispatchTuning.DispatchPolicy.ROUNDROBIN == dispatch.getDispatchPolicy()); + assertEquals(18.3, dispatch.getTopkProbability(), 0.0); } @Test public void requireThatRandomDispatchWork() { @@ -52,6 +54,7 @@ public class DispatchTuningTest { assertNull(dispatch.getDispatchPolicy()); assertNull(dispatch.getMinActiveDocsCoverage()); assertNull(dispatch.getMinGroupCoverage()); + assertNull(dispatch.getTopkProbability()); } } diff --git a/config-model/src/test/java/com/yahoo/vespa/model/content/cluster/DomDispatchTuningBuilderTest.java b/config-model/src/test/java/com/yahoo/vespa/model/content/cluster/DomDispatchTuningBuilderTest.java index 7fa27f74d74..abfb03e41dd 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/content/cluster/DomDispatchTuningBuilderTest.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/content/cluster/DomDispatchTuningBuilderTest.java @@ -47,6 +47,7 @@ public class DomDispatchTuningBuilderTest { assertNull(dispatch.getMinGroupCoverage()); assertNull(dispatch.getMinActiveDocsCoverage()); assertNull(dispatch.getDispatchPolicy()); + assertNull(dispatch.getTopkProbability()); } @Test @@ -58,12 +59,14 @@ public class DomDispatchTuningBuilderTest { " <max-hits-per-partition>69</max-hits-per-partition>" + " <min-group-coverage>7.5</min-group-coverage>" + " <min-active-docs-coverage>12.5</min-active-docs-coverage>" + + " <top-k-probability>0.999</top-k-probability>" + " </dispatch>" + " </tuning>" + "</content>"); assertEquals(69, dispatch.getMaxHitsPerPartition().intValue()); assertEquals(7.5, dispatch.getMinGroupCoverage().doubleValue(), 0.0); assertEquals(12.5, dispatch.getMinActiveDocsCoverage().doubleValue(), 0.0); + assertEquals(0.999, dispatch.getTopkProbability().doubleValue(), 0.0); } @Test public void requireThatTuningDispatchPolicyRoundRobin() throws Exception { diff --git a/configdefinitions/src/vespa/dispatch.def b/configdefinitions/src/vespa/dispatch.def index 21001eb3af0..0776e648ad7 100644 --- a/configdefinitions/src/vespa/dispatch.def +++ b/configdefinitions/src/vespa/dispatch.def @@ -23,6 +23,15 @@ distributionPolicy enum { ROUNDROBIN, ADAPTIVE } default=ROUNDROBIN ## don't use it if you don't (really) mean it. maxHitsPerNode int default=2147483647 +## Probability for getting the K best hits (topK). +## A value of 1.0 will ask all N partitions for K hits. +## Any value between <0, 1> will use a Student T with 30 degrees freedom and compute a value Q that +## will give you the globally K best hits according to this formula with the desired probability. +## q = k/n + qT (p',30) x √(k × (1/n) × (1 − 1/n)) +## With a probability of 0.999 and K=200 and N=10 will give a Q of 38, meaning that you only need to fetch 19% compared to +## default setting of 1.0. This is a significant optimisation with with very little loss in presicion. +topKProbability double default=1.0 + # Is multi-level dispatch configured for this cluster # Deprecated, will go away soon, NOOP useMultilevelDispatch bool default=false diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/ModelContextImpl.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/ModelContextImpl.java index 2b25b69d09c..a3624f52139 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/ModelContextImpl.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/ModelContextImpl.java @@ -146,6 +146,7 @@ public class ModelContextImpl implements ModelContext { private final boolean isBootstrap; private final boolean isFirstTimeDeployment; private final boolean useAdaptiveDispatch; + private final double defaultTopKprobability; private final Optional<EndpointCertificateSecrets> endpointCertificateSecrets; private final double defaultTermwiseLimit; private final boolean useBucketSpaceMetric; @@ -182,6 +183,8 @@ public class ModelContextImpl implements ModelContext { this.endpointCertificateSecrets = endpointCertificateSecrets; defaultTermwiseLimit = Flags.DEFAULT_TERM_WISE_LIMIT.bindTo(flagSource) .with(FetchVector.Dimension.APPLICATION_ID, applicationId.serializedForm()).value(); + defaultTopKprobability = Flags.DEFAULT_TOP_K_PROBABILITY.bindTo(flagSource) + .with(FetchVector.Dimension.APPLICATION_ID, applicationId.serializedForm()).value(); this.useBucketSpaceMetric = Flags.USE_BUCKET_SPACE_METRIC.bindTo(flagSource) .with(FetchVector.Dimension.APPLICATION_ID, applicationId.serializedForm()).value(); this.proxyProtocol = Flags.PROXY_PROTOCOL.bindTo(flagSource) @@ -239,6 +242,11 @@ public class ModelContextImpl implements ModelContext { public double defaultTermwiseLimit() { return defaultTermwiseLimit; } @Override + public double defaultTopKProbability() { + return defaultTopKprobability; + } + + @Override public boolean useBucketSpaceMetric() { return useBucketSpaceMetric; } @Override diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/ModelContextImplTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/ModelContextImplTest.java index 39dccc6b482..03c6bad79a8 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/ModelContextImplTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/ModelContextImplTest.java @@ -84,6 +84,9 @@ public class ModelContextImplTest { assertEquals(Optional.empty(), context.wantedDockerImageRepository()); assertEquals(new Version(7), context.modelVespaVersion()); assertEquals(new Version(8), context.wantedNodeVespaVersion()); + assertEquals(1.0, context.properties().defaultTermwiseLimit(), 0.0); + assertEquals(1.0, context.properties().defaultTopKProbability(), 0.0); + assertFalse(context.properties().useAdaptiveDispatch()); } } diff --git a/container-dev/pom.xml b/container-dev/pom.xml index 738a4cc8700..1bb06ab9694 100644 --- a/container-dev/pom.xml +++ b/container-dev/pom.xml @@ -176,6 +176,10 @@ <groupId>com.google.protobuf</groupId> <artifactId>protobuf-java</artifactId> </exclusion> + <exclusion> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + </exclusion> </exclusions> </dependency> <dependency> diff --git a/container-search/pom.xml b/container-search/pom.xml index 84ee5b2bc65..6fa32947869 100644 --- a/container-search/pom.xml +++ b/container-search/pom.xml @@ -132,6 +132,11 @@ <scope>compile</scope> </dependency> <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <scope>compile</scope> + </dependency> + <dependency> <groupId>javax.xml.bind</groupId> <artifactId>jaxb-api</artifactId> <scope>test</scope> diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/Dispatcher.java b/container-search/src/main/java/com/yahoo/search/dispatch/Dispatcher.java index 9b42ce03e6d..626cf087aca 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/Dispatcher.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/Dispatcher.java @@ -51,6 +51,7 @@ public class Dispatcher extends AbstractComponent { public static final String DISPATCH = "dispatch"; private static final String INTERNAL = "internal"; private static final String PROTOBUF = "protobuf"; + private static final String TOP_K_PROBABILITY = "topKProbability"; private static final String INTERNAL_METRIC = "dispatch_internal"; @@ -59,6 +60,9 @@ public class Dispatcher extends AbstractComponent { /** If enabled, search queries will use protobuf rpc */ public static final CompoundName dispatchProtobuf = CompoundName.fromComponents(DISPATCH, PROTOBUF); + /** If set will control computation of how many hits will be fetched from each partition.*/ + public static final CompoundName topKProbability = CompoundName.fromComponents(DISPATCH, TOP_K_PROBABILITY); + /** A model of the search cluster this dispatches to */ private final SearchCluster searchCluster; private final ClusterMonitor clusterMonitor; @@ -80,6 +84,7 @@ public class Dispatcher extends AbstractComponent { argumentType.setBuiltin(true); argumentType.addField(new FieldDescription(INTERNAL, FieldType.booleanType)); argumentType.addField(new FieldDescription(PROTOBUF, FieldType.booleanType)); + argumentType.addField(new FieldDescription(TOP_K_PROBABILITY, FieldType.doubleType)); argumentType.freeze(); } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java b/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java index cec3e94d551..e62848a7f9e 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/InterleavedSearchInvoker.java @@ -81,7 +81,12 @@ public class InterleavedSearchInvoker extends SearchInvoker implements ResponseM int originalHits = query.getHits(); int originalOffset = query.getOffset(); - query.setHits(query.getHits() + query.getOffset()); + int neededHits = originalHits + originalOffset; + Double topkProbabilityOverrride = query.properties().getDouble(Dispatcher.topKProbability); + int q = (topkProbabilityOverrride != null) + ? searchCluster.estimateHitsToFetch(neededHits, invokers.size(), topkProbabilityOverrride) + : searchCluster.estimateHitsToFetch(neededHits, invokers.size()); + query.setHits(q); query.setOffset(0); for (SearchInvoker invoker : invokers) { @@ -321,4 +326,7 @@ public class InterleavedSearchInvoker extends SearchInvoker implements ResponseM protected LinkedBlockingQueue<SearchInvoker> newQueue() { return new LinkedBlockingQueue<>(); } + + // For testing + Collection<SearchInvoker> invokers() { return invokers; } } diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java new file mode 100644 index 00000000000..8003d9c6744 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java @@ -0,0 +1,42 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.dispatch; + +import org.apache.commons.math3.distribution.TDistribution; + +/** + * Use StudentT distribution and estimate how many hits you need from each partition + * to to get the globally top-k documents with the desired probability + * @author baldersheim + */ +public class TopKEstimator { + private final TDistribution studentT; + private final double defaultP; + private final boolean estimate; + + private static boolean needEstimate(double p) { + return (0.0 < p) && (p < 1.0); + } + public TopKEstimator(double freedom, double defaultProbability) { + this.studentT = new TDistribution(null, freedom); + defaultP = defaultProbability; + estimate = needEstimate(defaultP); + } + double estimateExactK(double k, double n, double p) { + double variance = k * 1/n * (1 - 1/n); + double p_inverse = 1 - (1 - p)/n; + return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance); + } + double estimateExactK(double k, double n) { + return estimateExactK(k, n, defaultP); + } + public int estimateK(int k, int n) { + return (estimate && n > 1) + ? (int)Math.ceil(estimateExactK(k, n, defaultP)) + : k; + } + public int estimateK(int k, int n, double p) { + return (needEstimate(p) && (n > 1)) + ? (int)Math.ceil(estimateExactK(k, n, p)) + : k; + } +} diff --git a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java index 27b4472e324..7dfc03fd2d7 100644 --- a/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java +++ b/container-search/src/main/java/com/yahoo/search/dispatch/searchcluster/SearchCluster.java @@ -10,6 +10,7 @@ import com.yahoo.net.HostName; import com.yahoo.prelude.Pong; import com.yahoo.search.cluster.ClusterMonitor; import com.yahoo.search.cluster.NodeManager; +import com.yahoo.search.dispatch.TopKEstimator; import com.yahoo.vespa.config.search.DispatchConfig; import java.util.LinkedHashMap; @@ -38,6 +39,7 @@ public class SearchCluster implements NodeManager<Node> { private final ImmutableList<Group> orderedGroups; private final VipStatus vipStatus; private final PingFactory pingFactory; + private final TopKEstimator hitEstimator; private long nextLogTime = 0; /** @@ -76,6 +78,7 @@ public class SearchCluster implements NodeManager<Node> { for (Node node : nodes) nodesByHostBuilder.put(node.hostname(), node); this.nodesByHost = nodesByHostBuilder.build(); + hitEstimator = new TopKEstimator(30.0, dispatchConfig.topKProbability()); this.localCorpusDispatchTarget = findLocalCorpusDispatchTarget(HostName.getLocalhost(), size, @@ -240,6 +243,13 @@ public class SearchCluster implements NodeManager<Node> { vipStatus.removeFromRotation(clusterId); } + public int estimateHitsToFetch(int wantedHits, int numPartitions) { + return hitEstimator.estimateK(wantedHits, numPartitions); + } + public int estimateHitsToFetch(int wantedHits, int numPartitions, double topKProbability) { + return hitEstimator.estimateK(wantedHits, numPartitions, topKProbability); + } + public boolean hasInformationAboutAllNodes() { return nodesByHost.values().stream().allMatch(node -> node.isWorking() != null); } diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java index 27685426cf8..e16f09a58ab 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/InterleavedSearchInvokerTest.java @@ -204,6 +204,33 @@ public class InterleavedSearchInvokerTest { private static final List<Double> A5Aux = Arrays.asList(-1.0,11.0,8.5,7.5,-7.0,3.0,2.0); private static final List<Double> B5Aux = Arrays.asList(9.0,8.0,-3.0,7.0,6.0,1.0, -1.0); + private void validateThatTopKProbabilityOverrideTakesEffect(Double topKProbability, int expectedK) throws IOException { + InterleavedSearchInvoker invoker = createInterLeavedTestInvoker(A5, B5); + query.setHits(8); + query.properties().set(Dispatcher.topKProbability, topKProbability); + SearchInvoker [] invokers = invoker.invokers().toArray(new SearchInvoker[0]); + Result result = invoker.search(query, null); + assertEquals(2, invokers.length); + assertEquals(expectedK, ((MockInvoker)invokers[0]).hitsRequested); + assertEquals(8, result.hits().size()); + assertEquals(11.0, result.hits().get(0).getRelevance().getScore(), DELTA); + assertEquals(9.0, result.hits().get(1).getRelevance().getScore(), DELTA); + assertEquals(8.5, result.hits().get(2).getRelevance().getScore(), DELTA); + assertEquals(8.0, result.hits().get(3).getRelevance().getScore(), DELTA); + assertEquals(7.5, result.hits().get(4).getRelevance().getScore(), DELTA); + assertEquals(7.0, result.hits().get(5).getRelevance().getScore(), DELTA); + assertEquals(6.0, result.hits().get(6).getRelevance().getScore(), DELTA); + assertEquals(3.0, result.hits().get(7).getRelevance().getScore(), DELTA); + assertEquals(0, result.getQuery().getOffset()); + assertEquals(8, result.getQuery().getHits()); + } + + @Test + public void requireThatTopKProbabilityOverrideTakesEffect() throws IOException { + validateThatTopKProbabilityOverrideTakesEffect(null, 8); + validateThatTopKProbabilityOverrideTakesEffect(0.8, 6); + } + @Test public void requireThatMergeOfConcreteHitsObeySorting() throws IOException { InterleavedSearchInvoker invoker = createInterLeavedTestInvoker(A5, B5); diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/MockInvoker.java b/container-search/src/test/java/com/yahoo/search/dispatch/MockInvoker.java index c5fbda7c2f5..c159293d7d9 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/MockInvoker.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/MockInvoker.java @@ -17,6 +17,7 @@ class MockInvoker extends SearchInvoker { private final Coverage coverage; private Query query; private List<Hit> hits; + int hitsRequested; protected MockInvoker(int key, Coverage coverage) { super(Optional.of(new Node(key, "?", 0))); @@ -35,6 +36,7 @@ class MockInvoker extends SearchInvoker { @Override protected void sendSearchRequest(Query query) throws IOException { this.query = query; + hitsRequested = query.getHits(); } @Override diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java new file mode 100644 index 00000000000..c14e4f984f1 --- /dev/null +++ b/container-search/src/test/java/com/yahoo/search/dispatch/TopKEstimatorTest.java @@ -0,0 +1,28 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.dispatch; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TopKEstimatorTest { + @Test + public void requireHitsAreEstimatedAccordingToPartitionsAndProbability() { + TopKEstimator estimator = new TopKEstimator(30, 0.999); + assertEquals(91.97368471911312, estimator.estimateExactK(200, 3), 0.0); + assertEquals(92, estimator.estimateK(200, 3)); + assertEquals(37.96328109101396, estimator.estimateExactK(200, 10), 0.0); + assertEquals(38, estimator.estimateK(200, 10)); + assertEquals(23.815737601023095, estimator.estimateExactK(200, 20), 0.0); + assertEquals(24, estimator.estimateK(200, 20)); + + assertEquals(37.96328109101396, estimator.estimateExactK(200, 10, 0.999), 0.0); + assertEquals(38, estimator.estimateK(200, 10, 0.999)); + assertEquals(34.36212304875885, estimator.estimateExactK(200, 10, 0.99), 0.0); + assertEquals(35, estimator.estimateK(200, 10, 0.99)); + assertEquals(41.44244358524574, estimator.estimateExactK(200, 10, 0.9999), 0.0); + assertEquals(42, estimator.estimateK(200, 10, 0.9999)); + assertEquals(44.909040374464155, estimator.estimateExactK(200, 10, 0.99999), 0.0); + assertEquals(45, estimator.estimateK(200, 10, 0.99999)); + } +} diff --git a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java index ad281aeda7d..09024150a9a 100644 --- a/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java +++ b/container-search/src/test/java/com/yahoo/search/dispatch/searchcluster/SearchClusterTest.java @@ -8,6 +8,7 @@ import com.yahoo.net.HostName; import com.yahoo.prelude.Pong; import com.yahoo.search.cluster.ClusterMonitor; import com.yahoo.search.dispatch.MockSearchCluster; +import com.yahoo.search.dispatch.TopKEstimator; import com.yahoo.search.result.ErrorMessage; import org.junit.Test; diff --git a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java index 5166568348d..9c06a3b269f 100644 --- a/flags/src/main/java/com/yahoo/vespa/flags/Flags.java +++ b/flags/src/main/java/com/yahoo/vespa/flags/Flags.java @@ -148,7 +148,13 @@ public class Flags { public static final UnboundDoubleFlag DEFAULT_TERM_WISE_LIMIT = defineDoubleFlag( "default-term-wise-limit", 1.0, - "Node resource memory in Gb for admin cluster nodes", + "Default limit for when to apply termwise query evaluation", + "Takes effect at redeployment", + APPLICATION_ID); + + public static final UnboundDoubleFlag DEFAULT_TOP_K_PROBABILITY = defineDoubleFlag( + "default-top-k-probability", 1.0, + "Default probability that you will get the globally top K documents when merging many partitions.", "Takes effect at redeployment", APPLICATION_ID); diff --git a/parent/pom.xml b/parent/pom.xml index a6c606d2c31..454b4677cfa 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -479,6 +479,11 @@ <version>${athenz.version}</version> </dependency> <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>${commons.math3.version}</version> + </dependency> + <dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> <version>1.4</version> @@ -755,6 +760,7 @@ --> <curator.version>2.9.1</curator.version> <jna.version>4.5.2</jna.version> + <commons.math3.version>3.6.1</commons.math3.version> <junit.version>5.4.2</junit.version> <maven-assembly-plugin.version>3.1.1</maven-assembly-plugin.version> <maven-bundle-plugin.version>3.5.0</maven-bundle-plugin.version> diff --git a/processing/src/main/java/com/yahoo/processing/request/Properties.java b/processing/src/main/java/com/yahoo/processing/request/Properties.java index cadc658417b..9362de59203 100644 --- a/processing/src/main/java/com/yahoo/processing/request/Properties.java +++ b/processing/src/main/java/com/yahoo/processing/request/Properties.java @@ -543,7 +543,7 @@ public class Properties implements Cloneable { /** * Returns a property as a Double * - * @return the integer value of the name, or null if the property is null + * @return the double value of the name, or null if the property is null * @throws NumberFormatException if the given parameter exists but have a value which * is not parseable as a number */ @@ -554,7 +554,7 @@ public class Properties implements Cloneable { /** * Returns a property as a Double * - * @return the integer value of the name, or null if the property is null + * @return the double value of the name, or null if the property is null * @throws NumberFormatException if the given parameter exists but have a value which * is not parseable as a number */ |