diff options
12 files changed, 80 insertions, 11 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index 0b2660824c0..919309c5dae 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -9,6 +9,7 @@ #include <vespa/searchlib/queryeval/intermediate_blueprints.h> #include <vespa/searchlib/queryeval/equiv_blueprint.h> #include <vespa/searchlib/queryeval/get_weight_from_node.h> +#include <vespa/searchlib/attribute/attribute_blueprint_params.h> #include <vespa/vespalib/util/issue.h> using namespace search::queryeval; @@ -21,7 +22,7 @@ namespace { struct Mixer { std::unique_ptr<OrBlueprint> attributes; - Mixer() : attributes() {} + Mixer() noexcept: attributes() {} void addAttribute(Blueprint::UP attr) { if ( ! attributes) { @@ -66,7 +67,7 @@ private: void buildIntermediate(IntermediateBlueprint *b, NodeType &n) __attribute__((noinline)); void buildWeakAnd(ProtonWeakAnd &n) { - auto *wand = new WeakAndBlueprint(n.getTargetNumHits()); + auto *wand = new WeakAndBlueprint(n.getTargetNumHits(), _requestContext.get_attribute_blueprint_params().weakand_range); Blueprint::UP result(wand); for (auto node : n.getChildren()) { uint32_t weight = getWeightFromNode(*node).percent(); diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index 532ec2f63bd..06290386a31 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -340,6 +340,7 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu double upper_limit = GlobalFilterUpperLimit::lookup(rank_properties, rank_setup.get_global_filter_upper_limit()); double target_hits_max_adjustment_factor = TargetHitsMaxAdjustmentFactor::lookup(rank_properties, rank_setup.get_target_hits_max_adjustment_factor()); auto fuzzy_matching_algorithm = FuzzyAlgorithm::lookup(rank_properties, rank_setup.get_fuzzy_matching_algorithm()); + double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range()); // Note that we count the reserved docid 0 as active. // This ensures that when searchable-copies=1, the ratio is 1.0. @@ -348,7 +349,8 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu return {lower_limit * active_hit_ratio, upper_limit * active_hit_ratio, target_hits_max_adjustment_factor, - fuzzy_matching_algorithm}; + fuzzy_matching_algorithm, + weakand_range}; } AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext, diff --git a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp index e1f3f0805d9..8a0bc28f4dd 100644 --- a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp @@ -63,4 +63,27 @@ TEST("require that DotProductScorer calculates term score") EXPECT_EQUAL(11u, itr->_unpackDocId); } +TEST("test bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer(1000000, 1.0); + EXPECT_EQUAL(13410046, scorer.calculateMaxScore(1, 1)); + EXPECT_EQUAL(11464136, scorer.calculateMaxScore(10, 1)); + EXPECT_EQUAL(6907256, scorer.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(4605121, scorer.calculateMaxScore(10000, 1)); + EXPECT_EQUAL(2302581, scorer.calculateMaxScore(100000, 1)); + EXPECT_EQUAL(693147, scorer.calculateMaxScore(500000, 1)); + EXPECT_EQUAL(105360, scorer.calculateMaxScore(900000, 1)); + EXPECT_EQUAL(10050, scorer.calculateMaxScore(990000, 1)); +} + +TEST("test limited range of bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer08(1000000, 0.8); + wand::Bm25TermFrequencyScorer scorer10(1000000, 1.0); + EXPECT_EQUAL(8207814, scorer08.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(2690049, scorer08.calculateMaxScore(990000, 1)); + EXPECT_EQUAL(6907256, scorer10.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(10050, scorer10.calculateMaxScore(990000, 1)); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index e2928710a32..ac6fc6f603a 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -16,15 +16,18 @@ struct AttributeBlueprintParams double global_filter_upper_limit; double target_hits_max_adjustment_factor; vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; + double weakand_range; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, double target_hits_max_adjustment_factor_in, - vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in) + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in, + double weakand_range_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), - fuzzy_matching_algorithm(fuzzy_matching_algorithm_in) + fuzzy_matching_algorithm(fuzzy_matching_algorithm_in), + weakand_range(weakand_range_in) { } @@ -32,7 +35,8 @@ struct AttributeBlueprintParams : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, - fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) + fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE, + fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 4637ad5a4e8..1f88c34bef3 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -179,6 +179,21 @@ namespace onsummary { namespace temporary { +const vespalib::string WeakAndRange::NAME("vespa.weakand.range"); +const double WeakAndRange::DEFAULT_VALUE(0.0); + +double +WeakAndRange::lookup(const Properties &props) +{ + return lookup(props, DEFAULT_VALUE); +} + +double +WeakAndRange::lookup(const Properties &props, double defaultValue) +{ + return lookupDouble(props, NAME, defaultValue); +} + } namespace mutate { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index db8de8209a9..db97db0d894 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -178,6 +178,18 @@ namespace mutate { // Add temporary flags used for safe rollout of new features here namespace temporary { +/** + * A number in the range [0,1] for the effective idf range for WeakAndOperator. + * 1.0 will give the complete range as used by default by bm25. + * idf = (1.0 - range) + (range * idf) + * 0.0 which is default gives default legacy behavior. + **/ +struct WeakAndRange { + static const vespalib::string NAME; + static const double DEFAULT_VALUE; + static double lookup(const Properties &props); + static double lookup(const Properties &props, double defaultValue); +}; } namespace mutate::on_match { diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index aadc5300ede..25588cf3229 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -71,6 +71,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _global_filter_lower_limit(0.0), _global_filter_upper_limit(1.0), _target_hits_max_adjustment_factor(20.0), + _weakand_range(0.0), _fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::DfaTable), _mutateOnMatch(), _mutateOnFirstPhase(), @@ -126,6 +127,7 @@ RankSetup::configure() set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); + set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index d8b977a0331..f20ecd4b42b 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -80,6 +80,7 @@ private: double _global_filter_lower_limit; double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; + double _weakand_range; vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; @@ -402,6 +403,8 @@ public: double get_target_hits_max_adjustment_factor() const { return _target_hits_max_adjustment_factor; } void set_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm v) { _fuzzy_matching_algorithm = v; } vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } + void set_weakand_range(double v) { _weakand_range = v; } + double get_weakand_range() const { return _weakand_range; } /** * This method may be used to indicate that certain features diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 74c8b3534b8..99f7604e1a3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -492,9 +492,9 @@ WeakAndBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, _weights[i], getChild(i).getState().estimate().estHits); } - return (true) + return (_idf_range == 0.0) ? WeakAndSearch::create(terms, wand::TermFrequencyScorer(), _n, strict()) - : WeakAndSearch::create(terms, wand::Bm25TermFrequencyScorer(get_docid_limit(), 1.0), _n, strict()); + : WeakAndSearch::create(terms, wand::Bm25TermFrequencyScorer(get_docid_limit(), _idf_range), _n, strict()); } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index ade4c9318e4..7f4796c5f43 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -90,6 +90,7 @@ class WeakAndBlueprint : public IntermediateBlueprint { private: uint32_t _n; + float _idf_range; std::vector<uint32_t> _weights; AnyFlow my_flow(InFlow in_flow) const override; @@ -107,7 +108,8 @@ public: fef::MatchData &md) const override; SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override; - explicit WeakAndBlueprint(uint32_t n) noexcept : _n(n) {} + explicit WeakAndBlueprint(uint32_t n) noexcept : WeakAndBlueprint(n, 0.0) {} + WeakAndBlueprint(uint32_t n, float idf_range) noexcept : _n(n), _idf_range(idf_range), _weights() {} ~WeakAndBlueprint() override; void addTerm(Blueprint::UP bp, uint32_t weight) { addChild(std::move(bp)); diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index 69901993dfe..bc666c9996b 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -384,7 +384,7 @@ DualHeap<FutureHeap, PastHeap>::stringify() const { } //----------------------------------------------------------------------------- -#define TermFrequencyScorer_TERM_SCORE_FACTOR 1000000.0 +constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0; /** * Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency @@ -416,9 +416,13 @@ public: _range(range), _max_idf(Bm25Executor::calculate_inverse_document_frequency(1, _num_docs)) { } + double apply_range(double idf) const noexcept { + return (1.0 - _range)*_max_idf + _range * idf; + } // weight * idf, scaled to fixedpoint score_t calculateMaxScore(double estHits, double weight) const noexcept { - return weight * Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs); + return score_t(TermFrequencyScorer_TERM_SCORE_FACTOR * weight * + apply_range(Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs))); } score_t calculateMaxScore(const Term &term) const noexcept { diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index 58ffcfe17bc..b0e1fee8e51 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -145,6 +145,7 @@ WeakAndSearch::create(const Terms &terms, uint32_t n, bool strict) //----------------------------------------------------------------------------- template SearchIterator::UP WeakAndSearch::create<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::create<wand::Bm25TermFrequencyScorer>(const Terms &terms, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict); template SearchIterator::UP WeakAndSearch::createArrayWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); template SearchIterator::UP WeakAndSearch::createHeapWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); |