diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-04-29 13:44:11 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-04-29 13:44:11 +0000 |
commit | 388c29c6db7dea0ad256db7b3abf9e5f9d5cfb60 (patch) | |
tree | 5248ae2b5212eb9b28df1509e487ff1a5f951916 /searchlib | |
parent | f58fab3b4c021ff03f9d8e70f3bc029de8c0d13b (diff) |
Wire control of scoring range for weakand scorer from rank/query properties.
Diffstat (limited to 'searchlib')
10 files changed, 74 insertions, 8 deletions
diff --git a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp index e1f3f0805d9..8a0bc28f4dd 100644 --- a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp @@ -63,4 +63,27 @@ TEST("require that DotProductScorer calculates term score") EXPECT_EQUAL(11u, itr->_unpackDocId); } +TEST("test bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer(1000000, 1.0); + EXPECT_EQUAL(13410046, scorer.calculateMaxScore(1, 1)); + EXPECT_EQUAL(11464136, scorer.calculateMaxScore(10, 1)); + EXPECT_EQUAL(6907256, scorer.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(4605121, scorer.calculateMaxScore(10000, 1)); + EXPECT_EQUAL(2302581, scorer.calculateMaxScore(100000, 1)); + EXPECT_EQUAL(693147, scorer.calculateMaxScore(500000, 1)); + EXPECT_EQUAL(105360, scorer.calculateMaxScore(900000, 1)); + EXPECT_EQUAL(10050, scorer.calculateMaxScore(990000, 1)); +} + +TEST("test limited range of bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer08(1000000, 0.8); + wand::Bm25TermFrequencyScorer scorer10(1000000, 1.0); + EXPECT_EQUAL(8207814, scorer08.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(2690049, scorer08.calculateMaxScore(990000, 1)); + EXPECT_EQUAL(6907256, scorer10.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(10050, scorer10.calculateMaxScore(990000, 1)); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index e2928710a32..ac6fc6f603a 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -16,15 +16,18 @@ struct AttributeBlueprintParams double global_filter_upper_limit; double target_hits_max_adjustment_factor; vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; + double weakand_range; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, double target_hits_max_adjustment_factor_in, - vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in) + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in, + double weakand_range_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), - fuzzy_matching_algorithm(fuzzy_matching_algorithm_in) + fuzzy_matching_algorithm(fuzzy_matching_algorithm_in), + weakand_range(weakand_range_in) { } @@ -32,7 +35,8 @@ struct AttributeBlueprintParams : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, - fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) + fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE, + fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 4637ad5a4e8..1f88c34bef3 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -179,6 +179,21 @@ namespace onsummary { namespace temporary { +const vespalib::string WeakAndRange::NAME("vespa.weakand.range"); +const double WeakAndRange::DEFAULT_VALUE(0.0); + +double +WeakAndRange::lookup(const Properties &props) +{ + return lookup(props, DEFAULT_VALUE); +} + +double +WeakAndRange::lookup(const Properties &props, double defaultValue) +{ + return lookupDouble(props, NAME, defaultValue); +} + } namespace mutate { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index db8de8209a9..db97db0d894 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -178,6 +178,18 @@ namespace mutate { // Add temporary flags used for safe rollout of new features here namespace temporary { +/** + * A number in the range [0,1] for the effective idf range for WeakAndOperator. + * 1.0 will give the complete range as used by default by bm25. + * idf = (1.0 - range) + (range * idf) + * 0.0 which is default gives default legacy behavior. + **/ +struct WeakAndRange { + static const vespalib::string NAME; + static const double DEFAULT_VALUE; + static double lookup(const Properties &props); + static double lookup(const Properties &props, double defaultValue); +}; } namespace mutate::on_match { diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index aadc5300ede..25588cf3229 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -71,6 +71,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _global_filter_lower_limit(0.0), _global_filter_upper_limit(1.0), _target_hits_max_adjustment_factor(20.0), + _weakand_range(0.0), _fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::DfaTable), _mutateOnMatch(), _mutateOnFirstPhase(), @@ -126,6 +127,7 @@ RankSetup::configure() set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); + set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index d8b977a0331..f20ecd4b42b 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -80,6 +80,7 @@ private: double _global_filter_lower_limit; double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; + double _weakand_range; vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; @@ -402,6 +403,8 @@ public: double get_target_hits_max_adjustment_factor() const { return _target_hits_max_adjustment_factor; } void set_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm v) { _fuzzy_matching_algorithm = v; } vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } + void set_weakand_range(double v) { _weakand_range = v; } + double get_weakand_range() const { return _weakand_range; } /** * This method may be used to indicate that certain features diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 74c8b3534b8..99f7604e1a3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -492,9 +492,9 @@ WeakAndBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, _weights[i], getChild(i).getState().estimate().estHits); } - return (true) + return (_idf_range == 0.0) ? WeakAndSearch::create(terms, wand::TermFrequencyScorer(), _n, strict()) - : WeakAndSearch::create(terms, wand::Bm25TermFrequencyScorer(get_docid_limit(), 1.0), _n, strict()); + : WeakAndSearch::create(terms, wand::Bm25TermFrequencyScorer(get_docid_limit(), _idf_range), _n, strict()); } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index ade4c9318e4..7f4796c5f43 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -90,6 +90,7 @@ class WeakAndBlueprint : public IntermediateBlueprint { private: uint32_t _n; + float _idf_range; std::vector<uint32_t> _weights; AnyFlow my_flow(InFlow in_flow) const override; @@ -107,7 +108,8 @@ public: fef::MatchData &md) const override; SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override; - explicit WeakAndBlueprint(uint32_t n) noexcept : _n(n) {} + explicit WeakAndBlueprint(uint32_t n) noexcept : WeakAndBlueprint(n, 0.0) {} + WeakAndBlueprint(uint32_t n, float idf_range) noexcept : _n(n), _idf_range(idf_range), _weights() {} ~WeakAndBlueprint() override; void addTerm(Blueprint::UP bp, uint32_t weight) { addChild(std::move(bp)); diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index 69901993dfe..bc666c9996b 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -384,7 +384,7 @@ DualHeap<FutureHeap, PastHeap>::stringify() const { } //----------------------------------------------------------------------------- -#define TermFrequencyScorer_TERM_SCORE_FACTOR 1000000.0 +constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0; /** * Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency @@ -416,9 +416,13 @@ public: _range(range), _max_idf(Bm25Executor::calculate_inverse_document_frequency(1, _num_docs)) { } + double apply_range(double idf) const noexcept { + return (1.0 - _range)*_max_idf + _range * idf; + } // weight * idf, scaled to fixedpoint score_t calculateMaxScore(double estHits, double weight) const noexcept { - return weight * Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs); + return score_t(TermFrequencyScorer_TERM_SCORE_FACTOR * weight * + apply_range(Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs))); } score_t calculateMaxScore(const Term &term) const noexcept { diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index 58ffcfe17bc..b0e1fee8e51 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -145,6 +145,7 @@ WeakAndSearch::create(const Terms &terms, uint32_t n, bool strict) //----------------------------------------------------------------------------- template SearchIterator::UP WeakAndSearch::create<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::create<wand::Bm25TermFrequencyScorer>(const Terms &terms, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict); template SearchIterator::UP WeakAndSearch::createArrayWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); template SearchIterator::UP WeakAndSearch::createHeapWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); |