diff options
17 files changed, 199 insertions, 86 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index 0b2660824c0..919309c5dae 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -9,6 +9,7 @@ #include <vespa/searchlib/queryeval/intermediate_blueprints.h> #include <vespa/searchlib/queryeval/equiv_blueprint.h> #include <vespa/searchlib/queryeval/get_weight_from_node.h> +#include <vespa/searchlib/attribute/attribute_blueprint_params.h> #include <vespa/vespalib/util/issue.h> using namespace search::queryeval; @@ -21,7 +22,7 @@ namespace { struct Mixer { std::unique_ptr<OrBlueprint> attributes; - Mixer() : attributes() {} + Mixer() noexcept: attributes() {} void addAttribute(Blueprint::UP attr) { if ( ! attributes) { @@ -66,7 +67,7 @@ private: void buildIntermediate(IntermediateBlueprint *b, NodeType &n) __attribute__((noinline)); void buildWeakAnd(ProtonWeakAnd &n) { - auto *wand = new WeakAndBlueprint(n.getTargetNumHits()); + auto *wand = new WeakAndBlueprint(n.getTargetNumHits(), _requestContext.get_attribute_blueprint_params().weakand_range); Blueprint::UP result(wand); for (auto node : n.getChildren()) { uint32_t weight = getWeightFromNode(*node).percent(); diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index 532ec2f63bd..06290386a31 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -340,6 +340,7 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu double upper_limit = GlobalFilterUpperLimit::lookup(rank_properties, rank_setup.get_global_filter_upper_limit()); double target_hits_max_adjustment_factor = TargetHitsMaxAdjustmentFactor::lookup(rank_properties, rank_setup.get_target_hits_max_adjustment_factor()); auto fuzzy_matching_algorithm = FuzzyAlgorithm::lookup(rank_properties, rank_setup.get_fuzzy_matching_algorithm()); + double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range()); // Note that we count the reserved docid 0 as active. // This ensures that when searchable-copies=1, the ratio is 1.0. @@ -348,7 +349,8 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu return {lower_limit * active_hit_ratio, upper_limit * active_hit_ratio, target_hits_max_adjustment_factor, - fuzzy_matching_algorithm}; + fuzzy_matching_algorithm, + weakand_range}; } AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext, diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp index bddc9f92111..48ddeed47e9 100644 --- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp @@ -27,8 +27,9 @@ LOG_SETUP("blueprint_test"); using namespace search::queryeval; -using namespace search::fef; using namespace search::query; +using search::fef::MatchData; +using search::queryeval::Blueprint; using search::BitVector; using BlueprintVector = std::vector<std::unique_ptr<Blueprint>>; using vespalib::Slime; diff --git a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp index 5e056eb6c0e..55dc3868ed4 100644 --- a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp +++ b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp @@ -29,17 +29,17 @@ struct Stats { size_t unpackCnt; size_t skippedDocs; size_t skippedHits; - Stats() : hitCnt(0), seekCnt(0), unpackCnt(0), + Stats() noexcept : hitCnt(0), seekCnt(0), unpackCnt(0), skippedDocs(0), skippedHits(0) {} - void hit() { + void hit() noexcept { ++hitCnt; } - void seek(size_t docs, size_t hits) { + void seek(size_t docs, size_t hits) noexcept { ++seekCnt; skippedDocs += docs; skippedHits += hits; } - void unpack() { + void unpack() noexcept { ++unpackCnt; } void print() { @@ -77,7 +77,7 @@ struct ModSearch : SearchIterator { } } void doUnpack(uint32_t docid) override { - if (tfmd != NULL) { + if (tfmd != nullptr) { tfmd->reset(docid); search::fef::TermFieldMatchDataPosition pos; pos.setElementWeight(info.getMaxWeight()); @@ -96,16 +96,16 @@ ModSearch::~ModSearch() = default; struct WandFactory { virtual std::string name() const = 0; virtual SearchIterator::UP create(const wand::Terms &terms) = 0; - virtual ~WandFactory() {} + virtual ~WandFactory() = default; }; struct VespaWandFactory : WandFactory { uint32_t n; - VespaWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~VespaWandFactory() override; - virtual std::string name() const override { return make_string("VESPA WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::create(terms, n, true)); + std::string name() const override { return make_string("VESPA WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::create(terms, n, true); } }; @@ -113,11 +113,11 @@ VespaWandFactory::~VespaWandFactory() = default; struct VespaArrayWandFactory : WandFactory { uint32_t n; - VespaArrayWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaArrayWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~VespaArrayWandFactory() override; - virtual std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::createArrayWand(terms, n, true)); + std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::createArrayWand(terms, wand::TermFrequencyScorer(), n, true); } }; @@ -125,11 +125,11 @@ VespaArrayWandFactory::~VespaArrayWandFactory() = default; struct VespaHeapWandFactory : WandFactory { uint32_t n; - VespaHeapWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaHeapWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~VespaHeapWandFactory() override; - virtual std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::createHeapWand(terms, n, true)); + std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::createHeapWand(terms, wand::TermFrequencyScorer(), n, true); } }; @@ -138,39 +138,39 @@ VespaHeapWandFactory::~VespaHeapWandFactory() = default; struct VespaParallelWandFactory : public WandFactory { SharedWeakAndPriorityQueue scores; TermFieldMatchData rootMatchData; - VespaParallelWandFactory(uint32_t n) : scores(n), rootMatchData() {} + explicit VespaParallelWandFactory(uint32_t n) noexcept : scores(n), rootMatchData() {} ~VespaParallelWandFactory() override; - virtual std::string name() const override { return make_string("VESPA PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::create(terms, + std::string name() const override { return make_string("VESPA PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::create(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; VespaParallelWandFactory::~VespaParallelWandFactory() = default; struct VespaParallelArrayWandFactory : public VespaParallelWandFactory { - VespaParallelArrayWandFactory(uint32_t n) : VespaParallelWandFactory(n) {} + VespaParallelArrayWandFactory(uint32_t n) noexcept : VespaParallelWandFactory(n) {} ~VespaParallelArrayWandFactory() override; - virtual std::string name() const override { return make_string("VESPA ARRAY PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::createArrayWand(terms, + std::string name() const override { return make_string("VESPA ARRAY PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::createArrayWand(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; VespaParallelArrayWandFactory::~VespaParallelArrayWandFactory() = default; struct VespaParallelHeapWandFactory : public VespaParallelWandFactory { - VespaParallelHeapWandFactory(uint32_t n) : VespaParallelWandFactory(n) {} + explicit VespaParallelHeapWandFactory(uint32_t n) noexcept : VespaParallelWandFactory(n) {} ~VespaParallelHeapWandFactory() override; - virtual std::string name() const override { return make_string("VESPA HEAP PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::createHeapWand(terms, + std::string name() const override { return make_string("VESPA HEAP PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::createHeapWand(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; @@ -178,11 +178,11 @@ VespaParallelHeapWandFactory::~VespaParallelHeapWandFactory() = default; struct TermFrequencyRiseWandFactory : WandFactory { uint32_t n; - TermFrequencyRiseWandFactory(uint32_t n_in) : n(n_in) {} + explicit TermFrequencyRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~TermFrequencyRiseWandFactory() override; - virtual std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(new rise::TermFrequencyRiseWand(terms, n)); + std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return std::make_unique<rise::TermFrequencyRiseWand>(terms, n); } }; @@ -190,11 +190,11 @@ TermFrequencyRiseWandFactory::~TermFrequencyRiseWandFactory() = default; struct DotProductRiseWandFactory : WandFactory { uint32_t n; - DotProductRiseWandFactory(uint32_t n_in) : n(n_in) {} + explicit DotProductRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~DotProductRiseWandFactory() override; - virtual std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(new rise::DotProductRiseWand(terms, n)); + std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return std::make_unique<rise::DotProductRiseWand>(terms, n); } }; @@ -204,13 +204,13 @@ struct FilterFactory : WandFactory { WandFactory &factory; Stats stats; uint32_t n; - FilterFactory(WandFactory &f, uint32_t n_in) : factory(f), n(n_in) {} + FilterFactory(WandFactory &f, uint32_t n_in) noexcept : factory(f), n(n_in) {} ~FilterFactory() override; - virtual std::string name() const override { return make_string("Filter (mod=%u) [%s]", n, factory.name().c_str()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { + std::string name() const override { return make_string("Filter (mod=%u) [%s]", n, factory.name().c_str()); } + SearchIterator::UP create(const wand::Terms &terms) override { AndNotSearch::Children children; children.push_back(factory.create(terms)); - children.emplace_back(new ModSearch(stats, n, search::endDocId, n, NULL)); + children.emplace_back(new ModSearch(stats, n, search::endDocId, n, nullptr)); return AndNotSearch::create(std::move(children), true); } }; @@ -220,8 +220,8 @@ FilterFactory::~FilterFactory() = default; struct Setup { Stats stats; vespalib::duration minTime; - Setup() : stats(), minTime(10000s) {} - virtual ~Setup() {} + Setup() noexcept : stats(), minTime(10000s) {} + virtual ~Setup() = default; virtual std::string name() const = 0; virtual SearchIterator::UP create() = 0; void perform() { @@ -256,10 +256,10 @@ struct WandSetup : Setup { MatchData::UP matchData; WandSetup(WandFactory &f, uint32_t c, uint32_t l) : Setup(), factory(f), childCnt(c), limit(l), weight(100), matchData() {} ~WandSetup() override; - virtual std::string name() const override { + std::string name() const override { return make_string("Wand Setup (terms=%u,docs=%u) [%s]", childCnt, limit, factory.name().c_str()); } - virtual SearchIterator::UP create() override { + SearchIterator::UP create() override { MatchDataLayout layout; std::vector<TermFieldHandle> handles; for (size_t i = 0; i < childCnt; ++i) { diff --git a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp index e1f3f0805d9..8a0bc28f4dd 100644 --- a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp @@ -63,4 +63,27 @@ TEST("require that DotProductScorer calculates term score") EXPECT_EQUAL(11u, itr->_unpackDocId); } +TEST("test bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer(1000000, 1.0); + EXPECT_EQUAL(13410046, scorer.calculateMaxScore(1, 1)); + EXPECT_EQUAL(11464136, scorer.calculateMaxScore(10, 1)); + EXPECT_EQUAL(6907256, scorer.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(4605121, scorer.calculateMaxScore(10000, 1)); + EXPECT_EQUAL(2302581, scorer.calculateMaxScore(100000, 1)); + EXPECT_EQUAL(693147, scorer.calculateMaxScore(500000, 1)); + EXPECT_EQUAL(105360, scorer.calculateMaxScore(900000, 1)); + EXPECT_EQUAL(10050, scorer.calculateMaxScore(990000, 1)); +} + +TEST("test limited range of bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer08(1000000, 0.8); + wand::Bm25TermFrequencyScorer scorer10(1000000, 1.0); + EXPECT_EQUAL(8207814, scorer08.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(2690049, scorer08.calculateMaxScore(990000, 1)); + EXPECT_EQUAL(6907256, scorer10.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(10050, scorer10.calculateMaxScore(990000, 1)); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index e2928710a32..ac6fc6f603a 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -16,15 +16,18 @@ struct AttributeBlueprintParams double global_filter_upper_limit; double target_hits_max_adjustment_factor; vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; + double weakand_range; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, double target_hits_max_adjustment_factor_in, - vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in) + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in, + double weakand_range_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), - fuzzy_matching_algorithm(fuzzy_matching_algorithm_in) + fuzzy_matching_algorithm(fuzzy_matching_algorithm_in), + weakand_range(weakand_range_in) { } @@ -32,7 +35,8 @@ struct AttributeBlueprintParams : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, - fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) + fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE, + fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp index 505b8166ee7..03d2e94b5d0 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp @@ -68,7 +68,7 @@ Bm25Executor::Bm25Executor(const fef::FieldInfo& field, } double -Bm25Executor::calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) +Bm25Executor::calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) noexcept { return std::log(1 + (static_cast<double>(total_doc_count - matching_doc_count + 0.5) / static_cast<double>(matching_doc_count + 0.5))); diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.h b/searchlib/src/vespa/searchlib/features/bm25_feature.h index a1b45375285..637d656990b 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.h +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.h @@ -39,7 +39,7 @@ public: double k1_param, double b_param); - double static calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count); + double static calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) noexcept; void handle_bind_match_data(const fef::MatchData& match_data) override; void execute(uint32_t docId) override; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 4637ad5a4e8..1f88c34bef3 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -179,6 +179,21 @@ namespace onsummary { namespace temporary { +const vespalib::string WeakAndRange::NAME("vespa.weakand.range"); +const double WeakAndRange::DEFAULT_VALUE(0.0); + +double +WeakAndRange::lookup(const Properties &props) +{ + return lookup(props, DEFAULT_VALUE); +} + +double +WeakAndRange::lookup(const Properties &props, double defaultValue) +{ + return lookupDouble(props, NAME, defaultValue); +} + } namespace mutate { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index db8de8209a9..d047eb13347 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -178,6 +178,18 @@ namespace mutate { // Add temporary flags used for safe rollout of new features here namespace temporary { +/** + * A number in the range [0,1] for the effective idf range for WeakAndOperator. + * 1.0 will give the complete range as used by default by bm25. + * scaled_idf = (1.0 - range) * max_idf + (range * idf) + * 0.0 which is default gives default legacy behavior. + **/ +struct WeakAndRange { + static const vespalib::string NAME; + static const double DEFAULT_VALUE; + static double lookup(const Properties &props); + static double lookup(const Properties &props, double defaultValue); +}; } namespace mutate::on_match { diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index aadc5300ede..25588cf3229 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -71,6 +71,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _global_filter_lower_limit(0.0), _global_filter_upper_limit(1.0), _target_hits_max_adjustment_factor(20.0), + _weakand_range(0.0), _fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::DfaTable), _mutateOnMatch(), _mutateOnFirstPhase(), @@ -126,6 +127,7 @@ RankSetup::configure() set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); + set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index d8b977a0331..f20ecd4b42b 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -80,6 +80,7 @@ private: double _global_filter_lower_limit; double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; + double _weakand_range; vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; @@ -402,6 +403,8 @@ public: double get_target_hits_max_adjustment_factor() const { return _target_hits_max_adjustment_factor; } void set_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm v) { _fuzzy_matching_algorithm = v; } vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } + void set_weakand_range(double v) { _weakand_range = v; } + double get_weakand_range() const { return _weakand_range; } /** * This method may be used to indicate that certain features diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 33b249572f0..99f7604e1a3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -492,7 +492,9 @@ WeakAndBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, _weights[i], getChild(i).getState().estimate().estHits); } - return WeakAndSearch::create(terms, _n, strict()); + return (_idf_range == 0.0) + ? WeakAndSearch::create(terms, wand::TermFrequencyScorer(), _n, strict()) + : WeakAndSearch::create(terms, wand::Bm25TermFrequencyScorer(get_docid_limit(), _idf_range), _n, strict()); } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index ade4c9318e4..7f4796c5f43 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -90,6 +90,7 @@ class WeakAndBlueprint : public IntermediateBlueprint { private: uint32_t _n; + float _idf_range; std::vector<uint32_t> _weights; AnyFlow my_flow(InFlow in_flow) const override; @@ -107,7 +108,8 @@ public: fef::MatchData &md) const override; SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override; - explicit WeakAndBlueprint(uint32_t n) noexcept : _n(n) {} + explicit WeakAndBlueprint(uint32_t n) noexcept : WeakAndBlueprint(n, 0.0) {} + WeakAndBlueprint(uint32_t n, float idf_range) noexcept : _n(n), _idf_range(idf_range), _weights() {} ~WeakAndBlueprint() override; void addTerm(Blueprint::UP bp, uint32_t weight) { addChild(std::move(bp)); diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index 4e781f8497b..88f0c9288f9 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -2,10 +2,9 @@ #pragma once -#include <algorithm> -#include <cmath> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> +#include <vespa/searchlib/features/bm25_feature.h> #include <vespa/searchlib/queryeval/searchiterator.h> #include <vespa/searchlib/queryeval/iterator_pack.h> #include <vespa/searchlib/attribute/posting_iterator_pack.h> @@ -13,20 +12,16 @@ #include <vespa/vespalib/util/priority_queue.h> #include <vespa/searchlib/attribute/i_docid_with_weight_posting_store.h> #include <vespa/vespalib/util/stringfmt.h> +#include <cmath> namespace search::queryeval::wand { //----------------------------------------------------------------------------- -struct Term; -using Terms = std::vector<Term>; using score_t = int64_t; using docid_t = uint32_t; using ref_t = uint16_t; -using Attr = IDirectPostingStore; -using AttrDictEntry = Attr::LookupResult; - //----------------------------------------------------------------------------- /** @@ -46,7 +41,7 @@ struct Term { Term(SearchIterator *s, int32_t w, uint32_t e) noexcept : Term(s, w, e, nullptr) {} Term(SearchIterator::UP s, int32_t w, uint32_t e) noexcept : Term(s.release(), w, e, nullptr) {} }; - +using Terms = std::vector<Term>; //----------------------------------------------------------------------------- // input manipulation utilities @@ -75,7 +70,7 @@ auto assemble(const F &f, const Order &order)->std::vector<decltype(f(0))> { } int32_t get_max_weight(const SearchIterator &search) { - const MinMaxPostingInfo *minMax = dynamic_cast<const MinMaxPostingInfo *>(search.getPostingInfo()); + const auto *minMax = dynamic_cast<const MinMaxPostingInfo *>(search.getPostingInfo()); return (minMax != nullptr) ? minMax->getMaxWeight() : std::numeric_limits<int32_t>::max(); } @@ -291,7 +286,7 @@ struct VectorizedAttributeTerms : VectorizedState<DocidWithWeightIteratorPack> { **/ struct DocIdOrder { const docid_t *termPos; - explicit DocIdOrder(docid_t *pos) noexcept : termPos(pos) {} + explicit DocIdOrder(const docid_t *pos) noexcept : termPos(pos) {} bool at_end(ref_t ref) const noexcept { return termPos[ref] == search::endDocId; } docid_t get_pos(ref_t ref) const noexcept { return termPos[ref]; } bool operator()(ref_t a, ref_t b) const noexcept { @@ -389,7 +384,7 @@ DualHeap<FutureHeap, PastHeap>::stringify() const { } //----------------------------------------------------------------------------- -#define TermFrequencyScorer_TERM_SCORE_FACTOR 1000000.0 +constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0; /** * Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency @@ -412,6 +407,38 @@ struct TermFrequencyScorer } }; +class Bm25TermFrequencyScorer +{ +public: + using Bm25Executor = features::Bm25Executor; + Bm25TermFrequencyScorer(uint32_t num_docs, float range) noexcept + : _num_docs(num_docs), + _range(range), + _max_idf(Bm25Executor::calculate_inverse_document_frequency(1, _num_docs)) + { } + double apply_range(double idf) const noexcept { + return (1.0 - _range)*_max_idf + _range * idf; + } + // weight * scaled_bm25_idf, scaled to fixedpoint + score_t calculateMaxScore(double estHits, double weight) const noexcept { + return score_t(TermFrequencyScorer_TERM_SCORE_FACTOR * weight * + apply_range(Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs))); + } + + score_t calculateMaxScore(const Term &term) const noexcept { + return calculateMaxScore(term.estHits, term.weight) + 1; + } + + template <typename Input> + score_t calculate_max_score(const Input &input, ref_t ref) const noexcept { + return calculateMaxScore(input.get_est_hits(ref), input.get_weight(ref)) + 1; + } +private: + uint32_t _num_docs; + float _range; + double _max_idf; +}; + //----------------------------------------------------------------------------- /** @@ -453,14 +480,14 @@ struct DotProductScorer // used with parallel wand where we can safely discard hits based on score struct GreaterThan { score_t threshold; - GreaterThan(score_t t) : threshold(t) {} + explicit GreaterThan(score_t t) noexcept : threshold(t) {} bool operator()(score_t score) const { return (score > threshold); } }; // used with old-style vespa wand to ensure at least AND'ish results struct GreaterThanEqual { score_t threshold; - GreaterThanEqual(score_t t) : threshold(t) {} + explicit GreaterThanEqual(score_t t) noexcept : threshold(t) {} bool operator()(score_t score) const { return (score >= threshold); } }; diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index 04b1cb75da4..cf3fd44ad4f 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -42,8 +42,9 @@ private: } public: - WeakAndSearchLR(const Terms &terms, uint32_t n) - : _terms(terms, TermFrequencyScorer(), 0, {}), + template<typename Scorer> + WeakAndSearchLR(const Terms &terms, const Scorer & scorer, uint32_t n) + : _terms(terms, scorer, 0, {}), _heaps(DocIdOrder(_terms.docId()), _terms.size()), _algo(), _threshold(1), @@ -102,36 +103,50 @@ WeakAndSearch::visitMembers(vespalib::ObjectVisitor &visitor) const //----------------------------------------------------------------------------- +template<typename Scorer> SearchIterator::UP -WeakAndSearch::createArrayWand(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::createArrayWand(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict) { if (strict) { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, true>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, true>>(terms, scorer, n); } else { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, false>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, false>>(terms, scorer, n); } } +template<typename Scorer> SearchIterator::UP -WeakAndSearch::createHeapWand(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::createHeapWand(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict) { if (strict) { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, true>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, true>>(terms, scorer, n); } else { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, false>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, false>>(terms, scorer, n); } } +template<typename Scorer> SearchIterator::UP -WeakAndSearch::create(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::create(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict) { if (terms.size() < 128) { - return createArrayWand(terms, n, strict); + return createArrayWand(terms, scorer, n, strict); } else { - return createHeapWand(terms, n, strict); + return createHeapWand(terms, scorer, n, strict); } } +SearchIterator::UP +WeakAndSearch::create(const Terms &terms, uint32_t n, bool strict) +{ + return create(terms, wand::TermFrequencyScorer(), n, strict); +} + //----------------------------------------------------------------------------- +template SearchIterator::UP WeakAndSearch::create<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::create<wand::Bm25TermFrequencyScorer>(const Terms &terms, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::createArrayWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::createHeapWand<wand::TermFrequencyScorer>(const Terms &terms, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); + } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h index 6a56a04887c..a91b2860a63 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h @@ -15,8 +15,12 @@ struct WeakAndSearch : SearchIterator { virtual const Terms &getTerms() const = 0; virtual uint32_t getN() const = 0; void visitMembers(vespalib::ObjectVisitor &visitor) const override; - static SearchIterator::UP createArrayWand(const Terms &terms, uint32_t n, bool strict); - static SearchIterator::UP createHeapWand(const Terms &terms, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP createArrayWand(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP createHeapWand(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP create(const Terms &terms, const Scorer & scorer, uint32_t n, bool strict); static SearchIterator::UP create(const Terms &terms, uint32_t n, bool strict); }; |