From 08ea0e2ff052b0b44a78dfb01d4f81d80dd2709e Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Thu, 6 Jul 2023 13:39:06 +0000 Subject: Make average field length configurable for bm25 rank feature. --- searchlib/src/tests/features/bm25/bm25_test.cpp | 15 ++++++++++ .../src/vespa/searchlib/features/bm25_feature.cpp | 34 +++++++++++++++++----- .../src/vespa/searchlib/features/bm25_feature.h | 5 +++- 3 files changed, 45 insertions(+), 9 deletions(-) (limited to 'searchlib/src') diff --git a/searchlib/src/tests/features/bm25/bm25_test.cpp b/searchlib/src/tests/features/bm25/bm25_test.cpp index 3a9dfffa8d7..a3a19762101 100644 --- a/searchlib/src/tests/features/bm25/bm25_test.cpp +++ b/searchlib/src/tests/features/bm25/bm25_test.cpp @@ -77,6 +77,12 @@ TEST_F(Bm25BlueprintTest, blueprint_setup_fails_when_b_param_is_malformed) expect_setup_fail({"is"}); } +TEST_F(Bm25BlueprintTest, blueprint_setup_fails_when_avg_field_length_is_malformed) +{ + index_env.getProperties().add("bm25(is).averageFieldLength", "malformed"); + expect_setup_fail({"is"}); +} + TEST_F(Bm25BlueprintTest, blueprint_setup_succeeds_for_index_field) { expect_setup_succeed({"is"}); @@ -243,6 +249,15 @@ TEST_F(Bm25ExecutorTest, b_param_can_be_overriden) EXPECT_TRUE(execute(score(3.0, 20, idf(25)))); } +TEST_F(Bm25ExecutorTest, avg_field_length_can_be_overriden) +{ + test.getIndexEnv().getProperties().add("bm25(foo).averageFieldLength", "15"); + setup(); + prepare_term(0, 0, 3, 20); + scorer.avg_field_length = 15; + EXPECT_TRUE(execute(score(3.0, 20, idf(25)))); +} + TEST_F(Bm25ExecutorTest, inverse_document_frequency_can_be_overriden_with_significance) { test.getQueryEnv().getProperties().add("vespa.term.0.significance", "0.35"); diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp index 66658fd2ca5..ccc3abbb299 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp @@ -26,6 +26,7 @@ using fef::ITermData; using fef::ITermFieldData; using fef::MatchDataDetails; using fef::objectstore::as_value; +using vespalib::Trinary; namespace { @@ -99,7 +100,7 @@ Bm25Executor::execute(uint32_t doc_id) outputs().set_number(0, score); } -bool +Trinary Bm25Blueprint::lookup_param(const fef::Properties& props, const vespalib::string& param, double& result) const { vespalib::string key = getBaseName() + "(" + _field->name() + ")." + param; @@ -107,13 +108,25 @@ Bm25Blueprint::lookup_param(const fef::Properties& props, const vespalib::string if (value.found()) { try { result = std::stod(value.get()); + return Trinary::True; } catch (const std::invalid_argument& ex) { LOG(warning, "Not able to convert rank property '%s': '%s' to a double value", key.c_str(), value.get().c_str()); - return false; + return Trinary::Undefined; } } - return true; + return Trinary::False; +} + +Trinary +Bm25Blueprint::lookup_param(const fef::Properties& props, const vespalib::string& param, std::optional& result) const +{ + double tmp_result; + auto lres = lookup_param(props, param, tmp_result); + if (lres == Trinary::True) { + result = tmp_result; + } + return lres; } double constexpr default_k1_param = 1.2; @@ -123,7 +136,8 @@ Bm25Blueprint::Bm25Blueprint() : Blueprint("bm25"), _field(nullptr), _k1_param(default_k1_param), - _b_param(default_b_param) + _b_param(default_b_param), + _avg_field_length() { } @@ -152,10 +166,13 @@ Bm25Blueprint::setup(const fef::IIndexEnvironment& env, const fef::ParameterList const auto& field_name = params[0].getValue(); _field = env.getFieldByName(field_name); - if (!lookup_param(env.getProperties(), "k1", _k1_param)) { + if (lookup_param(env.getProperties(), "k1", _k1_param) == Trinary::Undefined) { + return false; + } + if (lookup_param(env.getProperties(), "b", _b_param) == Trinary::Undefined) { return false; } - if (!lookup_param(env.getProperties(), "b", _b_param)) { + if (lookup_param(env.getProperties(), "averageFieldLength", _avg_field_length) == Trinary::Undefined) { return false; } @@ -178,7 +195,8 @@ Bm25Blueprint::prepareSharedState(const fef::IQueryEnvironment& env, fef::IObjec { vespalib::string key = make_avg_field_length_key(getBaseName(), _field->name()); if (store.get(key) == nullptr) { - store.add(key, std::make_unique>(env.get_average_field_length(_field->name()))); + double avg_field_length = _avg_field_length.value_or(env.get_average_field_length(_field->name())); + store.add(key, std::make_unique>(avg_field_length)); } } @@ -188,7 +206,7 @@ Bm25Blueprint::createExecutor(const fef::IQueryEnvironment& env, vespalib::Stash const auto* lookup_result = env.getObjectStore().get(make_avg_field_length_key(getBaseName(), _field->name())); double avg_field_length = lookup_result != nullptr ? as_value(*lookup_result) : - env.get_average_field_length(_field->name()); + _avg_field_length.value_or(env.get_average_field_length(_field->name())); return stash.create(*_field, env, avg_field_length, _k1_param, _b_param); } diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.h b/searchlib/src/vespa/searchlib/features/bm25_feature.h index 769fd0f0c07..68b6159285e 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.h +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.h @@ -2,6 +2,7 @@ #include #include +#include namespace search::features { @@ -53,8 +54,10 @@ private: const fef::FieldInfo* _field; double _k1_param; double _b_param; + std::optional _avg_field_length; - bool lookup_param(const fef::Properties& props, const vespalib::string& param, double& result) const; + vespalib::Trinary lookup_param(const fef::Properties& props, const vespalib::string& param, double& result) const; + vespalib::Trinary lookup_param(const fef::Properties& props, const vespalib::string& param, std::optional& result) const; public: Bm25Blueprint(); -- cgit v1.2.3