diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-06-14 15:50:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-14 15:50:59 +0200 |
commit | 427c172017b75d93b7084cf578da940a2acf03da (patch) | |
tree | 91775316091af11b8ec89acb7c70f4734987025c /searchlib | |
parent | c55418a1cc47dd66d685cd9b37f12f27bc74178d (diff) | |
parent | 459dc7a3d7a2272a1eb505793781ca079279aa2f (diff) |
Merge pull request #9795 from vespa-engine/geirst/bm25-feature-support-override-idf
Add support to override inverse document frequency in bm25 feature by…
Diffstat (limited to 'searchlib')
4 files changed, 64 insertions, 20 deletions
diff --git a/searchlib/src/tests/features/bm25/bm25_test.cpp b/searchlib/src/tests/features/bm25/bm25_test.cpp index eb2f46650a6..55c9caa6c0f 100644 --- a/searchlib/src/tests/features/bm25/bm25_test.cpp +++ b/searchlib/src/tests/features/bm25/bm25_test.cpp @@ -135,6 +135,7 @@ struct Bm25ExecutorTest : public ::testing::Test { void add_query_term(const vespalib::string& field_name, uint32_t matching_doc_count) { auto* term = test.getQueryEnv().getBuilder().addIndexNode({field_name}); term->field(0).setDocFreq(matching_doc_count, total_doc_count); + term->setUniqueId(test.getQueryEnv().getNumTerms() - 1); } void setup() { EXPECT_TRUE(test.setup()); @@ -236,4 +237,12 @@ TEST_F(Bm25ExecutorTest, b_param_can_be_overriden) EXPECT_TRUE(execute(score(3.0, 20, idf(25)))); } +TEST_F(Bm25ExecutorTest, inverse_document_frequency_can_be_overriden_with_significance) +{ + test.getQueryEnv().getProperties().add("vespa.term.0.significance", "0.35"); + setup(); + prepare_term(0, 0, 3, 20); + EXPECT_TRUE(execute(score(3.0, 20, 0.35))); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp index e89655a75bb..6e889b48343 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp @@ -1,6 +1,7 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bm25_feature.h" +#include "utils.h" #include <vespa/searchlib/fef/itermdata.h> #include <vespa/searchlib/fef/itermfielddata.h> #include <vespa/searchlib/fef/objectstore.h> @@ -23,6 +24,21 @@ using fef::ITermFieldData; using fef::MatchDataDetails; using fef::objectstore::as_value; +namespace { + +double +get_inverse_document_frequency(const ITermFieldData& term_field, + const fef::IQueryEnvironment& env, + const ITermData& term) + +{ + double fallback = Bm25Executor::calculate_inverse_document_frequency(term_field.get_matching_doc_count(), + term_field.get_total_doc_count()); + return util::lookupSignificance(env, term, fallback); +} + +} + Bm25Executor::Bm25Executor(const fef::FieldInfo& field, const fef::IQueryEnvironment& env, double avg_field_length, @@ -39,10 +55,8 @@ Bm25Executor::Bm25Executor(const fef::FieldInfo& field, for (size_t j = 0; j < term->numFields(); ++j) { const ITermFieldData& term_field = term->field(j); if (field.id() == term_field.getFieldId()) { - // TODO: Add support for using significance instead of default idf if specified in the query _terms.emplace_back(term_field.getHandle(MatchDataDetails::Cheap), - calculate_inverse_document_frequency(term_field.get_matching_doc_count(), - term_field.get_total_doc_count())); + get_inverse_document_frequency(term_field, env, *term)); } } } @@ -114,7 +128,7 @@ Bm25Blueprint::visitDumpFeatures(const fef::IIndexEnvironment& env, fef::IDumpFe { (void) env; (void) visitor; - // TODO: Implement + // TODO: Implement when feature is supported end-2-end with both memory and disk index. } fef::Blueprint::UP diff --git a/searchlib/src/vespa/searchlib/features/documenttestutils.cpp b/searchlib/src/vespa/searchlib/features/documenttestutils.cpp index e13699576cd..1768eb0a216 100644 --- a/searchlib/src/vespa/searchlib/features/documenttestutils.cpp +++ b/searchlib/src/vespa/searchlib/features/documenttestutils.cpp @@ -18,7 +18,8 @@ using namespace search::fef; namespace search::features::util { -feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, uint32_t termId, feature_t fallback) +feature_t +lookupConnectedness(const search::fef::IQueryEnvironment& env, uint32_t termId, feature_t fallback) { if (termId == 0) { return fallback; // no previous term @@ -26,14 +27,15 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, uint32 const ITermData * data = env.getTerm(termId); const ITermData * prev = env.getTerm(termId - 1); - if (data == NULL || prev == NULL) { + if (data == nullptr || prev == nullptr) { return fallback; // default value } return lookupConnectedness(env, data->getUniqueId(), prev->getUniqueId(), fallback); } -feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, - uint32_t currUniqueId, uint32_t prevUniqueId, feature_t fallback) +feature_t +lookupConnectedness(const search::fef::IQueryEnvironment& env, + uint32_t currUniqueId, uint32_t prevUniqueId, feature_t fallback) { // Connectedness of 0.5 between term with unique id 2 and term with unique id 1 is represented as: // [vespa.term.2.connexity: "1", vespa.term.2.connexity: "0.5"] @@ -49,33 +51,40 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, return fallback; } -feature_t lookupSignificance(const search::fef::IQueryEnvironment & env, uint32_t termId, feature_t fallback) +feature_t +lookupSignificance(const search::fef::IQueryEnvironment& env, const ITermData& term, feature_t fallback) { - const ITermData * data = env.getTerm(termId); - if (data == NULL) { - return fallback; - } - // Significance of 0.5 for term with unique id 1 is represented as: // [vespa.term.1.significance: "0.5"] vespalib::asciistream os; - os << "vespa.term." << data->getUniqueId() << ".significance"; + os << "vespa.term." << term.getUniqueId() << ".significance"; Property p = env.getProperties().lookup(os.str()); if (p.found()) { return strToNum<feature_t>(p.get()); } - return fallback; } -double getRobertsonSparckJonesWeight(double docCount, double docsInCorpus) +feature_t +lookupSignificance(const search::fef::IQueryEnvironment& env, uint32_t termId, feature_t fallback) +{ + const ITermData* term = env.getTerm(termId); + if (term == nullptr) { + return fallback; + } + return lookupSignificance(env, *term, fallback); +} + +double +getRobertsonSparckJonesWeight(double docCount, double docsInCorpus) { return std::log((docsInCorpus - docCount + 0.5)/(docCount + 0.5)); } static const double N = 1000000.0; -feature_t getSignificance(double docFreq) +feature_t +getSignificance(double docFreq) { if (docFreq < (1.0/N)) { docFreq = 1.0/N; @@ -95,7 +104,8 @@ feature_t getSignificance(double docFreq) #endif } -feature_t getSignificance(const search::fef::ITermData &termData) +feature_t +getSignificance(const search::fef::ITermData& termData) { typedef search::fef::ITermFieldRangeAdapter FRA; double df = 0; @@ -115,7 +125,7 @@ lookupTable(const search::fef::IIndexEnvironment & env, const vespalib::string & vespalib::string tn1 = env.getProperties().lookup(featureName, table).get(fallback); vespalib::string tn2 = env.getProperties().lookup(featureName, table, fieldName).get(tn1); const search::fef::Table * retval = env.getTableManager().getTable(tn2); - if (retval == NULL) { + if (retval == nullptr) { LOG(warning, "Could not find the %s '%s' to be used for field '%s' in feature '%s'", table.c_str(), tn2.c_str(), fieldName.c_str(), featureName.c_str()); } diff --git a/searchlib/src/vespa/searchlib/features/utils.h b/searchlib/src/vespa/searchlib/features/utils.h index 859c66af66a..890c7cc5225 100644 --- a/searchlib/src/vespa/searchlib/features/utils.h +++ b/searchlib/src/vespa/searchlib/features/utils.h @@ -116,6 +116,17 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, * Uses the property map of the query environment to lookup this data. * * @param env The query environment. + * @param term The term data. + * @param fallback The value to return if the significance was not found in the property map. + * @return The significance. + */ +feature_t lookupSignificance(const search::fef::IQueryEnvironment& env, const search::fef::ITermData& term, feature_t fallback); + +/** + * Returns the significance of the given term. + * Uses the property map of the query environment to lookup this data. + * + * @param env The query environment. * @param termId The term id. * @param fallback The value to return if the significance was not found in the property map. * @return The significance. |