summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2019-06-14 10:43:28 +0000
committerGeir Storli <geirst@verizonmedia.com>2019-06-14 10:46:09 +0000
commit83adcebd451d3ca08227624a58f2ad27360603d0 (patch)
treefc18be2c2d9928a6b44260ac30e59354a8d89dd5 /searchlib
parent436735628ef9428187fea8d3c7a5aecd3477784d (diff)
Add support to override inverse document frequency in bm25 feature by using significance passed down with the query.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/features/bm25/bm25_test.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/features/bm25_feature.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/features/documenttestutils.cpp42
-rw-r--r--searchlib/src/vespa/searchlib/features/utils.h11
4 files changed, 64 insertions, 20 deletions
diff --git a/searchlib/src/tests/features/bm25/bm25_test.cpp b/searchlib/src/tests/features/bm25/bm25_test.cpp
index eb2f46650a6..55c9caa6c0f 100644
--- a/searchlib/src/tests/features/bm25/bm25_test.cpp
+++ b/searchlib/src/tests/features/bm25/bm25_test.cpp
@@ -135,6 +135,7 @@ struct Bm25ExecutorTest : public ::testing::Test {
void add_query_term(const vespalib::string& field_name, uint32_t matching_doc_count) {
auto* term = test.getQueryEnv().getBuilder().addIndexNode({field_name});
term->field(0).setDocFreq(matching_doc_count, total_doc_count);
+ term->setUniqueId(test.getQueryEnv().getNumTerms() - 1);
}
void setup() {
EXPECT_TRUE(test.setup());
@@ -236,4 +237,12 @@ TEST_F(Bm25ExecutorTest, b_param_can_be_overriden)
EXPECT_TRUE(execute(score(3.0, 20, idf(25))));
}
+TEST_F(Bm25ExecutorTest, inverse_document_frequency_can_be_overriden_with_significance)
+{
+ test.getQueryEnv().getProperties().add("vespa.term.0.significance", "0.35");
+ setup();
+ prepare_term(0, 0, 3, 20);
+ EXPECT_TRUE(execute(score(3.0, 20, 0.35)));
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp
index e89655a75bb..6e889b48343 100644
--- a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp
+++ b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp
@@ -1,6 +1,7 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "bm25_feature.h"
+#include "utils.h"
#include <vespa/searchlib/fef/itermdata.h>
#include <vespa/searchlib/fef/itermfielddata.h>
#include <vespa/searchlib/fef/objectstore.h>
@@ -23,6 +24,21 @@ using fef::ITermFieldData;
using fef::MatchDataDetails;
using fef::objectstore::as_value;
+namespace {
+
+double
+get_inverse_document_frequency(const ITermFieldData& term_field,
+ const fef::IQueryEnvironment& env,
+ const ITermData& term)
+
+{
+ double fallback = Bm25Executor::calculate_inverse_document_frequency(term_field.get_matching_doc_count(),
+ term_field.get_total_doc_count());
+ return util::lookupSignificance(env, term, fallback);
+}
+
+}
+
Bm25Executor::Bm25Executor(const fef::FieldInfo& field,
const fef::IQueryEnvironment& env,
double avg_field_length,
@@ -39,10 +55,8 @@ Bm25Executor::Bm25Executor(const fef::FieldInfo& field,
for (size_t j = 0; j < term->numFields(); ++j) {
const ITermFieldData& term_field = term->field(j);
if (field.id() == term_field.getFieldId()) {
- // TODO: Add support for using significance instead of default idf if specified in the query
_terms.emplace_back(term_field.getHandle(MatchDataDetails::Cheap),
- calculate_inverse_document_frequency(term_field.get_matching_doc_count(),
- term_field.get_total_doc_count()));
+ get_inverse_document_frequency(term_field, env, *term));
}
}
}
@@ -114,7 +128,7 @@ Bm25Blueprint::visitDumpFeatures(const fef::IIndexEnvironment& env, fef::IDumpFe
{
(void) env;
(void) visitor;
- // TODO: Implement
+ // TODO: Implement when feature is supported end-2-end with both memory and disk index.
}
fef::Blueprint::UP
diff --git a/searchlib/src/vespa/searchlib/features/documenttestutils.cpp b/searchlib/src/vespa/searchlib/features/documenttestutils.cpp
index e13699576cd..43c51b120bf 100644
--- a/searchlib/src/vespa/searchlib/features/documenttestutils.cpp
+++ b/searchlib/src/vespa/searchlib/features/documenttestutils.cpp
@@ -18,7 +18,8 @@ using namespace search::fef;
namespace search::features::util {
-feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, uint32_t termId, feature_t fallback)
+feature_t
+lookupConnectedness(const search::fef::IQueryEnvironment& env, uint32_t termId, feature_t fallback)
{
if (termId == 0) {
return fallback; // no previous term
@@ -26,14 +27,15 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env, uint32
const ITermData * data = env.getTerm(termId);
const ITermData * prev = env.getTerm(termId - 1);
- if (data == NULL || prev == NULL) {
+ if (data == nullptr || prev == nullptr) {
return fallback; // default value
}
return lookupConnectedness(env, data->getUniqueId(), prev->getUniqueId(), fallback);
}
-feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env,
- uint32_t currUniqueId, uint32_t prevUniqueId, feature_t fallback)
+feature_t
+lookupConnectedness(const search::fef::IQueryEnvironment& env,
+ uint32_t currUniqueId, uint32_t prevUniqueId, feature_t fallback)
{
// Connectedness of 0.5 between term with unique id 2 and term with unique id 1 is represented as:
// [vespa.term.2.connexity: "1", vespa.term.2.connexity: "0.5"]
@@ -49,33 +51,40 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env,
return fallback;
}
-feature_t lookupSignificance(const search::fef::IQueryEnvironment & env, uint32_t termId, feature_t fallback)
+feature_t
+lookupSignificance(const search::fef::IQueryEnvironment& env, const ITermData& data, feature_t fallback)
{
- const ITermData * data = env.getTerm(termId);
- if (data == NULL) {
- return fallback;
- }
-
// Significance of 0.5 for term with unique id 1 is represented as:
// [vespa.term.1.significance: "0.5"]
vespalib::asciistream os;
- os << "vespa.term." << data->getUniqueId() << ".significance";
+ os << "vespa.term." << data.getUniqueId() << ".significance";
Property p = env.getProperties().lookup(os.str());
if (p.found()) {
return strToNum<feature_t>(p.get());
}
-
return fallback;
}
-double getRobertsonSparckJonesWeight(double docCount, double docsInCorpus)
+feature_t
+lookupSignificance(const search::fef::IQueryEnvironment& env, uint32_t termId, feature_t fallback)
+{
+ const ITermData * data = env.getTerm(termId);
+ if (data == nullptr) {
+ return fallback;
+ }
+ return lookupSignificance(env, *data, fallback);
+}
+
+double
+getRobertsonSparckJonesWeight(double docCount, double docsInCorpus)
{
return std::log((docsInCorpus - docCount + 0.5)/(docCount + 0.5));
}
static const double N = 1000000.0;
-feature_t getSignificance(double docFreq)
+feature_t
+getSignificance(double docFreq)
{
if (docFreq < (1.0/N)) {
docFreq = 1.0/N;
@@ -95,7 +104,8 @@ feature_t getSignificance(double docFreq)
#endif
}
-feature_t getSignificance(const search::fef::ITermData &termData)
+feature_t
+getSignificance(const search::fef::ITermData& termData)
{
typedef search::fef::ITermFieldRangeAdapter FRA;
double df = 0;
@@ -115,7 +125,7 @@ lookupTable(const search::fef::IIndexEnvironment & env, const vespalib::string &
vespalib::string tn1 = env.getProperties().lookup(featureName, table).get(fallback);
vespalib::string tn2 = env.getProperties().lookup(featureName, table, fieldName).get(tn1);
const search::fef::Table * retval = env.getTableManager().getTable(tn2);
- if (retval == NULL) {
+ if (retval == nullptr) {
LOG(warning, "Could not find the %s '%s' to be used for field '%s' in feature '%s'",
table.c_str(), tn2.c_str(), fieldName.c_str(), featureName.c_str());
}
diff --git a/searchlib/src/vespa/searchlib/features/utils.h b/searchlib/src/vespa/searchlib/features/utils.h
index 859c66af66a..b103483934c 100644
--- a/searchlib/src/vespa/searchlib/features/utils.h
+++ b/searchlib/src/vespa/searchlib/features/utils.h
@@ -116,6 +116,17 @@ feature_t lookupConnectedness(const search::fef::IQueryEnvironment & env,
* Uses the property map of the query environment to lookup this data.
*
* @param env The query environment.
+ * @param term The term data.
+ * @param fallback The value to return if the significance was not found in the property map.
+ * @return The significance.
+ */
+feature_t lookupSignificance(const search::fef::IQueryEnvironment& env, const search::fef::ITermData& data, feature_t fallback);
+
+/**
+ * Returns the significance of the given term.
+ * Uses the property map of the query environment to lookup this data.
+ *
+ * @param env The query environment.
* @param termId The term id.
* @param fallback The value to return if the significance was not found in the property map.
* @return The significance.