summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-06-25 18:02:12 +0200
committerGitHub <noreply@github.com>2024-06-25 18:02:12 +0200
commitc01c4d0d365088fa79d4240eba6119405cd5fed0 (patch)
treef56d9738b5f795d304ad3bd55126a95b278ff739
parent7c0ac144b1c3ea28bb03843f364a718f63cdabbc (diff)
parent1ccf72ba01db7324bc3caad84311512caea66d79 (diff)
Merge pull request #31722 from vespa-engine/toregge/add-utility-functions-to-get-document-frequency-for-a-termv8.363.17
Add utility functions to get document frequency for a term.
-rw-r--r--searchlib/src/tests/features/util/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/features/util/util_test.cpp77
-rw-r--r--searchlib/src/vespa/searchlib/features/document_frequency.h26
-rw-r--r--searchlib/src/vespa/searchlib/features/utils.cpp28
-rw-r--r--searchlib/src/vespa/searchlib/features/utils.h8
5 files changed, 118 insertions, 22 deletions
diff --git a/searchlib/src/tests/features/util/CMakeLists.txt b/searchlib/src/tests/features/util/CMakeLists.txt
index 0eee4d3b7ac..1315734eee7 100644
--- a/searchlib/src/tests/features/util/CMakeLists.txt
+++ b/searchlib/src/tests/features/util/CMakeLists.txt
@@ -4,5 +4,6 @@ vespa_add_executable(searchlib_util_test_app TEST
util_test.cpp
DEPENDS
vespa_searchlib
+ GTest::gtest
)
vespa_add_test(NAME searchlib_util_test_app COMMAND searchlib_util_test_app)
diff --git a/searchlib/src/tests/features/util/util_test.cpp b/searchlib/src/tests/features/util/util_test.cpp
index 7f3d8ad209f..e51eb8e77b8 100644
--- a/searchlib/src/tests/features/util/util_test.cpp
+++ b/searchlib/src/tests/features/util/util_test.cpp
@@ -1,8 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/testkit/test_kit.h>
#include <vespa/searchlib/features/utils.h>
#include <vespa/searchlib/fef/test/indexenvironment.h>
#include <vespa/searchlib/fef/test/queryenvironment.h>
+#include <vespa/vespalib/gtest/gtest.h>
using namespace search;
using namespace search::fef;
@@ -10,6 +10,14 @@ using namespace search::fef::test;
using namespace search::features;
using namespace search::features::util;
+namespace search::features::util {
+
+void PrintTo(const DocumentFrequency& document_frequency, std::ostream* os) {
+ *os << "{" << document_frequency.frequency << "," << document_frequency.count << "}";
+}
+
+}
+
SimpleTermData make_term(uint32_t uid) {
SimpleTermData term;
term.setUniqueId(uid);
@@ -30,31 +38,56 @@ struct TermLabelFixture {
}
};
-TEST_F("require that label can be mapped to term", TermLabelFixture) {
- EXPECT_EQUAL((ITermData*)&f1.queryEnv.getTerms()[0], util::getTermByLabel(f1.queryEnv, "foo"));
- EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "bar"));
- EXPECT_EQUAL((ITermData*)&f1.queryEnv.getTerms()[2], util::getTermByLabel(f1.queryEnv, "baz"));
- EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "fox"));
- EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "unknown"));
+TEST(UtilsTest, require_that_label_can_be_mapped_to_term)
+{
+ TermLabelFixture f1;
+ EXPECT_EQ((ITermData*)&f1.queryEnv.getTerms()[0], getTermByLabel(f1.queryEnv, "foo"));
+ EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "bar"));
+ EXPECT_EQ((ITermData*)&f1.queryEnv.getTerms()[2], getTermByLabel(f1.queryEnv, "baz"));
+ EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "fox"));
+ EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "unknown"));
}
template <typename T>
-void verifyStrToNum() {
- EXPECT_EQUAL(-17, static_cast<long>(strToNum<T>("-17")));
- EXPECT_EQUAL(-1, static_cast<long>(strToNum<T>("-1")));
- EXPECT_EQUAL(0, static_cast<long>(strToNum<T>("0")));
- EXPECT_EQUAL(1, static_cast<long>(strToNum<T>("1")));
- EXPECT_EQUAL(17, static_cast<long>(strToNum<T>("17")));
- EXPECT_EQUAL(0, static_cast<long>(strToNum<T>("0x0")));
- EXPECT_EQUAL(1, static_cast<long>(strToNum<T>("0x1")));
- EXPECT_EQUAL(27, static_cast<long>(strToNum<T>("0x1b")));
+void verifyStrToNum(const std::string& label) {
+ SCOPED_TRACE(label);
+ EXPECT_EQ(-17, static_cast<long>(strToNum<T>("-17")));
+ EXPECT_EQ(-1, static_cast<long>(strToNum<T>("-1")));
+ EXPECT_EQ(0, static_cast<long>(strToNum<T>("0")));
+ EXPECT_EQ(1, static_cast<long>(strToNum<T>("1")));
+ EXPECT_EQ(17, static_cast<long>(strToNum<T>("17")));
+ EXPECT_EQ(0, static_cast<long>(strToNum<T>("0x0")));
+ EXPECT_EQ(1, static_cast<long>(strToNum<T>("0x1")));
+ EXPECT_EQ(27, static_cast<long>(strToNum<T>("0x1b")));
+}
+
+TEST(UtilsTest, verify_str2Num)
+{
+ verifyStrToNum<int8_t>("int8_t");
+ verifyStrToNum<int16_t>("int16_t");
+ verifyStrToNum<int32_t>("int32_t");
+ verifyStrToNum<int64_t>("int64_t");
}
-TEST("verify str2Num") {
- verifyStrToNum<int8_t>();
- verifyStrToNum<int16_t>();
- verifyStrToNum<int32_t>();
- verifyStrToNum<int64_t>();
+TEST(UtilsTest, lookup_document_frequency)
+{
+ using OptDF = std::optional<DocumentFrequency>;
+ IndexEnvironment index_env;;
+ QueryEnvironment query_env(&index_env);
+ query_env.getTerms() = std::vector<SimpleTermData>{make_term(0), make_term(5), make_term(6), make_term(10)};
+ // Properties not used due to bad unique id
+ query_env.getProperties().add("vespa.term.0.docfreq", "11");
+ query_env.getProperties().add("vespa.term.0.docfreq", "17");
+ // Incomplete properties, thus not used
+ query_env.getProperties().add("vespa.term.6.docfreq", "5");
+ // Complete properties
+ query_env.getProperties().add("vespa.term.10.docfreq", "10");
+ query_env.getProperties().add("vespa.term.10.docfreq", "15");
+ EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 0)); // bad unique id
+ EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 1)); // missing properties
+ EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 2)); // incomplete properties
+ EXPECT_EQ(OptDF({10, 15}), lookup_document_frequency(query_env, 3));
+ EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 4)); // term not found
}
-TEST_MAIN() { TEST_RUN_ALL(); }
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/features/document_frequency.h b/searchlib/src/vespa/searchlib/features/document_frequency.h
new file mode 100644
index 00000000000..f84e12d9e5c
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/features/document_frequency.h
@@ -0,0 +1,26 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <cstdint>
+
+namespace search::features::util {
+
+/*
+ * Struct containing the raw data used to calculate significance.
+ */
+struct DocumentFrequency {
+ uint64_t frequency; // number of documents containing the word
+ uint64_t count; // total number of documents
+
+ DocumentFrequency(uint64_t document_frequency_in, uint64_t document_count_in)
+ : frequency(document_frequency_in),
+ count(document_count_in)
+ {
+ }
+ bool operator==(const DocumentFrequency& rhs) const noexcept {
+ return frequency == rhs.frequency && count == rhs.count;
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/features/utils.cpp b/searchlib/src/vespa/searchlib/features/utils.cpp
index fce151bc8ec..6555db03824 100644
--- a/searchlib/src/vespa/searchlib/features/utils.cpp
+++ b/searchlib/src/vespa/searchlib/features/utils.cpp
@@ -181,4 +181,32 @@ getTermByLabel(const search::fef::IQueryEnvironment &env, const vespalib::string
return 0;
}
+std::optional<DocumentFrequency>
+lookup_document_frequency(const search::fef::IQueryEnvironment& env, const ITermData& term)
+{
+ vespalib::asciistream os;
+ auto unique_id = term.getUniqueId();
+ if (unique_id != 0) {
+ os << "vespa.term." << unique_id << ".docfreq";
+ Property p = env.getProperties().lookup(os.str());
+ if (p.size() == 2) {
+ // we have a defined document frequency
+ auto document_frequency = strToNum<uint64_t>(p.getAt(0));
+ auto document_count = strToNum<uint64_t>(p.getAt(1));
+ return DocumentFrequency(document_frequency, document_count);
+ }
+ }
+ return {};
+}
+
+std::optional<DocumentFrequency>
+lookup_document_frequency(const search::fef::IQueryEnvironment& env, uint32_t termId)
+{
+ const ITermData* term = env.getTerm(termId);
+ if (term == nullptr) {
+ return {};
+ }
+ return lookup_document_frequency(env, *term);
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/features/utils.h b/searchlib/src/vespa/searchlib/features/utils.h
index 518dbd42073..a0ca4b8be2a 100644
--- a/searchlib/src/vespa/searchlib/features/utils.h
+++ b/searchlib/src/vespa/searchlib/features/utils.h
@@ -2,6 +2,7 @@
#pragma once
+#include "document_frequency.h"
#include <vespa/searchlib/fef/iqueryenvironment.h>
#include <vespa/searchlib/fef/table.h>
#include <vespa/searchlib/fef/termfieldmatchdata.h>
@@ -10,6 +11,7 @@
#include <vespa/searchlib/common/feature.h>
#include <vespa/vespalib/util/string_hash.h>
#include <limits>
+#include <optional>
namespace search::features::util {
@@ -191,4 +193,10 @@ getTermFieldHandle(const search::fef::IQueryEnvironment &env, uint32_t termId, u
const search::fef::ITermData *
getTermByLabel(const search::fef::IQueryEnvironment &env, const vespalib::string &label);
+std::optional<DocumentFrequency>
+lookup_document_frequency(const search::fef::IQueryEnvironment& env, const search::fef::ITermData& term);
+
+std::optional<DocumentFrequency>
+lookup_document_frequency(const search::fef::IQueryEnvironment& env, uint32_t termId);
+
}