diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-06-25 18:02:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-25 18:02:12 +0200 |
commit | c01c4d0d365088fa79d4240eba6119405cd5fed0 (patch) | |
tree | f56d9738b5f795d304ad3bd55126a95b278ff739 /searchlib/src | |
parent | 7c0ac144b1c3ea28bb03843f364a718f63cdabbc (diff) | |
parent | 1ccf72ba01db7324bc3caad84311512caea66d79 (diff) |
Merge pull request #31722 from vespa-engine/toregge/add-utility-functions-to-get-document-frequency-for-a-termv8.363.17
Add utility functions to get document frequency for a term.
Diffstat (limited to 'searchlib/src')
-rw-r--r-- | searchlib/src/tests/features/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | searchlib/src/tests/features/util/util_test.cpp | 77 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/features/document_frequency.h | 26 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/features/utils.cpp | 28 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/features/utils.h | 8 |
5 files changed, 118 insertions, 22 deletions
diff --git a/searchlib/src/tests/features/util/CMakeLists.txt b/searchlib/src/tests/features/util/CMakeLists.txt index 0eee4d3b7ac..1315734eee7 100644 --- a/searchlib/src/tests/features/util/CMakeLists.txt +++ b/searchlib/src/tests/features/util/CMakeLists.txt @@ -4,5 +4,6 @@ vespa_add_executable(searchlib_util_test_app TEST util_test.cpp DEPENDS vespa_searchlib + GTest::gtest ) vespa_add_test(NAME searchlib_util_test_app COMMAND searchlib_util_test_app) diff --git a/searchlib/src/tests/features/util/util_test.cpp b/searchlib/src/tests/features/util/util_test.cpp index 7f3d8ad209f..e51eb8e77b8 100644 --- a/searchlib/src/tests/features/util/util_test.cpp +++ b/searchlib/src/tests/features/util/util_test.cpp @@ -1,8 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/testkit/test_kit.h> #include <vespa/searchlib/features/utils.h> #include <vespa/searchlib/fef/test/indexenvironment.h> #include <vespa/searchlib/fef/test/queryenvironment.h> +#include <vespa/vespalib/gtest/gtest.h> using namespace search; using namespace search::fef; @@ -10,6 +10,14 @@ using namespace search::fef::test; using namespace search::features; using namespace search::features::util; +namespace search::features::util { + +void PrintTo(const DocumentFrequency& document_frequency, std::ostream* os) { + *os << "{" << document_frequency.frequency << "," << document_frequency.count << "}"; +} + +} + SimpleTermData make_term(uint32_t uid) { SimpleTermData term; term.setUniqueId(uid); @@ -30,31 +38,56 @@ struct TermLabelFixture { } }; -TEST_F("require that label can be mapped to term", TermLabelFixture) { - EXPECT_EQUAL((ITermData*)&f1.queryEnv.getTerms()[0], util::getTermByLabel(f1.queryEnv, "foo")); - EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "bar")); - EXPECT_EQUAL((ITermData*)&f1.queryEnv.getTerms()[2], util::getTermByLabel(f1.queryEnv, "baz")); - EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "fox")); - EXPECT_EQUAL((ITermData*)0, util::getTermByLabel(f1.queryEnv, "unknown")); +TEST(UtilsTest, require_that_label_can_be_mapped_to_term) +{ + TermLabelFixture f1; + EXPECT_EQ((ITermData*)&f1.queryEnv.getTerms()[0], getTermByLabel(f1.queryEnv, "foo")); + EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "bar")); + EXPECT_EQ((ITermData*)&f1.queryEnv.getTerms()[2], getTermByLabel(f1.queryEnv, "baz")); + EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "fox")); + EXPECT_EQ((ITermData*)0, getTermByLabel(f1.queryEnv, "unknown")); } template <typename T> -void verifyStrToNum() { - EXPECT_EQUAL(-17, static_cast<long>(strToNum<T>("-17"))); - EXPECT_EQUAL(-1, static_cast<long>(strToNum<T>("-1"))); - EXPECT_EQUAL(0, static_cast<long>(strToNum<T>("0"))); - EXPECT_EQUAL(1, static_cast<long>(strToNum<T>("1"))); - EXPECT_EQUAL(17, static_cast<long>(strToNum<T>("17"))); - EXPECT_EQUAL(0, static_cast<long>(strToNum<T>("0x0"))); - EXPECT_EQUAL(1, static_cast<long>(strToNum<T>("0x1"))); - EXPECT_EQUAL(27, static_cast<long>(strToNum<T>("0x1b"))); +void verifyStrToNum(const std::string& label) { + SCOPED_TRACE(label); + EXPECT_EQ(-17, static_cast<long>(strToNum<T>("-17"))); + EXPECT_EQ(-1, static_cast<long>(strToNum<T>("-1"))); + EXPECT_EQ(0, static_cast<long>(strToNum<T>("0"))); + EXPECT_EQ(1, static_cast<long>(strToNum<T>("1"))); + EXPECT_EQ(17, static_cast<long>(strToNum<T>("17"))); + EXPECT_EQ(0, static_cast<long>(strToNum<T>("0x0"))); + EXPECT_EQ(1, static_cast<long>(strToNum<T>("0x1"))); + EXPECT_EQ(27, static_cast<long>(strToNum<T>("0x1b"))); +} + +TEST(UtilsTest, verify_str2Num) +{ + verifyStrToNum<int8_t>("int8_t"); + verifyStrToNum<int16_t>("int16_t"); + verifyStrToNum<int32_t>("int32_t"); + verifyStrToNum<int64_t>("int64_t"); } -TEST("verify str2Num") { - verifyStrToNum<int8_t>(); - verifyStrToNum<int16_t>(); - verifyStrToNum<int32_t>(); - verifyStrToNum<int64_t>(); +TEST(UtilsTest, lookup_document_frequency) +{ + using OptDF = std::optional<DocumentFrequency>; + IndexEnvironment index_env;; + QueryEnvironment query_env(&index_env); + query_env.getTerms() = std::vector<SimpleTermData>{make_term(0), make_term(5), make_term(6), make_term(10)}; + // Properties not used due to bad unique id + query_env.getProperties().add("vespa.term.0.docfreq", "11"); + query_env.getProperties().add("vespa.term.0.docfreq", "17"); + // Incomplete properties, thus not used + query_env.getProperties().add("vespa.term.6.docfreq", "5"); + // Complete properties + query_env.getProperties().add("vespa.term.10.docfreq", "10"); + query_env.getProperties().add("vespa.term.10.docfreq", "15"); + EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 0)); // bad unique id + EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 1)); // missing properties + EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 2)); // incomplete properties + EXPECT_EQ(OptDF({10, 15}), lookup_document_frequency(query_env, 3)); + EXPECT_EQ(OptDF(), lookup_document_frequency(query_env, 4)); // term not found } -TEST_MAIN() { TEST_RUN_ALL(); } +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/features/document_frequency.h b/searchlib/src/vespa/searchlib/features/document_frequency.h new file mode 100644 index 00000000000..f84e12d9e5c --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/document_frequency.h @@ -0,0 +1,26 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstdint> + +namespace search::features::util { + +/* + * Struct containing the raw data used to calculate significance. + */ +struct DocumentFrequency { + uint64_t frequency; // number of documents containing the word + uint64_t count; // total number of documents + + DocumentFrequency(uint64_t document_frequency_in, uint64_t document_count_in) + : frequency(document_frequency_in), + count(document_count_in) + { + } + bool operator==(const DocumentFrequency& rhs) const noexcept { + return frequency == rhs.frequency && count == rhs.count; + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/features/utils.cpp b/searchlib/src/vespa/searchlib/features/utils.cpp index fce151bc8ec..6555db03824 100644 --- a/searchlib/src/vespa/searchlib/features/utils.cpp +++ b/searchlib/src/vespa/searchlib/features/utils.cpp @@ -181,4 +181,32 @@ getTermByLabel(const search::fef::IQueryEnvironment &env, const vespalib::string return 0; } +std::optional<DocumentFrequency> +lookup_document_frequency(const search::fef::IQueryEnvironment& env, const ITermData& term) +{ + vespalib::asciistream os; + auto unique_id = term.getUniqueId(); + if (unique_id != 0) { + os << "vespa.term." << unique_id << ".docfreq"; + Property p = env.getProperties().lookup(os.str()); + if (p.size() == 2) { + // we have a defined document frequency + auto document_frequency = strToNum<uint64_t>(p.getAt(0)); + auto document_count = strToNum<uint64_t>(p.getAt(1)); + return DocumentFrequency(document_frequency, document_count); + } + } + return {}; +} + +std::optional<DocumentFrequency> +lookup_document_frequency(const search::fef::IQueryEnvironment& env, uint32_t termId) +{ + const ITermData* term = env.getTerm(termId); + if (term == nullptr) { + return {}; + } + return lookup_document_frequency(env, *term); +} + } diff --git a/searchlib/src/vespa/searchlib/features/utils.h b/searchlib/src/vespa/searchlib/features/utils.h index 518dbd42073..a0ca4b8be2a 100644 --- a/searchlib/src/vespa/searchlib/features/utils.h +++ b/searchlib/src/vespa/searchlib/features/utils.h @@ -2,6 +2,7 @@ #pragma once +#include "document_frequency.h" #include <vespa/searchlib/fef/iqueryenvironment.h> #include <vespa/searchlib/fef/table.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> @@ -10,6 +11,7 @@ #include <vespa/searchlib/common/feature.h> #include <vespa/vespalib/util/string_hash.h> #include <limits> +#include <optional> namespace search::features::util { @@ -191,4 +193,10 @@ getTermFieldHandle(const search::fef::IQueryEnvironment &env, uint32_t termId, u const search::fef::ITermData * getTermByLabel(const search::fef::IQueryEnvironment &env, const vespalib::string &label); +std::optional<DocumentFrequency> +lookup_document_frequency(const search::fef::IQueryEnvironment& env, const search::fef::ITermData& term); + +std::optional<DocumentFrequency> +lookup_document_frequency(const search::fef::IQueryEnvironment& env, uint32_t termId); + } |