diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-01-24 16:37:15 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-01-24 16:37:15 +0100 |
commit | 5c2aca998192db6b0d4cbcd054aa11db158b298b (patch) | |
tree | beea1df066868901c116801fa27b61bfb004bcdf /searchsummary | |
parent | bb1a582cbf3de4854243f88f05a73b355f00a3d0 (diff) |
Add new KeywordExtractor with two factories (one each for indexed search
and streaming search).
Diffstat (limited to 'searchsummary')
8 files changed, 198 insertions, 0 deletions
diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt index 9c9079e6ed5..451c90c752d 100644 --- a/searchsummary/CMakeLists.txt +++ b/searchsummary/CMakeLists.txt @@ -20,6 +20,7 @@ vespa_define_module( src/tests/docsummary/attribute_combiner src/tests/docsummary/attributedfw src/tests/docsummary/document_id_dfw + src/tests/docsummary/keyword_extractor_factory src/tests/docsummary/matched_elements_filter src/tests/docsummary/result_class src/tests/docsummary/slime_filler diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt new file mode 100644 index 00000000000..1cb555f3bd8 --- /dev/null +++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_keyword_extractor_factory_test_app TEST + SOURCES + keyword_extractor_factory_test.cpp + DEPENDS + searchsummary + GTest::GTest +) +vespa_add_test(NAME searchsummary_keyword_extractor_factory_test_app COMMAND searchsummary_keyword_extractor_factory_test_app) diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp new file mode 100644 index 00000000000..8ba91699ae6 --- /dev/null +++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp @@ -0,0 +1,73 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchsummary/docsummary/i_keyword_extractor.h> +#include <vespa/searchsummary/docsummary/keyword_extractor_factory.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using search::docsummary::KeywordExtractorFactory; +using search::index::Schema; + +using FieldSet = Schema::FieldSet; + +class KeywordExtractorFactoryTest : public testing::Test { + std::unique_ptr<IKeywordExtractorFactory> _factory; + Schema _schema; + +protected: + KeywordExtractorFactoryTest(); + ~KeywordExtractorFactoryTest() override; + + void make_factory() { + _factory = std::make_unique<KeywordExtractorFactory>(_schema); + } + + bool check_index(const vespalib::string &index_name, const vespalib::string& summary_field) { + if (!_factory) { + make_factory(); + } + auto extractor = _factory->make(summary_field); + return extractor->isLegalIndex(index_name); + } + + void add_field_set(const vespalib::string& field_set_name, const std::vector<vespalib::string>& field_names) { + FieldSet field_set(field_set_name); + for (auto& field_name : field_names) { + field_set.addField(field_name); + } + _schema.addFieldSet(field_set); + _factory.reset(); + } +}; + + +KeywordExtractorFactoryTest::KeywordExtractorFactoryTest() + : testing::Test(), + _factory() +{ +} + +KeywordExtractorFactoryTest::~KeywordExtractorFactoryTest() = default; + +TEST_F(KeywordExtractorFactoryTest, empty_schema) +{ + EXPECT_TRUE(check_index("foo", "foo")); + EXPECT_FALSE(check_index("bar", "foo")); + EXPECT_FALSE(check_index("foo", "bar")); +} + +TEST_F(KeywordExtractorFactoryTest, field_set_is_checked) +{ + add_field_set("ab", {"cd", "de"}); + add_field_set("gh", {"cd"}); + EXPECT_TRUE(check_index("cd", "cd")); + EXPECT_TRUE(check_index("ab", "cd")); + EXPECT_TRUE(check_index("gh", "cd")); + EXPECT_TRUE(check_index("de", "de")); + EXPECT_TRUE(check_index("ab", "de")); + EXPECT_FALSE(check_index("gh", "de")); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index cfc3eb6536d..34e902461f4 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -23,6 +23,8 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_dfw_term_visitor.cpp juniper_query_adapter.cpp juniperproperties.cpp + keyword_extractor.cpp + keyword_extractor_factory.cpp legacy_keyword_extractor.cpp legacy_keyword_extractor_factory.cpp linguisticsannotation.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp new file mode 100644 index 00000000000..71b685c6317 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor.h" +#include <vespa/vespalib/stllike/hash_set.hpp> + +namespace search::docsummary { + +KeywordExtractor::KeywordExtractor(StringSet indexes) + : IKeywordExtractor(), + _indexes(std::move(indexes)) +{ +} + +KeywordExtractor::~KeywordExtractor() = default; + +bool +KeywordExtractor::isLegalIndex(vespalib::stringref idx) const +{ + return _indexes.contains(idx); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h new file mode 100644 index 00000000000..a2b1fba96f1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h @@ -0,0 +1,24 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_keyword_extractor.h" +#include <vespa/vespalib/stllike/hash_set.h> + +namespace search::docsummary { + +/* + * Class for checking if query term index name indicates that + * related query term is useful from the perspective of juniper. + */ +class KeywordExtractor : public IKeywordExtractor +{ + using StringSet = vespalib::hash_set<vespalib::string>; + StringSet _indexes; +public: + KeywordExtractor(StringSet indexes); + ~KeywordExtractor() override; + bool isLegalIndex(vespalib::stringref idx) const override; +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp new file mode 100644 index 00000000000..f749e6e42a1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp @@ -0,0 +1,41 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor_factory.h" +#include "keyword_extractor.h" +#include <vespa/searchcommon/common/schema.h> +#include <vespa/vespalib/stllike/hash_map.hpp> +#include <vespa/vespalib/stllike/hash_set.hpp> + +namespace search::docsummary { + +KeywordExtractorFactory::KeywordExtractorFactory(const search::index::Schema& schema) + : IKeywordExtractorFactory(), + _index_map() +{ + for (uint32_t i = 0; i < schema.getNumFieldSets(); ++i) { + auto& field_set = schema.getFieldSet(i); + auto& fields = field_set.getFields(); + for (auto& field : fields) { + auto& vec = _index_map[field]; + vec.emplace_back(field_set.getName()); + } + } +} + +KeywordExtractorFactory::~KeywordExtractorFactory() = default; + +std::shared_ptr<const IKeywordExtractor> +KeywordExtractorFactory::make(vespalib::stringref input_field) const +{ + vespalib::hash_set<vespalib::string> indexes; + indexes.insert(input_field); + auto itr = _index_map.find(input_field); + if (itr != _index_map.end()) { + for (auto& index : itr->second) { + indexes.insert(index); + } + } + return std::make_shared<KeywordExtractor>(std::move(indexes)); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h new file mode 100644 index 00000000000..e22475eb842 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h @@ -0,0 +1,26 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_keyword_extractor_factory.h" +#include <vespa/searchcommon/common/schema.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vector> + +namespace search::index { class Schema; } + +namespace search::docsummary { + +/* + * Class for creating an instance of IKeywordExtractor. + */ +class KeywordExtractorFactory : public IKeywordExtractorFactory +{ + vespalib::hash_map<vespalib::string, std::vector<vespalib::string>> _index_map; +public: + KeywordExtractorFactory(const search::index::Schema& schema); + ~KeywordExtractorFactory() override; + std::shared_ptr<const IKeywordExtractor> make(vespalib::stringref input_field) const override; +}; + +} |