diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-01-24 16:37:15 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-01-24 16:37:15 +0100 |
commit | 5c2aca998192db6b0d4cbcd054aa11db158b298b (patch) | |
tree | beea1df066868901c116801fa27b61bfb004bcdf | |
parent | bb1a582cbf3de4854243f88f05a73b355f00a3d0 (diff) |
Add new KeywordExtractor with two factories (one each for indexed search
and streaming search).
14 files changed, 444 insertions, 0 deletions
diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt index 9c9079e6ed5..451c90c752d 100644 --- a/searchsummary/CMakeLists.txt +++ b/searchsummary/CMakeLists.txt @@ -20,6 +20,7 @@ vespa_define_module( src/tests/docsummary/attribute_combiner src/tests/docsummary/attributedfw src/tests/docsummary/document_id_dfw + src/tests/docsummary/keyword_extractor_factory src/tests/docsummary/matched_elements_filter src/tests/docsummary/result_class src/tests/docsummary/slime_filler diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt new file mode 100644 index 00000000000..1cb555f3bd8 --- /dev/null +++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_keyword_extractor_factory_test_app TEST + SOURCES + keyword_extractor_factory_test.cpp + DEPENDS + searchsummary + GTest::GTest +) +vespa_add_test(NAME searchsummary_keyword_extractor_factory_test_app COMMAND searchsummary_keyword_extractor_factory_test_app) diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp new file mode 100644 index 00000000000..8ba91699ae6 --- /dev/null +++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp @@ -0,0 +1,73 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchsummary/docsummary/i_keyword_extractor.h> +#include <vespa/searchsummary/docsummary/keyword_extractor_factory.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using search::docsummary::KeywordExtractorFactory; +using search::index::Schema; + +using FieldSet = Schema::FieldSet; + +class KeywordExtractorFactoryTest : public testing::Test { + std::unique_ptr<IKeywordExtractorFactory> _factory; + Schema _schema; + +protected: + KeywordExtractorFactoryTest(); + ~KeywordExtractorFactoryTest() override; + + void make_factory() { + _factory = std::make_unique<KeywordExtractorFactory>(_schema); + } + + bool check_index(const vespalib::string &index_name, const vespalib::string& summary_field) { + if (!_factory) { + make_factory(); + } + auto extractor = _factory->make(summary_field); + return extractor->isLegalIndex(index_name); + } + + void add_field_set(const vespalib::string& field_set_name, const std::vector<vespalib::string>& field_names) { + FieldSet field_set(field_set_name); + for (auto& field_name : field_names) { + field_set.addField(field_name); + } + _schema.addFieldSet(field_set); + _factory.reset(); + } +}; + + +KeywordExtractorFactoryTest::KeywordExtractorFactoryTest() + : testing::Test(), + _factory() +{ +} + +KeywordExtractorFactoryTest::~KeywordExtractorFactoryTest() = default; + +TEST_F(KeywordExtractorFactoryTest, empty_schema) +{ + EXPECT_TRUE(check_index("foo", "foo")); + EXPECT_FALSE(check_index("bar", "foo")); + EXPECT_FALSE(check_index("foo", "bar")); +} + +TEST_F(KeywordExtractorFactoryTest, field_set_is_checked) +{ + add_field_set("ab", {"cd", "de"}); + add_field_set("gh", {"cd"}); + EXPECT_TRUE(check_index("cd", "cd")); + EXPECT_TRUE(check_index("ab", "cd")); + EXPECT_TRUE(check_index("gh", "cd")); + EXPECT_TRUE(check_index("de", "de")); + EXPECT_TRUE(check_index("ab", "de")); + EXPECT_FALSE(check_index("gh", "de")); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index cfc3eb6536d..34e902461f4 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -23,6 +23,8 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_dfw_term_visitor.cpp juniper_query_adapter.cpp juniperproperties.cpp + keyword_extractor.cpp + keyword_extractor_factory.cpp legacy_keyword_extractor.cpp legacy_keyword_extractor_factory.cpp linguisticsannotation.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp new file mode 100644 index 00000000000..71b685c6317 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor.h" +#include <vespa/vespalib/stllike/hash_set.hpp> + +namespace search::docsummary { + +KeywordExtractor::KeywordExtractor(StringSet indexes) + : IKeywordExtractor(), + _indexes(std::move(indexes)) +{ +} + +KeywordExtractor::~KeywordExtractor() = default; + +bool +KeywordExtractor::isLegalIndex(vespalib::stringref idx) const +{ + return _indexes.contains(idx); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h new file mode 100644 index 00000000000..a2b1fba96f1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h @@ -0,0 +1,24 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_keyword_extractor.h" +#include <vespa/vespalib/stllike/hash_set.h> + +namespace search::docsummary { + +/* + * Class for checking if query term index name indicates that + * related query term is useful from the perspective of juniper. + */ +class KeywordExtractor : public IKeywordExtractor +{ + using StringSet = vespalib::hash_set<vespalib::string>; + StringSet _indexes; +public: + KeywordExtractor(StringSet indexes); + ~KeywordExtractor() override; + bool isLegalIndex(vespalib::stringref idx) const override; +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp new file mode 100644 index 00000000000..f749e6e42a1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp @@ -0,0 +1,41 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor_factory.h" +#include "keyword_extractor.h" +#include <vespa/searchcommon/common/schema.h> +#include <vespa/vespalib/stllike/hash_map.hpp> +#include <vespa/vespalib/stllike/hash_set.hpp> + +namespace search::docsummary { + +KeywordExtractorFactory::KeywordExtractorFactory(const search::index::Schema& schema) + : IKeywordExtractorFactory(), + _index_map() +{ + for (uint32_t i = 0; i < schema.getNumFieldSets(); ++i) { + auto& field_set = schema.getFieldSet(i); + auto& fields = field_set.getFields(); + for (auto& field : fields) { + auto& vec = _index_map[field]; + vec.emplace_back(field_set.getName()); + } + } +} + +KeywordExtractorFactory::~KeywordExtractorFactory() = default; + +std::shared_ptr<const IKeywordExtractor> +KeywordExtractorFactory::make(vespalib::stringref input_field) const +{ + vespalib::hash_set<vespalib::string> indexes; + indexes.insert(input_field); + auto itr = _index_map.find(input_field); + if (itr != _index_map.end()) { + for (auto& index : itr->second) { + indexes.insert(index); + } + } + return std::make_shared<KeywordExtractor>(std::move(indexes)); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h new file mode 100644 index 00000000000..e22475eb842 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h @@ -0,0 +1,26 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_keyword_extractor_factory.h" +#include <vespa/searchcommon/common/schema.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vector> + +namespace search::index { class Schema; } + +namespace search::docsummary { + +/* + * Class for creating an instance of IKeywordExtractor. + */ +class KeywordExtractorFactory : public IKeywordExtractorFactory +{ + vespalib::hash_map<vespalib::string, std::vector<vespalib::string>> _index_map; +public: + KeywordExtractorFactory(const search::index::Schema& schema); + ~KeywordExtractorFactory() override; + std::shared_ptr<const IKeywordExtractor> make(vespalib::stringref input_field) const override; +}; + +} diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt index 0e7789a21b9..adfee1a76ae 100644 --- a/streamingvisitors/CMakeLists.txt +++ b/streamingvisitors/CMakeLists.txt @@ -26,6 +26,7 @@ vespa_define_module( src/tests/charbuffer src/tests/docsum src/tests/document + src/tests/keyword_extractor_factory src/tests/searcher src/tests/textutil ) diff --git a/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt b/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt new file mode 100644 index 00000000000..54e2368f200 --- /dev/null +++ b/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_keyword_extractor_factory_test_app TEST + SOURCES + keyword_extractor_factory_test.cpp + DEPENDS + streamingvisitors + GTest::GTest +) +vespa_add_test(NAME streamingvisitors_keyword_extractor_factory_test_app COMMAND streamingvisitors_keyword_extractor_factory_test_app) diff --git a/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp b/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp new file mode 100644 index 00000000000..6ed4dfa1425 --- /dev/null +++ b/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp @@ -0,0 +1,116 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchsummary/docsummary/i_keyword_extractor.h> +#include <vespa/vsm/vsm/keyword_extractor_factory.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using vespa::config::search::vsm::VsmfieldsConfig; +using vespa::config::search::vsm::VsmfieldsConfigBuilder; +using vespa::config::search::vsm::VsmsummaryConfig; +using vespa::config::search::vsm::VsmsummaryConfigBuilder; +using vsm::KeywordExtractorFactory; + +class KeywordExtractorFactoryTest : public testing::Test { + std::unique_ptr<IKeywordExtractorFactory> _factory; + VsmfieldsConfigBuilder _fields; + VsmsummaryConfigBuilder _summary; +protected: + KeywordExtractorFactoryTest(); + ~KeywordExtractorFactoryTest() override; + + void make_factory() { + _factory = std::make_unique<KeywordExtractorFactory>(_fields, _summary); + } + + bool check_index(const vespalib::string &index_name, const vespalib::string& summary_field) { + if (!_factory) { + make_factory(); + } + auto extractor = _factory->make(summary_field); + return extractor->isLegalIndex(index_name); + } + + void add_summary_field(const vespalib::string& summary_field_name, const std::vector<vespalib::string>& field_names) + { + VsmsummaryConfigBuilder::Fieldmap field_map; + field_map.summary = summary_field_name; + for (auto& field_name : field_names) { + VsmsummaryConfigBuilder::Fieldmap::Document document; + document.field = field_name; + field_map.document.emplace_back(document); + } + _summary.fieldmap.emplace_back(field_map); + _factory.reset(); + } + void add_index(const vespalib::string& index_name, const std::vector<vespalib::string>& field_names) + { + if (_fields.documenttype.empty()) { + _fields.documenttype.resize(1); + _fields.documenttype.back().name = "dummy"; + } + VsmfieldsConfigBuilder::Documenttype::Index index; + index.name = index_name; + for (auto& field_name : field_names) { + VsmfieldsConfigBuilder::Documenttype::Index::Field field; + field.name = field_name; + index.field.emplace_back(field); + } + _fields.documenttype.back().index.emplace_back(index); + _factory.reset(); + } +}; + + +KeywordExtractorFactoryTest::KeywordExtractorFactoryTest() + : testing::Test(), + _factory() +{ +} + +KeywordExtractorFactoryTest::~KeywordExtractorFactoryTest() = default; + +TEST_F(KeywordExtractorFactoryTest, empty_config) +{ + EXPECT_FALSE(check_index("foo", "foo")); +} + +TEST_F(KeywordExtractorFactoryTest, implied_identity_mapping_for_summary_field) +{ + add_index("foo", {"bar"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("foo", "bar")); +} + +TEST_F(KeywordExtractorFactoryTest, two_source_fields_for_summary_field) +{ + add_index("bar", {"bar"}); + add_index("baz", {"baz"}); + add_summary_field("foo", {"bar", "baz"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("bar", "foo")); + EXPECT_TRUE(check_index("bar", "bar")); + EXPECT_TRUE(check_index("baz", "foo")); + EXPECT_TRUE(check_index("baz", "baz")); +} + +TEST_F(KeywordExtractorFactoryTest, two_source_fields_for_summary_field_and_multiple_indexes) +{ + add_index("bar", {"bar"}); + add_index("baz", {"baz"}); + add_index("both", {"bar", "baz"}); + add_summary_field("foo", {"bar", "baz"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("both", "foo")); + EXPECT_TRUE(check_index("bar", "foo")); + EXPECT_TRUE(check_index("baz", "foo")); + EXPECT_TRUE(check_index("both", "bar")); + EXPECT_TRUE(check_index("bar", "bar")); + EXPECT_FALSE(check_index("baz", "bar")); + EXPECT_TRUE(check_index("both", "baz")); + EXPECT_FALSE(check_index("bar", "baz")); + EXPECT_TRUE(check_index("baz", "baz")); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt index cf121aead4b..67acbc1a391 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt @@ -6,6 +6,7 @@ vespa_add_library(vsm_vsmbase OBJECT docsum_field_writer_factory.cpp fieldsearchspec.cpp flattendocsumwriter.cpp + keyword_extractor_factory.cpp snippetmodifier.cpp vsm-adapter.cpp DEPENDS diff --git a/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp new file mode 100644 index 00000000000..5319f554c81 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp @@ -0,0 +1,80 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor_factory.h" +#include <vespa/searchsummary/docsummary/keyword_extractor.h> +#include <vespa/vespalib/stllike/hash_map.hpp> +#include <vespa/vespalib/stllike/hash_set.hpp> +#include <cassert> +#include <vespa/log/log.h> +LOG_SETUP(".vsm.keyword_extractor_factory"); + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using search::docsummary::KeywordExtractor; +using vespa::config::search::vsm::VsmfieldsConfig; +using vespa::config::search::vsm::VsmsummaryConfig; + +namespace vsm { + +KeywordExtractorFactory::KeywordExtractorFactory(VsmfieldsConfig& vsm_fields_config, + VsmsummaryConfig& vsm_summary_config) + : IKeywordExtractorFactory(), + _index_map(), + _field_map() +{ + populate_index_map(vsm_fields_config); + populate_field_map(vsm_summary_config); +} + +KeywordExtractorFactory::~KeywordExtractorFactory() = default; + +void +KeywordExtractorFactory::populate_index_map(VsmfieldsConfig& vsm_fields_config) +{ + for (auto& doctype : vsm_fields_config.documenttype) { + for (auto& index : doctype.index) { + for (auto& field : index.field) { + _index_map[field.name].insert(index.name); + } + } + } +} + +void +KeywordExtractorFactory::populate_field_map(VsmsummaryConfig& vsm_summary_config) +{ + for (auto& summary_field : vsm_summary_config.fieldmap) { + for (auto& document : summary_field.document) { + _field_map[summary_field.summary].insert(document.field); + } + } +} + +void +KeywordExtractorFactory::populate_indexes(StringSet& indexes, const vespalib::string& field) const +{ + auto itr = _index_map.find(field); + if (itr != _index_map.end()) { + for (auto& index : itr->second) { + indexes.insert(index); + } + } +} + +std::shared_ptr<const IKeywordExtractor> +KeywordExtractorFactory::make(vespalib::stringref input_field) const +{ + StringSet indexes; + auto itr = _field_map.find(input_field); + if (itr != _field_map.end()) { + for (auto& field : itr->second) { + populate_indexes(indexes, field); + } + } else { + // Assume identity mapping vsm summary field -> document field + populate_indexes(indexes, input_field); + } + return std::make_shared<KeywordExtractor>(std::move(indexes)); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h new file mode 100644 index 00000000000..6ffcbd6f84b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/i_keyword_extractor_factory.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/vespalib/stllike/hash_set.h> +#include <vespa/vsm/config/config-vsmfields.h> +#include <vespa/vsm/config/config-vsmsummary.h> + +namespace vsm { + +/* + * Class for creating an instance of IKeywordExtractor for streaming search. + * + * vsm summary fields are treated as document fields by the summary framework + * in the searchsummary module, cf. IDocsumStoreDocument. + */ +class KeywordExtractorFactory : public search::docsummary::IKeywordExtractorFactory +{ +public: + using VsmfieldsConfig = vespa::config::search::vsm::VsmfieldsConfig; + using VsmsummaryConfig = vespa::config::search::vsm::VsmsummaryConfig; +private: + using StringSet = vespalib::hash_set<vespalib::string>; + using StringSetMap = vespalib::hash_map<vespalib::string, StringSet>; + StringSetMap _index_map; // document field -> indexes + StringSetMap _field_map; // vsm summary field -> document fields + void populate_index_map(VsmfieldsConfig& vsm_fields_config); + void populate_field_map(VsmsummaryConfig& vsm_summary_config); + void populate_indexes(StringSet& indexes, const vespalib::string& field) const; +public: + KeywordExtractorFactory(VsmfieldsConfig& vsm_fields_config, + VsmsummaryConfig& vsm_summary_config); + ~KeywordExtractorFactory() override; + std::shared_ptr<const search::docsummary::IKeywordExtractor> make(vespalib::stringref input_field) const override; +}; + +} |