From 5c2aca998192db6b0d4cbcd054aa11db158b298b Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 24 Jan 2023 16:37:15 +0100 Subject: Add new KeywordExtractor with two factories (one each for indexed search and streaming search). --- .../tests/keyword_extractor_factory/CMakeLists.txt | 9 ++ .../keyword_extractor_factory_test.cpp | 116 +++++++++++++++++++++ streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt | 1 + .../vespa/vsm/vsm/keyword_extractor_factory.cpp | 80 ++++++++++++++ .../src/vespa/vsm/vsm/keyword_extractor_factory.h | 39 +++++++ 5 files changed, 245 insertions(+) create mode 100644 streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt create mode 100644 streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp create mode 100644 streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp create mode 100644 streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h (limited to 'streamingvisitors/src') diff --git a/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt b/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt new file mode 100644 index 00000000000..54e2368f200 --- /dev/null +++ b/streamingvisitors/src/tests/keyword_extractor_factory/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_keyword_extractor_factory_test_app TEST + SOURCES + keyword_extractor_factory_test.cpp + DEPENDS + streamingvisitors + GTest::GTest +) +vespa_add_test(NAME streamingvisitors_keyword_extractor_factory_test_app COMMAND streamingvisitors_keyword_extractor_factory_test_app) diff --git a/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp b/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp new file mode 100644 index 00000000000..6ed4dfa1425 --- /dev/null +++ b/streamingvisitors/src/tests/keyword_extractor_factory/keyword_extractor_factory_test.cpp @@ -0,0 +1,116 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using vespa::config::search::vsm::VsmfieldsConfig; +using vespa::config::search::vsm::VsmfieldsConfigBuilder; +using vespa::config::search::vsm::VsmsummaryConfig; +using vespa::config::search::vsm::VsmsummaryConfigBuilder; +using vsm::KeywordExtractorFactory; + +class KeywordExtractorFactoryTest : public testing::Test { + std::unique_ptr _factory; + VsmfieldsConfigBuilder _fields; + VsmsummaryConfigBuilder _summary; +protected: + KeywordExtractorFactoryTest(); + ~KeywordExtractorFactoryTest() override; + + void make_factory() { + _factory = std::make_unique(_fields, _summary); + } + + bool check_index(const vespalib::string &index_name, const vespalib::string& summary_field) { + if (!_factory) { + make_factory(); + } + auto extractor = _factory->make(summary_field); + return extractor->isLegalIndex(index_name); + } + + void add_summary_field(const vespalib::string& summary_field_name, const std::vector& field_names) + { + VsmsummaryConfigBuilder::Fieldmap field_map; + field_map.summary = summary_field_name; + for (auto& field_name : field_names) { + VsmsummaryConfigBuilder::Fieldmap::Document document; + document.field = field_name; + field_map.document.emplace_back(document); + } + _summary.fieldmap.emplace_back(field_map); + _factory.reset(); + } + void add_index(const vespalib::string& index_name, const std::vector& field_names) + { + if (_fields.documenttype.empty()) { + _fields.documenttype.resize(1); + _fields.documenttype.back().name = "dummy"; + } + VsmfieldsConfigBuilder::Documenttype::Index index; + index.name = index_name; + for (auto& field_name : field_names) { + VsmfieldsConfigBuilder::Documenttype::Index::Field field; + field.name = field_name; + index.field.emplace_back(field); + } + _fields.documenttype.back().index.emplace_back(index); + _factory.reset(); + } +}; + + +KeywordExtractorFactoryTest::KeywordExtractorFactoryTest() + : testing::Test(), + _factory() +{ +} + +KeywordExtractorFactoryTest::~KeywordExtractorFactoryTest() = default; + +TEST_F(KeywordExtractorFactoryTest, empty_config) +{ + EXPECT_FALSE(check_index("foo", "foo")); +} + +TEST_F(KeywordExtractorFactoryTest, implied_identity_mapping_for_summary_field) +{ + add_index("foo", {"bar"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("foo", "bar")); +} + +TEST_F(KeywordExtractorFactoryTest, two_source_fields_for_summary_field) +{ + add_index("bar", {"bar"}); + add_index("baz", {"baz"}); + add_summary_field("foo", {"bar", "baz"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("bar", "foo")); + EXPECT_TRUE(check_index("bar", "bar")); + EXPECT_TRUE(check_index("baz", "foo")); + EXPECT_TRUE(check_index("baz", "baz")); +} + +TEST_F(KeywordExtractorFactoryTest, two_source_fields_for_summary_field_and_multiple_indexes) +{ + add_index("bar", {"bar"}); + add_index("baz", {"baz"}); + add_index("both", {"bar", "baz"}); + add_summary_field("foo", {"bar", "baz"}); + EXPECT_FALSE(check_index("foo", "foo")); + EXPECT_TRUE(check_index("both", "foo")); + EXPECT_TRUE(check_index("bar", "foo")); + EXPECT_TRUE(check_index("baz", "foo")); + EXPECT_TRUE(check_index("both", "bar")); + EXPECT_TRUE(check_index("bar", "bar")); + EXPECT_FALSE(check_index("baz", "bar")); + EXPECT_TRUE(check_index("both", "baz")); + EXPECT_FALSE(check_index("bar", "baz")); + EXPECT_TRUE(check_index("baz", "baz")); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt index cf121aead4b..67acbc1a391 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt @@ -6,6 +6,7 @@ vespa_add_library(vsm_vsmbase OBJECT docsum_field_writer_factory.cpp fieldsearchspec.cpp flattendocsumwriter.cpp + keyword_extractor_factory.cpp snippetmodifier.cpp vsm-adapter.cpp DEPENDS diff --git a/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp new file mode 100644 index 00000000000..5319f554c81 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.cpp @@ -0,0 +1,80 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "keyword_extractor_factory.h" +#include +#include +#include +#include +#include +LOG_SETUP(".vsm.keyword_extractor_factory"); + +using search::docsummary::IKeywordExtractor; +using search::docsummary::IKeywordExtractorFactory; +using search::docsummary::KeywordExtractor; +using vespa::config::search::vsm::VsmfieldsConfig; +using vespa::config::search::vsm::VsmsummaryConfig; + +namespace vsm { + +KeywordExtractorFactory::KeywordExtractorFactory(VsmfieldsConfig& vsm_fields_config, + VsmsummaryConfig& vsm_summary_config) + : IKeywordExtractorFactory(), + _index_map(), + _field_map() +{ + populate_index_map(vsm_fields_config); + populate_field_map(vsm_summary_config); +} + +KeywordExtractorFactory::~KeywordExtractorFactory() = default; + +void +KeywordExtractorFactory::populate_index_map(VsmfieldsConfig& vsm_fields_config) +{ + for (auto& doctype : vsm_fields_config.documenttype) { + for (auto& index : doctype.index) { + for (auto& field : index.field) { + _index_map[field.name].insert(index.name); + } + } + } +} + +void +KeywordExtractorFactory::populate_field_map(VsmsummaryConfig& vsm_summary_config) +{ + for (auto& summary_field : vsm_summary_config.fieldmap) { + for (auto& document : summary_field.document) { + _field_map[summary_field.summary].insert(document.field); + } + } +} + +void +KeywordExtractorFactory::populate_indexes(StringSet& indexes, const vespalib::string& field) const +{ + auto itr = _index_map.find(field); + if (itr != _index_map.end()) { + for (auto& index : itr->second) { + indexes.insert(index); + } + } +} + +std::shared_ptr +KeywordExtractorFactory::make(vespalib::stringref input_field) const +{ + StringSet indexes; + auto itr = _field_map.find(input_field); + if (itr != _field_map.end()) { + for (auto& field : itr->second) { + populate_indexes(indexes, field); + } + } else { + // Assume identity mapping vsm summary field -> document field + populate_indexes(indexes, input_field); + } + return std::make_shared(std::move(indexes)); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h new file mode 100644 index 00000000000..6ffcbd6f84b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/keyword_extractor_factory.h @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include +#include +#include +#include + +namespace vsm { + +/* + * Class for creating an instance of IKeywordExtractor for streaming search. + * + * vsm summary fields are treated as document fields by the summary framework + * in the searchsummary module, cf. IDocsumStoreDocument. + */ +class KeywordExtractorFactory : public search::docsummary::IKeywordExtractorFactory +{ +public: + using VsmfieldsConfig = vespa::config::search::vsm::VsmfieldsConfig; + using VsmsummaryConfig = vespa::config::search::vsm::VsmsummaryConfig; +private: + using StringSet = vespalib::hash_set; + using StringSetMap = vespalib::hash_map; + StringSetMap _index_map; // document field -> indexes + StringSetMap _field_map; // vsm summary field -> document fields + void populate_index_map(VsmfieldsConfig& vsm_fields_config); + void populate_field_map(VsmsummaryConfig& vsm_summary_config); + void populate_indexes(StringSet& indexes, const vespalib::string& field) const; +public: + KeywordExtractorFactory(VsmfieldsConfig& vsm_fields_config, + VsmsummaryConfig& vsm_summary_config); + ~KeywordExtractorFactory() override; + std::shared_ptr make(vespalib::stringref input_field) const override; +}; + +} -- cgit v1.2.3