aboutsummaryrefslogtreecommitdiffstats
path: root/searchsummary
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-01-24 16:37:15 +0100
committerTor Egge <Tor.Egge@online.no>2023-01-24 16:37:15 +0100
commit5c2aca998192db6b0d4cbcd054aa11db158b298b (patch)
treebeea1df066868901c116801fa27b61bfb004bcdf /searchsummary
parentbb1a582cbf3de4854243f88f05a73b355f00a3d0 (diff)
Add new KeywordExtractor with two factories (one each for indexed search
and streaming search).
Diffstat (limited to 'searchsummary')
-rw-r--r--searchsummary/CMakeLists.txt1
-rw-r--r--searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt9
-rw-r--r--searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp73
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt2
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp22
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h24
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp41
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h26
8 files changed, 198 insertions, 0 deletions
diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt
index 9c9079e6ed5..451c90c752d 100644
--- a/searchsummary/CMakeLists.txt
+++ b/searchsummary/CMakeLists.txt
@@ -20,6 +20,7 @@ vespa_define_module(
src/tests/docsummary/attribute_combiner
src/tests/docsummary/attributedfw
src/tests/docsummary/document_id_dfw
+ src/tests/docsummary/keyword_extractor_factory
src/tests/docsummary/matched_elements_filter
src/tests/docsummary/result_class
src/tests/docsummary/slime_filler
diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt
new file mode 100644
index 00000000000..1cb555f3bd8
--- /dev/null
+++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchsummary_keyword_extractor_factory_test_app TEST
+ SOURCES
+ keyword_extractor_factory_test.cpp
+ DEPENDS
+ searchsummary
+ GTest::GTest
+)
+vespa_add_test(NAME searchsummary_keyword_extractor_factory_test_app COMMAND searchsummary_keyword_extractor_factory_test_app)
diff --git a/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp
new file mode 100644
index 00000000000..8ba91699ae6
--- /dev/null
+++ b/searchsummary/src/tests/docsummary/keyword_extractor_factory/keyword_extractor_factory_test.cpp
@@ -0,0 +1,73 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchcommon/common/schema.h>
+#include <vespa/searchsummary/docsummary/i_keyword_extractor.h>
+#include <vespa/searchsummary/docsummary/keyword_extractor_factory.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using search::docsummary::IKeywordExtractor;
+using search::docsummary::IKeywordExtractorFactory;
+using search::docsummary::KeywordExtractorFactory;
+using search::index::Schema;
+
+using FieldSet = Schema::FieldSet;
+
+class KeywordExtractorFactoryTest : public testing::Test {
+ std::unique_ptr<IKeywordExtractorFactory> _factory;
+ Schema _schema;
+
+protected:
+ KeywordExtractorFactoryTest();
+ ~KeywordExtractorFactoryTest() override;
+
+ void make_factory() {
+ _factory = std::make_unique<KeywordExtractorFactory>(_schema);
+ }
+
+ bool check_index(const vespalib::string &index_name, const vespalib::string& summary_field) {
+ if (!_factory) {
+ make_factory();
+ }
+ auto extractor = _factory->make(summary_field);
+ return extractor->isLegalIndex(index_name);
+ }
+
+ void add_field_set(const vespalib::string& field_set_name, const std::vector<vespalib::string>& field_names) {
+ FieldSet field_set(field_set_name);
+ for (auto& field_name : field_names) {
+ field_set.addField(field_name);
+ }
+ _schema.addFieldSet(field_set);
+ _factory.reset();
+ }
+};
+
+
+KeywordExtractorFactoryTest::KeywordExtractorFactoryTest()
+ : testing::Test(),
+ _factory()
+{
+}
+
+KeywordExtractorFactoryTest::~KeywordExtractorFactoryTest() = default;
+
+TEST_F(KeywordExtractorFactoryTest, empty_schema)
+{
+ EXPECT_TRUE(check_index("foo", "foo"));
+ EXPECT_FALSE(check_index("bar", "foo"));
+ EXPECT_FALSE(check_index("foo", "bar"));
+}
+
+TEST_F(KeywordExtractorFactoryTest, field_set_is_checked)
+{
+ add_field_set("ab", {"cd", "de"});
+ add_field_set("gh", {"cd"});
+ EXPECT_TRUE(check_index("cd", "cd"));
+ EXPECT_TRUE(check_index("ab", "cd"));
+ EXPECT_TRUE(check_index("gh", "cd"));
+ EXPECT_TRUE(check_index("de", "de"));
+ EXPECT_TRUE(check_index("ab", "de"));
+ EXPECT_FALSE(check_index("gh", "de"));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index cfc3eb6536d..34e902461f4 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -23,6 +23,8 @@ vespa_add_library(searchsummary_docsummary OBJECT
juniper_dfw_term_visitor.cpp
juniper_query_adapter.cpp
juniperproperties.cpp
+ keyword_extractor.cpp
+ keyword_extractor_factory.cpp
legacy_keyword_extractor.cpp
legacy_keyword_extractor_factory.cpp
linguisticsannotation.cpp
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp
new file mode 100644
index 00000000000..71b685c6317
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.cpp
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "keyword_extractor.h"
+#include <vespa/vespalib/stllike/hash_set.hpp>
+
+namespace search::docsummary {
+
+KeywordExtractor::KeywordExtractor(StringSet indexes)
+ : IKeywordExtractor(),
+ _indexes(std::move(indexes))
+{
+}
+
+KeywordExtractor::~KeywordExtractor() = default;
+
+bool
+KeywordExtractor::isLegalIndex(vespalib::stringref idx) const
+{
+ return _indexes.contains(idx);
+}
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h
new file mode 100644
index 00000000000..a2b1fba96f1
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor.h
@@ -0,0 +1,24 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "i_keyword_extractor.h"
+#include <vespa/vespalib/stllike/hash_set.h>
+
+namespace search::docsummary {
+
+/*
+ * Class for checking if query term index name indicates that
+ * related query term is useful from the perspective of juniper.
+ */
+class KeywordExtractor : public IKeywordExtractor
+{
+ using StringSet = vespalib::hash_set<vespalib::string>;
+ StringSet _indexes;
+public:
+ KeywordExtractor(StringSet indexes);
+ ~KeywordExtractor() override;
+ bool isLegalIndex(vespalib::stringref idx) const override;
+};
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp
new file mode 100644
index 00000000000..f749e6e42a1
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.cpp
@@ -0,0 +1,41 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "keyword_extractor_factory.h"
+#include "keyword_extractor.h"
+#include <vespa/searchcommon/common/schema.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+#include <vespa/vespalib/stllike/hash_set.hpp>
+
+namespace search::docsummary {
+
+KeywordExtractorFactory::KeywordExtractorFactory(const search::index::Schema& schema)
+ : IKeywordExtractorFactory(),
+ _index_map()
+{
+ for (uint32_t i = 0; i < schema.getNumFieldSets(); ++i) {
+ auto& field_set = schema.getFieldSet(i);
+ auto& fields = field_set.getFields();
+ for (auto& field : fields) {
+ auto& vec = _index_map[field];
+ vec.emplace_back(field_set.getName());
+ }
+ }
+}
+
+KeywordExtractorFactory::~KeywordExtractorFactory() = default;
+
+std::shared_ptr<const IKeywordExtractor>
+KeywordExtractorFactory::make(vespalib::stringref input_field) const
+{
+ vespalib::hash_set<vespalib::string> indexes;
+ indexes.insert(input_field);
+ auto itr = _index_map.find(input_field);
+ if (itr != _index_map.end()) {
+ for (auto& index : itr->second) {
+ indexes.insert(index);
+ }
+ }
+ return std::make_shared<KeywordExtractor>(std::move(indexes));
+}
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h
new file mode 100644
index 00000000000..e22475eb842
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/keyword_extractor_factory.h
@@ -0,0 +1,26 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "i_keyword_extractor_factory.h"
+#include <vespa/searchcommon/common/schema.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vector>
+
+namespace search::index { class Schema; }
+
+namespace search::docsummary {
+
+/*
+ * Class for creating an instance of IKeywordExtractor.
+ */
+class KeywordExtractorFactory : public IKeywordExtractorFactory
+{
+ vespalib::hash_map<vespalib::string, std::vector<vespalib::string>> _index_map;
+public:
+ KeywordExtractorFactory(const search::index::Schema& schema);
+ ~KeywordExtractorFactory() override;
+ std::shared_ptr<const IKeywordExtractor> make(vespalib::stringref input_field) const override;
+};
+
+}