From aea9c1ee84d1e17c2a6dc11e01f2981d724c3822 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 27 Mar 2024 14:52:53 +0100 Subject: Add streaming mode version of tokens document field writer. --- .../docsummary/docsum_field_writer_factory.cpp | 6 +- .../docsummary/docsum_field_writer_factory.h | 1 + streamingvisitors/CMakeLists.txt | 1 + .../src/tests/tokens_converter/CMakeLists.txt | 10 +++ .../tokens_converter/tokens_converter_test.cpp | 91 ++++++++++++++++++++++ streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt | 2 + .../vespa/vsm/vsm/docsum_field_writer_factory.cpp | 28 +++++++ .../src/vespa/vsm/vsm/tokens_converter.cpp | 59 ++++++++++++++ .../src/vespa/vsm/vsm/tokens_converter.h | 27 +++++++ streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp | 37 +++++++++ streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h | 28 +++++++ 11 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 streamingvisitors/src/tests/tokens_converter/CMakeLists.txt create mode 100644 streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp create mode 100644 streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp create mode 100644 streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h create mode 100644 streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp create mode 100644 streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp index aa4a4342b0d..b11a0eb73cc 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp @@ -50,14 +50,14 @@ throw_if_nullptr(const std::unique_ptr& writer, } } +} + void -throw_missing_source(const vespalib::string& command) +DocsumFieldWriterFactory::throw_missing_source(const vespalib::string& command) { throw IllegalArgumentException("Missing source for command '" + command + "'."); } -} - std::unique_ptr DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& field_name, const vespalib::string& command, diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h index d4f52811687..d98f689fa3f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h @@ -20,6 +20,7 @@ class DocsumFieldWriterFactory : public IDocsumFieldWriterFactory const IDocsumEnvironment& _env; const IQueryTermFilterFactory& _query_term_filter_factory; protected: + static void throw_missing_source(const vespalib::string& command); const IDocsumEnvironment& getEnvironment() const noexcept { return _env; } bool has_attribute_manager() const noexcept; public: diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt index 2abbfd4a64e..a990f6a4de0 100644 --- a/streamingvisitors/CMakeLists.txt +++ b/streamingvisitors/CMakeLists.txt @@ -30,4 +30,5 @@ vespa_define_module( src/tests/searcher src/tests/searchvisitor src/tests/textutil + src/tests/tokens_converter ) diff --git a/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt b/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt new file mode 100644 index 00000000000..01a1fc965af --- /dev/null +++ b/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_tokens_converter_test_app TEST + SOURCES + tokens_converter_test.cpp + DEPENDS + streamingvisitors + GTest::gtest +) + +vespa_add_test(NAME streamingvisitors_tokens_converter_test_app COMMAND streamingvisitors_tokens_converter_test_app) diff --git a/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp b/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp new file mode 100644 index 00000000000..1d9c953519d --- /dev/null +++ b/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp @@ -0,0 +1,91 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include +#include +#include +#include + +using document::StringFieldValue; +using search::Normalizing; +using vespalib::SimpleBuffer; +using vespalib::Slime; +using vespalib::slime::JsonFormat; +using vespalib::slime::SlimeInserter; +using vsm::TokensConverter; + +namespace { + +vespalib::string +slime_to_string(const Slime& slime) +{ + SimpleBuffer buf; + JsonFormat::encode(slime, buf, true); + return buf.get().make_string(); +} + +} + +class TokensConverterTest : public testing::Test +{ +protected: + TokensConverterTest(); + ~TokensConverterTest() override; +; + vespalib::string convert(const StringFieldValue& fv, bool exact_match, Normalizing normalize_mode); +}; + +TokensConverterTest::TokensConverterTest() + : testing::Test() +{ +} + +TokensConverterTest::~TokensConverterTest() = default; + +vespalib::string +TokensConverterTest::convert(const StringFieldValue& fv, bool exact_match, Normalizing normalize_mode) +{ + TokensConverter converter(exact_match, normalize_mode); + Slime slime; + SlimeInserter inserter(slime); + converter.convert(fv, inserter); + return slime_to_string(slime); +} + +TEST_F(TokensConverterTest, convert_empty_string) +{ + vespalib::string exp(R"([])"); + StringFieldValue plain_string(""); + EXPECT_EQ(exp, convert(plain_string, false, Normalizing::NONE)); + EXPECT_EQ(exp, convert(plain_string, true, Normalizing::NONE)); +} + +TEST_F(TokensConverterTest, convert_exact_match) +{ + vespalib::string exp_none(R"([".Foo Bar Baz."])"); + vespalib::string exp_lowercase(R"([".foo bar baz."])"); + StringFieldValue plain_string(".Foo Bar Baz."); + EXPECT_EQ(exp_none, convert(plain_string, true, Normalizing::NONE)); + EXPECT_EQ(exp_lowercase, convert(plain_string, true, Normalizing::LOWERCASE)); +} + +TEST_F(TokensConverterTest, convert_tokenized_string) +{ + vespalib::string exp_none(R"(["Foo","Bar"])"); + vespalib::string exp_lowercase(R"(["foo","bar"])"); + StringFieldValue value(".Foo Bar."); + EXPECT_EQ(exp_none, convert(value, false, Normalizing::NONE)); + EXPECT_EQ(exp_lowercase, convert(value, false, Normalizing::LOWERCASE)); +} + +TEST_F(TokensConverterTest, convert_with_folding) +{ + vespalib::string exp_exact_match_folded(R"(["moerk vaarkveld"])"); + vespalib::string exp_tokenized_folded(R"(["moerk","vaarkveld"])"); + StringFieldValue value("Mørk vårkveld"); + EXPECT_EQ(exp_exact_match_folded, convert(value, true, Normalizing::LOWERCASE_AND_FOLD)); + EXPECT_EQ(exp_tokenized_folded, convert(value, false, Normalizing::LOWERCASE_AND_FOLD)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt index a50a541f3b7..d13e2e1fd54 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt @@ -8,6 +8,8 @@ vespa_add_library(vsm_vsmbase OBJECT flattendocsumwriter.cpp query_term_filter_factory.cpp snippetmodifier.cpp + tokens_converter.cpp + tokens_dfw.cpp vsm-adapter.cpp DEPENDS vsm_vconfig diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp index 180200e2eaf..95e0f961d13 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "docsum_field_writer_factory.h" +#include "fieldsearchspec.h" +#include "tokens_dfw.h" #include #include #include @@ -8,8 +10,10 @@ #include #include #include +#include using search::MatchingElementsFields; +using search::Normalizing; using search::docsummary::CopyDFW; using search::docsummary::DocsumFieldWriter; using search::docsummary::EmptyDFW; @@ -35,6 +39,23 @@ void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_con } } +bool is_exact_match(vespalib::stringref arg1) { + return ((arg1 == "exact") || (arg1 == "word")); +} + +std::unique_ptr +make_tokens_dfw(const vespalib::string& source, VsmfieldsConfig& fields_config) +{ + bool exact_match = false; + Normalizing normalize_mode = Normalizing::LOWERCASE; + auto it = std::find_if(fields_config.fieldspec.begin(), fields_config.fieldspec.end(), [&source](auto& fs) { return source == fs.name; }); + if (it != fields_config.fieldspec.end()) { + exact_match = is_exact_match(it->arg1); + normalize_mode = FieldSearchSpecMap::convert_normalize_mode(it->normalize); + } + return std::make_unique(source, exact_match, normalize_mode); +} + } DocsumFieldWriterFactory::DocsumFieldWriterFactory(bool use_v8_geo_positions, const IDocsumEnvironment& env, const IQueryTermFilterFactory& query_term_filter_factory, const vespa::config::search::vsm::VsmfieldsConfig& vsm_fields_config) @@ -68,6 +89,13 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie vespalib::string source_field = source.empty() ? field_name : source; populate_fields(*matching_elems_fields, _vsm_fields_config, source_field); fieldWriter = MatchedElementsFilterDFW::create(source_field, matching_elems_fields); + } else if ((command == command::tokens) || + (command == command::attribute_tokens)) { + if (!source.empty()) { + fieldWriter = make_tokens_dfw(source, _vsm_fields_config); + } else { + throw_missing_source(command); + } } else { return search::docsummary::DocsumFieldWriterFactory::create_docsum_field_writer(field_name, command, source, matching_elems_fields); } diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp new file mode 100644 index 00000000000..3534b751356 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp @@ -0,0 +1,59 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokens_converter.h" +#include +#include +#include +#include +#include + +using document::StringFieldValue; +using search::byte; +using vespalib::Utf8Writer; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; +using vsm::TokenizeReader; + +namespace vsm { + +TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode) + : IStringFieldConverter(), + _text(), + _exact_match(exact_match), + _normalize_mode(normalize_mode) +{ +} + +TokensConverter::~TokensConverter() = default; + +void +TokensConverter::convert(const StringFieldValue &input, Inserter& inserter) +{ + _text = input.getValueRef(); + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + vespalib::Array buf(_text.size() + 1, 0); + vespalib::string scratch; + TokenizeReader reader(reinterpret_cast (_text.data()), _text.size(), buf.data()); + for (;;) { + auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode); + if (len == 0) { + break; + } + scratch.clear(); + Utf8Writer w(scratch); + for (size_t i = 0; i < len; ++i) { + w.putChar(buf[i]); + } + ai.insertString(scratch); + } +} + +bool +TokensConverter::render_weighted_set_as_array() const +{ + return true; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h new file mode 100644 index 00000000000..689b0b95b02 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h @@ -0,0 +1,27 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace vsm { + +/* + * Class converting a string field value into an array + * containing the tokens. + */ +class TokensConverter : public search::docsummary::IStringFieldConverter +{ + vespalib::stringref _text; + bool _exact_match; + search::Normalizing _normalize_mode; + +public: + TokensConverter(bool exact_match, search::Normalizing normalize_mode); + ~TokensConverter() override; + void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp new file mode 100644 index 00000000000..7861c13a179 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp @@ -0,0 +1,37 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokens_dfw.h" +#include +#include "tokens_converter.h" + +using search::docsummary::IDocsumStoreDocument; +using search::docsummary::GetDocsumsState; + +namespace vsm { + +TokensDFW::TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode) + : DocsumFieldWriter(), + _input_field_name(input_field_name), + _exact_match(exact_match), + _normalize_mode(normalize_mode) +{ +} + +TokensDFW::~TokensDFW() = default; + +bool +TokensDFW::isGenerated() const +{ + return false; +} + +void +TokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const +{ + if (doc != nullptr) { + TokensConverter converter(_exact_match, _normalize_mode); + doc->insert_summary_field(_input_field_name, target, &converter); + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h new file mode 100644 index 00000000000..23fd869d005 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h @@ -0,0 +1,28 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace vsm { + +/* + * Class for writing annotated string field values from document as + * arrays containing the tokens. + */ +class TokensDFW : public search::docsummary::DocsumFieldWriter +{ +private: + vespalib::string _input_field_name; + bool _exact_match; + search::Normalizing _normalize_mode; + +public: + explicit TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode); + ~TokensDFW() override; + bool isGenerated() const override; + void insertField(uint32_t docid, const search::docsummary::IDocsumStoreDocument* doc, search::docsummary::GetDocsumsState& state, vespalib::slime::Inserter& target) const override; +}; + +} -- cgit v1.2.3 From 65d47f5dc645c2f746cb31a40891af99bfdeba64 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 27 Mar 2024 15:35:02 +0100 Subject: Update class comment. --- streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h index 23fd869d005..4199630a94d 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h @@ -8,8 +8,9 @@ namespace vsm { /* - * Class for writing annotated string field values from document as - * arrays containing the tokens. + * Class for writing string field values from document as + * arrays containing the tokens. Tokenization is performed + * on the fly using the exact_match and normalize_mode settings. */ class TokensDFW : public search::docsummary::DocsumFieldWriter { -- cgit v1.2.3 From 086944ee64cb439c336d8d94d6af1c253a22697b Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 27 Mar 2024 15:36:01 +0100 Subject: Update class comment. --- streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h index e32761f38b3..68fcfd6e8eb 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h +++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h @@ -8,7 +8,7 @@ namespace vsm { /* - * Factory interface class for creating docsum field writers, adjusted for + * Factory class for creating docsum field writers, adjusted for * streaming search. */ class DocsumFieldWriterFactory : public search::docsummary::DocsumFieldWriterFactory -- cgit v1.2.3