aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@vespa.ai>2024-03-27 16:16:56 +0100
committerGitHub <noreply@github.com>2024-03-27 16:16:56 +0100
commit45e8c0c2b2ab7b3ac9d8e6a8f174b08ac15b342e (patch)
treeb886a2b83b89265239fc22f7a78fda01082832d6
parentaa19da6beecdf82393af0f99a12940fb9db92bd9 (diff)
parent086944ee64cb439c336d8d94d6af1c253a22697b (diff)
Merge pull request #30747 from vespa-engine/toregge/add-streaming-mode-version-of-tokens-dfw
Add streaming mode version of tokens document field writer.
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp6
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h1
-rw-r--r--streamingvisitors/CMakeLists.txt1
-rw-r--r--streamingvisitors/src/tests/tokens_converter/CMakeLists.txt10
-rw-r--r--streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp91
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp28
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h2
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp59
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h27
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp37
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h29
12 files changed, 289 insertions, 4 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
index aa4a4342b0d..b11a0eb73cc 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
@@ -50,14 +50,14 @@ throw_if_nullptr(const std::unique_ptr<DocsumFieldWriter>& writer,
}
}
+}
+
void
-throw_missing_source(const vespalib::string& command)
+DocsumFieldWriterFactory::throw_missing_source(const vespalib::string& command)
{
throw IllegalArgumentException("Missing source for command '" + command + "'.");
}
-}
-
std::unique_ptr<DocsumFieldWriter>
DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& field_name,
const vespalib::string& command,
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h
index d4f52811687..d98f689fa3f 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.h
@@ -20,6 +20,7 @@ class DocsumFieldWriterFactory : public IDocsumFieldWriterFactory
const IDocsumEnvironment& _env;
const IQueryTermFilterFactory& _query_term_filter_factory;
protected:
+ static void throw_missing_source(const vespalib::string& command);
const IDocsumEnvironment& getEnvironment() const noexcept { return _env; }
bool has_attribute_manager() const noexcept;
public:
diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt
index 2abbfd4a64e..a990f6a4de0 100644
--- a/streamingvisitors/CMakeLists.txt
+++ b/streamingvisitors/CMakeLists.txt
@@ -30,4 +30,5 @@ vespa_define_module(
src/tests/searcher
src/tests/searchvisitor
src/tests/textutil
+ src/tests/tokens_converter
)
diff --git a/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt b/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt
new file mode 100644
index 00000000000..01a1fc965af
--- /dev/null
+++ b/streamingvisitors/src/tests/tokens_converter/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(streamingvisitors_tokens_converter_test_app TEST
+ SOURCES
+ tokens_converter_test.cpp
+ DEPENDS
+ streamingvisitors
+ GTest::gtest
+)
+
+vespa_add_test(NAME streamingvisitors_tokens_converter_test_app COMMAND streamingvisitors_tokens_converter_test_app)
diff --git a/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp b/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp
new file mode 100644
index 00000000000..1d9c953519d
--- /dev/null
+++ b/streamingvisitors/src/tests/tokens_converter/tokens_converter_test.cpp
@@ -0,0 +1,91 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/data/simple_buffer.h>
+#include <vespa/vespalib/data/slime/json_format.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vsm/vsm/tokens_converter.h>
+
+using document::StringFieldValue;
+using search::Normalizing;
+using vespalib::SimpleBuffer;
+using vespalib::Slime;
+using vespalib::slime::JsonFormat;
+using vespalib::slime::SlimeInserter;
+using vsm::TokensConverter;
+
+namespace {
+
+vespalib::string
+slime_to_string(const Slime& slime)
+{
+ SimpleBuffer buf;
+ JsonFormat::encode(slime, buf, true);
+ return buf.get().make_string();
+}
+
+}
+
+class TokensConverterTest : public testing::Test
+{
+protected:
+ TokensConverterTest();
+ ~TokensConverterTest() override;
+;
+ vespalib::string convert(const StringFieldValue& fv, bool exact_match, Normalizing normalize_mode);
+};
+
+TokensConverterTest::TokensConverterTest()
+ : testing::Test()
+{
+}
+
+TokensConverterTest::~TokensConverterTest() = default;
+
+vespalib::string
+TokensConverterTest::convert(const StringFieldValue& fv, bool exact_match, Normalizing normalize_mode)
+{
+ TokensConverter converter(exact_match, normalize_mode);
+ Slime slime;
+ SlimeInserter inserter(slime);
+ converter.convert(fv, inserter);
+ return slime_to_string(slime);
+}
+
+TEST_F(TokensConverterTest, convert_empty_string)
+{
+ vespalib::string exp(R"([])");
+ StringFieldValue plain_string("");
+ EXPECT_EQ(exp, convert(plain_string, false, Normalizing::NONE));
+ EXPECT_EQ(exp, convert(plain_string, true, Normalizing::NONE));
+}
+
+TEST_F(TokensConverterTest, convert_exact_match)
+{
+ vespalib::string exp_none(R"([".Foo Bar Baz."])");
+ vespalib::string exp_lowercase(R"([".foo bar baz."])");
+ StringFieldValue plain_string(".Foo Bar Baz.");
+ EXPECT_EQ(exp_none, convert(plain_string, true, Normalizing::NONE));
+ EXPECT_EQ(exp_lowercase, convert(plain_string, true, Normalizing::LOWERCASE));
+}
+
+TEST_F(TokensConverterTest, convert_tokenized_string)
+{
+ vespalib::string exp_none(R"(["Foo","Bar"])");
+ vespalib::string exp_lowercase(R"(["foo","bar"])");
+ StringFieldValue value(".Foo Bar.");
+ EXPECT_EQ(exp_none, convert(value, false, Normalizing::NONE));
+ EXPECT_EQ(exp_lowercase, convert(value, false, Normalizing::LOWERCASE));
+}
+
+TEST_F(TokensConverterTest, convert_with_folding)
+{
+ vespalib::string exp_exact_match_folded(R"(["moerk vaarkveld"])");
+ vespalib::string exp_tokenized_folded(R"(["moerk","vaarkveld"])");
+ StringFieldValue value("Mørk vårkveld");
+ EXPECT_EQ(exp_exact_match_folded, convert(value, true, Normalizing::LOWERCASE_AND_FOLD));
+ EXPECT_EQ(exp_tokenized_folded, convert(value, false, Normalizing::LOWERCASE_AND_FOLD));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
index a50a541f3b7..d13e2e1fd54 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
@@ -8,6 +8,8 @@ vespa_add_library(vsm_vsmbase OBJECT
flattendocsumwriter.cpp
query_term_filter_factory.cpp
snippetmodifier.cpp
+ tokens_converter.cpp
+ tokens_dfw.cpp
vsm-adapter.cpp
DEPENDS
vsm_vconfig
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
index 180200e2eaf..95e0f961d13 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
@@ -1,6 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "docsum_field_writer_factory.h"
+#include "fieldsearchspec.h"
+#include "tokens_dfw.h"
#include <vespa/searchlib/common/matching_elements_fields.h>
#include <vespa/searchsummary/docsummary/copy_dfw.h>
#include <vespa/searchsummary/docsummary/docsum_field_writer.h>
@@ -8,8 +10,10 @@
#include <vespa/searchsummary/docsummary/empty_dfw.h>
#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h>
#include <vespa/vsm/config/config-vsmfields.h>
+#include <algorithm>
using search::MatchingElementsFields;
+using search::Normalizing;
using search::docsummary::CopyDFW;
using search::docsummary::DocsumFieldWriter;
using search::docsummary::EmptyDFW;
@@ -35,6 +39,23 @@ void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_con
}
}
+bool is_exact_match(vespalib::stringref arg1) {
+ return ((arg1 == "exact") || (arg1 == "word"));
+}
+
+std::unique_ptr<DocsumFieldWriter>
+make_tokens_dfw(const vespalib::string& source, VsmfieldsConfig& fields_config)
+{
+ bool exact_match = false;
+ Normalizing normalize_mode = Normalizing::LOWERCASE;
+ auto it = std::find_if(fields_config.fieldspec.begin(), fields_config.fieldspec.end(), [&source](auto& fs) { return source == fs.name; });
+ if (it != fields_config.fieldspec.end()) {
+ exact_match = is_exact_match(it->arg1);
+ normalize_mode = FieldSearchSpecMap::convert_normalize_mode(it->normalize);
+ }
+ return std::make_unique<TokensDFW>(source, exact_match, normalize_mode);
+}
+
}
DocsumFieldWriterFactory::DocsumFieldWriterFactory(bool use_v8_geo_positions, const IDocsumEnvironment& env, const IQueryTermFilterFactory& query_term_filter_factory, const vespa::config::search::vsm::VsmfieldsConfig& vsm_fields_config)
@@ -68,6 +89,13 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie
vespalib::string source_field = source.empty() ? field_name : source;
populate_fields(*matching_elems_fields, _vsm_fields_config, source_field);
fieldWriter = MatchedElementsFilterDFW::create(source_field, matching_elems_fields);
+ } else if ((command == command::tokens) ||
+ (command == command::attribute_tokens)) {
+ if (!source.empty()) {
+ fieldWriter = make_tokens_dfw(source, _vsm_fields_config);
+ } else {
+ throw_missing_source(command);
+ }
} else {
return search::docsummary::DocsumFieldWriterFactory::create_docsum_field_writer(field_name, command, source, matching_elems_fields);
}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
index e32761f38b3..68fcfd6e8eb 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
@@ -8,7 +8,7 @@
namespace vsm {
/*
- * Factory interface class for creating docsum field writers, adjusted for
+ * Factory class for creating docsum field writers, adjusted for
* streaming search.
*/
class DocsumFieldWriterFactory : public search::docsummary::DocsumFieldWriterFactory
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
new file mode 100644
index 00000000000..3534b751356
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
@@ -0,0 +1,59 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_converter.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/array.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
+
+using document::StringFieldValue;
+using search::byte;
+using vespalib::Utf8Writer;
+using vespalib::slime::ArrayInserter;
+using vespalib::slime::Cursor;
+using vespalib::slime::Inserter;
+using vsm::TokenizeReader;
+
+namespace vsm {
+
+TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode)
+ : IStringFieldConverter(),
+ _text(),
+ _exact_match(exact_match),
+ _normalize_mode(normalize_mode)
+{
+}
+
+TokensConverter::~TokensConverter() = default;
+
+void
+TokensConverter::convert(const StringFieldValue &input, Inserter& inserter)
+{
+ _text = input.getValueRef();
+ Cursor& a = inserter.insertArray();
+ ArrayInserter ai(a);
+ vespalib::Array<ucs4_t> buf(_text.size() + 1, 0);
+ vespalib::string scratch;
+ TokenizeReader reader(reinterpret_cast<const byte *> (_text.data()), _text.size(), buf.data());
+ for (;;) {
+ auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode);
+ if (len == 0) {
+ break;
+ }
+ scratch.clear();
+ Utf8Writer w(scratch);
+ for (size_t i = 0; i < len; ++i) {
+ w.putChar(buf[i]);
+ }
+ ai.insertString(scratch);
+ }
+}
+
+bool
+TokensConverter::render_weighted_set_as_array() const
+{
+ return true;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
new file mode 100644
index 00000000000..689b0b95b02
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/i_string_field_converter.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class converting a string field value into an array
+ * containing the tokens.
+ */
+class TokensConverter : public search::docsummary::IStringFieldConverter
+{
+ vespalib::stringref _text;
+ bool _exact_match;
+ search::Normalizing _normalize_mode;
+
+public:
+ TokensConverter(bool exact_match, search::Normalizing normalize_mode);
+ ~TokensConverter() override;
+ void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+ bool render_weighted_set_as_array() const override;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
new file mode 100644
index 00000000000..7861c13a179
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
@@ -0,0 +1,37 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_dfw.h"
+#include <vespa/searchsummary/docsummary/i_docsum_store_document.h>
+#include "tokens_converter.h"
+
+using search::docsummary::IDocsumStoreDocument;
+using search::docsummary::GetDocsumsState;
+
+namespace vsm {
+
+TokensDFW::TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode)
+ : DocsumFieldWriter(),
+ _input_field_name(input_field_name),
+ _exact_match(exact_match),
+ _normalize_mode(normalize_mode)
+{
+}
+
+TokensDFW::~TokensDFW() = default;
+
+bool
+TokensDFW::isGenerated() const
+{
+ return false;
+}
+
+void
+TokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const
+{
+ if (doc != nullptr) {
+ TokensConverter converter(_exact_match, _normalize_mode);
+ doc->insert_summary_field(_input_field_name, target, &converter);
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
new file mode 100644
index 00000000000..4199630a94d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
@@ -0,0 +1,29 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/docsum_field_writer.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class for writing string field values from document as
+ * arrays containing the tokens. Tokenization is performed
+ * on the fly using the exact_match and normalize_mode settings.
+ */
+class TokensDFW : public search::docsummary::DocsumFieldWriter
+{
+private:
+ vespalib::string _input_field_name;
+ bool _exact_match;
+ search::Normalizing _normalize_mode;
+
+public:
+ explicit TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode);
+ ~TokensDFW() override;
+ bool isGenerated() const override;
+ void insertField(uint32_t docid, const search::docsummary::IDocsumStoreDocument* doc, search::docsummary::GetDocsumsState& state, vespalib::slime::Inserter& target) const override;
+};
+
+}