7 files changed, 183 insertions, 1 deletions
diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
index a50a541f3b7..d13e2e1fd54 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
@@ -8,6 +8,8 @@ vespa_add_library(vsm_vsmbase OBJECT
     flattendocsumwriter.cpp
     query_term_filter_factory.cpp
     snippetmodifier.cpp
+    tokens_converter.cpp
+    tokens_dfw.cpp
     vsm-adapter.cpp
     DEPENDS
     vsm_vconfig
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
index 180200e2eaf..95e0f961d13 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
@@ -1,6 +1,8 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "docsum_field_writer_factory.h"
+#include "fieldsearchspec.h"
+#include "tokens_dfw.h"
 #include <vespa/searchlib/common/matching_elements_fields.h>
 #include <vespa/searchsummary/docsummary/copy_dfw.h>
 #include <vespa/searchsummary/docsummary/docsum_field_writer.h>
@@ -8,8 +10,10 @@
 #include <vespa/searchsummary/docsummary/empty_dfw.h>
 #include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h>
 #include <vespa/vsm/config/config-vsmfields.h>
+#include <algorithm>
 
 using search::MatchingElementsFields;
+using search::Normalizing;
 using search::docsummary::CopyDFW;
 using search::docsummary::DocsumFieldWriter;
 using search::docsummary::EmptyDFW;
@@ -35,6 +39,23 @@ void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_con
     }
 }
 
+bool is_exact_match(vespalib::stringref arg1) {
+    return ((arg1 == "exact") || (arg1 == "word"));
+}
+
+std::unique_ptr<DocsumFieldWriter>
+make_tokens_dfw(const vespalib::string& source, VsmfieldsConfig& fields_config)
+{
+    bool exact_match = false;
+    Normalizing normalize_mode = Normalizing::LOWERCASE;
+    auto it = std::find_if(fields_config.fieldspec.begin(), fields_config.fieldspec.end(), [&source](auto& fs) { return source == fs.name; });
+    if (it != fields_config.fieldspec.end()) {
+        exact_match = is_exact_match(it->arg1);
+        normalize_mode = FieldSearchSpecMap::convert_normalize_mode(it->normalize);
+    }
+    return std::make_unique<TokensDFW>(source, exact_match, normalize_mode);
+}
+
 }
 
 DocsumFieldWriterFactory::DocsumFieldWriterFactory(bool use_v8_geo_positions, const IDocsumEnvironment& env, const IQueryTermFilterFactory& query_term_filter_factory, const vespa::config::search::vsm::VsmfieldsConfig& vsm_fields_config)
@@ -68,6 +89,13 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie
         vespalib::string source_field = source.empty() ? field_name : source;
         populate_fields(*matching_elems_fields, _vsm_fields_config, source_field);
         fieldWriter = MatchedElementsFilterDFW::create(source_field, matching_elems_fields);
+    } else if ((command == command::tokens) ||
+               (command == command::attribute_tokens)) {
+        if (!source.empty()) {
+            fieldWriter = make_tokens_dfw(source, _vsm_fields_config);
+        } else {
+            throw_missing_source(command);
+        }
     } else {
         return search::docsummary::DocsumFieldWriterFactory::create_docsum_field_writer(field_name, command, source, matching_elems_fields);
     }
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
index e32761f38b3..68fcfd6e8eb 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
@@ -8,7 +8,7 @@
 namespace vsm {
 
 /*
- * Factory interface class for creating docsum field writers, adjusted for
+ * Factory class for creating docsum field writers, adjusted for
  * streaming search.
  */
 class DocsumFieldWriterFactory : public search::docsummary::DocsumFieldWriterFactory
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
new file mode 100644
index 00000000000..3534b751356
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
@@ -0,0 +1,59 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_converter.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/array.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
+
+using document::StringFieldValue;
+using search::byte;
+using vespalib::Utf8Writer;
+using vespalib::slime::ArrayInserter;
+using vespalib::slime::Cursor;
+using vespalib::slime::Inserter;
+using vsm::TokenizeReader;
+
+namespace vsm {
+
+TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode)
+    : IStringFieldConverter(),
+      _text(),
+      _exact_match(exact_match),
+      _normalize_mode(normalize_mode)
+{
+}
+
+TokensConverter::~TokensConverter() = default;
+
+void
+TokensConverter::convert(const StringFieldValue &input, Inserter& inserter)
+{
+    _text = input.getValueRef();
+    Cursor& a = inserter.insertArray();
+    ArrayInserter ai(a);
+    vespalib::Array<ucs4_t> buf(_text.size() + 1, 0);
+    vespalib::string scratch;
+    TokenizeReader reader(reinterpret_cast<const byte *> (_text.data()), _text.size(), buf.data());
+    for (;;) {
+        auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode);
+        if (len == 0) {
+            break;
+        }
+        scratch.clear();
+        Utf8Writer w(scratch);
+        for (size_t i = 0; i < len; ++i) {
+            w.putChar(buf[i]);
+        }
+        ai.insertString(scratch);
+    }
+}
+
+bool
+TokensConverter::render_weighted_set_as_array() const
+{
+    return true;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
new file mode 100644
index 00000000000..689b0b95b02
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/i_string_field_converter.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class converting a string field value into an array
+ * containing the tokens.
+ */
+class TokensConverter : public search::docsummary::IStringFieldConverter
+{
+    vespalib::stringref _text;
+    bool                _exact_match;
+    search::Normalizing _normalize_mode;
+
+public:
+    TokensConverter(bool exact_match, search::Normalizing normalize_mode);
+    ~TokensConverter() override;
+    void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+    bool render_weighted_set_as_array() const override;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
new file mode 100644
index 00000000000..7861c13a179
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
@@ -0,0 +1,37 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_dfw.h"
+#include <vespa/searchsummary/docsummary/i_docsum_store_document.h>
+#include "tokens_converter.h"
+
+using search::docsummary::IDocsumStoreDocument;
+using search::docsummary::GetDocsumsState;
+
+namespace vsm {
+
+TokensDFW::TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode)
+    : DocsumFieldWriter(),
+      _input_field_name(input_field_name),
+      _exact_match(exact_match),
+      _normalize_mode(normalize_mode)
+{
+}
+
+TokensDFW::~TokensDFW() = default;
+
+bool
+TokensDFW::isGenerated() const
+{
+    return false;
+}
+
+void
+TokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const
+{
+    if (doc != nullptr) {
+        TokensConverter converter(_exact_match, _normalize_mode);
+        doc->insert_summary_field(_input_field_name, target, &converter);
+    }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
new file mode 100644
index 00000000000..4199630a94d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
@@ -0,0 +1,29 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/docsum_field_writer.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class for writing string field values from document as
+ * arrays containing the tokens. Tokenization is performed
+ * on the fly using the exact_match and normalize_mode settings.
+ */
+class TokensDFW : public search::docsummary::DocsumFieldWriter
+{
+private:
+    vespalib::string            _input_field_name;
+    bool                        _exact_match;
+    search::Normalizing         _normalize_mode;
+
+public:
+    explicit TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode);
+    ~TokensDFW() override;
+    bool isGenerated() const override;
+    void insertField(uint32_t docid, const search::docsummary::IDocsumStoreDocument* doc, search::docsummary::GetDocsumsState& state, vespalib::slime::Inserter& target) const override;
+};
+
+}