Add a converter from an annoated string field value to an array of

strings representing the indexing terms. Multiple indexing terms for same position are placed in a nested array.
author: Tor Egge <Tor.Egge@online.no> 2023-10-12 14:34:04 +0200
committer: Tor Egge <Tor.Egge@online.no> 2023-10-12 14:34:04 +0200
commit: 486cea93db4e6e4239a1932737611bcfbf541996 (patch)
tree: f4e6e0d7614c3fc867f7f5e7bd1cbb13cb919a5f /searchsummary
parent: 4d4fa30cdb221decb7c1462f31635046748c50b2 (diff)
6 files changed, 302 insertions, 0 deletions
diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt
index e82ffa8d2b8..a091f8b5358 100644
--- a/searchsummary/CMakeLists.txt
+++ b/searchsummary/CMakeLists.txt
@@ -20,6 +20,7 @@ vespa_define_module(
     src/tests/docsummary/attribute_combiner
     src/tests/docsummary/attributedfw
     src/tests/docsummary/document_id_dfw
+    src/tests/docsummary/linguistics_tokens_converter
     src/tests/docsummary/matched_elements_filter
     src/tests/docsummary/query_term_filter_factory
     src/tests/docsummary/result_class
diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt
new file mode 100644
index 00000000000..d9510c3a2b3
--- /dev/null
+++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchsummary_linguistics_tokens_converter_test_app TEST
+    SOURCES
+    linguistics_tokens_converter_test.cpp
+    DEPENDS
+    searchsummary
+    GTest::gtest
+)
+
+vespa_add_test(NAME searchsummary_linguistics_tokens_converter_test_app COMMAND searchsummary_linguistics_tokens_converter_test_app)
diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp
new file mode 100644
index 00000000000..c8d959361ae
--- /dev/null
+++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp
@@ -0,0 +1,172 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/document/annotation/annotation.h>
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/annotation/spanlist.h>
+#include <vespa/document/annotation/spantree.h>
+#include <vespa/document/datatype/annotationtype.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/repo/configbuilder.h>
+#include <vespa/document/repo/fixedtyperepo.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchsummary/docsummary/linguistics_tokens_converter.h>
+#include <vespa/vespalib/data/simple_buffer.h>
+#include <vespa/vespalib/data/slime/json_format.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using document::Annotation;
+using document::AnnotationType;
+using document::DocumentType;
+using document::DocumentTypeRepo;
+using document::Span;
+using document::SpanList;
+using document::SpanTree;
+using document::StringFieldValue;
+using search::docsummary::LinguisticsTokensConverter;
+using search::linguistics::SPANTREE_NAME;
+using vespalib::SimpleBuffer;
+using vespalib::Slime;
+using vespalib::slime::JsonFormat;
+using vespalib::slime::SlimeInserter;
+
+namespace {
+
+vespalib::string
+slime_to_string(const Slime& slime)
+{
+    SimpleBuffer buf;
+    JsonFormat::encode(slime, buf, true);
+    return buf.get().make_string();
+}
+
+DocumenttypesConfig
+get_document_types_config()
+{
+    using namespace document::config_builder;
+    DocumenttypesConfigBuilderHelper builder;
+    builder.document(42, "indexingdocument",
+                     Struct("indexingdocument.header"),
+                     Struct("indexingdocument.body"));
+    return builder.config();
+}
+
+}
+
+class LinguisticsTokensConverterTest : public testing::Test
+{
+protected:
+    std::shared_ptr<const DocumentTypeRepo> _repo;
+    const DocumentType*                     _document_type;
+    document::FixedTypeRepo                 _fixed_repo;
+
+    LinguisticsTokensConverterTest();
+    ~LinguisticsTokensConverterTest() override;
+    void set_span_tree(StringFieldValue& value, std::unique_ptr<SpanTree> tree);
+    StringFieldValue make_annotated_string(bool alt_tokens);
+    StringFieldValue make_annotated_chinese_string();
+    vespalib::string make_exp_annotated_chinese_string_tokens();
+    vespalib::string convert(const StringFieldValue& fv);
+};
+
+LinguisticsTokensConverterTest::LinguisticsTokensConverterTest()
+    : testing::Test(),
+      _repo(std::make_unique<DocumentTypeRepo>(get_document_types_config())),
+      _document_type(_repo->getDocumentType("indexingdocument")),
+      _fixed_repo(*_repo, *_document_type)
+{
+}
+
+LinguisticsTokensConverterTest::~LinguisticsTokensConverterTest() = default;
+
+void
+LinguisticsTokensConverterTest::set_span_tree(StringFieldValue & value, std::unique_ptr<SpanTree> tree)
+{
+    StringFieldValue::SpanTrees trees;
+    trees.push_back(std::move(tree));
+    value.setSpanTrees(trees, _fixed_repo);
+}
+
+StringFieldValue
+LinguisticsTokensConverterTest::make_annotated_string(bool alt_tokens)
+{
+    auto span_list_up = std::make_unique<SpanList>();
+    auto span_list = span_list_up.get();
+    auto tree = std::make_unique<SpanTree>(SPANTREE_NAME, std::move(span_list_up));
+    tree->annotate(span_list->add(std::make_unique<Span>(0, 3)), *AnnotationType::TERM);
+    if (alt_tokens) {
+        tree->annotate(span_list->add(std::make_unique<Span>(4, 3)), *AnnotationType::TERM);
+    }
+    tree->annotate(span_list->add(std::make_unique<Span>(4, 3)),
+                   Annotation(*AnnotationType::TERM, std::make_unique<StringFieldValue>("baz")));
+    StringFieldValue value("foo bar");
+    set_span_tree(value, std::move(tree));
+    return value;
+}
+
+StringFieldValue
+LinguisticsTokensConverterTest::make_annotated_chinese_string()
+{
+    auto span_list_up = std::make_unique<SpanList>();
+    auto span_list = span_list_up.get();
+    auto tree = std::make_unique<SpanTree>(SPANTREE_NAME, std::move(span_list_up));
+    // These chinese characters each use 3 bytes in their UTF8 encoding.
+    tree->annotate(span_list->add(std::make_unique<Span>(0, 15)), *AnnotationType::TERM);
+    tree->annotate(span_list->add(std::make_unique<Span>(15, 9)), *AnnotationType::TERM);
+    StringFieldValue value("我就是那个大灰狼");
+    set_span_tree(value, std::move(tree));
+    return value;
+}
+
+vespalib::string
+LinguisticsTokensConverterTest::make_exp_annotated_chinese_string_tokens()
+{
+    return R"(["我就是那个","大灰狼"])";
+}
+
+vespalib::string
+LinguisticsTokensConverterTest::convert(const StringFieldValue& fv)
+{
+    LinguisticsTokensConverter converter;
+    Slime slime;
+    SlimeInserter inserter(slime);
+    converter.convert(fv, inserter);
+    return slime_to_string(slime);
+}
+
+TEST_F(LinguisticsTokensConverterTest, convert_empty_string)
+{
+    vespalib::string exp(R"([])");
+    StringFieldValue plain_string("");
+    EXPECT_EQ(exp, convert(plain_string));
+}
+
+TEST_F(LinguisticsTokensConverterTest, convert_plain_string)
+{
+    vespalib::string exp(R"(["Foo Bar Baz"])");
+    StringFieldValue plain_string("Foo Bar Baz");
+    EXPECT_EQ(exp, convert(plain_string));
+}
+
+TEST_F(LinguisticsTokensConverterTest, convert_annotated_string)
+{
+    vespalib::string exp(R"(["foo","baz"])");
+    auto annotated_string = make_annotated_string(false);
+    EXPECT_EQ(exp, convert(annotated_string));
+}
+
+TEST_F(LinguisticsTokensConverterTest, convert_annotated_string_with_alternatives)
+{
+    vespalib::string exp(R"(["foo",["bar","baz"]])");
+    auto annotated_string = make_annotated_string(true);
+    EXPECT_EQ(exp, convert(annotated_string));
+}
+
+TEST_F(LinguisticsTokensConverterTest, convert_annotated_chinese_string)
+{
+    auto exp = make_exp_annotated_chinese_string_tokens();
+    auto annotated_chinese_string = make_annotated_chinese_string();
+    EXPECT_EQ(exp, convert(annotated_chinese_string));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index 32df047c27f..e5ae47593e5 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -23,6 +23,7 @@ vespa_add_library(searchsummary_docsummary OBJECT
     juniper_dfw_term_visitor.cpp
     juniper_query_adapter.cpp
     juniperproperties.cpp
+    linguistics_tokens_converter.cpp
     matched_elements_filter_dfw.cpp
     positionsdfw.cpp
     query_term_filter.cpp
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp
new file mode 100644
index 00000000000..838b0234cdb
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp
@@ -0,0 +1,81 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguistics_tokens_converter.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/searchlib/memoryindex/field_inverter.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchlib/util/token_extractor.h>
+#include <vespa/vespalib/data/slime/slime.h>
+
+using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
+using search::memoryindex::FieldInverter;
+using vespalib::Memory;
+using vespalib::slime::ArrayInserter;
+using vespalib::slime::Cursor;
+using vespalib::slime::Inserter;
+
+namespace search::docsummary {
+
+namespace {
+
+vespalib::string dummy_field_name;
+
+}
+
+LinguisticsTokensConverter::LinguisticsTokensConverter()
+    : IStringFieldConverter(),
+      _text()
+{
+}
+
+LinguisticsTokensConverter::~LinguisticsTokensConverter() = default;
+
+template <typename ForwardIt>
+void
+LinguisticsTokensConverter::handle_alternative_index_terms(ForwardIt it, ForwardIt last, Inserter& inserter)
+{
+    Cursor& a = inserter.insertArray();
+    ArrayInserter ai(a);
+    for (;it != last; ++it) {
+        handle_index_term(it->word, ai);
+    }
+}
+
+void
+LinguisticsTokensConverter::handle_index_term(vespalib::stringref word, Inserter& inserter)
+{
+    inserter.insertString(Memory(word));
+}
+
+void
+LinguisticsTokensConverter::handle_indexing_terms(const StringFieldValue& value, vespalib::slime::Inserter& inserter)
+{
+    Cursor& a = inserter.insertArray();
+    ArrayInserter ai(a);
+    using SpanTerm = TokenExtractor::SpanTerm;
+    std::vector<SpanTerm> terms;
+    auto span_trees = value.getSpanTrees();
+    TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
+    token_extractor.extract(terms, span_trees, _text, nullptr);
+    auto it = terms.begin();
+    auto ite = terms.end();
+    auto itn = it;
+    for (; it != ite; it = itn) {
+        for (; itn != ite && itn->span == it->span; ++itn);
+        if ((itn - it) > 1) {
+            handle_alternative_index_terms(it, itn, ai);
+        } else {
+            handle_index_term(it->word, ai);
+        }
+    }
+}
+
+void
+LinguisticsTokensConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter)
+{
+    _text = input.getValueRef();
+    handle_indexing_terms(input, inserter);
+}
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h
new file mode 100644
index 00000000000..74d1e2ab1f9
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h
@@ -0,0 +1,37 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "i_string_field_converter.h"
+
+namespace document {
+
+class FieldValue;
+class Span;
+
+}
+
+namespace search::docsummary {
+
+class IJuniperConverter;
+
+/*
+ * Class converting a string field value with annotations into an array
+ * containing the index terms. Multiple index terms at same position are
+ * placed in a nested array.
+ */
+class LinguisticsTokensConverter : public IStringFieldConverter
+{
+    vespalib::stringref    _text;
+
+    template <typename ForwardIt>
+    void handle_alternative_index_terms(ForwardIt it, ForwardIt last, vespalib::slime::Inserter& inserter);
+    void handle_index_term(vespalib::stringref word, vespalib::slime::Inserter& inserter);
+    void handle_indexing_terms(const document::StringFieldValue& value, vespalib::slime::Inserter& inserter);
+public:
+    LinguisticsTokensConverter();
+    ~LinguisticsTokensConverter() override;
+    void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+};
+
+}
author	Tor Egge <Tor.Egge@online.no>	2023-10-12 14:34:04 +0200
committer	Tor Egge <Tor.Egge@online.no>	2023-10-12 14:34:04 +0200
commit	486cea93db4e6e4239a1932737611bcfbf541996 (patch)
tree	f4e6e0d7614c3fc867f7f5e7bd1cbb13cb919a5f /searchsummary
parent	4d4fa30cdb221decb7c1462f31635046748c50b2 (diff)