Merge pull request #28894 from vespa-engine/toregge/move-checks-to-token-extractor

Move more checks to TokenExtractor.
author: Geir Storli <geirst@yahooinc.com> 2023-10-12 13:35:55 +0200
committer: GitHub <noreply@github.com> 2023-10-12 13:35:55 +0200
commit: 4d4fa30cdb221decb7c1462f31635046748c50b2 (patch)
tree: 198dbeeefcb9ae3c3d86c790e89228bddb6880b7
parent: 724d1120e45dd09c8089c7f6b75a6be0f355e8b3 (diff)
parent: 706bf2929c840606efba2763b177ae435579c1d7 (diff)
8 files changed, 337 insertions, 117 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index e9817497904..6510808760c 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -244,6 +244,7 @@ vespa_define_module(
     src/tests/util/folded_string_compare
     src/tests/util/searchable_stats
     src/tests/util/slime_output_raw_buf_adapter
+    src/tests/util/token_extractor
     src/tests/vespa-fileheader-inspect
 )
 
diff --git a/searchlib/src/tests/util/token_extractor/CMakeLists.txt b/searchlib/src/tests/util/token_extractor/CMakeLists.txt
new file mode 100644
index 00000000000..adfe579243c
--- /dev/null
+++ b/searchlib/src/tests/util/token_extractor/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_token_extractor_test_app TEST
+    SOURCES
+    token_extractor_test.cpp
+    DEPENDS
+    searchlib_test
+    GTest::gtest
+)
+vespa_add_test(NAME searchlib_token_extractor_test_app COMMAND searchlib_token_extractor_test_app)
diff --git a/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp
new file mode 100644
index 00000000000..e6944e257e9
--- /dev/null
+++ b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp
@@ -0,0 +1,164 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/repo/configbuilder.h>
+#include <vespa/searchlib/test/doc_builder.h>
+#include <vespa/searchlib/test/string_field_builder.h>
+#include <vespa/searchlib/util/token_extractor.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/objects/nbostream.h>
+#include <variant>
+
+using document::DataType;
+using document::Document;
+using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
+using search::test::DocBuilder;
+using search::test::StringFieldBuilder;
+
+using AlternativeWords = std::vector<vespalib::string>;
+using AlternativeWordsOrWord = std::variant<AlternativeWords, vespalib::string>;
+using Words = std::vector<AlternativeWordsOrWord>;
+
+namespace {
+
+vespalib::string corrupt_word = "corruptWord";
+
+vespalib::string field_name("stringfield");
+
+std::unique_ptr<Document>
+make_corrupted_document(DocBuilder &b, size_t wordOffset)
+{
+    StringFieldBuilder sfb(b);
+    auto doc = b.make_document("id:ns:searchdocument::18");
+    doc->setValue(field_name, sfb.tokenize("before ").word(corrupt_word).tokenize(" after").build());
+    vespalib::nbostream stream;
+    doc->serialize(stream);
+    std::vector<char> raw;
+    raw.resize(stream.size());
+    stream.read(&raw[0], stream.size());
+    assert(wordOffset < corrupt_word.size());
+    for (size_t i = 0; i + corrupt_word.size() <= raw.size(); ++i) {
+        if (memcmp(&raw[i], corrupt_word.c_str(), corrupt_word.size()) == 0) {
+            raw[i + wordOffset] = '\0';
+            break;
+        }
+    }
+    vespalib::nbostream badstream;
+    badstream.write(&raw[0], raw.size());
+    return std::make_unique<Document>(b.get_repo(), badstream);
+}
+
+}
+
+class TokenExtractorTest : public ::testing::Test {
+protected:
+    using SpanTerm = TokenExtractor::SpanTerm;
+    DocBuilder                _doc_builder;
+    std::unique_ptr<Document> _doc;
+    TokenExtractor            _token_extractor;
+    std::vector<SpanTerm>     _terms;
+
+    static constexpr size_t max_word_len = 20;
+
+    TokenExtractorTest();
+    ~TokenExtractorTest() override;
+
+    static DocBuilder::AddFieldsType
+    make_add_fields()
+    {
+        return [](auto& header) { header.addField(field_name, DataType::T_STRING); };
+    }
+
+    Words process(const StringFieldValue& value);
+};
+
+TokenExtractorTest::TokenExtractorTest()
+    : _doc_builder(make_add_fields()),
+      _doc(_doc_builder.make_document("id:ns:searchdocument::0")),
+      _token_extractor(field_name, max_word_len),
+      _terms()
+{
+}
+
+TokenExtractorTest::~TokenExtractorTest() = default;
+
+Words
+TokenExtractorTest::process(const StringFieldValue& value)
+{
+    Words result;
+    _terms.clear();
+    auto span_trees = value.getSpanTrees();
+    vespalib::stringref text = value.getValueRef();
+    _token_extractor.extract(_terms, span_trees, text, _doc.get());
+    auto it  = _terms.begin();
+    auto ite = _terms.end();
+    auto itn = it;
+    for (; it != ite; ) {
+        for (; itn != ite && itn->span == it->span; ++itn);
+        if ((itn - it) > 1) {
+            auto& alternatives = std::get<0>(result.emplace_back());
+            for (;it != itn; ++it) {
+                alternatives.emplace_back(it->word);
+            }
+        } else {
+            result.emplace_back(vespalib::string(it->word));
+            ++it;
+        }
+    }
+
+    return result;
+}
+
+TEST_F(TokenExtractorTest, empty_string)
+{
+    EXPECT_EQ((Words{}), process(StringFieldValue("")));
+}
+
+TEST_F(TokenExtractorTest, plain_string)
+{
+    EXPECT_EQ((Words{"Plain string"}), process(StringFieldValue("Plain string")));
+}
+
+TEST_F(TokenExtractorTest, normal_string)
+{
+    StringFieldBuilder sfb(_doc_builder);
+    EXPECT_EQ((Words{"Hello", "world"}), process(sfb.tokenize("Hello world").build()));
+}
+
+TEST_F(TokenExtractorTest, normalized_tokens)
+{
+    StringFieldBuilder sfb(_doc_builder);
+    auto value = sfb.token("Hello", false).alt_word("hello").tokenize(" world").build();
+    EXPECT_EQ("Hello world", value.getValue());
+    EXPECT_EQ((Words{"hello", "world"}), process(value));
+}
+
+TEST_F(TokenExtractorTest, alternative_tokens)
+{
+    StringFieldBuilder sfb(_doc_builder);
+    auto value = sfb.word("Hello").alt_word("hello").tokenize(" world").build();
+    EXPECT_EQ("Hello world", value.getValue());
+    EXPECT_EQ((Words{AlternativeWords{"Hello", "hello"}, "world"}), process(value));
+}
+
+TEST_F(TokenExtractorTest, word_with_nul_byte_is_truncated)
+{
+    auto doc = make_corrupted_document(_doc_builder, 7);
+    EXPECT_EQ((Words{"before", "corrupt", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name))));
+}
+
+TEST_F(TokenExtractorTest, word_with_nul_byte_at_start_is_dropped)
+{
+    auto doc = make_corrupted_document(_doc_builder, 0);
+    EXPECT_EQ((Words{"before", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name))));
+}
+
+TEST_F(TokenExtractorTest, too_long_word_is_dropped)
+{
+    StringFieldBuilder sfb(_doc_builder);
+    EXPECT_EQ((Words{"before", "after"}), process(sfb.tokenize("before veryverylongwordthatwillbedropped after").build()));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 2a54859352d..a69260c6f45 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -21,9 +21,6 @@
 #include <vespa/vespalib/stllike/hash_map.hpp>
 #include <stdexcept>
 
-#include <vespa/log/log.h>
-LOG_SETUP(".searchlib.memoryindex.fieldinverter");
-
 namespace search::memoryindex {
 
 using document::Annotation;
@@ -51,45 +48,17 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document&
 {
     _terms.clear();
     auto span_trees = value.getSpanTrees();
-    if (!TokenExtractor::extract(false, _terms, span_trees)) {
-        /* This is wrong unless field is exact match */
-        const vespalib::string &text = value.getValue();
-        if (text.empty()) {
-            return;
-        }
-        uint32_t wordRef = saveWord(text, &doc);
-        if (wordRef != 0u) {
-            add(wordRef);
-            stepWordPos();
-        }
-        return;
-    }
-    const vespalib::string &text = value.getValue();
+    vespalib::stringref text = value.getValueRef();
+    _token_extractor.extract(_terms, span_trees, text, &doc);
     auto it  = _terms.begin();
     auto ite = _terms.end();
-    uint32_t wordRef;
-    bool mustStep = false;
     for (; it != ite; ) {
         auto it_begin = it;
-        for (; it != ite && it->first == it_begin->first; ++it) {
-            if (it->second) {  // it->second is a const FieldValue *.
-                wordRef = saveWord(*it->second, doc);
-            } else {
-                const Span &iSpan = it->first;
-                assert(iSpan.from() >= 0);
-                assert(iSpan.length() > 0);
-                wordRef = saveWord(vespalib::stringref(&text[iSpan.from()],
-                                                       iSpan.length()), &doc);
-            }
-            if (wordRef != 0u) {
-                add(wordRef);
-                mustStep = true;
-            }
-        }
-        if (mustStep) {
-            stepWordPos();
-            mustStep = false;
+        for (; it != ite && it->span == it_begin->span; ++it) {
+            uint32_t wordRef = saveWord(it->word);
+            add(wordRef);
         }
+        stepWordPos();
     }
 }
 
@@ -170,33 +139,19 @@ FieldInverter::endElement()
 }
 
 uint32_t
-FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
+FieldInverter::saveWord(vespalib::stringref word)
 {
     const size_t wordsSize = _words.size();
     // assert((wordsSize & 3) == 0); // Check alignment
-    size_t len = strnlen(word.data(), word.size());
-    if (len < word.size()) {
-        const Schema::IndexField &field = _schema.getIndexField(_fieldId);
-        LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, lid is %u, field is %s, truncated word is %s", word.size(), len, _docId, field.getName().c_str(), word.data());
-    }
-    if (len > max_word_len && doc != nullptr) {
-        const Schema::IndexField& field = _schema.getIndexField(_fieldId);
-        LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.100s", len, max_word_len, doc->getId().toString().c_str(), field.getName().c_str(), word.data());
-        return 0u;
-    }
-    if (len == 0) {
-        return 0u;
-    }
-
-    const size_t unpadded_size = wordsSize + 4 + len + 1;
+    const size_t unpadded_size = wordsSize + 4 + word.size() + 1;
     const size_t fullyPaddedSize = Aligner<4>::align(unpadded_size);
     _words.reserve(vespalib::roundUp2inN(fullyPaddedSize));
     _words.resize(fullyPaddedSize);
 
     char * buf = &_words[0] + wordsSize;
     memset(buf, 0, 4);
-    memcpy(buf + 4, word.data(), len);
-    memset(buf + 4 + len, 0, fullyPaddedSize - unpadded_size + 1);
+    memcpy(buf + 4, word.data(), word.size());
+    memset(buf + 4 + word.size(), 0, fullyPaddedSize - unpadded_size + 1);
 
     uint32_t wordRef = (wordsSize + 4) >> 2;
     // assert(wordRef != 0);
@@ -204,20 +159,10 @@ FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
     return wordRef;
 }
 
-uint32_t
-FieldInverter::saveWord(const document::FieldValue &fv, const Document& doc)
-{
-    assert(fv.isA(FieldValue::Type::STRING));
-    using RawRef = std::pair<const char*, size_t>;
-    RawRef sRef = fv.getAsRaw();
-    return saveWord(vespalib::stringref(sRef.first, sRef.second), &doc);
-}
-
 void
 FieldInverter::remove(const vespalib::stringref word, uint32_t docId)
 {
-    uint32_t wordRef = saveWord(word, nullptr);
-    assert(wordRef != 0);
+    uint32_t wordRef = saveWord(word);
     _positions.emplace_back(wordRef, docId);
 }
 
@@ -245,6 +190,17 @@ FieldInverter::endDoc()
 }
 
 void
+FieldInverter::addWord(vespalib::stringref word, const document::Document& doc)
+{
+    word = _token_extractor.sanitize_word(word, &doc);
+    if (!word.empty()) {
+        uint32_t wordRef = saveWord(word);
+        add(wordRef);
+        stepWordPos();
+    }
+}
+
+void
 FieldInverter::processNormalDocTextField(const StringFieldValue &field, const Document& doc)
 {
     startElement(1);
@@ -293,6 +249,7 @@ FieldInverter::FieldInverter(const Schema &schema, uint32_t fieldId,
       _docId(0),
       _oldPosSize(0),
       _schema(schema),
+      _token_extractor(_schema.getIndexField(_fieldId).getName(), max_word_len),
       _words(),
       _elems(),
       _positions(),
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
index 23e3f9ddfd8..4e3934ba322 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
@@ -173,6 +173,7 @@ private:
     uint32_t                       _oldPosSize;
 
     const index::Schema           &_schema;
+    linguistics::TokenExtractor    _token_extractor;
 
     WordBuffer                     _words;
     ElemInfoVec                    _elems;
@@ -202,12 +203,7 @@ private:
     /**
      * Save the given word in the word buffer and return the word reference.
      */
-    VESPA_DLL_LOCAL uint32_t saveWord(const vespalib::stringref word, const document::Document* doc);
-
-    /**
-     * Save the field value as a word in the word buffer and return the word reference.
-     */
-    VESPA_DLL_LOCAL uint32_t saveWord(const document::FieldValue &fv, const document::Document& doc);
+    VESPA_DLL_LOCAL uint32_t saveWord(vespalib::stringref word);
 
     /**
      * Get pointer to saved word from a word reference.
@@ -326,13 +322,7 @@ public:
 
     void endDoc();
 
-    void addWord(const vespalib::stringref word, const document::Document& doc) {
-        uint32_t wordRef = saveWord(word, &doc);
-        if (wordRef != 0u) {
-            add(wordRef);
-            stepWordPos();
-        }
-    }
+    void addWord(vespalib::stringref word, const document::Document& doc);
 };
 
 }
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
index 555ea86d299..a78f30afe21 100644
--- a/searchlib/src/vespa/searchlib/util/token_extractor.cpp
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -6,16 +6,25 @@
 #include <vespa/document/annotation/span.h>
 #include <vespa/document/annotation/spanlist.h>
 #include <vespa/document/annotation/spantreevisitor.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/exceptions.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".searchlib.util.token_extractor");
 
 using document::AlternateSpanList;
 using document::Annotation;
 using document::AnnotationType;
+using document::Document;
+using document::FieldValue;
 using document::SimpleSpanList;
 using document::Span;
 using document::SpanList;
 using document::SpanNode;
 using document::SpanTreeVisitor;
 using document::StringFieldValue;
+using vespalib::Utf8Reader;
 
 namespace search::linguistics {
 
@@ -58,14 +67,85 @@ getSpan(const SpanNode &span_node)
     return finder.span();
 }
 
+vespalib::stringref
+get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv)
+{
+    if (fv != nullptr) {
+        auto raw = fv->getAsRaw();
+        return {raw.first, raw.second};
+    } else {
+        return {s.data() + span.from(), static_cast<size_t>(span.length())};
+    }
+}
+
+size_t
+truncated_word_len(vespalib::stringref word, size_t max_byte_len)
+{
+    Utf8Reader reader(word);
+    while (reader.hasMore()) {
+        auto last_pos = reader.getPos();
+        (void) reader.getChar();
+        if (reader.getPos() > max_byte_len) {
+            return last_pos;
+        }
+    }
+    return reader.getPos(); // No truncation
+}
+
+constexpr size_t max_fmt_len = 100; // Max length of word in logs
+
+}
+
+TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len)
+    : _field_name(field_name),
+      _max_word_len(max_word_len)
+{
+}
+
+TokenExtractor::~TokenExtractor() = default;
+
+vespalib::stringref
+TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const
+{
+    size_t len = strnlen(word.data(), word.size());
+    if (len < word.size()) {
+        size_t old_len = word.size();
+        len = truncated_word_len(word, len);
+        word = word.substr(0, len);
+        if (doc != nullptr) {
+            LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+        }
+    }
+    if (word.size() > _max_word_len) {
+        if (doc != nullptr) {
+            LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+        }
+        return {};
+    }
+    return word;
+}
+
+void
+TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const
+{
+    if (span.length() > 0 && span.from() >= 0 &&
+        static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) {
+        auto word = get_span_string_or_alternative(text, span, fv);
+        word = sanitize_word(word, doc);
+        if (!word.empty()) {
+            terms.emplace_back(span, word, fv != nullptr);
+        }
+    }
 }
 
-bool
-TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees)
+void
+TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const
 {
     auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
     if (tree == nullptr) {
-        return false;
+        /* field might not be annotated if match type is exact */
+        consider_word(terms, text, Span(0, text.size()), nullptr, doc);
+        return;
     }
     for (const Annotation & annotation : *tree) {
         const SpanNode *span = annotation.getSpanNode();
@@ -73,13 +153,10 @@ TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& te
             (annotation.getType() == *AnnotationType::TERM))
         {
             Span sp = getSpan(*span);
-            if (sp.length() != 0 || allow_zero_length_tokens) {
-                terms.emplace_back(sp, annotation.getFieldValue());
-            }
+            consider_word(terms, text, sp, annotation.getFieldValue(), doc);
         }
     }
     std::sort(terms.begin(), terms.end());
-    return true;
 }
 
 }
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
index 5796aaa7482..4955448b0c2 100644
--- a/searchlib/src/vespa/searchlib/util/token_extractor.h
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -2,14 +2,16 @@
 
 #pragma once
 
+#include <vespa/document/annotation/span.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/stllike/string.h>
 #include <vector>
 
 namespace document {
 
-class FieldValue;
-class StringFieldValue;
+class Document;
 class Span;
+class StringFieldValue;
 
 }
 
@@ -19,9 +21,43 @@ namespace search::linguistics {
  * Class used to extract tokens from annotated string field value.
  */
 class TokenExtractor {
+    const vespalib::string& _field_name;
+    size_t                  _max_word_len;
+
+public:
+    struct SpanTerm {
+        document::Span      span;
+        vespalib::stringref word;
+        bool                altered;
+
+        SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept
+            : span(span_),
+              word(word_),
+              altered(altered_)
+        {
+        }
+        SpanTerm() noexcept
+            : span(),
+              word(),
+              altered(false)
+        {
+        }
+        bool operator<(const SpanTerm& rhs) const noexcept {
+            if (span != rhs.span) {
+                return span < rhs.span;
+            }
+            return word < rhs.word;
+        }
+    };
+
+private:
+    void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const;
+
 public:
-    using SpanTerm = std::pair<document::Span, const document::FieldValue*>;
-    static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees);
+    TokenExtractor(const vespalib::string& field_name, size_t max_word_len);
+    ~TokenExtractor();
+    void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const;
+    vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const;
 };
 
 }
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index b4f76d8e39f..bf267ab9e27 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -6,6 +6,7 @@
 #include <vespa/document/annotation/span.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
 #include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/memoryindex/field_inverter.h>
 #include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/searchlib/util/token_extractor.h>
 #include <vespa/vespalib/stllike/asciistream.h>
@@ -17,6 +18,7 @@ using document::FieldValue;
 using document::Span;
 using document::StringFieldValue;
 using search::linguistics::TokenExtractor;
+using search::memoryindex::FieldInverter;
 
 namespace search::docsummary {
 
@@ -28,14 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span)
     return {s.data() + span.from(), static_cast<size_t>(span.length())};
 }
 
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
-
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
-    if (!value.isA(FieldValue::Type::STRING)) {
-        throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC);
-    }
-    return static_cast<const StringFieldValue &>(value);
-}
+vespalib::string dummy_field_name;
 
 }
 
@@ -53,7 +48,7 @@ template <typename ForwardIt>
 void
 AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) {
     int annCnt = (last - it);
-    if (annCnt > 1 || (annCnt == 1 && it->second)) {
+    if (annCnt > 1 || (annCnt == 1 && it->altered)) {
         annotateSpans(span, it, last);
     } else {
         _out << getSpanString(_text, span) << juniper::separators::unit_separator_string;
@@ -67,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
          << (getSpanString(_text, span))
          << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
     while (it != last) {
-        if (it->second) {
-            _out << ensureStringFieldValue(*it->second).getValue();
-        } else {
-            _out << getSpanString(_text, span);
-        }
+        _out << it->word;
         if (++it != last) {
             _out << " ";
         }
@@ -86,26 +77,21 @@ AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
     using SpanTerm = TokenExtractor::SpanTerm;
     std::vector<SpanTerm> terms;
     auto span_trees = value.getSpanTrees();
-    if (!TokenExtractor::extract(true, terms, span_trees)) {
-        // Treat a string without annotations as a single span.
-        SpanTerm str(Span(0, _text.size()),
-                     static_cast<const FieldValue*>(nullptr));
-        handleAnnotations(str.first, &str, &str + 1);
-        return;
-    }
+    TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
+    token_extractor.extract(terms, span_trees, _text, nullptr);
     auto it = terms.begin();
     auto ite = terms.end();
     int32_t endPos = 0;
     for (; it != ite; ) {
         auto it_begin = it;
-        if (it_begin->first.from() >  endPos) {
-            Span tmpSpan(endPos, it_begin->first.from() - endPos);
+        if (it_begin->span.from() >  endPos) {
+            Span tmpSpan(endPos, it_begin->span.from() - endPos);
             handleAnnotations(tmpSpan, it, it);
-            endPos = it_begin->first.from();
+            endPos = it_begin->span.from();
         }
-        for (; it != ite && it->first == it_begin->first; ++it);
-        handleAnnotations(it_begin->first, it_begin, it);
-        endPos = it_begin->first.from() + it_begin->first.length();
+        for (; it != ite && it->span == it_begin->span; ++it);
+        handleAnnotations(it_begin->span, it_begin, it);
+        endPos = it_begin->span.from() + it_begin->span.length();
     }
     int32_t wantEndPos = _text.size();
     if (endPos < wantEndPos) {
author	Geir Storli <geirst@yahooinc.com>	2023-10-12 13:35:55 +0200
committer	GitHub <noreply@github.com>	2023-10-12 13:35:55 +0200
commit	4d4fa30cdb221decb7c1462f31635046748c50b2 (patch)
tree	198dbeeefcb9ae3c3d86c790e89228bddb6880b7
parent	724d1120e45dd09c8089c7f6b75a6be0f355e8b3 (diff)
parent	706bf2929c840606efba2763b177ae435579c1d7 (diff)