diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-10-12 13:35:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-12 13:35:55 +0200 |
commit | 4d4fa30cdb221decb7c1462f31635046748c50b2 (patch) | |
tree | 198dbeeefcb9ae3c3d86c790e89228bddb6880b7 | |
parent | 724d1120e45dd09c8089c7f6b75a6be0f355e8b3 (diff) | |
parent | 706bf2929c840606efba2763b177ae435579c1d7 (diff) |
Merge pull request #28894 from vespa-engine/toregge/move-checks-to-token-extractor
Move more checks to TokenExtractor.
8 files changed, 337 insertions, 117 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index e9817497904..6510808760c 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -244,6 +244,7 @@ vespa_define_module( src/tests/util/folded_string_compare src/tests/util/searchable_stats src/tests/util/slime_output_raw_buf_adapter + src/tests/util/token_extractor src/tests/vespa-fileheader-inspect ) diff --git a/searchlib/src/tests/util/token_extractor/CMakeLists.txt b/searchlib/src/tests/util/token_extractor/CMakeLists.txt new file mode 100644 index 00000000000..adfe579243c --- /dev/null +++ b/searchlib/src/tests/util/token_extractor/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_token_extractor_test_app TEST + SOURCES + token_extractor_test.cpp + DEPENDS + searchlib_test + GTest::gtest +) +vespa_add_test(NAME searchlib_token_extractor_test_app COMMAND searchlib_token_extractor_test_app) diff --git a/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp new file mode 100644 index 00000000000..e6944e257e9 --- /dev/null +++ b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp @@ -0,0 +1,164 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> +#include <vespa/searchlib/test/doc_builder.h> +#include <vespa/searchlib/test/string_field_builder.h> +#include <vespa/searchlib/util/token_extractor.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/objects/nbostream.h> +#include <variant> + +using document::DataType; +using document::Document; +using document::StringFieldValue; +using search::linguistics::TokenExtractor; +using search::test::DocBuilder; +using search::test::StringFieldBuilder; + +using AlternativeWords = std::vector<vespalib::string>; +using AlternativeWordsOrWord = std::variant<AlternativeWords, vespalib::string>; +using Words = std::vector<AlternativeWordsOrWord>; + +namespace { + +vespalib::string corrupt_word = "corruptWord"; + +vespalib::string field_name("stringfield"); + +std::unique_ptr<Document> +make_corrupted_document(DocBuilder &b, size_t wordOffset) +{ + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::18"); + doc->setValue(field_name, sfb.tokenize("before ").word(corrupt_word).tokenize(" after").build()); + vespalib::nbostream stream; + doc->serialize(stream); + std::vector<char> raw; + raw.resize(stream.size()); + stream.read(&raw[0], stream.size()); + assert(wordOffset < corrupt_word.size()); + for (size_t i = 0; i + corrupt_word.size() <= raw.size(); ++i) { + if (memcmp(&raw[i], corrupt_word.c_str(), corrupt_word.size()) == 0) { + raw[i + wordOffset] = '\0'; + break; + } + } + vespalib::nbostream badstream; + badstream.write(&raw[0], raw.size()); + return std::make_unique<Document>(b.get_repo(), badstream); +} + +} + +class TokenExtractorTest : public ::testing::Test { +protected: + using SpanTerm = TokenExtractor::SpanTerm; + DocBuilder _doc_builder; + std::unique_ptr<Document> _doc; + TokenExtractor _token_extractor; + std::vector<SpanTerm> _terms; + + static constexpr size_t max_word_len = 20; + + TokenExtractorTest(); + ~TokenExtractorTest() override; + + static DocBuilder::AddFieldsType + make_add_fields() + { + return [](auto& header) { header.addField(field_name, DataType::T_STRING); }; + } + + Words process(const StringFieldValue& value); +}; + +TokenExtractorTest::TokenExtractorTest() + : _doc_builder(make_add_fields()), + _doc(_doc_builder.make_document("id:ns:searchdocument::0")), + _token_extractor(field_name, max_word_len), + _terms() +{ +} + +TokenExtractorTest::~TokenExtractorTest() = default; + +Words +TokenExtractorTest::process(const StringFieldValue& value) +{ + Words result; + _terms.clear(); + auto span_trees = value.getSpanTrees(); + vespalib::stringref text = value.getValueRef(); + _token_extractor.extract(_terms, span_trees, text, _doc.get()); + auto it = _terms.begin(); + auto ite = _terms.end(); + auto itn = it; + for (; it != ite; ) { + for (; itn != ite && itn->span == it->span; ++itn); + if ((itn - it) > 1) { + auto& alternatives = std::get<0>(result.emplace_back()); + for (;it != itn; ++it) { + alternatives.emplace_back(it->word); + } + } else { + result.emplace_back(vespalib::string(it->word)); + ++it; + } + } + + return result; +} + +TEST_F(TokenExtractorTest, empty_string) +{ + EXPECT_EQ((Words{}), process(StringFieldValue(""))); +} + +TEST_F(TokenExtractorTest, plain_string) +{ + EXPECT_EQ((Words{"Plain string"}), process(StringFieldValue("Plain string"))); +} + +TEST_F(TokenExtractorTest, normal_string) +{ + StringFieldBuilder sfb(_doc_builder); + EXPECT_EQ((Words{"Hello", "world"}), process(sfb.tokenize("Hello world").build())); +} + +TEST_F(TokenExtractorTest, normalized_tokens) +{ + StringFieldBuilder sfb(_doc_builder); + auto value = sfb.token("Hello", false).alt_word("hello").tokenize(" world").build(); + EXPECT_EQ("Hello world", value.getValue()); + EXPECT_EQ((Words{"hello", "world"}), process(value)); +} + +TEST_F(TokenExtractorTest, alternative_tokens) +{ + StringFieldBuilder sfb(_doc_builder); + auto value = sfb.word("Hello").alt_word("hello").tokenize(" world").build(); + EXPECT_EQ("Hello world", value.getValue()); + EXPECT_EQ((Words{AlternativeWords{"Hello", "hello"}, "world"}), process(value)); +} + +TEST_F(TokenExtractorTest, word_with_nul_byte_is_truncated) +{ + auto doc = make_corrupted_document(_doc_builder, 7); + EXPECT_EQ((Words{"before", "corrupt", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name)))); +} + +TEST_F(TokenExtractorTest, word_with_nul_byte_at_start_is_dropped) +{ + auto doc = make_corrupted_document(_doc_builder, 0); + EXPECT_EQ((Words{"before", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name)))); +} + +TEST_F(TokenExtractorTest, too_long_word_is_dropped) +{ + StringFieldBuilder sfb(_doc_builder); + EXPECT_EQ((Words{"before", "after"}), process(sfb.tokenize("before veryverylongwordthatwillbedropped after").build())); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp index 2a54859352d..a69260c6f45 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp @@ -21,9 +21,6 @@ #include <vespa/vespalib/stllike/hash_map.hpp> #include <stdexcept> -#include <vespa/log/log.h> -LOG_SETUP(".searchlib.memoryindex.fieldinverter"); - namespace search::memoryindex { using document::Annotation; @@ -51,45 +48,17 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document& { _terms.clear(); auto span_trees = value.getSpanTrees(); - if (!TokenExtractor::extract(false, _terms, span_trees)) { - /* This is wrong unless field is exact match */ - const vespalib::string &text = value.getValue(); - if (text.empty()) { - return; - } - uint32_t wordRef = saveWord(text, &doc); - if (wordRef != 0u) { - add(wordRef); - stepWordPos(); - } - return; - } - const vespalib::string &text = value.getValue(); + vespalib::stringref text = value.getValueRef(); + _token_extractor.extract(_terms, span_trees, text, &doc); auto it = _terms.begin(); auto ite = _terms.end(); - uint32_t wordRef; - bool mustStep = false; for (; it != ite; ) { auto it_begin = it; - for (; it != ite && it->first == it_begin->first; ++it) { - if (it->second) { // it->second is a const FieldValue *. - wordRef = saveWord(*it->second, doc); - } else { - const Span &iSpan = it->first; - assert(iSpan.from() >= 0); - assert(iSpan.length() > 0); - wordRef = saveWord(vespalib::stringref(&text[iSpan.from()], - iSpan.length()), &doc); - } - if (wordRef != 0u) { - add(wordRef); - mustStep = true; - } - } - if (mustStep) { - stepWordPos(); - mustStep = false; + for (; it != ite && it->span == it_begin->span; ++it) { + uint32_t wordRef = saveWord(it->word); + add(wordRef); } + stepWordPos(); } } @@ -170,33 +139,19 @@ FieldInverter::endElement() } uint32_t -FieldInverter::saveWord(const vespalib::stringref word, const Document* doc) +FieldInverter::saveWord(vespalib::stringref word) { const size_t wordsSize = _words.size(); // assert((wordsSize & 3) == 0); // Check alignment - size_t len = strnlen(word.data(), word.size()); - if (len < word.size()) { - const Schema::IndexField &field = _schema.getIndexField(_fieldId); - LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, lid is %u, field is %s, truncated word is %s", word.size(), len, _docId, field.getName().c_str(), word.data()); - } - if (len > max_word_len && doc != nullptr) { - const Schema::IndexField& field = _schema.getIndexField(_fieldId); - LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.100s", len, max_word_len, doc->getId().toString().c_str(), field.getName().c_str(), word.data()); - return 0u; - } - if (len == 0) { - return 0u; - } - - const size_t unpadded_size = wordsSize + 4 + len + 1; + const size_t unpadded_size = wordsSize + 4 + word.size() + 1; const size_t fullyPaddedSize = Aligner<4>::align(unpadded_size); _words.reserve(vespalib::roundUp2inN(fullyPaddedSize)); _words.resize(fullyPaddedSize); char * buf = &_words[0] + wordsSize; memset(buf, 0, 4); - memcpy(buf + 4, word.data(), len); - memset(buf + 4 + len, 0, fullyPaddedSize - unpadded_size + 1); + memcpy(buf + 4, word.data(), word.size()); + memset(buf + 4 + word.size(), 0, fullyPaddedSize - unpadded_size + 1); uint32_t wordRef = (wordsSize + 4) >> 2; // assert(wordRef != 0); @@ -204,20 +159,10 @@ FieldInverter::saveWord(const vespalib::stringref word, const Document* doc) return wordRef; } -uint32_t -FieldInverter::saveWord(const document::FieldValue &fv, const Document& doc) -{ - assert(fv.isA(FieldValue::Type::STRING)); - using RawRef = std::pair<const char*, size_t>; - RawRef sRef = fv.getAsRaw(); - return saveWord(vespalib::stringref(sRef.first, sRef.second), &doc); -} - void FieldInverter::remove(const vespalib::stringref word, uint32_t docId) { - uint32_t wordRef = saveWord(word, nullptr); - assert(wordRef != 0); + uint32_t wordRef = saveWord(word); _positions.emplace_back(wordRef, docId); } @@ -245,6 +190,17 @@ FieldInverter::endDoc() } void +FieldInverter::addWord(vespalib::stringref word, const document::Document& doc) +{ + word = _token_extractor.sanitize_word(word, &doc); + if (!word.empty()) { + uint32_t wordRef = saveWord(word); + add(wordRef); + stepWordPos(); + } +} + +void FieldInverter::processNormalDocTextField(const StringFieldValue &field, const Document& doc) { startElement(1); @@ -293,6 +249,7 @@ FieldInverter::FieldInverter(const Schema &schema, uint32_t fieldId, _docId(0), _oldPosSize(0), _schema(schema), + _token_extractor(_schema.getIndexField(_fieldId).getName(), max_word_len), _words(), _elems(), _positions(), diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h index 23e3f9ddfd8..4e3934ba322 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h @@ -173,6 +173,7 @@ private: uint32_t _oldPosSize; const index::Schema &_schema; + linguistics::TokenExtractor _token_extractor; WordBuffer _words; ElemInfoVec _elems; @@ -202,12 +203,7 @@ private: /** * Save the given word in the word buffer and return the word reference. */ - VESPA_DLL_LOCAL uint32_t saveWord(const vespalib::stringref word, const document::Document* doc); - - /** - * Save the field value as a word in the word buffer and return the word reference. - */ - VESPA_DLL_LOCAL uint32_t saveWord(const document::FieldValue &fv, const document::Document& doc); + VESPA_DLL_LOCAL uint32_t saveWord(vespalib::stringref word); /** * Get pointer to saved word from a word reference. @@ -326,13 +322,7 @@ public: void endDoc(); - void addWord(const vespalib::stringref word, const document::Document& doc) { - uint32_t wordRef = saveWord(word, &doc); - if (wordRef != 0u) { - add(wordRef); - stepWordPos(); - } - } + void addWord(vespalib::stringref word, const document::Document& doc); }; } diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp index 555ea86d299..a78f30afe21 100644 --- a/searchlib/src/vespa/searchlib/util/token_extractor.cpp +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -6,16 +6,25 @@ #include <vespa/document/annotation/span.h> #include <vespa/document/annotation/spanlist.h> #include <vespa/document/annotation/spantreevisitor.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/util/exceptions.h> + +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.util.token_extractor"); using document::AlternateSpanList; using document::Annotation; using document::AnnotationType; +using document::Document; +using document::FieldValue; using document::SimpleSpanList; using document::Span; using document::SpanList; using document::SpanNode; using document::SpanTreeVisitor; using document::StringFieldValue; +using vespalib::Utf8Reader; namespace search::linguistics { @@ -58,14 +67,85 @@ getSpan(const SpanNode &span_node) return finder.span(); } +vespalib::stringref +get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv) +{ + if (fv != nullptr) { + auto raw = fv->getAsRaw(); + return {raw.first, raw.second}; + } else { + return {s.data() + span.from(), static_cast<size_t>(span.length())}; + } +} + +size_t +truncated_word_len(vespalib::stringref word, size_t max_byte_len) +{ + Utf8Reader reader(word); + while (reader.hasMore()) { + auto last_pos = reader.getPos(); + (void) reader.getChar(); + if (reader.getPos() > max_byte_len) { + return last_pos; + } + } + return reader.getPos(); // No truncation +} + +constexpr size_t max_fmt_len = 100; // Max length of word in logs + +} + +TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len) + : _field_name(field_name), + _max_word_len(max_word_len) +{ +} + +TokenExtractor::~TokenExtractor() = default; + +vespalib::stringref +TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const +{ + size_t len = strnlen(word.data(), word.size()); + if (len < word.size()) { + size_t old_len = word.size(); + len = truncated_word_len(word, len); + word = word.substr(0, len); + if (doc != nullptr) { + LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + } + if (word.size() > _max_word_len) { + if (doc != nullptr) { + LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + return {}; + } + return word; +} + +void +TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const +{ + if (span.length() > 0 && span.from() >= 0 && + static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) { + auto word = get_span_string_or_alternative(text, span, fv); + word = sanitize_word(word, doc); + if (!word.empty()) { + terms.emplace_back(span, word, fv != nullptr); + } + } } -bool -TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees) +void +TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const { auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); if (tree == nullptr) { - return false; + /* field might not be annotated if match type is exact */ + consider_word(terms, text, Span(0, text.size()), nullptr, doc); + return; } for (const Annotation & annotation : *tree) { const SpanNode *span = annotation.getSpanNode(); @@ -73,13 +153,10 @@ TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& te (annotation.getType() == *AnnotationType::TERM)) { Span sp = getSpan(*span); - if (sp.length() != 0 || allow_zero_length_tokens) { - terms.emplace_back(sp, annotation.getFieldValue()); - } + consider_word(terms, text, sp, annotation.getFieldValue(), doc); } } std::sort(terms.begin(), terms.end()); - return true; } } diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h index 5796aaa7482..4955448b0c2 100644 --- a/searchlib/src/vespa/searchlib/util/token_extractor.h +++ b/searchlib/src/vespa/searchlib/util/token_extractor.h @@ -2,14 +2,16 @@ #pragma once +#include <vespa/document/annotation/span.h> #include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/vespalib/stllike/string.h> #include <vector> namespace document { -class FieldValue; -class StringFieldValue; +class Document; class Span; +class StringFieldValue; } @@ -19,9 +21,43 @@ namespace search::linguistics { * Class used to extract tokens from annotated string field value. */ class TokenExtractor { + const vespalib::string& _field_name; + size_t _max_word_len; + +public: + struct SpanTerm { + document::Span span; + vespalib::stringref word; + bool altered; + + SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept + : span(span_), + word(word_), + altered(altered_) + { + } + SpanTerm() noexcept + : span(), + word(), + altered(false) + { + } + bool operator<(const SpanTerm& rhs) const noexcept { + if (span != rhs.span) { + return span < rhs.span; + } + return word < rhs.word; + } + }; + +private: + void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const; + public: - using SpanTerm = std::pair<document::Span, const document::FieldValue*>; - static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees); + TokenExtractor(const vespalib::string& field_name, size_t max_word_len); + ~TokenExtractor(); + void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const; + vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index b4f76d8e39f..bf267ab9e27 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -6,6 +6,7 @@ #include <vespa/document/annotation/span.h> #include <vespa/document/fieldvalue/stringfieldvalue.h> #include <vespa/juniper/juniper_separators.h> +#include <vespa/searchlib/memoryindex/field_inverter.h> #include <vespa/searchlib/util/linguisticsannotation.h> #include <vespa/searchlib/util/token_extractor.h> #include <vespa/vespalib/stllike/asciistream.h> @@ -17,6 +18,7 @@ using document::FieldValue; using document::Span; using document::StringFieldValue; using search::linguistics::TokenExtractor; +using search::memoryindex::FieldInverter; namespace search::docsummary { @@ -28,14 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span) return {s.data() + span.from(), static_cast<size_t>(span.length())}; } -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); - -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { - if (!value.isA(FieldValue::Type::STRING)) { - throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC); - } - return static_cast<const StringFieldValue &>(value); -} +vespalib::string dummy_field_name; } @@ -53,7 +48,7 @@ template <typename ForwardIt> void AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) { int annCnt = (last - it); - if (annCnt > 1 || (annCnt == 1 && it->second)) { + if (annCnt > 1 || (annCnt == 1 && it->altered)) { annotateSpans(span, it, last); } else { _out << getSpanString(_text, span) << juniper::separators::unit_separator_string; @@ -67,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For << (getSpanString(_text, span)) << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR while (it != last) { - if (it->second) { - _out << ensureStringFieldValue(*it->second).getValue(); - } else { - _out << getSpanString(_text, span); - } + _out << it->word; if (++it != last) { _out << " "; } @@ -86,26 +77,21 @@ AnnotationConverter::handleIndexingTerms(const StringFieldValue& value) using SpanTerm = TokenExtractor::SpanTerm; std::vector<SpanTerm> terms; auto span_trees = value.getSpanTrees(); - if (!TokenExtractor::extract(true, terms, span_trees)) { - // Treat a string without annotations as a single span. - SpanTerm str(Span(0, _text.size()), - static_cast<const FieldValue*>(nullptr)); - handleAnnotations(str.first, &str, &str + 1); - return; - } + TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len); + token_extractor.extract(terms, span_trees, _text, nullptr); auto it = terms.begin(); auto ite = terms.end(); int32_t endPos = 0; for (; it != ite; ) { auto it_begin = it; - if (it_begin->first.from() > endPos) { - Span tmpSpan(endPos, it_begin->first.from() - endPos); + if (it_begin->span.from() > endPos) { + Span tmpSpan(endPos, it_begin->span.from() - endPos); handleAnnotations(tmpSpan, it, it); - endPos = it_begin->first.from(); + endPos = it_begin->span.from(); } - for (; it != ite && it->first == it_begin->first; ++it); - handleAnnotations(it_begin->first, it_begin, it); - endPos = it_begin->first.from() + it_begin->first.length(); + for (; it != ite && it->span == it_begin->span; ++it); + handleAnnotations(it_begin->span, it_begin, it); + endPos = it_begin->span.from() + it_begin->span.length(); } int32_t wantEndPos = _text.size(); if (endPos < wantEndPos) { |