summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-10-12 13:35:55 +0200
committerGitHub <noreply@github.com>2023-10-12 13:35:55 +0200
commit4d4fa30cdb221decb7c1462f31635046748c50b2 (patch)
tree198dbeeefcb9ae3c3d86c790e89228bddb6880b7
parent724d1120e45dd09c8089c7f6b75a6be0f355e8b3 (diff)
parent706bf2929c840606efba2763b177ae435579c1d7 (diff)
Merge pull request #28894 from vespa-engine/toregge/move-checks-to-token-extractor
Move more checks to TokenExtractor.
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/util/token_extractor/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/util/token_extractor/token_extractor_test.cpp164
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp89
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_inverter.h16
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.cpp91
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.h44
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp40
8 files changed, 337 insertions, 117 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index e9817497904..6510808760c 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -244,6 +244,7 @@ vespa_define_module(
src/tests/util/folded_string_compare
src/tests/util/searchable_stats
src/tests/util/slime_output_raw_buf_adapter
+ src/tests/util/token_extractor
src/tests/vespa-fileheader-inspect
)
diff --git a/searchlib/src/tests/util/token_extractor/CMakeLists.txt b/searchlib/src/tests/util/token_extractor/CMakeLists.txt
new file mode 100644
index 00000000000..adfe579243c
--- /dev/null
+++ b/searchlib/src/tests/util/token_extractor/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_token_extractor_test_app TEST
+ SOURCES
+ token_extractor_test.cpp
+ DEPENDS
+ searchlib_test
+ GTest::gtest
+)
+vespa_add_test(NAME searchlib_token_extractor_test_app COMMAND searchlib_token_extractor_test_app)
diff --git a/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp
new file mode 100644
index 00000000000..e6944e257e9
--- /dev/null
+++ b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp
@@ -0,0 +1,164 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/repo/configbuilder.h>
+#include <vespa/searchlib/test/doc_builder.h>
+#include <vespa/searchlib/test/string_field_builder.h>
+#include <vespa/searchlib/util/token_extractor.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/objects/nbostream.h>
+#include <variant>
+
+using document::DataType;
+using document::Document;
+using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
+using search::test::DocBuilder;
+using search::test::StringFieldBuilder;
+
+using AlternativeWords = std::vector<vespalib::string>;
+using AlternativeWordsOrWord = std::variant<AlternativeWords, vespalib::string>;
+using Words = std::vector<AlternativeWordsOrWord>;
+
+namespace {
+
+vespalib::string corrupt_word = "corruptWord";
+
+vespalib::string field_name("stringfield");
+
+std::unique_ptr<Document>
+make_corrupted_document(DocBuilder &b, size_t wordOffset)
+{
+ StringFieldBuilder sfb(b);
+ auto doc = b.make_document("id:ns:searchdocument::18");
+ doc->setValue(field_name, sfb.tokenize("before ").word(corrupt_word).tokenize(" after").build());
+ vespalib::nbostream stream;
+ doc->serialize(stream);
+ std::vector<char> raw;
+ raw.resize(stream.size());
+ stream.read(&raw[0], stream.size());
+ assert(wordOffset < corrupt_word.size());
+ for (size_t i = 0; i + corrupt_word.size() <= raw.size(); ++i) {
+ if (memcmp(&raw[i], corrupt_word.c_str(), corrupt_word.size()) == 0) {
+ raw[i + wordOffset] = '\0';
+ break;
+ }
+ }
+ vespalib::nbostream badstream;
+ badstream.write(&raw[0], raw.size());
+ return std::make_unique<Document>(b.get_repo(), badstream);
+}
+
+}
+
+class TokenExtractorTest : public ::testing::Test {
+protected:
+ using SpanTerm = TokenExtractor::SpanTerm;
+ DocBuilder _doc_builder;
+ std::unique_ptr<Document> _doc;
+ TokenExtractor _token_extractor;
+ std::vector<SpanTerm> _terms;
+
+ static constexpr size_t max_word_len = 20;
+
+ TokenExtractorTest();
+ ~TokenExtractorTest() override;
+
+ static DocBuilder::AddFieldsType
+ make_add_fields()
+ {
+ return [](auto& header) { header.addField(field_name, DataType::T_STRING); };
+ }
+
+ Words process(const StringFieldValue& value);
+};
+
+TokenExtractorTest::TokenExtractorTest()
+ : _doc_builder(make_add_fields()),
+ _doc(_doc_builder.make_document("id:ns:searchdocument::0")),
+ _token_extractor(field_name, max_word_len),
+ _terms()
+{
+}
+
+TokenExtractorTest::~TokenExtractorTest() = default;
+
+Words
+TokenExtractorTest::process(const StringFieldValue& value)
+{
+ Words result;
+ _terms.clear();
+ auto span_trees = value.getSpanTrees();
+ vespalib::stringref text = value.getValueRef();
+ _token_extractor.extract(_terms, span_trees, text, _doc.get());
+ auto it = _terms.begin();
+ auto ite = _terms.end();
+ auto itn = it;
+ for (; it != ite; ) {
+ for (; itn != ite && itn->span == it->span; ++itn);
+ if ((itn - it) > 1) {
+ auto& alternatives = std::get<0>(result.emplace_back());
+ for (;it != itn; ++it) {
+ alternatives.emplace_back(it->word);
+ }
+ } else {
+ result.emplace_back(vespalib::string(it->word));
+ ++it;
+ }
+ }
+
+ return result;
+}
+
+TEST_F(TokenExtractorTest, empty_string)
+{
+ EXPECT_EQ((Words{}), process(StringFieldValue("")));
+}
+
+TEST_F(TokenExtractorTest, plain_string)
+{
+ EXPECT_EQ((Words{"Plain string"}), process(StringFieldValue("Plain string")));
+}
+
+TEST_F(TokenExtractorTest, normal_string)
+{
+ StringFieldBuilder sfb(_doc_builder);
+ EXPECT_EQ((Words{"Hello", "world"}), process(sfb.tokenize("Hello world").build()));
+}
+
+TEST_F(TokenExtractorTest, normalized_tokens)
+{
+ StringFieldBuilder sfb(_doc_builder);
+ auto value = sfb.token("Hello", false).alt_word("hello").tokenize(" world").build();
+ EXPECT_EQ("Hello world", value.getValue());
+ EXPECT_EQ((Words{"hello", "world"}), process(value));
+}
+
+TEST_F(TokenExtractorTest, alternative_tokens)
+{
+ StringFieldBuilder sfb(_doc_builder);
+ auto value = sfb.word("Hello").alt_word("hello").tokenize(" world").build();
+ EXPECT_EQ("Hello world", value.getValue());
+ EXPECT_EQ((Words{AlternativeWords{"Hello", "hello"}, "world"}), process(value));
+}
+
+TEST_F(TokenExtractorTest, word_with_nul_byte_is_truncated)
+{
+ auto doc = make_corrupted_document(_doc_builder, 7);
+ EXPECT_EQ((Words{"before", "corrupt", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name))));
+}
+
+TEST_F(TokenExtractorTest, word_with_nul_byte_at_start_is_dropped)
+{
+ auto doc = make_corrupted_document(_doc_builder, 0);
+ EXPECT_EQ((Words{"before", "after"}), process(dynamic_cast<const StringFieldValue&>(*doc->getValue(field_name))));
+}
+
+TEST_F(TokenExtractorTest, too_long_word_is_dropped)
+{
+ StringFieldBuilder sfb(_doc_builder);
+ EXPECT_EQ((Words{"before", "after"}), process(sfb.tokenize("before veryverylongwordthatwillbedropped after").build()));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 2a54859352d..a69260c6f45 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -21,9 +21,6 @@
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <stdexcept>
-#include <vespa/log/log.h>
-LOG_SETUP(".searchlib.memoryindex.fieldinverter");
-
namespace search::memoryindex {
using document::Annotation;
@@ -51,45 +48,17 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document&
{
_terms.clear();
auto span_trees = value.getSpanTrees();
- if (!TokenExtractor::extract(false, _terms, span_trees)) {
- /* This is wrong unless field is exact match */
- const vespalib::string &text = value.getValue();
- if (text.empty()) {
- return;
- }
- uint32_t wordRef = saveWord(text, &doc);
- if (wordRef != 0u) {
- add(wordRef);
- stepWordPos();
- }
- return;
- }
- const vespalib::string &text = value.getValue();
+ vespalib::stringref text = value.getValueRef();
+ _token_extractor.extract(_terms, span_trees, text, &doc);
auto it = _terms.begin();
auto ite = _terms.end();
- uint32_t wordRef;
- bool mustStep = false;
for (; it != ite; ) {
auto it_begin = it;
- for (; it != ite && it->first == it_begin->first; ++it) {
- if (it->second) { // it->second is a const FieldValue *.
- wordRef = saveWord(*it->second, doc);
- } else {
- const Span &iSpan = it->first;
- assert(iSpan.from() >= 0);
- assert(iSpan.length() > 0);
- wordRef = saveWord(vespalib::stringref(&text[iSpan.from()],
- iSpan.length()), &doc);
- }
- if (wordRef != 0u) {
- add(wordRef);
- mustStep = true;
- }
- }
- if (mustStep) {
- stepWordPos();
- mustStep = false;
+ for (; it != ite && it->span == it_begin->span; ++it) {
+ uint32_t wordRef = saveWord(it->word);
+ add(wordRef);
}
+ stepWordPos();
}
}
@@ -170,33 +139,19 @@ FieldInverter::endElement()
}
uint32_t
-FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
+FieldInverter::saveWord(vespalib::stringref word)
{
const size_t wordsSize = _words.size();
// assert((wordsSize & 3) == 0); // Check alignment
- size_t len = strnlen(word.data(), word.size());
- if (len < word.size()) {
- const Schema::IndexField &field = _schema.getIndexField(_fieldId);
- LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, lid is %u, field is %s, truncated word is %s", word.size(), len, _docId, field.getName().c_str(), word.data());
- }
- if (len > max_word_len && doc != nullptr) {
- const Schema::IndexField& field = _schema.getIndexField(_fieldId);
- LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.100s", len, max_word_len, doc->getId().toString().c_str(), field.getName().c_str(), word.data());
- return 0u;
- }
- if (len == 0) {
- return 0u;
- }
-
- const size_t unpadded_size = wordsSize + 4 + len + 1;
+ const size_t unpadded_size = wordsSize + 4 + word.size() + 1;
const size_t fullyPaddedSize = Aligner<4>::align(unpadded_size);
_words.reserve(vespalib::roundUp2inN(fullyPaddedSize));
_words.resize(fullyPaddedSize);
char * buf = &_words[0] + wordsSize;
memset(buf, 0, 4);
- memcpy(buf + 4, word.data(), len);
- memset(buf + 4 + len, 0, fullyPaddedSize - unpadded_size + 1);
+ memcpy(buf + 4, word.data(), word.size());
+ memset(buf + 4 + word.size(), 0, fullyPaddedSize - unpadded_size + 1);
uint32_t wordRef = (wordsSize + 4) >> 2;
// assert(wordRef != 0);
@@ -204,20 +159,10 @@ FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
return wordRef;
}
-uint32_t
-FieldInverter::saveWord(const document::FieldValue &fv, const Document& doc)
-{
- assert(fv.isA(FieldValue::Type::STRING));
- using RawRef = std::pair<const char*, size_t>;
- RawRef sRef = fv.getAsRaw();
- return saveWord(vespalib::stringref(sRef.first, sRef.second), &doc);
-}
-
void
FieldInverter::remove(const vespalib::stringref word, uint32_t docId)
{
- uint32_t wordRef = saveWord(word, nullptr);
- assert(wordRef != 0);
+ uint32_t wordRef = saveWord(word);
_positions.emplace_back(wordRef, docId);
}
@@ -245,6 +190,17 @@ FieldInverter::endDoc()
}
void
+FieldInverter::addWord(vespalib::stringref word, const document::Document& doc)
+{
+ word = _token_extractor.sanitize_word(word, &doc);
+ if (!word.empty()) {
+ uint32_t wordRef = saveWord(word);
+ add(wordRef);
+ stepWordPos();
+ }
+}
+
+void
FieldInverter::processNormalDocTextField(const StringFieldValue &field, const Document& doc)
{
startElement(1);
@@ -293,6 +249,7 @@ FieldInverter::FieldInverter(const Schema &schema, uint32_t fieldId,
_docId(0),
_oldPosSize(0),
_schema(schema),
+ _token_extractor(_schema.getIndexField(_fieldId).getName(), max_word_len),
_words(),
_elems(),
_positions(),
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
index 23e3f9ddfd8..4e3934ba322 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
@@ -173,6 +173,7 @@ private:
uint32_t _oldPosSize;
const index::Schema &_schema;
+ linguistics::TokenExtractor _token_extractor;
WordBuffer _words;
ElemInfoVec _elems;
@@ -202,12 +203,7 @@ private:
/**
* Save the given word in the word buffer and return the word reference.
*/
- VESPA_DLL_LOCAL uint32_t saveWord(const vespalib::stringref word, const document::Document* doc);
-
- /**
- * Save the field value as a word in the word buffer and return the word reference.
- */
- VESPA_DLL_LOCAL uint32_t saveWord(const document::FieldValue &fv, const document::Document& doc);
+ VESPA_DLL_LOCAL uint32_t saveWord(vespalib::stringref word);
/**
* Get pointer to saved word from a word reference.
@@ -326,13 +322,7 @@ public:
void endDoc();
- void addWord(const vespalib::stringref word, const document::Document& doc) {
- uint32_t wordRef = saveWord(word, &doc);
- if (wordRef != 0u) {
- add(wordRef);
- stepWordPos();
- }
- }
+ void addWord(vespalib::stringref word, const document::Document& doc);
};
}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
index 555ea86d299..a78f30afe21 100644
--- a/searchlib/src/vespa/searchlib/util/token_extractor.cpp
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -6,16 +6,25 @@
#include <vespa/document/annotation/span.h>
#include <vespa/document/annotation/spanlist.h>
#include <vespa/document/annotation/spantreevisitor.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/exceptions.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".searchlib.util.token_extractor");
using document::AlternateSpanList;
using document::Annotation;
using document::AnnotationType;
+using document::Document;
+using document::FieldValue;
using document::SimpleSpanList;
using document::Span;
using document::SpanList;
using document::SpanNode;
using document::SpanTreeVisitor;
using document::StringFieldValue;
+using vespalib::Utf8Reader;
namespace search::linguistics {
@@ -58,14 +67,85 @@ getSpan(const SpanNode &span_node)
return finder.span();
}
+vespalib::stringref
+get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv)
+{
+ if (fv != nullptr) {
+ auto raw = fv->getAsRaw();
+ return {raw.first, raw.second};
+ } else {
+ return {s.data() + span.from(), static_cast<size_t>(span.length())};
+ }
+}
+
+size_t
+truncated_word_len(vespalib::stringref word, size_t max_byte_len)
+{
+ Utf8Reader reader(word);
+ while (reader.hasMore()) {
+ auto last_pos = reader.getPos();
+ (void) reader.getChar();
+ if (reader.getPos() > max_byte_len) {
+ return last_pos;
+ }
+ }
+ return reader.getPos(); // No truncation
+}
+
+constexpr size_t max_fmt_len = 100; // Max length of word in logs
+
+}
+
+TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len)
+ : _field_name(field_name),
+ _max_word_len(max_word_len)
+{
+}
+
+TokenExtractor::~TokenExtractor() = default;
+
+vespalib::stringref
+TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const
+{
+ size_t len = strnlen(word.data(), word.size());
+ if (len < word.size()) {
+ size_t old_len = word.size();
+ len = truncated_word_len(word, len);
+ word = word.substr(0, len);
+ if (doc != nullptr) {
+ LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+ }
+ }
+ if (word.size() > _max_word_len) {
+ if (doc != nullptr) {
+ LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
+ }
+ return {};
+ }
+ return word;
+}
+
+void
+TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const
+{
+ if (span.length() > 0 && span.from() >= 0 &&
+ static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) {
+ auto word = get_span_string_or_alternative(text, span, fv);
+ word = sanitize_word(word, doc);
+ if (!word.empty()) {
+ terms.emplace_back(span, word, fv != nullptr);
+ }
+ }
}
-bool
-TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees)
+void
+TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const
{
auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
if (tree == nullptr) {
- return false;
+ /* field might not be annotated if match type is exact */
+ consider_word(terms, text, Span(0, text.size()), nullptr, doc);
+ return;
}
for (const Annotation & annotation : *tree) {
const SpanNode *span = annotation.getSpanNode();
@@ -73,13 +153,10 @@ TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& te
(annotation.getType() == *AnnotationType::TERM))
{
Span sp = getSpan(*span);
- if (sp.length() != 0 || allow_zero_length_tokens) {
- terms.emplace_back(sp, annotation.getFieldValue());
- }
+ consider_word(terms, text, sp, annotation.getFieldValue(), doc);
}
}
std::sort(terms.begin(), terms.end());
- return true;
}
}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
index 5796aaa7482..4955448b0c2 100644
--- a/searchlib/src/vespa/searchlib/util/token_extractor.h
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -2,14 +2,16 @@
#pragma once
+#include <vespa/document/annotation/span.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/stllike/string.h>
#include <vector>
namespace document {
-class FieldValue;
-class StringFieldValue;
+class Document;
class Span;
+class StringFieldValue;
}
@@ -19,9 +21,43 @@ namespace search::linguistics {
* Class used to extract tokens from annotated string field value.
*/
class TokenExtractor {
+ const vespalib::string& _field_name;
+ size_t _max_word_len;
+
+public:
+ struct SpanTerm {
+ document::Span span;
+ vespalib::stringref word;
+ bool altered;
+
+ SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept
+ : span(span_),
+ word(word_),
+ altered(altered_)
+ {
+ }
+ SpanTerm() noexcept
+ : span(),
+ word(),
+ altered(false)
+ {
+ }
+ bool operator<(const SpanTerm& rhs) const noexcept {
+ if (span != rhs.span) {
+ return span < rhs.span;
+ }
+ return word < rhs.word;
+ }
+ };
+
+private:
+ void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const;
+
public:
- using SpanTerm = std::pair<document::Span, const document::FieldValue*>;
- static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees);
+ TokenExtractor(const vespalib::string& field_name, size_t max_word_len);
+ ~TokenExtractor();
+ void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const;
+ vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const;
};
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index b4f76d8e39f..bf267ab9e27 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -6,6 +6,7 @@
#include <vespa/document/annotation/span.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
#include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/memoryindex/field_inverter.h>
#include <vespa/searchlib/util/linguisticsannotation.h>
#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/stllike/asciistream.h>
@@ -17,6 +18,7 @@ using document::FieldValue;
using document::Span;
using document::StringFieldValue;
using search::linguistics::TokenExtractor;
+using search::memoryindex::FieldInverter;
namespace search::docsummary {
@@ -28,14 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span)
return {s.data() + span.from(), static_cast<size_t>(span.length())};
}
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
-
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
- if (!value.isA(FieldValue::Type::STRING)) {
- throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC);
- }
- return static_cast<const StringFieldValue &>(value);
-}
+vespalib::string dummy_field_name;
}
@@ -53,7 +48,7 @@ template <typename ForwardIt>
void
AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) {
int annCnt = (last - it);
- if (annCnt > 1 || (annCnt == 1 && it->second)) {
+ if (annCnt > 1 || (annCnt == 1 && it->altered)) {
annotateSpans(span, it, last);
} else {
_out << getSpanString(_text, span) << juniper::separators::unit_separator_string;
@@ -67,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
<< (getSpanString(_text, span))
<< juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
while (it != last) {
- if (it->second) {
- _out << ensureStringFieldValue(*it->second).getValue();
- } else {
- _out << getSpanString(_text, span);
- }
+ _out << it->word;
if (++it != last) {
_out << " ";
}
@@ -86,26 +77,21 @@ AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
using SpanTerm = TokenExtractor::SpanTerm;
std::vector<SpanTerm> terms;
auto span_trees = value.getSpanTrees();
- if (!TokenExtractor::extract(true, terms, span_trees)) {
- // Treat a string without annotations as a single span.
- SpanTerm str(Span(0, _text.size()),
- static_cast<const FieldValue*>(nullptr));
- handleAnnotations(str.first, &str, &str + 1);
- return;
- }
+ TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
+ token_extractor.extract(terms, span_trees, _text, nullptr);
auto it = terms.begin();
auto ite = terms.end();
int32_t endPos = 0;
for (; it != ite; ) {
auto it_begin = it;
- if (it_begin->first.from() > endPos) {
- Span tmpSpan(endPos, it_begin->first.from() - endPos);
+ if (it_begin->span.from() > endPos) {
+ Span tmpSpan(endPos, it_begin->span.from() - endPos);
handleAnnotations(tmpSpan, it, it);
- endPos = it_begin->first.from();
+ endPos = it_begin->span.from();
}
- for (; it != ite && it->first == it_begin->first; ++it);
- handleAnnotations(it_begin->first, it_begin, it);
- endPos = it_begin->first.from() + it_begin->first.length();
+ for (; it != ite && it->span == it_begin->span; ++it);
+ handleAnnotations(it_begin->span, it_begin, it);
+ endPos = it_begin->span.from() + it_begin->span.length();
}
int32_t wantEndPos = _text.size();
if (endPos < wantEndPos) {