aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp')
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp169
1 files changed, 26 insertions, 143 deletions
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 8d23b235b07..a69260c6f45 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -1,13 +1,9 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "field_inverter.h"
#include "ordered_field_index_inserter.h"
-#include <vespa/document/annotation/alternatespanlist.h>
#include <vespa/document/annotation/annotation.h>
#include <vespa/document/annotation/span.h>
-#include <vespa/document/annotation/spanlist.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
#include <vespa/document/fieldvalue/arrayfieldvalue.h>
#include <vespa/document/fieldvalue/document.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
@@ -25,14 +21,9 @@
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <stdexcept>
-#include <vespa/log/log.h>
-LOG_SETUP(".searchlib.memoryindex.fieldinverter");
-
namespace search::memoryindex {
-using document::AlternateSpanList;
using document::Annotation;
-using document::AnnotationType;
using document::ArrayFieldValue;
using document::DataType;
using document::Document;
@@ -40,130 +31,34 @@ using document::DocumentType;
using document::Field;
using document::FieldValue;
using document::IntFieldValue;
-using document::SimpleSpanList;
using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
using document::StringFieldValue;
using document::StructFieldValue;
using document::WeightedSetFieldValue;
using index::DocIdAndPosOccFeatures;
using index::Schema;
using search::index::schema::CollectionType;
+using search::linguistics::TokenExtractor;
using search::util::URL;
using vespalib::make_string;
using vespalib::datastore::Aligner;
-namespace documentinverterkludge::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
-using namespace documentinverterkludge;
-
-namespace {
-
-class SpanFinder : public SpanTreeVisitor {
-public:
- int32_t begin_pos;
- int32_t end_pos;
-
- SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
- Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
- void visit(const Span &node) override {
- begin_pos = std::min(begin_pos, node.from());
- end_pos = std::max(end_pos, node.from() + node.length());
- }
- void visit(const SpanList &node) override {
- for (const auto & span_ : node) {
- const_cast<SpanNode *>(span_)->accept(*this);
- }
- }
- void visit(const SimpleSpanList &node) override {
- for (const auto & span_ : node) {
- const_cast<Span &>(span_).accept(*this);
- }
- }
- void visit(const AlternateSpanList &node) override {
- for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
- visit(node.getSubtree(i));
- }
- }
-};
-
-Span
-getSpan(const SpanNode &span_node)
-{
- SpanFinder finder;
- // The SpanNode will not be changed.
- const_cast<SpanNode &>(span_node).accept(finder);
- return finder.span();
-}
-
-}
-
void
FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc)
{
_terms.clear();
- StringFieldValue::SpanTrees spanTrees = value.getSpanTrees();
- const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME);
- if (tree == nullptr) {
- /* This is wrong unless field is exact match */
- const vespalib::string &text = value.getValue();
- if (text.empty()) {
- return;
- }
- uint32_t wordRef = saveWord(text, &doc);
- if (wordRef != 0u) {
- add(wordRef);
- stepWordPos();
- }
- return;
- }
- const vespalib::string &text = value.getValue();
- for (const Annotation & annotation : *tree) {
- const SpanNode *span = annotation.getSpanNode();
- if ((span != nullptr) && annotation.valid() &&
- (annotation.getType() == *AnnotationType::TERM))
- {
- Span sp = getSpan(*span);
- if (sp.length() != 0) {
- _terms.push_back(std::make_pair(sp,
- annotation.getFieldValue()));
- }
- }
- }
- std::sort(_terms.begin(), _terms.end());
+ auto span_trees = value.getSpanTrees();
+ vespalib::stringref text = value.getValueRef();
+ _token_extractor.extract(_terms, span_trees, text, &doc);
auto it = _terms.begin();
auto ite = _terms.end();
- uint32_t wordRef;
- bool mustStep = false;
for (; it != ite; ) {
auto it_begin = it;
- for (; it != ite && it->first == it_begin->first; ++it) {
- if (it->second) { // it->second is a const FieldValue *.
- wordRef = saveWord(*it->second, doc);
- } else {
- const Span &iSpan = it->first;
- assert(iSpan.from() >= 0);
- assert(iSpan.length() > 0);
- wordRef = saveWord(vespalib::stringref(&text[iSpan.from()],
- iSpan.length()), &doc);
- }
- if (wordRef != 0u) {
- add(wordRef);
- mustStep = true;
- }
- }
- if (mustStep) {
- stepWordPos();
- mustStep = false;
+ for (; it != ite && it->span == it_begin->span; ++it) {
+ uint32_t wordRef = saveWord(it->word);
+ add(wordRef);
}
+ stepWordPos();
}
}
@@ -244,33 +139,19 @@ FieldInverter::endElement()
}
uint32_t
-FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
+FieldInverter::saveWord(vespalib::stringref word)
{
const size_t wordsSize = _words.size();
// assert((wordsSize & 3) == 0); // Check alignment
- size_t len = strnlen(word.data(), word.size());
- if (len < word.size()) {
- const Schema::IndexField &field = _schema.getIndexField(_fieldId);
- LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, lid is %u, field is %s, truncated word is %s", word.size(), len, _docId, field.getName().c_str(), word.data());
- }
- if (len > max_word_len && doc != nullptr) {
- const Schema::IndexField& field = _schema.getIndexField(_fieldId);
- LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.100s", len, max_word_len, doc->getId().toString().c_str(), field.getName().c_str(), word.data());
- return 0u;
- }
- if (len == 0) {
- return 0u;
- }
-
- const size_t unpadded_size = wordsSize + 4 + len + 1;
+ const size_t unpadded_size = wordsSize + 4 + word.size() + 1;
const size_t fullyPaddedSize = Aligner<4>::align(unpadded_size);
_words.reserve(vespalib::roundUp2inN(fullyPaddedSize));
_words.resize(fullyPaddedSize);
char * buf = &_words[0] + wordsSize;
memset(buf, 0, 4);
- memcpy(buf + 4, word.data(), len);
- memset(buf + 4 + len, 0, fullyPaddedSize - unpadded_size + 1);
+ memcpy(buf + 4, word.data(), word.size());
+ memset(buf + 4 + word.size(), 0, fullyPaddedSize - unpadded_size + 1);
uint32_t wordRef = (wordsSize + 4) >> 2;
// assert(wordRef != 0);
@@ -278,20 +159,10 @@ FieldInverter::saveWord(const vespalib::stringref word, const Document* doc)
return wordRef;
}
-uint32_t
-FieldInverter::saveWord(const document::FieldValue &fv, const Document& doc)
-{
- assert(fv.isA(FieldValue::Type::STRING));
- using RawRef = std::pair<const char*, size_t>;
- RawRef sRef = fv.getAsRaw();
- return saveWord(vespalib::stringref(sRef.first, sRef.second), &doc);
-}
-
void
FieldInverter::remove(const vespalib::stringref word, uint32_t docId)
{
- uint32_t wordRef = saveWord(word, nullptr);
- assert(wordRef != 0);
+ uint32_t wordRef = saveWord(word);
_positions.emplace_back(wordRef, docId);
}
@@ -319,6 +190,17 @@ FieldInverter::endDoc()
}
void
+FieldInverter::addWord(vespalib::stringref word, const document::Document& doc)
+{
+ word = _token_extractor.sanitize_word(word, &doc);
+ if (!word.empty()) {
+ uint32_t wordRef = saveWord(word);
+ add(wordRef);
+ stepWordPos();
+ }
+}
+
+void
FieldInverter::processNormalDocTextField(const StringFieldValue &field, const Document& doc)
{
startElement(1);
@@ -367,6 +249,7 @@ FieldInverter::FieldInverter(const Schema &schema, uint32_t fieldId,
_docId(0),
_oldPosSize(0),
_schema(schema),
+ _token_extractor(_schema.getIndexField(_fieldId).getName(), max_word_len),
_words(),
_elems(),
_positions(),