diff options
Diffstat (limited to 'searchlib')
7 files changed, 101 insertions, 72 deletions
diff --git a/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp b/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp index 2e3e56a123b..af1824475a8 100644 --- a/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp +++ b/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp @@ -18,7 +18,7 @@ #include <vespa/vespalib/objects/nbostream.h> #include <vespa/vespalib/gtest/gtest.h> -namespace search { +namespace search::memoryindex { using document::ArrayFieldValue; using document::DataType; @@ -31,8 +31,6 @@ using search::test::StringFieldBuilder; using namespace index; -namespace memoryindex { - namespace { Document::UP @@ -138,6 +136,16 @@ makeCorruptDocument(DocBuilder &b, size_t wordOffset) return std::make_unique<Document>(b.get_repo(), badstream); } +std::unique_ptr<Document> +make_very_long_word_document(DocBuilder& b) +{ + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::19"); + vespalib::string long_word(FieldInverter::max_word_len + 1, 'z'); + doc->setValue("f0", sfb.tokenize("before ").word(long_word).tokenize(" after").build()); + return doc; +} + } struct FieldInverterTest : public ::testing::Test { @@ -169,7 +177,7 @@ struct FieldInverterTest : public ::testing::Test { for (auto &inverter : _inverters) { vespalib::stringref fieldName = _schema.getIndexField(fieldId).getName(); - inverter->invertField(docId, doc.getValue(fieldName)); + inverter->invertField(docId, doc.getValue(fieldName), doc); ++fieldId; } } @@ -421,7 +429,16 @@ TEST_F(FieldInverterTest, require_that_word_with_NUL_byte_is_dropped_when_trunca _inserter_backend.toStr()); } +TEST_F(FieldInverterTest, very_long_words_are_dropped) +{ + invertDocument(1, *make_very_long_word_document(_b)); + pushDocuments(); + EXPECT_EQ("f=0," + "w=after,a=1," + "w=before,a=1", + _inserter_backend.toStr()); } + } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp b/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp index 9543b109dc4..e94bf8c9850 100644 --- a/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp +++ b/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp @@ -132,7 +132,7 @@ struct UrlFieldInverterTest : public ::testing::Test { ~UrlFieldInverterTest() override; void invertDocument(uint32_t docId, const Document &doc) { - _urlInverter->invertField(docId, doc.getValue(url)); + _urlInverter->invertField(docId, doc.getValue(url), doc); } void pushDocuments() { diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp index 206b92c85d0..b95a89957a2 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp @@ -9,6 +9,7 @@ #include <vespa/document/annotation/spantree.h> #include <vespa/document/annotation/spantreevisitor.h> #include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/document.h> #include <vespa/document/fieldvalue/stringfieldvalue.h> #include <vespa/document/fieldvalue/weightedsetfieldvalue.h> #include <vespa/searchlib/bitcompression/compression.h> @@ -106,7 +107,7 @@ getSpan(const SpanNode &span_node) } void -FieldInverter::processAnnotations(const StringFieldValue &value) +FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc) { _terms.clear(); StringFieldValue::SpanTrees spanTrees = value.getSpanTrees(); @@ -117,7 +118,7 @@ FieldInverter::processAnnotations(const StringFieldValue &value) if (text.empty()) { return; } - uint32_t wordRef = saveWord(text); + uint32_t wordRef = saveWord(text, &doc); if (wordRef != 0u) { add(wordRef); stepWordPos(); @@ -146,13 +147,13 @@ FieldInverter::processAnnotations(const StringFieldValue &value) auto it_begin = it; for (; it != ite && it->first == it_begin->first; ++it) { if (it->second) { // it->second is a const FieldValue *. - wordRef = saveWord(*it->second); + wordRef = saveWord(*it->second, doc); } else { const Span &iSpan = it->first; assert(iSpan.from() >= 0); assert(iSpan.length() > 0); wordRef = saveWord(vespalib::stringref(&text[iSpan.from()], - iSpan.length())); + iSpan.length()), &doc); } if (wordRef != 0u) { add(wordRef); @@ -243,7 +244,7 @@ FieldInverter::endElement() } uint32_t -FieldInverter::saveWord(const vespalib::stringref word) +FieldInverter::saveWord(const vespalib::stringref word, const Document* doc) { const size_t wordsSize = _words.size(); // assert((wordsSize & 3) == 0); // Check alignment @@ -252,6 +253,11 @@ FieldInverter::saveWord(const vespalib::stringref word) const Schema::IndexField &field = _schema.getIndexField(_fieldId); LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, lid is %u, field is %s, truncated word is %s", word.size(), len, _docId, field.getName().c_str(), word.data()); } + if (len > max_word_len && doc != nullptr) { + const Schema::IndexField& field = _schema.getIndexField(_fieldId); + LOG(error, "Dropped too long word (len=%zu) from document %s field %s, word prefix is %.100s", len, doc->getId().toString().c_str(), field.getName().c_str(), word.data()); + return 0u; + } if (len == 0) { return 0u; } @@ -273,18 +279,18 @@ FieldInverter::saveWord(const vespalib::stringref word) } uint32_t -FieldInverter::saveWord(const document::FieldValue &fv) +FieldInverter::saveWord(const document::FieldValue &fv, const Document& doc) { assert(fv.isA(FieldValue::Type::STRING)); using RawRef = std::pair<const char*, size_t>; RawRef sRef = fv.getAsRaw(); - return saveWord(vespalib::stringref(sRef.first, sRef.second)); + return saveWord(vespalib::stringref(sRef.first, sRef.second), &doc); } void FieldInverter::remove(const vespalib::stringref word, uint32_t docId) { - uint32_t wordRef = saveWord(word); + uint32_t wordRef = saveWord(word, nullptr); assert(wordRef != 0); _positions.emplace_back(wordRef, docId); } @@ -313,15 +319,15 @@ FieldInverter::endDoc() } void -FieldInverter::processNormalDocTextField(const StringFieldValue &field) +FieldInverter::processNormalDocTextField(const StringFieldValue &field, const Document& doc) { startElement(1); - processAnnotations(field); + processAnnotations(field, doc); endElement(); } void -FieldInverter::processNormalDocArrayTextField(const ArrayFieldValue &field) +FieldInverter::processNormalDocArrayTextField(const ArrayFieldValue &field, const Document& doc) { uint32_t el = 0; uint32_t ele = field.size(); @@ -330,13 +336,13 @@ FieldInverter::processNormalDocArrayTextField(const ArrayFieldValue &field) assert(elfv.isA(FieldValue::Type::STRING)); const auto &element = static_cast<const StringFieldValue &>(elfv); startElement(1); - processAnnotations(element); + processAnnotations(element, doc); endElement(); } } void -FieldInverter::processNormalDocWeightedSetTextField(const WeightedSetFieldValue &field) +FieldInverter::processNormalDocWeightedSetTextField(const WeightedSetFieldValue &field, const Document& doc) { for (const auto & el : field) { const FieldValue &key = *el.first; @@ -346,7 +352,7 @@ FieldInverter::processNormalDocWeightedSetTextField(const WeightedSetFieldValue const auto &element = static_cast<const StringFieldValue &>(key); int32_t weight = xweight.getAsInt(); startElement(weight); - processAnnotations(element); + processAnnotations(element, doc); endElement(); } } @@ -437,11 +443,12 @@ FieldInverter::trimAbortedDocs() } void -FieldInverter::invertField(uint32_t docId, const FieldValue::UP &val) +FieldInverter::invertField(uint32_t docId, const FieldValue::UP &val, const Document& doc) { + (void) doc; if (val) { startDoc(docId); - invertNormalDocTextField(*val); + invertNormalDocTextField(*val, doc); endDoc(); } else { removeDocument(docId); @@ -460,13 +467,13 @@ FieldInverter::startDoc(uint32_t docId) { } void -FieldInverter::invertNormalDocTextField(const FieldValue &val) +FieldInverter::invertNormalDocTextField(const FieldValue &val, const Document& doc) { const Schema::IndexField &field = _schema.getIndexField(_fieldId); switch (field.getCollectionType()) { case CollectionType::SINGLE: if (val.isA(FieldValue::Type::STRING)) { - processNormalDocTextField(static_cast<const StringFieldValue &>(val)); + processNormalDocTextField(static_cast<const StringFieldValue &>(val), doc); } else { throw std::runtime_error(make_string("Expected DataType::STRING, got '%s'", val.getDataType()->getName().c_str())); } @@ -475,7 +482,7 @@ FieldInverter::invertNormalDocTextField(const FieldValue &val) if (val.isA(FieldValue::Type::WSET)) { const auto &wset = static_cast<const WeightedSetFieldValue &>(val); if (wset.getNestedType() == *DataType::STRING) { - processNormalDocWeightedSetTextField(wset); + processNormalDocWeightedSetTextField(wset, doc); } else { throw std::runtime_error(make_string("Expected DataType::STRING, got '%s'", wset.getNestedType().getName().c_str())); } @@ -487,7 +494,7 @@ FieldInverter::invertNormalDocTextField(const FieldValue &val) if (val.isA(FieldValue::Type::ARRAY)) { const auto &arr = static_cast<const ArrayFieldValue&>(val); if (arr.getNestedType() == *DataType::STRING) { - processNormalDocArrayTextField(arr); + processNormalDocArrayTextField(arr, doc); } else { throw std::runtime_error(make_string("Expected DataType::STRING, got '%s'", arr.getNestedType().getName().c_str())); } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h index 1a582bf8099..2178efc31bf 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h @@ -15,6 +15,7 @@ namespace search::index { } namespace document { + class Document; class FieldValue; class StringFieldValue; class ArrayFieldValue; @@ -92,6 +93,9 @@ public: } }; + // Max length of an indexed word. Longer words are dropped. + static constexpr size_t max_word_len = 1_Mi; + private: using WordBuffer = std::vector<char, vespalib::allocator_large<char>>; @@ -188,7 +192,7 @@ private: IOrderedFieldIndexInserter &_inserter; index::FieldLengthCalculator &_calculator; - void invertNormalDocTextField(const document::FieldValue &val); + void invertNormalDocTextField(const document::FieldValue &val, const document::Document& doc); public: void startElement(int32_t weight); @@ -198,12 +202,12 @@ private: /** * Save the given word in the word buffer and return the word reference. */ - VESPA_DLL_LOCAL uint32_t saveWord(const vespalib::stringref word); + VESPA_DLL_LOCAL uint32_t saveWord(const vespalib::stringref word, const document::Document* doc); /** * Save the field value as a word in the word buffer and return the word reference. */ - VESPA_DLL_LOCAL uint32_t saveWord(const document::FieldValue &fv); + VESPA_DLL_LOCAL uint32_t saveWord(const document::FieldValue &fv, const document::Document& doc); /** * Get pointer to saved word from a word reference. @@ -246,14 +250,14 @@ private: public: VESPA_DLL_LOCAL void - processAnnotations(const document::StringFieldValue &value); + processAnnotations(const document::StringFieldValue &value, const document::Document& doc); void push_documents_internal(); private: - void processNormalDocTextField(const document::StringFieldValue &field); - void processNormalDocArrayTextField(const document::ArrayFieldValue &field); - void processNormalDocWeightedSetTextField(const document::WeightedSetFieldValue &field); + void processNormalDocTextField(const document::StringFieldValue &field, const document::Document& doc); + void processNormalDocArrayTextField(const document::ArrayFieldValue &field, const document::Document& doc); + void processNormalDocWeightedSetTextField(const document::WeightedSetFieldValue &field, const document::Document& doc); const index::Schema &getSchema() const { return _schema; } @@ -306,7 +310,7 @@ public: /** * Invert a normal text field, based on annotations. */ - void invertField(uint32_t docId, const std::unique_ptr<document::FieldValue> &val); + void invertField(uint32_t docId, const std::unique_ptr<document::FieldValue> &val, const document::Document& doc); /** * Setup remove of word in old version of document. @@ -322,8 +326,8 @@ public: void endDoc(); - void addWord(const vespalib::stringref word) { - uint32_t wordRef = saveWord(word); + void addWord(const vespalib::stringref word, const document::Document& doc) { + uint32_t wordRef = saveWord(word, &doc); if (wordRef != 0u) { add(wordRef); stepWordPos(); diff --git a/searchlib/src/vespa/searchlib/memoryindex/invert_task.cpp b/searchlib/src/vespa/searchlib/memoryindex/invert_task.cpp index 8fa9de7da74..13fb1d726b4 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/invert_task.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/invert_task.cpp @@ -44,12 +44,12 @@ InvertTask::run() _context.set_data_type(_inv_context, _doc); auto document_field_itr = _context.get_document_fields().begin(); for (auto field_id : _context.get_fields()) { - _inverters[field_id]->invertField(_lid, get_field_value(_doc, *document_field_itr)); + _inverters[field_id]->invertField(_lid, get_field_value(_doc, *document_field_itr), _doc); ++document_field_itr; } auto document_uri_field_itr = _context.get_document_uri_fields().begin(); for (auto uri_field_id : _context.get_uri_fields()) { - _uri_inverters[uri_field_id]->invertField(_lid, get_field_value(_doc, *document_uri_field_itr)); + _uri_inverters[uri_field_id]->invertField(_lid, get_field_value(_doc, *document_uri_field_itr), _doc); ++document_uri_field_itr; } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.cpp index c1537c6b290..32a2ab733fd 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.cpp @@ -46,6 +46,7 @@ lowercaseToken(vespalib::string &dest, const char *src, size_t srcSize) using document::ArrayFieldValue; using document::DataType; +using document::Document; using document::FieldValue; using document::IntFieldValue; using document::SpanTree; @@ -110,21 +111,21 @@ UrlFieldInverter::endElement() } void -UrlFieldInverter::processUrlField(const FieldValue &url_field) +UrlFieldInverter::processUrlField(const FieldValue &url_field, const Document& doc) { assert(url_field.isA(FieldValue::Type::STRING)); const vespalib::string &url_str = static_cast<const StringFieldValue &>(url_field).getValue(); - processUrlOldStyle(url_str); + processUrlOldStyle(url_str, doc); return; } void -UrlFieldInverter::processUrlOldStyle(const vespalib::string &s) +UrlFieldInverter::processUrlOldStyle(const vespalib::string &s, const Document& doc) { URL url(reinterpret_cast<const unsigned char *>(s.data()), s.size()); - _hostname->addWord(HOSTNAME_BEGIN); + _hostname->addWord(HOSTNAME_BEGIN, doc); vespalib::string lowToken; const unsigned char *t; @@ -137,60 +138,60 @@ UrlFieldInverter::processUrlOldStyle(const vespalib::string &s) vespalib::stringref tokenRef(token, tokenLen); switch (url_context) { case URL::URL_SCHEME: - _scheme->addWord(tokenRef); - _all->addWord(tokenRef); + _scheme->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); break; case URL::URL_HOST: case URL::URL_DOMAIN: case URL::URL_MAINTLD: - _host->addWord(tokenRef); - _hostname->addWord(tokenRef); - _all->addWord(tokenRef); + _host->addWord(tokenRef, doc); + _hostname->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); break; case URL::URL_PORT: if (strcmp(token, "80") && strcmp(token, "443")) { - _port->addWord(tokenRef); - _all->addWord(tokenRef); + _port->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); } break; case URL::URL_PATH: case URL::URL_FILENAME: case URL::URL_EXTENSION: case URL::URL_PARAMS: - _path->addWord(tokenRef); - _all->addWord(tokenRef); + _path->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); break; case URL::URL_QUERY: - _query->addWord(tokenRef); - _all->addWord(tokenRef); + _query->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); break; case URL::URL_FRAGMENT: - _fragment->addWord(tokenRef); - _all->addWord(tokenRef); + _fragment->addWord(tokenRef, doc); + _all->addWord(tokenRef, doc); break; case URL::URL_ADDRESS: - _all->addWord(tokenRef); + _all->addWord(tokenRef, doc); break; default: LOG(warning, "Ignoring unknown Uri token '%s'.", token); } } - _hostname->addWord(HOSTNAME_END); + _hostname->addWord(HOSTNAME_END, doc); } void -UrlFieldInverter::processArrayUrlField(const ArrayFieldValue &field) +UrlFieldInverter::processArrayUrlField(const ArrayFieldValue &field, const Document& doc) { for (uint32_t el(0), ele(field.size());el < ele; ++el) { const FieldValue &element = field[el]; startElement(1); - processUrlField(element); + processUrlField(element, doc); endElement(); } } void -UrlFieldInverter::processWeightedSetUrlField(const WeightedSetFieldValue &field) +UrlFieldInverter::processWeightedSetUrlField(const WeightedSetFieldValue &field, const Document& doc) { for (const auto & el : field) { const FieldValue &key = *el.first; @@ -198,7 +199,7 @@ UrlFieldInverter::processWeightedSetUrlField(const WeightedSetFieldValue &field) assert(xweight.isA(FieldValue::Type::INT)); int32_t weight = xweight.getAsInt(); startElement(weight); - processUrlField(key); + processUrlField(key, doc); endElement(); } } @@ -214,13 +215,13 @@ isUriType(const DataType &type) } void -UrlFieldInverter::invertUrlField(const FieldValue &val) +UrlFieldInverter::invertUrlField(const FieldValue &val, const Document& doc) { switch (_collectionType) { case CollectionType::SINGLE: if (isUriType(*val.getDataType())) { startElement(1); - processUrlField(val); + processUrlField(val, doc); endElement(); } else { throw std::runtime_error(make_string("Expected URI field, got '%s'", val.getDataType()->getName().c_str())); @@ -230,7 +231,7 @@ UrlFieldInverter::invertUrlField(const FieldValue &val) assert(val.isA(FieldValue::Type::WSET)); const auto &wset = static_cast<const WeightedSetFieldValue &>(val); if (isUriType(wset.getNestedType())) { - processWeightedSetUrlField(wset); + processWeightedSetUrlField(wset, doc); } else { throw std::runtime_error( make_string("Expected wset of URI struct, got '%s'", wset.getNestedType().getName().c_str())); @@ -241,7 +242,7 @@ UrlFieldInverter::invertUrlField(const FieldValue &val) assert(val.isA(FieldValue::Type::ARRAY)); const auto &arr = static_cast<const ArrayFieldValue &>(val); if (isUriType(arr.getNestedType())) { - processArrayUrlField(arr); + processArrayUrlField(arr, doc); } else { throw std::runtime_error( make_string("Expected array of URI struct, got '%s' (%s)", arr.getNestedType().getName().c_str(), @@ -255,11 +256,11 @@ UrlFieldInverter::invertUrlField(const FieldValue &val) } void -UrlFieldInverter::invertField(uint32_t docId, const FieldValue::UP &val) +UrlFieldInverter::invertField(uint32_t docId, const FieldValue::UP &val, const Document& doc) { if (val) { startDoc(docId); - invertUrlField(*val); + invertUrlField(*val, doc); endDoc(); } else { removeDocument(docId); diff --git a/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.h index 0a38985dac4..45247c630e6 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/url_field_inverter.h @@ -29,15 +29,15 @@ class UrlFieldInverter { void endElement(); - void processUrlField(const document::FieldValue &url_field); + void processUrlField(const document::FieldValue &url_field, const document::Document& doc); - void processUrlOldStyle(const vespalib::string &s); + void processUrlOldStyle(const vespalib::string &s, const document::Document& doc); - void processArrayUrlField(const document::ArrayFieldValue &field); + void processArrayUrlField(const document::ArrayFieldValue &field, const document::Document& doc); - void processWeightedSetUrlField(const document::WeightedSetFieldValue &field); + void processWeightedSetUrlField(const document::WeightedSetFieldValue &field, const document::Document& doc); - void invertUrlField(const document::FieldValue &field); + void invertUrlField(const document::FieldValue &field, const document::Document& doc); public: UrlFieldInverter(index::schema::CollectionType collectionType, FieldInverter *all, @@ -49,7 +49,7 @@ public: FieldInverter *fragment, FieldInverter *hostname); - void invertField(uint32_t docId, const document::FieldValue::UP &field); + void invertField(uint32_t docId, const document::FieldValue::UP &field, const document::Document& doc); void removeDocument(uint32_t docId); void applyRemoves(); |