diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-20 11:03:26 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-20 12:03:59 +0200 |
commit | aade9eb66ed483a05405b102a10d52a5a4c58f07 (patch) | |
tree | 383278ee759eeba68a8e9889c303827878e3b4ef /searchlib | |
parent | 38de0304985d85dc9da58e15ad494054bff5d5dc (diff) |
Add field length scanner, to get accurate field lengths for
multivalue fields when regenerating interleaved features.
Diffstat (limited to 'searchlib')
11 files changed, 267 insertions, 9 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index f032bbe9c30..07ac6bb699c 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -106,6 +106,7 @@ vespa_define_module( src/tests/diskindex/bitvector src/tests/diskindex/diskindex src/tests/diskindex/fieldwriter + src/tests/diskindex/field_length_scanner src/tests/diskindex/fusion src/tests/diskindex/pagedict4 src/tests/docstore/chunk diff --git a/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt b/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt new file mode 100644 index 00000000000..985aaa38401 --- /dev/null +++ b/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +find_package(GTest REQUIRED) +vespa_add_executable(searchlib_field_length_scanner_test_app TEST + SOURCES + field_length_scanner_test.cpp + DEPENDS + searchlib + searchlib_test + GTest::GTest +) +vespa_add_test(NAME searchlib_field_length_scanner_test_app COMMAND searchlib_field_length_scanner_test_app) diff --git a/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp b/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp new file mode 100644 index 00000000000..1b8a4c9655d --- /dev/null +++ b/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp @@ -0,0 +1,73 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/diskindex/field_length_scanner.h> +#include <vespa/searchlib/index/docidandfeatures.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::index::DocIdAndFeatures; + + +namespace search::diskindex { + + +class FieldLengthScannerTest : public ::testing::Test +{ +protected: + FieldLengthScanner _scanner; + FieldLengthScannerTest() + : _scanner(3) + { + } +}; + +TEST_F(FieldLengthScannerTest, require_that_no_scan_gives_empty_length) +{ + EXPECT_EQ(0, _scanner.get_field_length(1)); +} + +TEST_F(FieldLengthScannerTest, require_that_single_length_is_registered) +{ + DocIdAndFeatures features; + features.set_doc_id(1); + features.elements().emplace_back(0, 1, 5); + _scanner.scan_features(features); + EXPECT_EQ(5u, _scanner.get_field_length(1)); +} + +TEST_F(FieldLengthScannerTest, require_that_duplicate_element_is_ignored) +{ + DocIdAndFeatures features; + features.set_doc_id(1); + features.elements().emplace_back(10, 1, 5); + features.elements().emplace_back(100, 1, 23); + _scanner.scan_features(features); + EXPECT_EQ(28u, _scanner.get_field_length(1)); + _scanner.scan_features(features); // elements 10 and 100 already scanned + EXPECT_EQ(28u, _scanner.get_field_length(1)); + features.elements()[0].setElementId(11); + _scanner.scan_features(features); // element 100 already scanned + EXPECT_EQ(33u, _scanner.get_field_length(1)); + features.elements()[1].setElementId(101); + _scanner.scan_features(features); // elements 10 already scanned + EXPECT_EQ(56u, _scanner.get_field_length(1)); +} + +TEST_F(FieldLengthScannerTest, require_that_documents_are_not_mixed) +{ + DocIdAndFeatures features1; + DocIdAndFeatures features2; + features1.set_doc_id(1); + features1.elements().emplace_back(10, 1, 5); + features1.elements().emplace_back(100, 1, 23); + features2.set_doc_id(2); + features2.elements().emplace_back(10, 1, 7); + features2.elements().emplace_back(100, 1, 9); + _scanner.scan_features(features1); + _scanner.scan_features(features2); + EXPECT_EQ(28u, _scanner.get_field_length(1)); + EXPECT_EQ(16u, _scanner.get_field_length(2)); +} + +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index da29918a5fb..93a3fde22dd 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -545,7 +545,7 @@ TEST_F(FusionTest, require_that_interleaved_features_can_be_reconstructed) assert_interleaved_features(disk_index, "f0", "a", 10, 1, 7); assert_interleaved_features(disk_index, "f1", "w", 10, 1, 4); assert_interleaved_features(disk_index, "f2", "ax", 10, 2, 4); - assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 3); + assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 4); assert_interleaved_features(disk_index, "f3", "wx", 10, 1, 2); clean_field_length_testdirs(); } diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index ba608467c8a..15678382741 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -12,6 +12,7 @@ vespa_add_library(searchlib_diskindex OBJECT extposocc.cpp fieldreader.cpp fieldwriter.cpp + field_length_scanner.cpp fileheader.cpp fusion.cpp indexbuilder.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp new file mode 100644 index 00000000000..b7aad2d1996 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp @@ -0,0 +1,51 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "field_length_scanner.h" +#include <vespa/searchlib/index/docidandfeatures.h> + +namespace search::diskindex { + +FieldLengthScanner::FieldLengthScanner(uint32_t doc_id_limit) + : _field_length_vector(doc_id_limit), + _scanned_elements_map() +{ +} + +FieldLengthScanner::~FieldLengthScanner() = default; + +void +FieldLengthScanner::scan_features(const index::DocIdAndFeatures &features) +{ + if (features.elements().empty()) { + return; + } + auto &entry = _field_length_vector[features.doc_id()]; + if (features.elements().back().getElementId() < element_id_bias) { + for (const auto &element : features.elements()) { + entry.add_element_length(element.getElementLen(), element.getElementId()); + } + } else { + auto element = features.elements().cbegin(); + while (element->getElementId() < element_id_bias) { + entry.add_element_length(element->getElementLen(), element->getElementId()); + ++element; + } + auto &scanned_elements = _scanned_elements_map[features.doc_id()]; + auto size_needed = features.elements().back().getElementId() + 1 - element_id_bias; + if (size_needed > scanned_elements.size()) { + if (size_needed > scanned_elements.capacity()) { + scanned_elements.reserve(std::max(size_needed + (size_needed / 4), 32u)); + } + scanned_elements.resize(size_needed); + } + while (element != features.elements().cend()) { + if (!scanned_elements[element->getElementId() - element_id_bias]) { + scanned_elements[element->getElementId() - element_id_bias] = true; + entry.add_element_length(element->getElementLen()); + } + ++element; + } + } +} + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h new file mode 100644 index 00000000000..e282a85b64f --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h @@ -0,0 +1,63 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vector> +#include <unordered_map> +#include <limits> + +namespace search::index { class DocIdAndFeatures; } + +namespace search::diskindex { + +/* + * Class used to reconstruct field lengths based on element lengths in + * posting list file. + */ +class FieldLengthScanner { + class FieldLengthEntry { + uint16_t _field_length; + uint16_t _elements; // first 16 elements + + static uint16_t make_element_mask(uint32_t element_id) { return (1u << element_id); } + + public: + FieldLengthEntry() + : _field_length(0), + _elements(0) + { + } + + void add_element_length(uint32_t element_length) { + // Cap field length + if (element_length < std::numeric_limits<uint16_t>::max()) { + uint32_t field_length32 = _field_length + element_length; + _field_length = std::min(field_length32, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); + } else { + _field_length = std::numeric_limits<uint16_t>::max(); + } + } + + void add_element_length(uint32_t element_length, uint32_t element_id) { + uint16_t element_mask = make_element_mask(element_id); + if (!(_elements & element_mask)) { + _elements |= element_mask; + add_element_length(element_length); + } + } + + uint16_t get_field_length() const { return _field_length; } + }; + std::vector<FieldLengthEntry> _field_length_vector; + static constexpr uint32_t element_id_bias = 16; + // bit vectors for element >= element_id_bias + std::unordered_map<uint32_t, std::vector<bool>> _scanned_elements_map; + +public: + FieldLengthScanner(uint32_t doc_id_limit); + ~FieldLengthScanner(); + void scan_features(const index::DocIdAndFeatures &features); + uint16_t get_field_length(uint32_t doc_id) const { return _field_length_vector[doc_id].get_field_length(); } +}; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index 68d37c43cb2..a6208cee970 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -4,6 +4,7 @@ #include "zcposocc.h" #include "extposocc.h" #include "pagedict4file.h" +#include "field_length_scanner.h" #include <vespa/vespalib/util/error.h> #include <vespa/log/log.h> @@ -189,7 +190,8 @@ FieldReader::get_field_length_info() const std::unique_ptr<FieldReader> FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index, - const Schema &oldSchema) + const Schema &oldSchema, + std::shared_ptr<FieldLengthScanner> field_length_scanner) { assert(index.isValid()); if (index.hasMatchingOldFields(oldSchema)) { @@ -203,7 +205,7 @@ FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index, } // field exists in old schema with different collection type setting // or old field is missing wanted interleaved features. - return std::make_unique<FieldReaderStripInfo>(index); // degraded + return std::make_unique<FieldReaderStripInfo>(index, field_length_scanner); // degraded } @@ -233,11 +235,12 @@ FieldReaderEmpty::getFeatureParams(PostingListParams ¶ms) } -FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index) +FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index, std::shared_ptr<FieldLengthScanner> field_length_scanner) : _hasElements(false), _hasElementWeights(false), _want_interleaved_features(index.use_experimental_posting_list_format()), - _regenerate_interleaved_features(false) + _regenerate_interleaved_features(false), + _field_length_scanner(std::move(field_length_scanner)) { PosOccFieldsParams fieldsParams; fieldsParams.setSchemaParams(index.getSchema(), index.getIndex()); @@ -272,10 +275,31 @@ FieldReaderStripInfo::open(const vespalib::string &prefix, const TuneFileSeqRead _regenerate_interleaved_features = true; } } + if (_regenerate_interleaved_features && _hasElements && _field_length_scanner) { + scan_element_lengths(); + close(); + if (!FieldReader::open(prefix, tuneFileRead)) { + return false; + } + } return true; } void +FieldReaderStripInfo::scan_element_lengths() +{ + for (;;) { + FieldReader::read(); + if (_wordNum == noWordNumHigh()) { + break; + } + DocIdAndFeatures &features = _docIdAndFeatures; + assert(!features.has_raw_data()); + _field_length_scanner->scan_features(features); + } +} + +void FieldReaderStripInfo::read() { typedef search::index::WordDocElementFeatures Element; @@ -319,7 +343,9 @@ FieldReaderStripInfo::read() field_length += element.getElementLen(); num_occs += element.getNumOccs(); } - // Note: Length of elements without occurrences is not included. + if (_hasElements && _field_length_scanner) { + field_length = _field_length_scanner->get_field_length(features.doc_id()); + } features.set_field_length(field_length); features.set_num_occs(num_occs); } diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h index 106e10d2e80..899b3708bf9 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h @@ -13,6 +13,8 @@ namespace search::diskindex { +class FieldLengthScanner; + /* * FieldReader is used to read a dictionary and posting list file * together, and get a sequential view of the stored data. @@ -95,7 +97,7 @@ public: uint32_t getDocIdLimit() const { return _docIdLimit; } const index::FieldLengthInfo &get_field_length_info() const; - static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema); + static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema, std::shared_ptr<FieldLengthScanner> field_length_scanner); }; @@ -126,11 +128,13 @@ private: bool _hasElementWeights; bool _want_interleaved_features; bool _regenerate_interleaved_features; + std::shared_ptr<FieldLengthScanner> _field_length_scanner; public: - FieldReaderStripInfo(const IndexIterator &index); + FieldReaderStripInfo(const IndexIterator &index, std::shared_ptr<FieldLengthScanner>); bool allowRawFeatures() override; bool open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead) override; void read() override; + void scan_element_lengths(); void getFeatureParams(PostingListParams ¶ms) override; }; diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp index 1ace5969b6b..2f302650d37 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp @@ -3,7 +3,9 @@ #include "fusion.h" #include "fieldreader.h" #include "dictionarywordreader.h" +#include "field_length_scanner.h" #include <vespa/vespalib/util/stringfmt.h> +#include <vespa/searchlib/bitcompression/posocc_fields_params.h> #include <vespa/searchlib/index/field_length_info.h> #include <vespa/searchlib/util/filekit.h> #include <vespa/searchlib/util/dirtraverse.h> @@ -28,6 +30,8 @@ using search::diskindex::DocIdMapping; using search::diskindex::WordNumMapping; using search::docsummary::DocumentSummary; using search::index::FieldLengthInfo; +using search::bitcompression::PosOccFieldParams; +using search::bitcompression::PosOccFieldsParams; using search::index::PostingListParams; using search::index::Schema; using search::index::SchemaUtil; @@ -304,17 +308,39 @@ Fusion::selectCookedOrRawFeatures(Reader &reader, Writer &writer) } +std::shared_ptr<FieldLengthScanner> +Fusion::allocate_field_length_scanner(const SchemaUtil::IndexIterator &index) +{ + if (index.use_experimental_posting_list_format()) { + PosOccFieldsParams fieldsParams; + fieldsParams.setSchemaParams(index.getSchema(), index.getIndex()); + assert(fieldsParams.getNumFields() > 0); + const PosOccFieldParams &fieldParams = fieldsParams.getFieldParams()[0]; + if (fieldParams._hasElements) { + for (const auto &old_index : _oldIndexes) { + const Schema &old_schema = old_index.getSchema(); + if (index.hasOldFields(old_schema) && + !index.has_matching_experimental_posting_list_format(old_schema)) { + return std::make_shared<FieldLengthScanner>(_docIdLimit); + } + } + } + } + return std::shared_ptr<FieldLengthScanner>(); +} + bool Fusion::openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list, std::vector<std::unique_ptr<FieldReader> > & readers) { + auto field_length_scanner = allocate_field_length_scanner(index); vespalib::string indexName = index.getName(); for (const auto &oi : _oldIndexes) { const Schema &oldSchema = oi.getSchema(); if (!index.hasOldFields(oldSchema)) { continue; // drop data } - auto reader = FieldReader::allocFieldReader(index, oldSchema); + auto reader = FieldReader::allocFieldReader(index, oldSchema, field_length_scanner); reader->setup(list[oi.getIndex()], oi.getDocIdMapping()); if (!reader->open(oi.getPath() + "/" + indexName + "/", _tuneFileIndexing._read)) { return false; diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.h b/searchlib/src/vespa/searchlib/diskindex/fusion.h index 28060a9c4be..d532384f6e9 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fusion.h +++ b/searchlib/src/vespa/searchlib/diskindex/fusion.h @@ -15,6 +15,7 @@ namespace search::index { class FieldLengthInfo; } namespace search::diskindex { +class FieldLengthScanner; class FieldReader; class FieldWriter; class DictionaryWordReader; @@ -49,6 +50,7 @@ private: bool mergeFields(vespalib::ThreadExecutor & executor); bool mergeField(uint32_t id); + std::shared_ptr<FieldLengthScanner> allocate_field_length_scanner(const SchemaUtil::IndexIterator &index); bool openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list, std::vector<std::unique_ptr<FieldReader> > & readers); bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer, const index::FieldLengthInfo &field_length_info); |