diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-04-18 14:27:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-18 14:27:10 +0200 |
commit | 5d0763e481e1d8908ba848e80f1bfa0d0fa0069a (patch) | |
tree | b3845bb31622186ea84437994246e4477a9d1bfe /searchlib | |
parent | 4ef45a0ea202f0fc6654c7cf32ba8be2ca82efd6 (diff) |
Revert "Revert "Use memory mapped disk index dictionary .ssdat file when large""
Diffstat (limited to 'searchlib')
7 files changed, 127 insertions, 32 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp index 951d6f61980..3b7ec00211d 100644 --- a/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp +++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp @@ -15,8 +15,9 @@ #include <vespa/searchlib/diskindex/pagedict4randread.h> #include <vespa/searchlib/common/tunefileinfo.h> #include <vespa/vespalib/util/signalhandler.h> -#include <sstream> #include <cinttypes> +#include <optional> +#include <sstream> #include <vespa/log/log.h> LOG_SETUP("pagedict4test"); @@ -357,6 +358,7 @@ checkCounts(const std::string &word, void testWords(const std::string &logname, vespalib::Rand48 &rnd, + std::optional<uint32_t> mmap_file_size_threshold, uint64_t numWordIds, uint32_t tupleCount, uint32_t chunkSize, @@ -495,7 +497,14 @@ testWords(const std::string &logname, LOG(info, "%s: pagedict4 written", logname.c_str()); } { - std::unique_ptr<DictionaryFileSeqRead> dr(new PageDict4FileSeqRead); + std::unique_ptr<DictionaryFileSeqRead> dr; + { + auto my_dr = std::make_unique<PageDict4FileSeqRead>(); + if (mmap_file_size_threshold.has_value()) { + my_dr->set_mmap_file_size_threshold(mmap_file_size_threshold.value()); + } + dr = std::move(my_dr); + } search::TuneFileSeqRead tuneFileRead; bool openres = dr->open("fakedict", @@ -535,7 +544,14 @@ testWords(const std::string &logname, LOG(info, "%s: pagedict4 seqverify OK", logname.c_str()); } { - std::unique_ptr<DictionaryFileRandRead> drr(new PageDict4RandRead); + std::unique_ptr<DictionaryFileRandRead> drr; + { + auto my_drr = std::make_unique<PageDict4RandRead>(); + if (mmap_file_size_threshold.has_value()) { + my_drr->set_mmap_file_size_threshold(mmap_file_size_threshold.value()); + } + drr = std::move(my_drr); + } search::TuneFileRandRead tuneFileRead; bool openres = drr->open("fakedict", tuneFileRead); @@ -649,46 +665,50 @@ testWords(const std::string &logname, void PageDict4TestApp::testWords() { - ::testWords("smallchunkwordsempty", _rnd, + ::testWords("smallchunkwordsempty", _rnd, std::nullopt, 1000000, 0, 64, 80, 72, 64, false, false, false); - ::testWords("smallchunkwordsempty2", _rnd, + ::testWords("smallchunkwordsempty2", _rnd, std::nullopt, 0, 0, 64, 80, 72, 64, false, false, false); - ::testWords("smallchunkwords", _rnd, + ::testWords("smallchunkwords", _rnd, std::nullopt, 1000000, 100, 64, 80, 72, 64, false, false, false); - ::testWords("smallchunkwordswithemptyword", _rnd, + ::testWords("smallchunkwordswithemptyword", _rnd, std::nullopt, 1000000, 100, 64, 80, 72, 64, true, false, false); - ::testWords("smallchunkwordswithcommonfirstword", _rnd, + ::testWords("smallchunkwordswithcommonfirstword", _rnd, std::nullopt, 1000000, 100, 64, 80, 72, 64, false, true, false); - ::testWords("smallchunkwordswithcommonemptyfirstword", _rnd, + ::testWords("smallchunkwordswithcommonemptyfirstword", _rnd, std::nullopt, 1000000, 100, 64, 80, 72, 64, true, true, false); - ::testWords("smallchunkwordswithcommonlastword", _rnd, + ::testWords("smallchunkwordswithcommonlastword", _rnd, std::nullopt, 1000000, 100, 64, 80, 72, 64, false, false, true); -#if 1 - ::testWords("smallchunkwords2", _rnd, + ::testWords("smallchunkwords2", _rnd, std::nullopt, 1000000, _stress ? 10000 : 100, 64, 80, 72, 64, _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon); -#endif -#if 1 - ::testWords("stdwords", _rnd, + ::testWords("stdwords", _rnd, std::nullopt, 1000000, _stress ? 10000 : 100, 262144, 80, 72, 64, _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon); -#endif + ::testWords("stdwordsnommapssdat", _rnd, 500_Mi, + 1000000, 100, + 262144, 80, 72, 64, + _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon); + ::testWords("stdwordsmmapssdat", _rnd, 1, + 1000000, 100, + 262144, 80, 72, 64, + _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon); } int main(int argc, char **argv) { diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp index f3fc31ac8b1..e5ce886f499 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp @@ -6,7 +6,9 @@ #include <vespa/searchlib/index/postinglistparams.h> #include <vespa/vespalib/data/fileheader.h> #include <vespa/vespalib/data/databuffer.h> +#include <vespa/vespalib/datastore/aligner.h> #include <vespa/vespalib/util/arrayref.h> +#include <vespa/vespalib/util/round_up_to_page_size.h> #include <vespa/vespalib/util/size_literals.h> namespace search::bitcompression { @@ -181,6 +183,12 @@ readHeader(vespalib::GenericHeader &header, int64_t fileSize) return headerLen; } +bool +DecodeContext64Base::is_padded_for_memory_map(uint64_t file_bit_size, uint64_t file_size) noexcept +{ + using Aligner = vespalib::datastore::Aligner<64>; + return (Aligner::align(file_bit_size) + 128 <= (vespalib::round_up_to_page_size(file_size) * 8)); +} template <bool bigEndian> void diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index 4124f1f659f..b1e13a9d96b 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -1261,6 +1261,13 @@ public: virtual uint64_t decode_exp_golomb(int k) = 0; void readBytes(uint8_t *buf, size_t len); uint32_t readHeader(vespalib::GenericHeader &header, int64_t fileSize); + + /* + * Check if file is padding at end for decompression readahead. + */ + static bool is_padded_for_memory_map(uint64_t file_bit_size, uint64_t file_size) noexcept; + + static uint64_t file_units(uint64_t file_size) noexcept { return (file_size + sizeof(uint64_t) - 1) / sizeof(uint64_t); } }; diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp index bceeb1e7bc1..89b5ffb84f8 100644 --- a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp @@ -51,7 +51,7 @@ using vespalib::getLastErrorString; namespace search::diskindex { struct PageDict4FileSeqRead::DictFileReadContext { - DictFileReadContext(vespalib::stringref id, const vespalib::string & name, const TuneFileSeqRead &tune, bool read_all_upfront); + DictFileReadContext(vespalib::stringref id, const vespalib::string & name, const TuneFileSeqRead &tune, uint32_t mmap_file_size_threshold, bool read_all_upfront); ~DictFileReadContext(); vespalib::FileHeader readHeader(); void readExtendedHeader(); @@ -66,7 +66,7 @@ struct PageDict4FileSeqRead::DictFileReadContext { }; PageDict4FileSeqRead::DictFileReadContext::DictFileReadContext(vespalib::stringref id, const vespalib::string & name, - const TuneFileSeqRead &tune, bool read_all_upfront) + const TuneFileSeqRead &tune, uint32_t mmap_file_size_threshold, bool read_all_upfront) : _id(id), _fileBitSize(0u), _headerLen(0u), @@ -79,23 +79,49 @@ PageDict4FileSeqRead::DictFileReadContext::DictFileReadContext(vespalib::stringr if (tune.getWantDirectIO()) { _file.EnableDirectIO(); } + if (read_all_upfront) { + _file.enableMemoryMap(0); + } if (!_file.OpenReadOnly(name.c_str())) { LOG(error, "could not open %s: %s", _file.GetFileName(), getLastErrorString().c_str()); return; } uint64_t fileSize = _file.getSize(); + uint64_t file_units = DC::file_units(fileSize); _readContext.setFile(&_file); _readContext.setFileSize(fileSize); + bool use_mmap = false; + /* + * Limit memory usage spike by using memory mapped .ssdat file if + * file size is greater than 32 MiB with padding at end of file. + */ + if (read_all_upfront && _file.MemoryMapPtr(0) != nullptr && fileSize >= mmap_file_size_threshold) { + _readContext.reference_compressed_buffer(_file.MemoryMapPtr(0), file_units); + vespalib::FileHeader header; + _dc.readHeader(header, _file.getSize()); + assert(header.hasTag("fileBitSize")); + int64_t file_bit_size = header.getTag("fileBitSize").asInteger(); + use_mmap = DC::is_padded_for_memory_map(file_bit_size, fileSize); + _readContext.setBitOffset(0); + _readContext.setBufferEndFilePos(0); + } if (read_all_upfront) { - _readContext.allocComprBuf((fileSize + sizeof(uint64_t) - 1) / sizeof(uint64_t), 32_Ki); + if (use_mmap) { + _readContext.reference_compressed_buffer(_file.MemoryMapPtr(0), file_units); + } else { + _readContext.allocComprBuf(file_units, 32_Ki); + } } else { _readContext.allocComprBuf(64_Ki, 32_Ki); } - _dc.emptyBuffer(0); - _readContext.readComprBuffer(); + if (!use_mmap) { + _dc.emptyBuffer(0); + _readContext.readComprBuffer(); + } if (read_all_upfront) { assert(_readContext.getBufferEndFilePos() >= fileSize); } + assert(_dc.getBitPosV() == 0); _valid = true; } @@ -121,7 +147,8 @@ PageDict4FileSeqRead::PageDict4FileSeqRead() _ss(), _sp(), _p(), - _wordNum(0u) + _wordNum(0u), + _mmap_file_size_threshold(32_Mi) { } PageDict4FileSeqRead::~PageDict4FileSeqRead() = default; @@ -166,9 +193,9 @@ bool PageDict4FileSeqRead::open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) { - _ss = std::make_unique<DictFileReadContext>(mySSId, name + ".ssdat", tuneFileRead, true); - _sp = std::make_unique<DictFileReadContext>(mySPId, name + ".spdat", tuneFileRead, false); - _p = std::make_unique<DictFileReadContext>(myPId, name + ".pdat", tuneFileRead, false); + _ss = std::make_unique<DictFileReadContext>(mySSId, name + ".ssdat", tuneFileRead, _mmap_file_size_threshold, true); + _sp = std::make_unique<DictFileReadContext>(mySPId, name + ".spdat", tuneFileRead, _mmap_file_size_threshold, false); + _p = std::make_unique<DictFileReadContext>(myPId, name + ".pdat", tuneFileRead, _mmap_file_size_threshold, false); if ( !_ss->_valid || !_sp->_valid || !_p->_valid ) { return false; } diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h index 404f85e9088..40540cd458e 100644 --- a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h +++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h @@ -26,6 +26,7 @@ class PageDict4FileSeqRead : public index::DictionaryFileSeqRead std::unique_ptr<DictFileReadContext> _sp; std::unique_ptr<DictFileReadContext> _p; uint64_t _wordNum; + uint32_t _mmap_file_size_threshold; public: PageDict4FileSeqRead(); ~PageDict4FileSeqRead() override; @@ -38,6 +39,7 @@ public: bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override; bool close() override; void getParams(index::PostingListParams ¶ms) override; + void set_mmap_file_size_threshold(uint32_t v) { _mmap_file_size_threshold = v; } }; /** diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp index 3654b703648..a513a18ae5d 100644 --- a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp @@ -1,8 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "pagedict4randread.h" -#include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/data/fileheader.h> +#include <vespa/vespalib/stllike/asciistream.h> #include <vespa/fastos/file.h> #include <vespa/log/log.h> @@ -33,7 +33,8 @@ PageDict4RandRead::PageDict4RandRead() _pFileBitSize(0u), _ssHeaderLen(0u), _spHeaderLen(0u), - _pHeaderLen(0u) + _pHeaderLen(0u), + _mmap_file_size_threshold(32_Mi) { _ssd.setReadContext(&_ssReadContext); } @@ -229,14 +230,42 @@ PageDict4RandRead::open(const vespalib::string &name, } uint64_t fileSize = _ssfile->getSize(); + uint64_t file_units = DC::file_units(fileSize); _ssReadContext.setFile(_ssfile.get()); _ssReadContext.setFileSize(fileSize); - _ssReadContext.allocComprBuf((fileSize + sizeof(uint64_t) - 1) / sizeof(uint64_t), 32768u); - _ssd.emptyBuffer(0); - _ssReadContext.readComprBuffer(); - assert(_ssReadContext.getBufferEndFilePos() >= fileSize); + /* + * Limit memory usage spike by using memory mapped .ssdat file if + * file size is greater than 32 MiB with padding at end of file. + * Note: It might cause higher dictionary lookup latencies when + * system is under memory pressure due to pageins. + */ + bool has_read_ss_header = false; + if (_ssfile->MemoryMapPtr(0) != nullptr && fileSize >= _mmap_file_size_threshold) { + _ssReadContext.reference_compressed_buffer(_ssfile->MemoryMapPtr(0), file_units); + assert(_ssd.getReadOffset() == 0u); + readSSHeader(); + has_read_ss_header = true; + } + if (!has_read_ss_header || !DC::is_padded_for_memory_map(_ssFileBitSize, fileSize)) { + /* + * Insufficient padding or small .sdat file. Read whole file into + * memory. + */ + _ssReadContext.allocComprBuf(file_units, 32768u); + _ssd.emptyBuffer(0); + _ssReadContext.setBitOffset(0); + _ssReadContext.setBufferEndFilePos(0); + _ssfile->SetPosition(0); + _ssReadContext.readComprBuffer(); + assert(_ssReadContext.getBufferEndFilePos() >= fileSize); + assert(_ssd.getReadOffset() == 0u); + if (has_read_ss_header) { + _ssReadContext.setPosition(_ssHeaderLen * 8); + } else { + readSSHeader(); + } + } - readSSHeader(); readSPHeader(); readPHeader(); diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h index 051efa486dd..1c2e538cc48 100644 --- a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h +++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h @@ -36,6 +36,7 @@ class PageDict4RandRead : public index::DictionaryFileRandRead uint32_t _ssHeaderLen; uint32_t _spHeaderLen; uint32_t _pHeaderLen; + uint32_t _mmap_file_size_threshold; void readSSHeader(); void readSPHeader(); @@ -51,6 +52,7 @@ public: bool close() override; uint64_t getNumWordIds() const override; + void set_mmap_file_size_threshold(uint32_t v) { _mmap_file_size_threshold = v; } }; } |