summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2024-04-18 14:27:10 +0200
committerGitHub <noreply@github.com>2024-04-18 14:27:10 +0200
commit5d0763e481e1d8908ba848e80f1bfa0d0fa0069a (patch)
treeb3845bb31622186ea84437994246e4477a9d1bfe /searchlib
parent4ef45a0ea202f0fc6654c7cf32ba8be2ca82efd6 (diff)
Revert "Revert "Use memory mapped disk index dictionary .ssdat file when large""
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp52
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.h7
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp45
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/pagedict4file.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp43
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h2
7 files changed, 127 insertions, 32 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp
index 951d6f61980..3b7ec00211d 100644
--- a/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp
+++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_test.cpp
@@ -15,8 +15,9 @@
#include <vespa/searchlib/diskindex/pagedict4randread.h>
#include <vespa/searchlib/common/tunefileinfo.h>
#include <vespa/vespalib/util/signalhandler.h>
-#include <sstream>
#include <cinttypes>
+#include <optional>
+#include <sstream>
#include <vespa/log/log.h>
LOG_SETUP("pagedict4test");
@@ -357,6 +358,7 @@ checkCounts(const std::string &word,
void
testWords(const std::string &logname,
vespalib::Rand48 &rnd,
+ std::optional<uint32_t> mmap_file_size_threshold,
uint64_t numWordIds,
uint32_t tupleCount,
uint32_t chunkSize,
@@ -495,7 +497,14 @@ testWords(const std::string &logname,
LOG(info, "%s: pagedict4 written", logname.c_str());
}
{
- std::unique_ptr<DictionaryFileSeqRead> dr(new PageDict4FileSeqRead);
+ std::unique_ptr<DictionaryFileSeqRead> dr;
+ {
+ auto my_dr = std::make_unique<PageDict4FileSeqRead>();
+ if (mmap_file_size_threshold.has_value()) {
+ my_dr->set_mmap_file_size_threshold(mmap_file_size_threshold.value());
+ }
+ dr = std::move(my_dr);
+ }
search::TuneFileSeqRead tuneFileRead;
bool openres = dr->open("fakedict",
@@ -535,7 +544,14 @@ testWords(const std::string &logname,
LOG(info, "%s: pagedict4 seqverify OK", logname.c_str());
}
{
- std::unique_ptr<DictionaryFileRandRead> drr(new PageDict4RandRead);
+ std::unique_ptr<DictionaryFileRandRead> drr;
+ {
+ auto my_drr = std::make_unique<PageDict4RandRead>();
+ if (mmap_file_size_threshold.has_value()) {
+ my_drr->set_mmap_file_size_threshold(mmap_file_size_threshold.value());
+ }
+ drr = std::move(my_drr);
+ }
search::TuneFileRandRead tuneFileRead;
bool openres = drr->open("fakedict",
tuneFileRead);
@@ -649,46 +665,50 @@ testWords(const std::string &logname,
void
PageDict4TestApp::testWords()
{
- ::testWords("smallchunkwordsempty", _rnd,
+ ::testWords("smallchunkwordsempty", _rnd, std::nullopt,
1000000, 0,
64, 80, 72, 64,
false, false, false);
- ::testWords("smallchunkwordsempty2", _rnd,
+ ::testWords("smallchunkwordsempty2", _rnd, std::nullopt,
0, 0,
64, 80, 72, 64,
false, false, false);
- ::testWords("smallchunkwords", _rnd,
+ ::testWords("smallchunkwords", _rnd, std::nullopt,
1000000, 100,
64, 80, 72, 64,
false, false, false);
- ::testWords("smallchunkwordswithemptyword", _rnd,
+ ::testWords("smallchunkwordswithemptyword", _rnd, std::nullopt,
1000000, 100,
64, 80, 72, 64,
true, false, false);
- ::testWords("smallchunkwordswithcommonfirstword", _rnd,
+ ::testWords("smallchunkwordswithcommonfirstword", _rnd, std::nullopt,
1000000, 100,
64, 80, 72, 64,
false, true, false);
- ::testWords("smallchunkwordswithcommonemptyfirstword", _rnd,
+ ::testWords("smallchunkwordswithcommonemptyfirstword", _rnd, std::nullopt,
1000000, 100,
64, 80, 72, 64,
true, true, false);
- ::testWords("smallchunkwordswithcommonlastword", _rnd,
+ ::testWords("smallchunkwordswithcommonlastword", _rnd, std::nullopt,
1000000, 100,
64, 80, 72, 64,
false, false, true);
-#if 1
- ::testWords("smallchunkwords2", _rnd,
+ ::testWords("smallchunkwords2", _rnd, std::nullopt,
1000000, _stress ? 10000 : 100,
64, 80, 72, 64,
_emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
-#endif
-#if 1
- ::testWords("stdwords", _rnd,
+ ::testWords("stdwords", _rnd, std::nullopt,
1000000, _stress ? 10000 : 100,
262144, 80, 72, 64,
_emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
-#endif
+ ::testWords("stdwordsnommapssdat", _rnd, 500_Mi,
+ 1000000, 100,
+ 262144, 80, 72, 64,
+ _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
+ ::testWords("stdwordsmmapssdat", _rnd, 1,
+ 1000000, 100,
+ 262144, 80, 72, 64,
+ _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
}
int main(int argc, char **argv) {
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
index f3fc31ac8b1..e5ce886f499 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
@@ -6,7 +6,9 @@
#include <vespa/searchlib/index/postinglistparams.h>
#include <vespa/vespalib/data/fileheader.h>
#include <vespa/vespalib/data/databuffer.h>
+#include <vespa/vespalib/datastore/aligner.h>
#include <vespa/vespalib/util/arrayref.h>
+#include <vespa/vespalib/util/round_up_to_page_size.h>
#include <vespa/vespalib/util/size_literals.h>
namespace search::bitcompression {
@@ -181,6 +183,12 @@ readHeader(vespalib::GenericHeader &header, int64_t fileSize)
return headerLen;
}
+bool
+DecodeContext64Base::is_padded_for_memory_map(uint64_t file_bit_size, uint64_t file_size) noexcept
+{
+ using Aligner = vespalib::datastore::Aligner<64>;
+ return (Aligner::align(file_bit_size) + 128 <= (vespalib::round_up_to_page_size(file_size) * 8));
+}
template <bool bigEndian>
void
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h
index 4124f1f659f..b1e13a9d96b 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h
@@ -1261,6 +1261,13 @@ public:
virtual uint64_t decode_exp_golomb(int k) = 0;
void readBytes(uint8_t *buf, size_t len);
uint32_t readHeader(vespalib::GenericHeader &header, int64_t fileSize);
+
+ /*
+ * Check if file is padding at end for decompression readahead.
+ */
+ static bool is_padded_for_memory_map(uint64_t file_bit_size, uint64_t file_size) noexcept;
+
+ static uint64_t file_units(uint64_t file_size) noexcept { return (file_size + sizeof(uint64_t) - 1) / sizeof(uint64_t); }
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp
index bceeb1e7bc1..89b5ffb84f8 100644
--- a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.cpp
@@ -51,7 +51,7 @@ using vespalib::getLastErrorString;
namespace search::diskindex {
struct PageDict4FileSeqRead::DictFileReadContext {
- DictFileReadContext(vespalib::stringref id, const vespalib::string & name, const TuneFileSeqRead &tune, bool read_all_upfront);
+ DictFileReadContext(vespalib::stringref id, const vespalib::string & name, const TuneFileSeqRead &tune, uint32_t mmap_file_size_threshold, bool read_all_upfront);
~DictFileReadContext();
vespalib::FileHeader readHeader();
void readExtendedHeader();
@@ -66,7 +66,7 @@ struct PageDict4FileSeqRead::DictFileReadContext {
};
PageDict4FileSeqRead::DictFileReadContext::DictFileReadContext(vespalib::stringref id, const vespalib::string & name,
- const TuneFileSeqRead &tune, bool read_all_upfront)
+ const TuneFileSeqRead &tune, uint32_t mmap_file_size_threshold, bool read_all_upfront)
: _id(id),
_fileBitSize(0u),
_headerLen(0u),
@@ -79,23 +79,49 @@ PageDict4FileSeqRead::DictFileReadContext::DictFileReadContext(vespalib::stringr
if (tune.getWantDirectIO()) {
_file.EnableDirectIO();
}
+ if (read_all_upfront) {
+ _file.enableMemoryMap(0);
+ }
if (!_file.OpenReadOnly(name.c_str())) {
LOG(error, "could not open %s: %s", _file.GetFileName(), getLastErrorString().c_str());
return;
}
uint64_t fileSize = _file.getSize();
+ uint64_t file_units = DC::file_units(fileSize);
_readContext.setFile(&_file);
_readContext.setFileSize(fileSize);
+ bool use_mmap = false;
+ /*
+ * Limit memory usage spike by using memory mapped .ssdat file if
+ * file size is greater than 32 MiB with padding at end of file.
+ */
+ if (read_all_upfront && _file.MemoryMapPtr(0) != nullptr && fileSize >= mmap_file_size_threshold) {
+ _readContext.reference_compressed_buffer(_file.MemoryMapPtr(0), file_units);
+ vespalib::FileHeader header;
+ _dc.readHeader(header, _file.getSize());
+ assert(header.hasTag("fileBitSize"));
+ int64_t file_bit_size = header.getTag("fileBitSize").asInteger();
+ use_mmap = DC::is_padded_for_memory_map(file_bit_size, fileSize);
+ _readContext.setBitOffset(0);
+ _readContext.setBufferEndFilePos(0);
+ }
if (read_all_upfront) {
- _readContext.allocComprBuf((fileSize + sizeof(uint64_t) - 1) / sizeof(uint64_t), 32_Ki);
+ if (use_mmap) {
+ _readContext.reference_compressed_buffer(_file.MemoryMapPtr(0), file_units);
+ } else {
+ _readContext.allocComprBuf(file_units, 32_Ki);
+ }
} else {
_readContext.allocComprBuf(64_Ki, 32_Ki);
}
- _dc.emptyBuffer(0);
- _readContext.readComprBuffer();
+ if (!use_mmap) {
+ _dc.emptyBuffer(0);
+ _readContext.readComprBuffer();
+ }
if (read_all_upfront) {
assert(_readContext.getBufferEndFilePos() >= fileSize);
}
+ assert(_dc.getBitPosV() == 0);
_valid = true;
}
@@ -121,7 +147,8 @@ PageDict4FileSeqRead::PageDict4FileSeqRead()
_ss(),
_sp(),
_p(),
- _wordNum(0u)
+ _wordNum(0u),
+ _mmap_file_size_threshold(32_Mi)
{ }
PageDict4FileSeqRead::~PageDict4FileSeqRead() = default;
@@ -166,9 +193,9 @@ bool
PageDict4FileSeqRead::open(const vespalib::string &name,
const TuneFileSeqRead &tuneFileRead)
{
- _ss = std::make_unique<DictFileReadContext>(mySSId, name + ".ssdat", tuneFileRead, true);
- _sp = std::make_unique<DictFileReadContext>(mySPId, name + ".spdat", tuneFileRead, false);
- _p = std::make_unique<DictFileReadContext>(myPId, name + ".pdat", tuneFileRead, false);
+ _ss = std::make_unique<DictFileReadContext>(mySSId, name + ".ssdat", tuneFileRead, _mmap_file_size_threshold, true);
+ _sp = std::make_unique<DictFileReadContext>(mySPId, name + ".spdat", tuneFileRead, _mmap_file_size_threshold, false);
+ _p = std::make_unique<DictFileReadContext>(myPId, name + ".pdat", tuneFileRead, _mmap_file_size_threshold, false);
if ( !_ss->_valid || !_sp->_valid || !_p->_valid ) {
return false;
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h
index 404f85e9088..40540cd458e 100644
--- a/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h
+++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4file.h
@@ -26,6 +26,7 @@ class PageDict4FileSeqRead : public index::DictionaryFileSeqRead
std::unique_ptr<DictFileReadContext> _sp;
std::unique_ptr<DictFileReadContext> _p;
uint64_t _wordNum;
+ uint32_t _mmap_file_size_threshold;
public:
PageDict4FileSeqRead();
~PageDict4FileSeqRead() override;
@@ -38,6 +39,7 @@ public:
bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override;
bool close() override;
void getParams(index::PostingListParams &params) override;
+ void set_mmap_file_size_threshold(uint32_t v) { _mmap_file_size_threshold = v; }
};
/**
diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp
index 3654b703648..a513a18ae5d 100644
--- a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.cpp
@@ -1,8 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "pagedict4randread.h"
-#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/data/fileheader.h>
+#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/fastos/file.h>
#include <vespa/log/log.h>
@@ -33,7 +33,8 @@ PageDict4RandRead::PageDict4RandRead()
_pFileBitSize(0u),
_ssHeaderLen(0u),
_spHeaderLen(0u),
- _pHeaderLen(0u)
+ _pHeaderLen(0u),
+ _mmap_file_size_threshold(32_Mi)
{
_ssd.setReadContext(&_ssReadContext);
}
@@ -229,14 +230,42 @@ PageDict4RandRead::open(const vespalib::string &name,
}
uint64_t fileSize = _ssfile->getSize();
+ uint64_t file_units = DC::file_units(fileSize);
_ssReadContext.setFile(_ssfile.get());
_ssReadContext.setFileSize(fileSize);
- _ssReadContext.allocComprBuf((fileSize + sizeof(uint64_t) - 1) / sizeof(uint64_t), 32768u);
- _ssd.emptyBuffer(0);
- _ssReadContext.readComprBuffer();
- assert(_ssReadContext.getBufferEndFilePos() >= fileSize);
+ /*
+ * Limit memory usage spike by using memory mapped .ssdat file if
+ * file size is greater than 32 MiB with padding at end of file.
+ * Note: It might cause higher dictionary lookup latencies when
+ * system is under memory pressure due to pageins.
+ */
+ bool has_read_ss_header = false;
+ if (_ssfile->MemoryMapPtr(0) != nullptr && fileSize >= _mmap_file_size_threshold) {
+ _ssReadContext.reference_compressed_buffer(_ssfile->MemoryMapPtr(0), file_units);
+ assert(_ssd.getReadOffset() == 0u);
+ readSSHeader();
+ has_read_ss_header = true;
+ }
+ if (!has_read_ss_header || !DC::is_padded_for_memory_map(_ssFileBitSize, fileSize)) {
+ /*
+ * Insufficient padding or small .sdat file. Read whole file into
+ * memory.
+ */
+ _ssReadContext.allocComprBuf(file_units, 32768u);
+ _ssd.emptyBuffer(0);
+ _ssReadContext.setBitOffset(0);
+ _ssReadContext.setBufferEndFilePos(0);
+ _ssfile->SetPosition(0);
+ _ssReadContext.readComprBuffer();
+ assert(_ssReadContext.getBufferEndFilePos() >= fileSize);
+ assert(_ssd.getReadOffset() == 0u);
+ if (has_read_ss_header) {
+ _ssReadContext.setPosition(_ssHeaderLen * 8);
+ } else {
+ readSSHeader();
+ }
+ }
- readSSHeader();
readSPHeader();
readPHeader();
diff --git a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h
index 051efa486dd..1c2e538cc48 100644
--- a/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h
+++ b/searchlib/src/vespa/searchlib/diskindex/pagedict4randread.h
@@ -36,6 +36,7 @@ class PageDict4RandRead : public index::DictionaryFileRandRead
uint32_t _ssHeaderLen;
uint32_t _spHeaderLen;
uint32_t _pHeaderLen;
+ uint32_t _mmap_file_size_threshold;
void readSSHeader();
void readSPHeader();
@@ -51,6 +52,7 @@ public:
bool close() override;
uint64_t getNumWordIds() const override;
+ void set_mmap_file_size_threshold(uint32_t v) { _mmap_file_size_threshold = v; }
};
}