summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-10-04 11:24:56 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2023-10-04 11:31:59 +0000
commitdedd71b89db9bda12ad994a1b47288e6d7d73d5d (patch)
tree3b33734a3b654f0bd2b9954a926ca8442b9492be /searchlib
parenteaf69ecaf979aad11850b6260db33a68d4cbcbb3 (diff)
Process idx file in streaming fashion instead of first reading all and then process.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/docstore/filechunk.cpp115
-rw-r--r--searchlib/src/vespa/searchlib/docstore/filechunk.h6
2 files changed, 48 insertions, 73 deletions
diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp
index a8b84fbeac4..5a1bcc733bc 100644
--- a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp
+++ b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp
@@ -126,15 +126,6 @@ FileChunk::TmpChunkMeta::fill(vespalib::nbostream & is) {
}
void
-FileChunk::verifyOrAssert(const TmpChunkMetaV & v)
-{
- for (auto prev(v.begin()), it(prev); it != v.end(); ++it) {
- assert(prev->getLastSerial() <= it->getLastSerial());
- prev = it;
- }
-}
-
-void
FileChunk::erase()
{
_file.reset();
@@ -149,69 +140,56 @@ FileChunk::updateLidMap(const unique_lock &guard, ISetLid &ds, uint64_t serialNu
FastOS_File idxFile(_idxFileName.c_str());
idxFile.enableMemoryMap(0);
- if (idxFile.OpenReadOnly()) {
- if (idxFile.IsMemoryMapped()) {
- const int64_t fileSize = idxFile.getSize();
- if (_idxHeaderLen == 0) {
- _idxHeaderLen = readIdxHeader(idxFile, _docIdLimit);
- }
- vespalib::nbostream is(static_cast<const char *>(idxFile.MemoryMapPtr(0)) + _idxHeaderLen,
- fileSize - _idxHeaderLen);
- TmpChunkMetaV tempVector;
- tempVector.reserve(fileSize/(sizeof(ChunkMeta)+sizeof(LidMeta)));
- while ( ! is.empty() && is.good()) {
- const int64_t lastKnownGoodPos = _idxHeaderLen + is.rp();
- tempVector.emplace_back();
- TmpChunkMeta & chunkMeta(tempVector.back());
- try {
- chunkMeta.deserialize(is);
- chunkMeta.fill(is);
- } catch (const vespalib::IllegalStateException & e) {
- LOG(warning, "Exception deserializing idx file : %s", e.what());
- LOG(warning, "File '%s' seems to be partially truncated. Will truncate from size=%" PRId64 " to %" PRId64,
- _idxFileName.c_str(), fileSize, lastKnownGoodPos);
- FastOS_File toTruncate(_idxFileName.c_str());
- if ( toTruncate.OpenReadWrite()) {
- if (toTruncate.SetSize(lastKnownGoodPos)) {
- tempVector.resize(tempVector.size() - 1);
- } else {
- throw SummaryException("SetSize() failed.", toTruncate, VESPA_STRLOC);
- }
- } else {
- throw SummaryException("Open for truncation failed.", toTruncate, VESPA_STRLOC);
- }
- break;
- }
+ if ( ! idxFile.OpenReadOnly()) {
+ LOG_ABORT("should not reach here");
+ }
+ if ( ! idxFile.IsMemoryMapped()) {
+ assert(idxFile.getSize() == 0);
+ return;
+ }
+ const int64_t fileSize = idxFile.getSize();
+ if (_idxHeaderLen == 0) {
+ _idxHeaderLen = readIdxHeader(idxFile, _docIdLimit);
+ }
+ BucketDensityComputer globalBucketMap(_bucketizer);
+ // Guard comes from the same bucketizer so the same guard can be used
+ // for both local and global BucketDensityComputer
+ vespalib::GenerationHandler::Guard bucketizerGuard = globalBucketMap.getGuard();
+ vespalib::nbostream is(static_cast<const char *>(idxFile.MemoryMapPtr(0)) + _idxHeaderLen,
+ fileSize - _idxHeaderLen);
+ for (size_t count=0; ! is.empty() && is.good(); count++) {
+ const int64_t lastKnownGoodPos = _idxHeaderLen + is.rp();
+ TmpChunkMeta chunkMeta;
+ try {
+ chunkMeta.deserialize(is);
+ chunkMeta.fill(is);
+ if ((count == 0) && (chunkMeta.getLastSerial() < serialNum)) {
+ LOG(warning, "last serial num(%" PRIu64 ") from previous file is bigger than my first(%" PRIu64
+ "). That is odd.Current filename is '%s'",
+ serialNum, chunkMeta.getLastSerial(), _idxFileName.c_str());
+ serialNum = chunkMeta.getLastSerial();
}
- if ( ! tempVector.empty()) {
- verifyOrAssert(tempVector);
- if (tempVector[0].getLastSerial() < serialNum) {
- LOG(warning,
- "last serial num(%" PRIu64 ") from previous file is "
- "bigger than my first(%" PRIu64 "). That is odd."
- "Current filename is '%s'",
- serialNum, tempVector[0].getLastSerial(),
- _idxFileName.c_str());
- serialNum = tempVector[0].getLastSerial();
+ assert(serialNum <= chunkMeta.getLastSerial());
+ serialNum = handleChunk(guard, ds, docIdLimit, bucketizerGuard, globalBucketMap, chunkMeta);
+ assert(serialNum >= _lastPersistedSerialNum.load(std::memory_order_relaxed));
+ _lastPersistedSerialNum.store(serialNum, std::memory_order_relaxed);
+ } catch (const vespalib::IllegalStateException & e) {
+ LOG(warning, "Exception deserializing idx file : %s", e.what());
+ LOG(warning, "File '%s' seems to be partially truncated. Will truncate from size=%" PRId64 " to %" PRId64,
+ _idxFileName.c_str(), fileSize, lastKnownGoodPos);
+ FastOS_File toTruncate(_idxFileName.c_str());
+ if ( toTruncate.OpenReadWrite()) {
+ if (toTruncate.SetSize(lastKnownGoodPos)) {
+ } else {
+ throw SummaryException("SetSize() failed.", toTruncate, VESPA_STRLOC);
}
- BucketDensityComputer globalBucketMap(_bucketizer);
- // Guard comes from the same bucketizer so the same guard can be used
- // for both local and global BucketDensityComputer
- vespalib::GenerationHandler::Guard bucketizerGuard = globalBucketMap.getGuard();
- for (const TmpChunkMeta & chunkMeta : tempVector) {
- assert(serialNum <= chunkMeta.getLastSerial());
- serialNum = handleChunk(guard, ds, docIdLimit, bucketizerGuard, globalBucketMap, chunkMeta);
- assert(serialNum >= _lastPersistedSerialNum.load(std::memory_order_relaxed));
- _lastPersistedSerialNum.store(serialNum, std::memory_order_relaxed);
- }
- _numUniqueBuckets = globalBucketMap.getNumBuckets();
+ } else {
+ throw SummaryException("Open for truncation failed.", toTruncate, VESPA_STRLOC);
}
- } else {
- assert(idxFile.getSize() == 0);
+ break;
}
- } else {
- LOG_ABORT("should not reach here");
}
+ _numUniqueBuckets = globalBucketMap.getNumBuckets();
}
uint64_t
@@ -578,8 +556,7 @@ FileChunk::getStats() const
uint64_t serialNum = getLastPersistedSerialNum();
uint32_t docIdLimit = getDocIdLimit();
uint64_t nameId = getNameId().getId();
- return DataStoreFileChunkStats(diskFootprint, diskBloat, bucketSpread,
- serialNum, serialNum, docIdLimit, nameId);
+ return {diskFootprint, diskBloat, bucketSpread, serialNum, serialNum, docIdLimit, nameId};
}
} // namespace search
diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.h b/searchlib/src/vespa/searchlib/docstore/filechunk.h
index b87dd819ac9..446a53de446 100644
--- a/searchlib/src/vespa/searchlib/docstore/filechunk.h
+++ b/searchlib/src/vespa/searchlib/docstore/filechunk.h
@@ -47,7 +47,7 @@ public:
class BucketDensityComputer
{
public:
- BucketDensityComputer(const IBucketizer * bucketizer) : _bucketizer(bucketizer), _count(0) { }
+ explicit BucketDensityComputer(const IBucketizer * bucketizer) : _bucketizer(bucketizer), _count(0) { }
void recordLid(const vespalib::GenerationHandler::Guard & guard, uint32_t lid, uint32_t dataSize) {
if (_bucketizer && (dataSize > 0)) {
recordLid(_bucketizer->getBucketOf(guard, lid));
@@ -118,7 +118,7 @@ public:
virtual size_t getMemoryMetaFootprint() const;
virtual vespalib::MemoryUsage getMemoryUsage() const;
- virtual size_t getDiskHeaderFootprint(void) const { return _dataHeaderLen + _idxHeaderLen; }
+ virtual size_t getDiskHeaderFootprint() const { return _dataHeaderLen + _idxHeaderLen; }
size_t getDiskBloat() const {
return (_addedBytes == 0)
? getDiskFootprint()
@@ -205,9 +205,7 @@ private:
public:
void fill(vespalib::nbostream & is);
};
- using TmpChunkMetaV = std::vector<TmpChunkMeta, vespalib::allocator_large<TmpChunkMeta>>;
using BucketizerGuard = vespalib::GenerationHandler::Guard;
- static void verifyOrAssert(const TmpChunkMetaV & v);
uint64_t handleChunk(const unique_lock &guard, ISetLid &lidMap, uint32_t docIdLimit,
const BucketizerGuard & bucketizerGuard, BucketDensityComputer & global,
const TmpChunkMeta & chunkMeta);