diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-10-04 11:24:56 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-10-04 11:31:59 +0000 |
commit | dedd71b89db9bda12ad994a1b47288e6d7d73d5d (patch) | |
tree | 3b33734a3b654f0bd2b9954a926ca8442b9492be /searchlib | |
parent | eaf69ecaf979aad11850b6260db33a68d4cbcbb3 (diff) |
Process idx file in streaming fashion instead of first reading all and then process.
Diffstat (limited to 'searchlib')
-rw-r--r-- | searchlib/src/vespa/searchlib/docstore/filechunk.cpp | 115 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/docstore/filechunk.h | 6 |
2 files changed, 48 insertions, 73 deletions
diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp index a8b84fbeac4..5a1bcc733bc 100644 --- a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp +++ b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp @@ -126,15 +126,6 @@ FileChunk::TmpChunkMeta::fill(vespalib::nbostream & is) { } void -FileChunk::verifyOrAssert(const TmpChunkMetaV & v) -{ - for (auto prev(v.begin()), it(prev); it != v.end(); ++it) { - assert(prev->getLastSerial() <= it->getLastSerial()); - prev = it; - } -} - -void FileChunk::erase() { _file.reset(); @@ -149,69 +140,56 @@ FileChunk::updateLidMap(const unique_lock &guard, ISetLid &ds, uint64_t serialNu FastOS_File idxFile(_idxFileName.c_str()); idxFile.enableMemoryMap(0); - if (idxFile.OpenReadOnly()) { - if (idxFile.IsMemoryMapped()) { - const int64_t fileSize = idxFile.getSize(); - if (_idxHeaderLen == 0) { - _idxHeaderLen = readIdxHeader(idxFile, _docIdLimit); - } - vespalib::nbostream is(static_cast<const char *>(idxFile.MemoryMapPtr(0)) + _idxHeaderLen, - fileSize - _idxHeaderLen); - TmpChunkMetaV tempVector; - tempVector.reserve(fileSize/(sizeof(ChunkMeta)+sizeof(LidMeta))); - while ( ! is.empty() && is.good()) { - const int64_t lastKnownGoodPos = _idxHeaderLen + is.rp(); - tempVector.emplace_back(); - TmpChunkMeta & chunkMeta(tempVector.back()); - try { - chunkMeta.deserialize(is); - chunkMeta.fill(is); - } catch (const vespalib::IllegalStateException & e) { - LOG(warning, "Exception deserializing idx file : %s", e.what()); - LOG(warning, "File '%s' seems to be partially truncated. Will truncate from size=%" PRId64 " to %" PRId64, - _idxFileName.c_str(), fileSize, lastKnownGoodPos); - FastOS_File toTruncate(_idxFileName.c_str()); - if ( toTruncate.OpenReadWrite()) { - if (toTruncate.SetSize(lastKnownGoodPos)) { - tempVector.resize(tempVector.size() - 1); - } else { - throw SummaryException("SetSize() failed.", toTruncate, VESPA_STRLOC); - } - } else { - throw SummaryException("Open for truncation failed.", toTruncate, VESPA_STRLOC); - } - break; - } + if ( ! idxFile.OpenReadOnly()) { + LOG_ABORT("should not reach here"); + } + if ( ! idxFile.IsMemoryMapped()) { + assert(idxFile.getSize() == 0); + return; + } + const int64_t fileSize = idxFile.getSize(); + if (_idxHeaderLen == 0) { + _idxHeaderLen = readIdxHeader(idxFile, _docIdLimit); + } + BucketDensityComputer globalBucketMap(_bucketizer); + // Guard comes from the same bucketizer so the same guard can be used + // for both local and global BucketDensityComputer + vespalib::GenerationHandler::Guard bucketizerGuard = globalBucketMap.getGuard(); + vespalib::nbostream is(static_cast<const char *>(idxFile.MemoryMapPtr(0)) + _idxHeaderLen, + fileSize - _idxHeaderLen); + for (size_t count=0; ! is.empty() && is.good(); count++) { + const int64_t lastKnownGoodPos = _idxHeaderLen + is.rp(); + TmpChunkMeta chunkMeta; + try { + chunkMeta.deserialize(is); + chunkMeta.fill(is); + if ((count == 0) && (chunkMeta.getLastSerial() < serialNum)) { + LOG(warning, "last serial num(%" PRIu64 ") from previous file is bigger than my first(%" PRIu64 + "). That is odd.Current filename is '%s'", + serialNum, chunkMeta.getLastSerial(), _idxFileName.c_str()); + serialNum = chunkMeta.getLastSerial(); } - if ( ! tempVector.empty()) { - verifyOrAssert(tempVector); - if (tempVector[0].getLastSerial() < serialNum) { - LOG(warning, - "last serial num(%" PRIu64 ") from previous file is " - "bigger than my first(%" PRIu64 "). That is odd." - "Current filename is '%s'", - serialNum, tempVector[0].getLastSerial(), - _idxFileName.c_str()); - serialNum = tempVector[0].getLastSerial(); + assert(serialNum <= chunkMeta.getLastSerial()); + serialNum = handleChunk(guard, ds, docIdLimit, bucketizerGuard, globalBucketMap, chunkMeta); + assert(serialNum >= _lastPersistedSerialNum.load(std::memory_order_relaxed)); + _lastPersistedSerialNum.store(serialNum, std::memory_order_relaxed); + } catch (const vespalib::IllegalStateException & e) { + LOG(warning, "Exception deserializing idx file : %s", e.what()); + LOG(warning, "File '%s' seems to be partially truncated. Will truncate from size=%" PRId64 " to %" PRId64, + _idxFileName.c_str(), fileSize, lastKnownGoodPos); + FastOS_File toTruncate(_idxFileName.c_str()); + if ( toTruncate.OpenReadWrite()) { + if (toTruncate.SetSize(lastKnownGoodPos)) { + } else { + throw SummaryException("SetSize() failed.", toTruncate, VESPA_STRLOC); } - BucketDensityComputer globalBucketMap(_bucketizer); - // Guard comes from the same bucketizer so the same guard can be used - // for both local and global BucketDensityComputer - vespalib::GenerationHandler::Guard bucketizerGuard = globalBucketMap.getGuard(); - for (const TmpChunkMeta & chunkMeta : tempVector) { - assert(serialNum <= chunkMeta.getLastSerial()); - serialNum = handleChunk(guard, ds, docIdLimit, bucketizerGuard, globalBucketMap, chunkMeta); - assert(serialNum >= _lastPersistedSerialNum.load(std::memory_order_relaxed)); - _lastPersistedSerialNum.store(serialNum, std::memory_order_relaxed); - } - _numUniqueBuckets = globalBucketMap.getNumBuckets(); + } else { + throw SummaryException("Open for truncation failed.", toTruncate, VESPA_STRLOC); } - } else { - assert(idxFile.getSize() == 0); + break; } - } else { - LOG_ABORT("should not reach here"); } + _numUniqueBuckets = globalBucketMap.getNumBuckets(); } uint64_t @@ -578,8 +556,7 @@ FileChunk::getStats() const uint64_t serialNum = getLastPersistedSerialNum(); uint32_t docIdLimit = getDocIdLimit(); uint64_t nameId = getNameId().getId(); - return DataStoreFileChunkStats(diskFootprint, diskBloat, bucketSpread, - serialNum, serialNum, docIdLimit, nameId); + return {diskFootprint, diskBloat, bucketSpread, serialNum, serialNum, docIdLimit, nameId}; } } // namespace search diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.h b/searchlib/src/vespa/searchlib/docstore/filechunk.h index b87dd819ac9..446a53de446 100644 --- a/searchlib/src/vespa/searchlib/docstore/filechunk.h +++ b/searchlib/src/vespa/searchlib/docstore/filechunk.h @@ -47,7 +47,7 @@ public: class BucketDensityComputer { public: - BucketDensityComputer(const IBucketizer * bucketizer) : _bucketizer(bucketizer), _count(0) { } + explicit BucketDensityComputer(const IBucketizer * bucketizer) : _bucketizer(bucketizer), _count(0) { } void recordLid(const vespalib::GenerationHandler::Guard & guard, uint32_t lid, uint32_t dataSize) { if (_bucketizer && (dataSize > 0)) { recordLid(_bucketizer->getBucketOf(guard, lid)); @@ -118,7 +118,7 @@ public: virtual size_t getMemoryMetaFootprint() const; virtual vespalib::MemoryUsage getMemoryUsage() const; - virtual size_t getDiskHeaderFootprint(void) const { return _dataHeaderLen + _idxHeaderLen; } + virtual size_t getDiskHeaderFootprint() const { return _dataHeaderLen + _idxHeaderLen; } size_t getDiskBloat() const { return (_addedBytes == 0) ? getDiskFootprint() @@ -205,9 +205,7 @@ private: public: void fill(vespalib::nbostream & is); }; - using TmpChunkMetaV = std::vector<TmpChunkMeta, vespalib::allocator_large<TmpChunkMeta>>; using BucketizerGuard = vespalib::GenerationHandler::Guard; - static void verifyOrAssert(const TmpChunkMetaV & v); uint64_t handleChunk(const unique_lock &guard, ISetLid &lidMap, uint32_t docIdLimit, const BucketizerGuard & bucketizerGuard, BucketDensityComputer & global, const TmpChunkMeta & chunkMeta); |