diff options
author | Geir Storli <geirst@vespa.ai> | 2024-04-16 11:36:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-16 11:36:01 +0200 |
commit | fca990d5ed32c408df42bbe178b174711fa54a08 (patch) | |
tree | d10b4ba60468f379b64f94d04fa5ad7efd3eccdb | |
parent | 4212a0b72398dbd5c555f23f3501d265b9a8724e (diff) | |
parent | 5e6fca96505e0a181f528be2eb7f25ec7673b758 (diff) |
Merge pull request #30926 from vespa-engine/toregge/test-disk-index-dictionary-with-many-long-words
Test disk index dictionary file with many long words MERGEOK
6 files changed, 168 insertions, 25 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/.gitignore b/searchlib/src/tests/diskindex/pagedict4/.gitignore index 2381ed57229..8aa95a29b63 100644 --- a/searchlib/src/tests/diskindex/pagedict4/.gitignore +++ b/searchlib/src/tests/diskindex/pagedict4/.gitignore @@ -3,3 +3,4 @@ Makefile pagedict4_test fakedict.* searchlib_pagedict4_test_app +/long_words_dir/ diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt index 6be544db829..34114e195bc 100644 --- a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt +++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt @@ -16,3 +16,11 @@ vespa_add_executable(searchlib_pagedict4_hugeword_cornercase_test_app TEST searchlib ) vespa_add_test(NAME searchlib_pagedict4_hugeword_cornercase_test_app COMMAND searchlib_pagedict4_hugeword_cornercase_test_app) + +vespa_add_executable(searchlib_pagedict4_long_words_test_app TEST + SOURCES + pagedict4_long_words_test.cpp + DEPENDS + searchlib_test + searchlib +) diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp new file mode 100644 index 00000000000..dba7980a4b7 --- /dev/null +++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp @@ -0,0 +1,131 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/common/tunefileinfo.h> +#include <vespa/searchlib/diskindex/pagedict4file.h> +#include <vespa/searchlib/diskindex/pagedict4randread.h> +#include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <filesystem> + +using search::diskindex::PageDict4FileSeqRead; +using search::diskindex::PageDict4FileSeqWrite; +using search::diskindex::PageDict4RandRead; +using search::index::DummyFileHeaderContext; +using search::index::PostingListCounts; +using search::index::PostingListOffsetAndCounts; +using search::index::PostingListParams; + + +namespace { + +vespalib::string test_dir("long_words_dir"); +vespalib::string dict(test_dir + "/dict"); + +PostingListCounts make_counts() +{ + PostingListCounts counts; + counts._bitLength = 100; + counts._numDocs = 1; + counts._segments.clear(); + return counts; +} + +vespalib::string +make_word(int i) +{ + vespalib::asciistream os; + vespalib::string word(5_Ki, 'a'); + os << vespalib::setfill('0') << vespalib::setw(8) << i; + word.append(os.str()); + return word; +} + +} + +/* + * A long word that don't fit into a 4 KiB 'page' causes a fallback to + * overflow handling where the word is put in the .ssdat file. + * + * Many long words causes excessive growth of the .ssdat file, with + * overflow potentials when the whole file is read into a buffer. + * + * 4 GiB size: Overflow in ComprFileReadBase::ReadComprBuffer for expression + * readUnits * cbuf.getUnitSize() when both are 32-bits. + * Testable by setting num_words to 900_Ki + * + * 16 GiB size: Overflow in ComprFileReadBase::ReadComprBuffer when + * readUnits is 32-bit signed. + * Some overflows in ComprFileDecodeContext API. + * Overflow in DecodeContext64Base::getBitPos + * Testable by setting num_words to 4_Mi + * + * 32 GiB size: Overflow when calling ComprFileReadContext::allocComprBuf when + * comprBufSize is 32-bit unsigned. + * Overflow in DecodeContext64Base::setEnd. + * Testable by setting num_words to 9_Mi + * + * These overflows are fixed. + */ +TEST(PageDict4LongWordsTest, test_many_long_words) +{ + int num_words = 9_Mi; + auto counts = make_counts(); + std::filesystem::remove_all(std::filesystem::path(test_dir)); + std::filesystem::create_directories(std::filesystem::path(test_dir)); + + auto dw = std::make_unique<PageDict4FileSeqWrite>(); + DummyFileHeaderContext file_header_context; + PostingListParams params; + search::TuneFileSeqWrite tune_file_write; + params.set("numWordIds", num_words); + params.set("minChunkDocs", 256_Ki); + dw->setParams(params); + EXPECT_TRUE(dw->open(dict, tune_file_write, file_header_context)); + for (int i = 0; i < num_words; ++i) { + auto word = make_word(i); + dw->writeWord(word, counts); + } + EXPECT_TRUE(dw->close()); + dw.reset(); + + auto drr = std::make_unique<PageDict4RandRead>(); + search::TuneFileRandRead tune_file_rand_read; + EXPECT_TRUE(drr->open(dict, tune_file_rand_read)); + PostingListOffsetAndCounts offset_and_counts; + uint64_t exp_offset = 0; + uint64_t exp_acc_num_docs = 0; + for (int i = 0; i < num_words; ++i) { + auto word = make_word(i); + uint64_t check_word_num = 0; + EXPECT_TRUE(drr->lookup(word, check_word_num, offset_and_counts)); + EXPECT_EQ(i + 1, (int) check_word_num); + EXPECT_EQ(exp_offset, offset_and_counts._offset); + EXPECT_EQ(exp_acc_num_docs, offset_and_counts._accNumDocs); + EXPECT_EQ(counts, offset_and_counts._counts); + exp_offset += offset_and_counts._counts._bitLength; + exp_acc_num_docs += offset_and_counts._counts._numDocs; + } + EXPECT_TRUE(drr->close()); + drr.reset(); + + auto dr = std::make_unique<PageDict4FileSeqRead>(); + search::TuneFileSeqRead tune_file_read; + EXPECT_TRUE(dr->open(dict, tune_file_read)); + vespalib::string check_word; + PostingListCounts check_counts; + for (int i = 0; i < num_words; ++i) { + uint64_t check_word_num = 0; + check_word.clear(); + dr->readWord(check_word, check_word_num, check_counts); + EXPECT_EQ(i + 1, (int) check_word_num); + EXPECT_EQ(make_word(i), check_word); + EXPECT_EQ(counts, check_counts); + } + EXPECT_TRUE(dr->close()); + dr.reset(); + + std::filesystem::remove_all(std::filesystem::path(test_dir)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index 232572a6314..9d4ca38eed3 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -1167,7 +1167,7 @@ public: * Get remaining units in buffer (e.g. _realValE - _valI) */ - int32_t remainingUnits() const override { return _realValE - _valI; } + int64_t remainingUnits() const override { return _realValE - _valI; } /** * Get unit ptr (e.g. _valI) from decode context. @@ -1181,7 +1181,7 @@ public: } uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const override { - int intOffset = _realValE - _valI; + int64_t intOffset = _realValE - _valI; if (bitOffset == -1) { bitOffset = -64 - _preRead; } @@ -1200,7 +1200,7 @@ public: uint64_t getBitPosV() const override { return getReadOffset(); } - void adjUnitPtr(int newRemainingUnits) override { + void adjUnitPtr(int64_t newRemainingUnits) override { _valI = _realValE - newRemainingUnits; } @@ -1219,7 +1219,7 @@ public: * @param unitCount Number of bytes in buffer * @param moreData Set if there is more data available */ - void setEnd(unsigned int unitCount, bool moreData) { + void setEnd(uint64_t unitCount, bool moreData) { _valE = _realValE = _valI + unitCount; if (moreData) { _valE -= END_BUFFER_SAFETY; diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp index ff74dc7a0e0..db8fe14d658 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.cpp +++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp @@ -23,14 +23,17 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, bool isretryread = false; retry: - if (decodeContext.lastChunk()) + if (decodeContext.lastChunk()) { return; // Already reached end of file. - int remainingUnits = decodeContext.remainingUnits(); + } + int64_t remainingUnits = decodeContext.remainingUnits(); + assert(remainingUnits >= 0); // There's a good amount of data here already. if (remainingUnits > - static_cast<ssize_t>(ComprBuffer::minimumPadding())) //FIX! Tune + static_cast<ssize_t>(ComprBuffer::minimumPadding())) { //FIX! Tune return; + } // Assert that file read offset is aligned on unit boundary assert((static_cast<size_t>(fileReadByteOffset) & @@ -47,9 +50,9 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, // Continuation reads starts at aligned boundary. assert(remainingUnits == 0 || padBeforeUnits == 0); - if (readAll) + if (readAll) { stopOffset = fileSize << 3; - else if (!isretryread) { + } else if (!isretryread) { stopOffset += 8 * cbuf.getUnitBitSize(); // XXX: Magic integer // Realign stop offset to direct IO alignment boundary uint64_t fileDirectIOBitAlign = @@ -93,20 +96,19 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, fileReadByteOffset -= padBeforeUnits * cbuf.getUnitSize(); file.SetPosition(fileReadByteOffset); } - int readUnits0 = 0; - if (readBits > 0) - readUnits0 = static_cast<int>((readBits + cbuf.getUnitBitSize() - 1) / - cbuf.getUnitBitSize()); + size_t readUnits0 = 0; + if (readBits > 0) { + readUnits0 = (readBits + cbuf.getUnitBitSize() - 1) / cbuf.getUnitBitSize(); + } // Try to align end of read to an alignment boundary - int readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / - cbuf.getUnitSize(), readUnits0); - if (readUnits < readUnits0) + size_t readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / cbuf.getUnitSize(), readUnits0); + if (readUnits < readUnits0) { isMore = true; + } if (readUnits > 0) { - int64_t padBytes = fileReadByteOffset + - static_cast<int64_t>(readUnits) * cbuf.getUnitSize() - fileSize; + int64_t padBytes = fileReadByteOffset + readUnits * cbuf.getUnitSize() - fileSize; if (!isMore && padBytes > 0) { // Pad reading of file written with smaller unit size with // NUL bytes. @@ -115,17 +117,18 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, readUnits * cbuf.getUnitSize() - padBytes, 0, padBytes); - } else + } else { file.ReadBuf(cbuf.getComprBuf(), readUnits * cbuf.getUnitSize()); + } } // If at end of file then add units of zero bits as padding - if (!isMore) + if (!isMore) { memset(reinterpret_cast<char *>(cbuf.getComprBuf()) + readUnits * cbuf.getUnitSize(), 0, cbuf.getUnitSize() * ComprBuffer::minimumPadding()); + } - assert(remainingUnits + readUnits >= 0); decodeContext.afterRead(reinterpret_cast<char *>(cbuf.getComprBuf()) + (padBeforeUnits - remainingUnits) * static_cast<int32_t>(cbuf.getUnitSize()), @@ -343,7 +346,7 @@ ComprFileReadContext::setPosition(uint64_t newPosition) } void -ComprFileReadContext::allocComprBuf(unsigned int comprBufSize, size_t preferredFileAlignment) +ComprFileReadContext::allocComprBuf(size_t comprBufSize, size_t preferredFileAlignment) { ComprBuffer::allocComprBuf(comprBufSize, preferredFileAlignment, _file, true); } diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h index 8f8cffaffd6..6a0b72ce7e5 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.h +++ b/searchlib/src/vespa/searchlib/util/comprfile.h @@ -32,7 +32,7 @@ public: * Get remaining units in buffer (e.g. _realValE - _valI) */ - virtual int32_t remainingUnits() const = 0; + virtual int64_t remainingUnits() const = 0; /** * Get unit ptr (e.g. _valI) from decode context. @@ -51,7 +51,7 @@ public: virtual uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const = 0; virtual uint64_t getBitPosV() const = 0; virtual void skipBits(int bits) = 0; - virtual void adjUnitPtr(int newRemainingUnits) = 0; + virtual void adjUnitPtr(int64_t newRemainingUnits) = 0; virtual void emptyBuffer(uint64_t newBitPosition) = 0; /** @@ -105,7 +105,7 @@ public: void readComprBuffer(uint64_t stopOffset, bool readAll); void readComprBuffer(); void setPosition(uint64_t newPosition); - void allocComprBuf(unsigned int comprBufSize, size_t preferredFileAlignment); + void allocComprBuf(size_t comprBufSize, size_t preferredFileAlignment); void setDecodeContext(ComprFileDecodeContext *decodeContext) { _decodeContext = decodeContext; } ComprFileDecodeContext *getDecodeContext() const { return _decodeContext; } void setFile(FastOS_FileInterface *file) { _file = file; } |