diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-04-15 16:45:00 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2024-04-15 21:28:00 +0200 |
commit | 0e569bed995a09bcb4da5265182c517d89f1f6cf (patch) | |
tree | ca8076ed5a0daf873c14543c22318c0fc1fc9f3d | |
parent | 44a866c0d648543c04567503990c03c36403d86d (diff) |
Test disk index dictionary file with many long words.
6 files changed, 146 insertions, 23 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/.gitignore b/searchlib/src/tests/diskindex/pagedict4/.gitignore index 2381ed57229..8aa95a29b63 100644 --- a/searchlib/src/tests/diskindex/pagedict4/.gitignore +++ b/searchlib/src/tests/diskindex/pagedict4/.gitignore @@ -3,3 +3,4 @@ Makefile pagedict4_test fakedict.* searchlib_pagedict4_test_app +/long_words_dir/ diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt index 6be544db829..34114e195bc 100644 --- a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt +++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt @@ -16,3 +16,11 @@ vespa_add_executable(searchlib_pagedict4_hugeword_cornercase_test_app TEST searchlib ) vespa_add_test(NAME searchlib_pagedict4_hugeword_cornercase_test_app COMMAND searchlib_pagedict4_hugeword_cornercase_test_app) + +vespa_add_executable(searchlib_pagedict4_long_words_test_app TEST + SOURCES + pagedict4_long_words_test.cpp + DEPENDS + searchlib_test + searchlib +) diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp new file mode 100644 index 00000000000..ec822e58fd7 --- /dev/null +++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp @@ -0,0 +1,111 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/common/tunefileinfo.h> +#include <vespa/searchlib/diskindex/pagedict4file.h> +#include <vespa/searchlib/diskindex/pagedict4randread.h> +#include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <filesystem> + +using search::diskindex::PageDict4FileSeqRead; +using search::diskindex::PageDict4FileSeqWrite; +using search::diskindex::PageDict4RandRead; +using search::index::DummyFileHeaderContext; +using search::index::PostingListCounts; +using search::index::PostingListOffsetAndCounts; +using search::index::PostingListParams; + + +namespace { + +vespalib::string test_dir("long_words_dir"); +vespalib::string dict(test_dir + "/dict"); + +PostingListCounts make_counts() +{ + PostingListCounts counts; + counts._bitLength = 100; + counts._numDocs = 1; + counts._segments.clear(); + return counts; +} + +vespalib::string +make_word(int i) +{ + vespalib::asciistream os; + vespalib::string word(5_Ki, 'a'); + os << vespalib::setfill('0') << vespalib::setw(8) << i; + word.append(os.str()); + return word; +} + +} + +/* + * A long word that don't fit into a 4 KiB 'page' causes a fallback to + * overflow handling where the word is put in the .ssdat file. + */ +TEST(PageDict4LongWordsTest, test_many_long_words) +{ + int num_words = 4_Mi; + auto counts = make_counts(); + std::filesystem::remove_all(std::filesystem::path(test_dir)); + std::filesystem::create_directories(std::filesystem::path(test_dir)); + + auto dw = std::make_unique<PageDict4FileSeqWrite>(); + DummyFileHeaderContext file_header_context; + PostingListParams params; + search::TuneFileSeqWrite tune_file_write; + params.set("numWordIds", num_words); + params.set("minChunkDocs", 256_Ki); + dw->setParams(params); + EXPECT_TRUE(dw->open(dict, tune_file_write, file_header_context)); + for (int i = 0; i < num_words; ++i) { + auto word = make_word(i); + dw->writeWord(word, counts); + } + EXPECT_TRUE(dw->close()); + dw.reset(); + + auto drr = std::make_unique<PageDict4RandRead>(); + search::TuneFileRandRead tune_file_rand_read; + EXPECT_TRUE(drr->open(dict, tune_file_rand_read)); + PostingListOffsetAndCounts offset_and_counts; + uint64_t exp_offset = 0; + uint64_t exp_acc_num_docs = 0; + for (int i = 0; i < num_words; ++i) { + auto word = make_word(i); + uint64_t check_word_num = 0; + EXPECT_TRUE(drr->lookup(word, check_word_num, offset_and_counts)); + EXPECT_EQ(i + 1, (int) check_word_num); + EXPECT_EQ(exp_offset, offset_and_counts._offset); + EXPECT_EQ(exp_acc_num_docs, offset_and_counts._accNumDocs); + EXPECT_EQ(counts, offset_and_counts._counts); + exp_offset += offset_and_counts._counts._bitLength; + exp_acc_num_docs += offset_and_counts._counts._numDocs; + } + EXPECT_TRUE(drr->close()); + drr.reset(); + + auto dr = std::make_unique<PageDict4FileSeqRead>(); + search::TuneFileSeqRead tune_file_read; + EXPECT_TRUE(dr->open(dict, tune_file_read)); + vespalib::string check_word; + PostingListCounts check_counts; + for (int i = 0; i < num_words; ++i) { + uint64_t check_word_num = 0; + check_word.clear(); + dr->readWord(check_word, check_word_num, check_counts); + EXPECT_EQ(i + 1, (int) check_word_num); + EXPECT_EQ(make_word(i), check_word); + EXPECT_EQ(counts, check_counts); + } + EXPECT_TRUE(dr->close()); + dr.reset(); + + std::filesystem::remove_all(std::filesystem::path(test_dir)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index 232572a6314..9d4ca38eed3 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -1167,7 +1167,7 @@ public: * Get remaining units in buffer (e.g. _realValE - _valI) */ - int32_t remainingUnits() const override { return _realValE - _valI; } + int64_t remainingUnits() const override { return _realValE - _valI; } /** * Get unit ptr (e.g. _valI) from decode context. @@ -1181,7 +1181,7 @@ public: } uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const override { - int intOffset = _realValE - _valI; + int64_t intOffset = _realValE - _valI; if (bitOffset == -1) { bitOffset = -64 - _preRead; } @@ -1200,7 +1200,7 @@ public: uint64_t getBitPosV() const override { return getReadOffset(); } - void adjUnitPtr(int newRemainingUnits) override { + void adjUnitPtr(int64_t newRemainingUnits) override { _valI = _realValE - newRemainingUnits; } @@ -1219,7 +1219,7 @@ public: * @param unitCount Number of bytes in buffer * @param moreData Set if there is more data available */ - void setEnd(unsigned int unitCount, bool moreData) { + void setEnd(uint64_t unitCount, bool moreData) { _valE = _realValE = _valI + unitCount; if (moreData) { _valE -= END_BUFFER_SAFETY; diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp index ff74dc7a0e0..9d917918dab 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.cpp +++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp @@ -23,14 +23,17 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, bool isretryread = false; retry: - if (decodeContext.lastChunk()) + if (decodeContext.lastChunk()) { return; // Already reached end of file. - int remainingUnits = decodeContext.remainingUnits(); + } + int64_t remainingUnits = decodeContext.remainingUnits(); + assert(remainingUnits >= 0); // There's a good amount of data here already. if (remainingUnits > - static_cast<ssize_t>(ComprBuffer::minimumPadding())) //FIX! Tune + static_cast<ssize_t>(ComprBuffer::minimumPadding())) { //FIX! Tune return; + } // Assert that file read offset is aligned on unit boundary assert((static_cast<size_t>(fileReadByteOffset) & @@ -47,9 +50,9 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, // Continuation reads starts at aligned boundary. assert(remainingUnits == 0 || padBeforeUnits == 0); - if (readAll) + if (readAll) { stopOffset = fileSize << 3; - else if (!isretryread) { + } else if (!isretryread) { stopOffset += 8 * cbuf.getUnitBitSize(); // XXX: Magic integer // Realign stop offset to direct IO alignment boundary uint64_t fileDirectIOBitAlign = @@ -93,20 +96,19 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, fileReadByteOffset -= padBeforeUnits * cbuf.getUnitSize(); file.SetPosition(fileReadByteOffset); } - int readUnits0 = 0; - if (readBits > 0) - readUnits0 = static_cast<int>((readBits + cbuf.getUnitBitSize() - 1) / - cbuf.getUnitBitSize()); + size_t readUnits0 = 0; + if (readBits > 0) { + readUnits0 = (readBits + cbuf.getUnitBitSize() - 1) / cbuf.getUnitBitSize(); + } // Try to align end of read to an alignment boundary - int readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / - cbuf.getUnitSize(), readUnits0); - if (readUnits < readUnits0) + size_t readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / cbuf.getUnitSize(), readUnits0); + if (readUnits < readUnits0) { isMore = true; + } if (readUnits > 0) { - int64_t padBytes = fileReadByteOffset + - static_cast<int64_t>(readUnits) * cbuf.getUnitSize() - fileSize; + int64_t padBytes = fileReadByteOffset + readUnits * cbuf.getUnitSize() - fileSize; if (!isMore && padBytes > 0) { // Pad reading of file written with smaller unit size with // NUL bytes. @@ -115,17 +117,18 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset, readUnits * cbuf.getUnitSize() - padBytes, 0, padBytes); - } else + } else { file.ReadBuf(cbuf.getComprBuf(), readUnits * cbuf.getUnitSize()); + } } // If at end of file then add units of zero bits as padding - if (!isMore) + if (!isMore) { memset(reinterpret_cast<char *>(cbuf.getComprBuf()) + readUnits * cbuf.getUnitSize(), 0, cbuf.getUnitSize() * ComprBuffer::minimumPadding()); + } - assert(remainingUnits + readUnits >= 0); decodeContext.afterRead(reinterpret_cast<char *>(cbuf.getComprBuf()) + (padBeforeUnits - remainingUnits) * static_cast<int32_t>(cbuf.getUnitSize()), diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h index 8f8cffaffd6..4140425dc68 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.h +++ b/searchlib/src/vespa/searchlib/util/comprfile.h @@ -32,7 +32,7 @@ public: * Get remaining units in buffer (e.g. _realValE - _valI) */ - virtual int32_t remainingUnits() const = 0; + virtual int64_t remainingUnits() const = 0; /** * Get unit ptr (e.g. _valI) from decode context. @@ -51,7 +51,7 @@ public: virtual uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const = 0; virtual uint64_t getBitPosV() const = 0; virtual void skipBits(int bits) = 0; - virtual void adjUnitPtr(int newRemainingUnits) = 0; + virtual void adjUnitPtr(int64_t newRemainingUnits) = 0; virtual void emptyBuffer(uint64_t newBitPosition) = 0; /** |