summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@vespa.ai>2024-04-16 11:36:01 +0200
committerGitHub <noreply@github.com>2024-04-16 11:36:01 +0200
commitfca990d5ed32c408df42bbe178b174711fa54a08 (patch)
treed10b4ba60468f379b64f94d04fa5ad7efd3eccdb
parent4212a0b72398dbd5c555f23f3501d265b9a8724e (diff)
parent5e6fca96505e0a181f528be2eb7f25ec7673b758 (diff)
Merge pull request #30926 from vespa-engine/toregge/test-disk-index-dictionary-with-many-long-words
Test disk index dictionary file with many long words MERGEOK
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/.gitignore1
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt8
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp131
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.h8
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.cpp39
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.h6
6 files changed, 168 insertions, 25 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/.gitignore b/searchlib/src/tests/diskindex/pagedict4/.gitignore
index 2381ed57229..8aa95a29b63 100644
--- a/searchlib/src/tests/diskindex/pagedict4/.gitignore
+++ b/searchlib/src/tests/diskindex/pagedict4/.gitignore
@@ -3,3 +3,4 @@ Makefile
pagedict4_test
fakedict.*
searchlib_pagedict4_test_app
+/long_words_dir/
diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
index 6be544db829..34114e195bc 100644
--- a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
+++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
@@ -16,3 +16,11 @@ vespa_add_executable(searchlib_pagedict4_hugeword_cornercase_test_app TEST
searchlib
)
vespa_add_test(NAME searchlib_pagedict4_hugeword_cornercase_test_app COMMAND searchlib_pagedict4_hugeword_cornercase_test_app)
+
+vespa_add_executable(searchlib_pagedict4_long_words_test_app TEST
+ SOURCES
+ pagedict4_long_words_test.cpp
+ DEPENDS
+ searchlib_test
+ searchlib
+)
diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp
new file mode 100644
index 00000000000..dba7980a4b7
--- /dev/null
+++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp
@@ -0,0 +1,131 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/common/tunefileinfo.h>
+#include <vespa/searchlib/diskindex/pagedict4file.h>
+#include <vespa/searchlib/diskindex/pagedict4randread.h>
+#include <vespa/searchlib/index/dummyfileheadercontext.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <filesystem>
+
+using search::diskindex::PageDict4FileSeqRead;
+using search::diskindex::PageDict4FileSeqWrite;
+using search::diskindex::PageDict4RandRead;
+using search::index::DummyFileHeaderContext;
+using search::index::PostingListCounts;
+using search::index::PostingListOffsetAndCounts;
+using search::index::PostingListParams;
+
+
+namespace {
+
+vespalib::string test_dir("long_words_dir");
+vespalib::string dict(test_dir + "/dict");
+
+PostingListCounts make_counts()
+{
+ PostingListCounts counts;
+ counts._bitLength = 100;
+ counts._numDocs = 1;
+ counts._segments.clear();
+ return counts;
+}
+
+vespalib::string
+make_word(int i)
+{
+ vespalib::asciistream os;
+ vespalib::string word(5_Ki, 'a');
+ os << vespalib::setfill('0') << vespalib::setw(8) << i;
+ word.append(os.str());
+ return word;
+}
+
+}
+
+/*
+ * A long word that don't fit into a 4 KiB 'page' causes a fallback to
+ * overflow handling where the word is put in the .ssdat file.
+ *
+ * Many long words causes excessive growth of the .ssdat file, with
+ * overflow potentials when the whole file is read into a buffer.
+ *
+ * 4 GiB size: Overflow in ComprFileReadBase::ReadComprBuffer for expression
+ * readUnits * cbuf.getUnitSize() when both are 32-bits.
+ * Testable by setting num_words to 900_Ki
+ *
+ * 16 GiB size: Overflow in ComprFileReadBase::ReadComprBuffer when
+ * readUnits is 32-bit signed.
+ * Some overflows in ComprFileDecodeContext API.
+ * Overflow in DecodeContext64Base::getBitPos
+ * Testable by setting num_words to 4_Mi
+ *
+ * 32 GiB size: Overflow when calling ComprFileReadContext::allocComprBuf when
+ * comprBufSize is 32-bit unsigned.
+ * Overflow in DecodeContext64Base::setEnd.
+ * Testable by setting num_words to 9_Mi
+ *
+ * These overflows are fixed.
+ */
+TEST(PageDict4LongWordsTest, test_many_long_words)
+{
+ int num_words = 9_Mi;
+ auto counts = make_counts();
+ std::filesystem::remove_all(std::filesystem::path(test_dir));
+ std::filesystem::create_directories(std::filesystem::path(test_dir));
+
+ auto dw = std::make_unique<PageDict4FileSeqWrite>();
+ DummyFileHeaderContext file_header_context;
+ PostingListParams params;
+ search::TuneFileSeqWrite tune_file_write;
+ params.set("numWordIds", num_words);
+ params.set("minChunkDocs", 256_Ki);
+ dw->setParams(params);
+ EXPECT_TRUE(dw->open(dict, tune_file_write, file_header_context));
+ for (int i = 0; i < num_words; ++i) {
+ auto word = make_word(i);
+ dw->writeWord(word, counts);
+ }
+ EXPECT_TRUE(dw->close());
+ dw.reset();
+
+ auto drr = std::make_unique<PageDict4RandRead>();
+ search::TuneFileRandRead tune_file_rand_read;
+ EXPECT_TRUE(drr->open(dict, tune_file_rand_read));
+ PostingListOffsetAndCounts offset_and_counts;
+ uint64_t exp_offset = 0;
+ uint64_t exp_acc_num_docs = 0;
+ for (int i = 0; i < num_words; ++i) {
+ auto word = make_word(i);
+ uint64_t check_word_num = 0;
+ EXPECT_TRUE(drr->lookup(word, check_word_num, offset_and_counts));
+ EXPECT_EQ(i + 1, (int) check_word_num);
+ EXPECT_EQ(exp_offset, offset_and_counts._offset);
+ EXPECT_EQ(exp_acc_num_docs, offset_and_counts._accNumDocs);
+ EXPECT_EQ(counts, offset_and_counts._counts);
+ exp_offset += offset_and_counts._counts._bitLength;
+ exp_acc_num_docs += offset_and_counts._counts._numDocs;
+ }
+ EXPECT_TRUE(drr->close());
+ drr.reset();
+
+ auto dr = std::make_unique<PageDict4FileSeqRead>();
+ search::TuneFileSeqRead tune_file_read;
+ EXPECT_TRUE(dr->open(dict, tune_file_read));
+ vespalib::string check_word;
+ PostingListCounts check_counts;
+ for (int i = 0; i < num_words; ++i) {
+ uint64_t check_word_num = 0;
+ check_word.clear();
+ dr->readWord(check_word, check_word_num, check_counts);
+ EXPECT_EQ(i + 1, (int) check_word_num);
+ EXPECT_EQ(make_word(i), check_word);
+ EXPECT_EQ(counts, check_counts);
+ }
+ EXPECT_TRUE(dr->close());
+ dr.reset();
+
+ std::filesystem::remove_all(std::filesystem::path(test_dir));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h
index 232572a6314..9d4ca38eed3 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h
@@ -1167,7 +1167,7 @@ public:
* Get remaining units in buffer (e.g. _realValE - _valI)
*/
- int32_t remainingUnits() const override { return _realValE - _valI; }
+ int64_t remainingUnits() const override { return _realValE - _valI; }
/**
* Get unit ptr (e.g. _valI) from decode context.
@@ -1181,7 +1181,7 @@ public:
}
uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const override {
- int intOffset = _realValE - _valI;
+ int64_t intOffset = _realValE - _valI;
if (bitOffset == -1) {
bitOffset = -64 - _preRead;
}
@@ -1200,7 +1200,7 @@ public:
uint64_t getBitPosV() const override { return getReadOffset(); }
- void adjUnitPtr(int newRemainingUnits) override {
+ void adjUnitPtr(int64_t newRemainingUnits) override {
_valI = _realValE - newRemainingUnits;
}
@@ -1219,7 +1219,7 @@ public:
* @param unitCount Number of bytes in buffer
* @param moreData Set if there is more data available
*/
- void setEnd(unsigned int unitCount, bool moreData) {
+ void setEnd(uint64_t unitCount, bool moreData) {
_valE = _realValE = _valI + unitCount;
if (moreData) {
_valE -= END_BUFFER_SAFETY;
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp
index ff74dc7a0e0..db8fe14d658 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.cpp
+++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp
@@ -23,14 +23,17 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
bool isretryread = false;
retry:
- if (decodeContext.lastChunk())
+ if (decodeContext.lastChunk()) {
return; // Already reached end of file.
- int remainingUnits = decodeContext.remainingUnits();
+ }
+ int64_t remainingUnits = decodeContext.remainingUnits();
+ assert(remainingUnits >= 0);
// There's a good amount of data here already.
if (remainingUnits >
- static_cast<ssize_t>(ComprBuffer::minimumPadding())) //FIX! Tune
+ static_cast<ssize_t>(ComprBuffer::minimumPadding())) { //FIX! Tune
return;
+ }
// Assert that file read offset is aligned on unit boundary
assert((static_cast<size_t>(fileReadByteOffset) &
@@ -47,9 +50,9 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
// Continuation reads starts at aligned boundary.
assert(remainingUnits == 0 || padBeforeUnits == 0);
- if (readAll)
+ if (readAll) {
stopOffset = fileSize << 3;
- else if (!isretryread) {
+ } else if (!isretryread) {
stopOffset += 8 * cbuf.getUnitBitSize(); // XXX: Magic integer
// Realign stop offset to direct IO alignment boundary
uint64_t fileDirectIOBitAlign =
@@ -93,20 +96,19 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
fileReadByteOffset -= padBeforeUnits * cbuf.getUnitSize();
file.SetPosition(fileReadByteOffset);
}
- int readUnits0 = 0;
- if (readBits > 0)
- readUnits0 = static_cast<int>((readBits + cbuf.getUnitBitSize() - 1) /
- cbuf.getUnitBitSize());
+ size_t readUnits0 = 0;
+ if (readBits > 0) {
+ readUnits0 = (readBits + cbuf.getUnitBitSize() - 1) / cbuf.getUnitBitSize();
+ }
// Try to align end of read to an alignment boundary
- int readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset /
- cbuf.getUnitSize(), readUnits0);
- if (readUnits < readUnits0)
+ size_t readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / cbuf.getUnitSize(), readUnits0);
+ if (readUnits < readUnits0) {
isMore = true;
+ }
if (readUnits > 0) {
- int64_t padBytes = fileReadByteOffset +
- static_cast<int64_t>(readUnits) * cbuf.getUnitSize() - fileSize;
+ int64_t padBytes = fileReadByteOffset + readUnits * cbuf.getUnitSize() - fileSize;
if (!isMore && padBytes > 0) {
// Pad reading of file written with smaller unit size with
// NUL bytes.
@@ -115,17 +117,18 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
readUnits * cbuf.getUnitSize() - padBytes,
0,
padBytes);
- } else
+ } else {
file.ReadBuf(cbuf.getComprBuf(), readUnits * cbuf.getUnitSize());
+ }
}
// If at end of file then add units of zero bits as padding
- if (!isMore)
+ if (!isMore) {
memset(reinterpret_cast<char *>(cbuf.getComprBuf()) +
readUnits * cbuf.getUnitSize(),
0,
cbuf.getUnitSize() * ComprBuffer::minimumPadding());
+ }
- assert(remainingUnits + readUnits >= 0);
decodeContext.afterRead(reinterpret_cast<char *>(cbuf.getComprBuf()) +
(padBeforeUnits - remainingUnits) *
static_cast<int32_t>(cbuf.getUnitSize()),
@@ -343,7 +346,7 @@ ComprFileReadContext::setPosition(uint64_t newPosition)
}
void
-ComprFileReadContext::allocComprBuf(unsigned int comprBufSize, size_t preferredFileAlignment)
+ComprFileReadContext::allocComprBuf(size_t comprBufSize, size_t preferredFileAlignment)
{
ComprBuffer::allocComprBuf(comprBufSize, preferredFileAlignment, _file, true);
}
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h
index 8f8cffaffd6..6a0b72ce7e5 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.h
+++ b/searchlib/src/vespa/searchlib/util/comprfile.h
@@ -32,7 +32,7 @@ public:
* Get remaining units in buffer (e.g. _realValE - _valI)
*/
- virtual int32_t remainingUnits() const = 0;
+ virtual int64_t remainingUnits() const = 0;
/**
* Get unit ptr (e.g. _valI) from decode context.
@@ -51,7 +51,7 @@ public:
virtual uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const = 0;
virtual uint64_t getBitPosV() const = 0;
virtual void skipBits(int bits) = 0;
- virtual void adjUnitPtr(int newRemainingUnits) = 0;
+ virtual void adjUnitPtr(int64_t newRemainingUnits) = 0;
virtual void emptyBuffer(uint64_t newBitPosition) = 0;
/**
@@ -105,7 +105,7 @@ public:
void readComprBuffer(uint64_t stopOffset, bool readAll);
void readComprBuffer();
void setPosition(uint64_t newPosition);
- void allocComprBuf(unsigned int comprBufSize, size_t preferredFileAlignment);
+ void allocComprBuf(size_t comprBufSize, size_t preferredFileAlignment);
void setDecodeContext(ComprFileDecodeContext *decodeContext) { _decodeContext = decodeContext; }
ComprFileDecodeContext *getDecodeContext() const { return _decodeContext; }
void setFile(FastOS_FileInterface *file) { _file = file; }