summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2024-04-15 16:45:00 +0200
committerTor Egge <Tor.Egge@online.no>2024-04-15 21:28:00 +0200
commit0e569bed995a09bcb4da5265182c517d89f1f6cf (patch)
treeca8076ed5a0daf873c14543c22318c0fc1fc9f3d
parent44a866c0d648543c04567503990c03c36403d86d (diff)
Test disk index dictionary file with many long words.
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/.gitignore1
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt8
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp111
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.h8
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.cpp37
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.h4
6 files changed, 146 insertions, 23 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/.gitignore b/searchlib/src/tests/diskindex/pagedict4/.gitignore
index 2381ed57229..8aa95a29b63 100644
--- a/searchlib/src/tests/diskindex/pagedict4/.gitignore
+++ b/searchlib/src/tests/diskindex/pagedict4/.gitignore
@@ -3,3 +3,4 @@ Makefile
pagedict4_test
fakedict.*
searchlib_pagedict4_test_app
+/long_words_dir/
diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
index 6be544db829..34114e195bc 100644
--- a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
+++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
@@ -16,3 +16,11 @@ vespa_add_executable(searchlib_pagedict4_hugeword_cornercase_test_app TEST
searchlib
)
vespa_add_test(NAME searchlib_pagedict4_hugeword_cornercase_test_app COMMAND searchlib_pagedict4_hugeword_cornercase_test_app)
+
+vespa_add_executable(searchlib_pagedict4_long_words_test_app TEST
+ SOURCES
+ pagedict4_long_words_test.cpp
+ DEPENDS
+ searchlib_test
+ searchlib
+)
diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp
new file mode 100644
index 00000000000..ec822e58fd7
--- /dev/null
+++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_long_words_test.cpp
@@ -0,0 +1,111 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/common/tunefileinfo.h>
+#include <vespa/searchlib/diskindex/pagedict4file.h>
+#include <vespa/searchlib/diskindex/pagedict4randread.h>
+#include <vespa/searchlib/index/dummyfileheadercontext.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <filesystem>
+
+using search::diskindex::PageDict4FileSeqRead;
+using search::diskindex::PageDict4FileSeqWrite;
+using search::diskindex::PageDict4RandRead;
+using search::index::DummyFileHeaderContext;
+using search::index::PostingListCounts;
+using search::index::PostingListOffsetAndCounts;
+using search::index::PostingListParams;
+
+
+namespace {
+
+vespalib::string test_dir("long_words_dir");
+vespalib::string dict(test_dir + "/dict");
+
+PostingListCounts make_counts()
+{
+ PostingListCounts counts;
+ counts._bitLength = 100;
+ counts._numDocs = 1;
+ counts._segments.clear();
+ return counts;
+}
+
+vespalib::string
+make_word(int i)
+{
+ vespalib::asciistream os;
+ vespalib::string word(5_Ki, 'a');
+ os << vespalib::setfill('0') << vespalib::setw(8) << i;
+ word.append(os.str());
+ return word;
+}
+
+}
+
+/*
+ * A long word that don't fit into a 4 KiB 'page' causes a fallback to
+ * overflow handling where the word is put in the .ssdat file.
+ */
+TEST(PageDict4LongWordsTest, test_many_long_words)
+{
+ int num_words = 4_Mi;
+ auto counts = make_counts();
+ std::filesystem::remove_all(std::filesystem::path(test_dir));
+ std::filesystem::create_directories(std::filesystem::path(test_dir));
+
+ auto dw = std::make_unique<PageDict4FileSeqWrite>();
+ DummyFileHeaderContext file_header_context;
+ PostingListParams params;
+ search::TuneFileSeqWrite tune_file_write;
+ params.set("numWordIds", num_words);
+ params.set("minChunkDocs", 256_Ki);
+ dw->setParams(params);
+ EXPECT_TRUE(dw->open(dict, tune_file_write, file_header_context));
+ for (int i = 0; i < num_words; ++i) {
+ auto word = make_word(i);
+ dw->writeWord(word, counts);
+ }
+ EXPECT_TRUE(dw->close());
+ dw.reset();
+
+ auto drr = std::make_unique<PageDict4RandRead>();
+ search::TuneFileRandRead tune_file_rand_read;
+ EXPECT_TRUE(drr->open(dict, tune_file_rand_read));
+ PostingListOffsetAndCounts offset_and_counts;
+ uint64_t exp_offset = 0;
+ uint64_t exp_acc_num_docs = 0;
+ for (int i = 0; i < num_words; ++i) {
+ auto word = make_word(i);
+ uint64_t check_word_num = 0;
+ EXPECT_TRUE(drr->lookup(word, check_word_num, offset_and_counts));
+ EXPECT_EQ(i + 1, (int) check_word_num);
+ EXPECT_EQ(exp_offset, offset_and_counts._offset);
+ EXPECT_EQ(exp_acc_num_docs, offset_and_counts._accNumDocs);
+ EXPECT_EQ(counts, offset_and_counts._counts);
+ exp_offset += offset_and_counts._counts._bitLength;
+ exp_acc_num_docs += offset_and_counts._counts._numDocs;
+ }
+ EXPECT_TRUE(drr->close());
+ drr.reset();
+
+ auto dr = std::make_unique<PageDict4FileSeqRead>();
+ search::TuneFileSeqRead tune_file_read;
+ EXPECT_TRUE(dr->open(dict, tune_file_read));
+ vespalib::string check_word;
+ PostingListCounts check_counts;
+ for (int i = 0; i < num_words; ++i) {
+ uint64_t check_word_num = 0;
+ check_word.clear();
+ dr->readWord(check_word, check_word_num, check_counts);
+ EXPECT_EQ(i + 1, (int) check_word_num);
+ EXPECT_EQ(make_word(i), check_word);
+ EXPECT_EQ(counts, check_counts);
+ }
+ EXPECT_TRUE(dr->close());
+ dr.reset();
+
+ std::filesystem::remove_all(std::filesystem::path(test_dir));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h
index 232572a6314..9d4ca38eed3 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h
@@ -1167,7 +1167,7 @@ public:
* Get remaining units in buffer (e.g. _realValE - _valI)
*/
- int32_t remainingUnits() const override { return _realValE - _valI; }
+ int64_t remainingUnits() const override { return _realValE - _valI; }
/**
* Get unit ptr (e.g. _valI) from decode context.
@@ -1181,7 +1181,7 @@ public:
}
uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const override {
- int intOffset = _realValE - _valI;
+ int64_t intOffset = _realValE - _valI;
if (bitOffset == -1) {
bitOffset = -64 - _preRead;
}
@@ -1200,7 +1200,7 @@ public:
uint64_t getBitPosV() const override { return getReadOffset(); }
- void adjUnitPtr(int newRemainingUnits) override {
+ void adjUnitPtr(int64_t newRemainingUnits) override {
_valI = _realValE - newRemainingUnits;
}
@@ -1219,7 +1219,7 @@ public:
* @param unitCount Number of bytes in buffer
* @param moreData Set if there is more data available
*/
- void setEnd(unsigned int unitCount, bool moreData) {
+ void setEnd(uint64_t unitCount, bool moreData) {
_valE = _realValE = _valI + unitCount;
if (moreData) {
_valE -= END_BUFFER_SAFETY;
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp
index ff74dc7a0e0..9d917918dab 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.cpp
+++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp
@@ -23,14 +23,17 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
bool isretryread = false;
retry:
- if (decodeContext.lastChunk())
+ if (decodeContext.lastChunk()) {
return; // Already reached end of file.
- int remainingUnits = decodeContext.remainingUnits();
+ }
+ int64_t remainingUnits = decodeContext.remainingUnits();
+ assert(remainingUnits >= 0);
// There's a good amount of data here already.
if (remainingUnits >
- static_cast<ssize_t>(ComprBuffer::minimumPadding())) //FIX! Tune
+ static_cast<ssize_t>(ComprBuffer::minimumPadding())) { //FIX! Tune
return;
+ }
// Assert that file read offset is aligned on unit boundary
assert((static_cast<size_t>(fileReadByteOffset) &
@@ -47,9 +50,9 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
// Continuation reads starts at aligned boundary.
assert(remainingUnits == 0 || padBeforeUnits == 0);
- if (readAll)
+ if (readAll) {
stopOffset = fileSize << 3;
- else if (!isretryread) {
+ } else if (!isretryread) {
stopOffset += 8 * cbuf.getUnitBitSize(); // XXX: Magic integer
// Realign stop offset to direct IO alignment boundary
uint64_t fileDirectIOBitAlign =
@@ -93,20 +96,19 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
fileReadByteOffset -= padBeforeUnits * cbuf.getUnitSize();
file.SetPosition(fileReadByteOffset);
}
- int readUnits0 = 0;
- if (readBits > 0)
- readUnits0 = static_cast<int>((readBits + cbuf.getUnitBitSize() - 1) /
- cbuf.getUnitBitSize());
+ size_t readUnits0 = 0;
+ if (readBits > 0) {
+ readUnits0 = (readBits + cbuf.getUnitBitSize() - 1) / cbuf.getUnitBitSize();
+ }
// Try to align end of read to an alignment boundary
- int readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset /
- cbuf.getUnitSize(), readUnits0);
- if (readUnits < readUnits0)
+ size_t readUnits = cbuf.getAligner().adjustElements(fileReadByteOffset / cbuf.getUnitSize(), readUnits0);
+ if (readUnits < readUnits0) {
isMore = true;
+ }
if (readUnits > 0) {
- int64_t padBytes = fileReadByteOffset +
- static_cast<int64_t>(readUnits) * cbuf.getUnitSize() - fileSize;
+ int64_t padBytes = fileReadByteOffset + readUnits * cbuf.getUnitSize() - fileSize;
if (!isMore && padBytes > 0) {
// Pad reading of file written with smaller unit size with
// NUL bytes.
@@ -115,17 +117,18 @@ ComprFileReadBase::ReadComprBuffer(uint64_t stopOffset,
readUnits * cbuf.getUnitSize() - padBytes,
0,
padBytes);
- } else
+ } else {
file.ReadBuf(cbuf.getComprBuf(), readUnits * cbuf.getUnitSize());
+ }
}
// If at end of file then add units of zero bits as padding
- if (!isMore)
+ if (!isMore) {
memset(reinterpret_cast<char *>(cbuf.getComprBuf()) +
readUnits * cbuf.getUnitSize(),
0,
cbuf.getUnitSize() * ComprBuffer::minimumPadding());
+ }
- assert(remainingUnits + readUnits >= 0);
decodeContext.afterRead(reinterpret_cast<char *>(cbuf.getComprBuf()) +
(padBeforeUnits - remainingUnits) *
static_cast<int32_t>(cbuf.getUnitSize()),
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h
index 8f8cffaffd6..4140425dc68 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.h
+++ b/searchlib/src/vespa/searchlib/util/comprfile.h
@@ -32,7 +32,7 @@ public:
* Get remaining units in buffer (e.g. _realValE - _valI)
*/
- virtual int32_t remainingUnits() const = 0;
+ virtual int64_t remainingUnits() const = 0;
/**
* Get unit ptr (e.g. _valI) from decode context.
@@ -51,7 +51,7 @@ public:
virtual uint64_t getBitPos(int bitOffset, uint64_t bufferEndFilePos) const = 0;
virtual uint64_t getBitPosV() const = 0;
virtual void skipBits(int bits) = 0;
- virtual void adjUnitPtr(int newRemainingUnits) = 0;
+ virtual void adjUnitPtr(int64_t newRemainingUnits) = 0;
virtual void emptyBuffer(uint64_t newBitPosition) = 0;
/**