diff options
author | Geir Storli <geirstorli@yahoo.no> | 2018-04-16 11:18:33 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-16 11:18:33 +0200 |
commit | 0151c5266ff31979ab79b3d13dfac96ca0a2676b (patch) | |
tree | a5fb3481aacce34930a3f69ac65b43c46dc060ec | |
parent | 8765c58c09b2699f8414d51f9bdec93aecf8dff1 (diff) | |
parent | ee859d3effc13d7b3e5e4a412930266a9a114ab9 (diff) |
Merge pull request #5574 from vespa-engine/toregge/validate-pagedict4-pages-during-sequential-read
Validate pagedict4 pages during sequential read
16 files changed, 865 insertions, 456 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt index 0ef5805c85a..eceb961bcfc 100644 --- a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt +++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt @@ -7,3 +7,12 @@ vespa_add_executable(searchlib_pagedict4_test_app TEST searchlib ) vespa_add_test(NAME searchlib_pagedict4_test_app COMMAND searchlib_pagedict4_test_app) + +vespa_add_executable(searchlib_pagedict4_hugeword_cornercase_test_app TEST + SOURCES + pagedict4_hugeword_cornercase_test.cpp + DEPENDS + searchlib_test + searchlib +) +vespa_add_test(NAME searchlib_pagedict4_hugeword_cornercase_test_app COMMAND searchlib_pagedict4_hugeword_cornercase_test_app) diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4_hugeword_cornercase_test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4_hugeword_cornercase_test.cpp new file mode 100644 index 00000000000..400108e91b0 --- /dev/null +++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4_hugeword_cornercase_test.cpp @@ -0,0 +1,190 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/bitcompression/compression.h> +#include <vespa/searchlib/bitcompression/countcompression.h> +#include <vespa/searchlib/bitcompression/pagedict4.h> +#include <vespa/searchlib/test/diskindex/threelevelcountbuffers.h> +#include <vespa/searchlib/test/diskindex/pagedict4_mem_writer.h> +#include <vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.h> +#include <vespa/searchlib/index/postinglistcounts.h> + +#include <vespa/log/log.h> +LOG_SETUP("pagedict4_hugeword_cornercase_test"); + +using search::index::PostingListCounts; +using search::ComprFileWriteContext; + +constexpr uint32_t minChunkDocs = 262144; +constexpr uint32_t numWordIds = 65536; + +struct BitBuffer +{ + using EncodeContext = search::bitcompression::PostingListCountFileEncodeContext; + + EncodeContext encodeCtx; + ComprFileWriteContext writeCtx; + + BitBuffer() + : encodeCtx(), + writeCtx(encodeCtx) + { + encodeCtx._minChunkDocs = minChunkDocs; + encodeCtx._numWordIds = numWordIds; + writeCtx.allocComprBuf(); + encodeCtx.setWriteContext(&writeCtx); + encodeCtx.setupWrite(writeCtx); + assert(encodeCtx.getWriteOffset() == 0); + } + + void clear() { encodeCtx.setupWrite(writeCtx); } + + uint64_t getSize(const PostingListCounts &counts) + { + clear(); + encodeCtx.writeCounts(counts); + return encodeCtx.getWriteOffset(); + } + + ~BitBuffer() = default; +}; + +void addSegment(PostingListCounts &counts) +{ + PostingListCounts::Segment lastseg = counts._segments.back(); + PostingListCounts::Segment fillseg; + fillseg._bitLength = 4000000; + fillseg._numDocs = minChunkDocs; + fillseg._lastDoc = minChunkDocs; + counts._bitLength += fillseg._bitLength; + counts._numDocs += fillseg._numDocs; + counts._segments.back() = fillseg; + uint32_t lastDoc = counts._segments.size() * fillseg._numDocs; + counts._segments.back()._lastDoc = lastDoc; + counts._segments.push_back(lastseg); + counts._segments.back()._lastDoc = lastDoc + lastseg._numDocs; +} + +PostingListCounts makeBaseCounts() +{ + PostingListCounts counts; + PostingListCounts::Segment lastseg; + lastseg._bitLength = 100; + lastseg._numDocs = 10; + lastseg._lastDoc = 10; + counts._bitLength = lastseg._bitLength; + counts._numDocs = lastseg._numDocs; + counts._segments.push_back(lastseg); + addSegment(counts); + return counts; +} + +PostingListCounts makeSegmentedCounts(uint32_t segments) +{ + PostingListCounts counts = makeBaseCounts(); + while (counts._segments.size() < segments) { + addSegment(counts); + } + return counts; +} + +uint32_t +calcSegments(uint32_t maxLen) +{ + BitBuffer bb; + PostingListCounts counts = makeBaseCounts(); + uint32_t len = bb.getSize(counts); + unsigned int i = 0; + while (len <= maxLen) { + addSegment(counts); + ++i; + len = bb.getSize(counts); + } + return counts._segments.size() - 1; +} + +/* + * Calculate posting list counts that compresses to wantLen bits. + */ +PostingListCounts makeCounts(uint32_t wantLen) +{ + BitBuffer bb; + uint32_t segments = calcSegments(wantLen); + PostingListCounts counts = makeSegmentedCounts(segments); + PostingListCounts counts2 = makeSegmentedCounts(segments - 1); + uint32_t len = bb.getSize(counts); + uint32_t len2 = bb.getSize(counts2); + for (uint32_t i = 1; i + 2 < counts._segments.size(); ++i) { + counts._bitLength += counts._segments[0]._bitLength; + counts._segments[i]._bitLength += counts._segments[0]._bitLength; + counts2._bitLength += counts2._segments[0]._bitLength; + counts2._segments[i]._bitLength += counts2._segments[0]._bitLength; + len = bb.getSize(counts); + len2 = bb.getSize(counts2); + if (len == wantLen) { + return counts; + } + if (len2 == wantLen) { + return counts2; + } + } + LOG(info, "Could not calculate counts with wanted compressed length"); + abort(); +} + +using StartOffset = search::bitcompression::PageDict4StartOffset; +using Writer = search::diskindex::test::PageDict4MemWriter; +using SeqReader = search::diskindex::test::PageDict4MemSeqReader; + +/* + * Test corner case where a dictionary page has a single word, and the + * page header and compressed counts completely fills the page. + */ +void testPageSizedCounts() +{ + uint32_t pageBitSize = 32768; + uint32_t startBits = 15 * 3 + 12; + + uint32_t ssPad = 64; + uint32_t spPad = 64; + uint32_t pPad = 64; + Writer w(minChunkDocs, numWordIds, ssPad, spPad, pPad); + PostingListCounts baseCounts = makeBaseCounts(); + PostingListCounts largeCounts = makeCounts(pageBitSize - startBits); + w.addCounts("a", baseCounts); + w.addCounts("b", baseCounts); + w.addCounts("c", largeCounts); + w.addCounts("d", baseCounts); + w.addCounts("e", baseCounts); + w.flush(); + + SeqReader r(minChunkDocs, numWordIds, w._buffers); + + uint64_t checkWordNum = 0; + PostingListCounts counts; + for (uint64_t wordNum = 1; wordNum < 7; ++wordNum) { + vespalib::string word; + counts.clear(); + r.readCounts(word, checkWordNum, counts); + if (wordNum < 6) { + EXPECT_EQUAL(checkWordNum, wordNum); + if (wordNum == 3) { + EXPECT_TRUE(counts == largeCounts); + } else { + EXPECT_TRUE(counts == baseCounts); + } + } else { + EXPECT_GREATER(checkWordNum, 100u); + } + } +} + + + +TEST("require that counts exactly filling dictionary page works") +{ + testPageSizedCounts(); +} + + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp index cf90356c949..e914fe7c559 100644 --- a/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp +++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp @@ -6,6 +6,9 @@ #include <vespa/searchlib/bitcompression/countcompression.h> #include <vespa/searchlib/bitcompression/pagedict4.h> #include <vespa/searchlib/test/diskindex/threelevelcountbuffers.h> +#include <vespa/searchlib/test/diskindex/pagedict4_mem_writer.h> +#include <vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.h> +#include <vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.h> #include <vespa/searchlib/index/postinglistcounts.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> #include <vespa/searchlib/diskindex/pagedict4file.h> @@ -44,164 +47,9 @@ using search::index::schema::DataType; using namespace search::index; using StartOffset = search::bitcompression::PageDict4StartOffset; - -namespace -{ - - -class Writer : public search::diskindex::ThreeLevelCountWriteBuffers -{ -public: - PageDict4SSWriter *_ssw; - PageDict4SPWriter *_spw; - PageDict4PWriter *_pw; - - Writer(EC &sse, - EC &spe, - EC &pe) - : ThreeLevelCountWriteBuffers(sse, spe, pe), - _ssw(NULL), - _spw(NULL), - _pw(NULL) - { - } - - ~Writer() - { - delete _ssw; - delete _spw; - delete _pw; - } - - void allocWriters() - { - _ssw = new PageDict4SSWriter(_sse); - _spw = new PageDict4SPWriter(*_ssw, _spe); - _pw = new PageDict4PWriter(*_spw, _pe); - _spw->setup(); - _pw->setup(); - } - - void flush() - { - _pw->flush(); - ThreeLevelCountWriteBuffers::flush(); - } - - void addCounts(const std::string &word, - const PostingListCounts &counts) - { - _pw->addCounts(word, counts); - } -}; - - -class SeqReader : public search::diskindex::ThreeLevelCountReadBuffers -{ -public: - PageDict4SSReader _ssr; - PageDict4Reader _pr; - - SeqReader(DC &ssd, - DC &spd, - DC &pd, - search::diskindex::ThreeLevelCountWriteBuffers &wb) - : ThreeLevelCountReadBuffers(ssd, spd, pd, wb), - _ssr(_rcssd, - wb._ssHeaderLen, wb._ssFileBitSize, - wb._spHeaderLen, wb._spFileBitSize, - wb._pHeaderLen, wb._pFileBitSize), - _pr(_ssr, spd, pd) - { - _ssr.setup(ssd); - _pr.setup(); - } - - void readCounts(vespalib::string &word, - uint64_t &wordNum, - PostingListCounts &counts) - { - _pr.readCounts(word, wordNum, counts); - } -}; - -class RandReader : public search::diskindex::ThreeLevelCountReadBuffers -{ -public: - PageDict4SSReader _ssr; - const char *_spData; - const char *_pData; - size_t _pageSize; - - RandReader(DC &ssd, - DC &spd, - DC &pd, - search::diskindex::ThreeLevelCountWriteBuffers &wb) - : ThreeLevelCountReadBuffers(ssd, spd, pd, wb), - _ssr(_rcssd, - wb._ssHeaderLen, wb._ssFileBitSize, - wb._spHeaderLen, wb._spFileBitSize, - wb._pHeaderLen, wb._pFileBitSize), - _spData(static_cast<const char *>(_rcspd._comprBuf)), - _pData(static_cast<const char *>(_rcpd._comprBuf)), - _pageSize(search::bitcompression::PageDict4PageParams::getPageByteSize()) - { - _ssr.setup(ssd); - } - - bool - lookup(const std::string &key, - uint64_t &wordNum, - PostingListCounts &counts, - StartOffset &offsets) - { - PageDict4SSLookupRes sslr; - - sslr = _ssr.lookup(key); - if (!sslr._res) { - counts.clear(); - offsets = sslr._l6StartOffset; - wordNum = sslr._l6WordNum; - return false; - } - - if (sslr._overflow) { - wordNum = sslr._l6WordNum; - counts = sslr._counts; - offsets = sslr._startOffset; - return true; - } - PageDict4SPLookupRes splr; - splr.lookup(_ssr, - _spData + - _pageSize * sslr._sparsePageNum, - key, - sslr._l6Word, - sslr._lastWord, - sslr._l6StartOffset, - sslr._l6WordNum, - sslr._pageNum); - - PageDict4PLookupRes plr; - plr.lookup(_ssr, - _pData + _pageSize * splr._pageNum, - key, - splr._l3Word, - splr._lastWord, - splr._l3StartOffset, - splr._l3WordNum); - wordNum = plr._wordNum; - offsets = plr._startOffset; - if (plr._res) { - counts = plr._counts; - return true; - } - counts.clear(); - return false; - } -}; - -} +using Writer = search::diskindex::test::PageDict4MemWriter; +using SeqReader = search::diskindex::test::PageDict4MemSeqReader; +using RandReader = search::diskindex::test::PageDict4MemRandReader; class PageDict4TestApp : public FastOS_Application { @@ -518,9 +366,6 @@ testWords(const std::string &logname, bool firstWordForcedCommon, bool lastWordForcedCommon) { - typedef search::bitcompression::PostingListCountFileEncodeContext EC; - typedef search::bitcompression::PostingListCountFileDecodeContext DC; - LOG(info, "%s: word test start", logname.c_str()); std::vector<WordCounts> myrand; makeWords(myrand, rnd, numWordIds, tupleCount, @@ -536,17 +381,7 @@ testWords(const std::string &logname, } LOG(info, "%s: word counts generated", logname.c_str()); - EC pe; - EC spe; - EC sse; - - sse._minChunkDocs = chunkSize; - sse._numWordIds = numWordIds; - spe.copyParams(sse); - pe.copyParams(sse); - Writer w(sse, spe, pe); - w.startPad(ssPad, spPad, pPad); - w.allocWriters(); + Writer w(chunkSize, numWordIds, ssPad, spPad, pPad); PostingListCounts counts; for (std::vector<WordCounts>::const_iterator @@ -563,23 +398,15 @@ testWords(const std::string &logname, "%s: Used %" PRIu64 "+%" PRIu64 "+%" PRIu64 " bits for %d words", logname.c_str(), - w._pFileBitSize, - w._spFileBitSize, - w._ssFileBitSize, + w._buffers._pFileBitSize, + w._buffers._spFileBitSize, + w._buffers._ssFileBitSize, (int) myrand.size()); StartOffset checkOffset; { - DC ssd; - ssd._minChunkDocs = chunkSize; - ssd._numWordIds = numWordIds; - DC spd; - spd.copyParams(ssd); - DC pd; - pd.copyParams(ssd); - - SeqReader r(ssd, spd, pd, w); + SeqReader r(chunkSize, numWordIds, w._buffers); uint64_t wordNum = 1; uint64_t checkWordNum = 0; @@ -596,20 +423,12 @@ testWords(const std::string &logname, checkOffset._fileOffset += counts._bitLength; checkOffset._accNumDocs += counts._numDocs; } - assert(pd.getReadOffset() == w._pFileBitSize); + assert(r._decoders.pd.getReadOffset() == w._buffers._pFileBitSize); LOG(info, "%s: words seqRead test OK", logname.c_str()); } { - DC ssd; - ssd._minChunkDocs = chunkSize; - ssd._numWordIds = numWordIds; - DC spd; - spd.copyParams(ssd); - DC pd; - pd.copyParams(ssd); - - RandReader rr(ssd, spd, pd, w); + RandReader rr(chunkSize, numWordIds, w._buffers); uint64_t wordNum = 1; uint64_t checkWordNum = 0; diff --git a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp index 82110d354d3..50abcca96a7 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.cpp @@ -63,13 +63,6 @@ PageDict4PageParams::getFileHeaderPad(uint32_t offset) } -std::ostream & -operator<<(std::ostream &stream, const index::PostingListCounts &counts) -{ - stream << "(d=" << counts._numDocs << ",b=" << counts._bitLength << ")"; - return stream; -} - typedef index::PostingListCounts Counts; typedef PageDict4StartOffset StartOffset; @@ -202,19 +195,6 @@ PageDict4SSWriter::addL6Skip(const vespalib::stringref &word, uint64_t pageNum, uint32_t sparsePageNum) { -#if 0 - LOG(info, - "addL6SKip, \"%s\" -> wordnum %d, page (%d,%d) startOffset %" PRId64 - ", SS bitOffset %" PRIu64, - word.c_str(), - (int) wordNum, - (int) pageNum, - (int) sparsePageNum, - startOffset.empty() ? - static_cast<int64_t>(0) : - startOffset[0]._fileOffset, - _eL6.getWriteOffset()); -#endif _eL6.writeBits(0, 1); // Selector bit writeStartOffset(_eL6, startOffset, @@ -227,12 +207,6 @@ PageDict4SSWriter::addL6Skip(const vespalib::stringref &word, size_t lcp = getLCP(word, _l6Word); vespalib::stringref wordSuffix = word.substr(lcp); _eL6.smallAlign(8); -#if 0 - LOG(info, - "lcp=%d, at offset %" PRIu64 , - (int) lcp, - _eL6.getWriteOffset()); -#endif _eL6.writeBits(lcp, 8); _eL6.writeComprBufferIfNeeded(); _eL6.writeString(wordSuffix); @@ -247,10 +221,6 @@ PageDict4SSWriter::addL6Skip(const vespalib::stringref &word, _l6StartOffset = startOffset; _l6Word = word; _l6WordNum = wordNum; -#if 0 - LOG(info, "after .. SS bit Offset %" PRId64, - _eL6.getWriteOffset()); -#endif } @@ -261,21 +231,6 @@ addOverflowCounts(const vespalib::stringref &word, const StartOffset &startOffset, uint64_t wordNum) { -#if 0 - std::ostringstream txtCounts; - std::ostringstream txtStartOffset; - std::ostringstream txtL6StartOffset; - txtCounts << counts; - txtStartOffset << startOffset; - txtL6StartOffset << _l6StartOffset; - LOG(info, - "addL6Overflow, \"%s\" wordNum %d, counts %s fileoffset %s l6startOffset %s", - word.c_str(), - (int) wordNum, - txtCounts.str().c_str(), - txtStartOffset.str().c_str(), - txtL6StartOffset.str().c_str()); -#endif _eL6.writeBits(1, 1); // Selector bit writeStartOffset(_eL6, startOffset, @@ -504,11 +459,6 @@ PageDict4SPWriter::addL3Skip(const vespalib::stringref &word, uint64_t wordNum, uint64_t pageNum) { -#if 0 - LOG(info, - "addL3Skip(\"%s\"), wordNum=%d pageNum=%d", - word.c_str(), (int) wordNum, (int) pageNum); -#endif assert(_l3WordOffset == _words.size()); /* * Update notion of previous size, converting tentative writes to @@ -530,10 +480,6 @@ PageDict4SPWriter::addL3Skip(const vespalib::stringref &word, _l3StartOffset, K_VALUE_COUNTFILE_L3_FILEOFFSET, K_VALUE_COUNTFILE_L3_ACCNUMDOCS); -#if 0 - LOG(info, - "Adding l3 delta %d", (int) (wordNum - _l3WordNum)); -#endif _eL3.encodeExpGolomb(wordNum - _l3WordNum, K_VALUE_COUNTFILE_L3_WORDNUM); _eL3.writeComprBufferIfNeeded(); @@ -555,7 +501,7 @@ PageDict4SPWriter::addL3Skip(const vespalib::stringref &word, // Flush existing full writes. flushPage(); - // Compensate for elided entry. + // Promote elided L3 entry to L6 entry _l6Word = word; _l6StartOffset = startOffset; _l6WordNum = wordNum; @@ -572,11 +518,6 @@ PageDict4SPWriter::addL3Skip(const vespalib::stringref &word, void PageDict4SPWriter::addL4Skip(size_t &lcp) { -#if 0 - LOG(info, - "addL4Skip(\"%s\")", - _l3Word.c_str()); -#endif size_t tlcp = getLCP(_l3Word, _l4Word); assert(tlcp <= lcp); if (tlcp < lcp) @@ -613,11 +554,6 @@ PageDict4SPWriter::addL4Skip(size_t &lcp) void PageDict4SPWriter::addL5Skip(size_t &lcp) { -#if 0 - LOG(info, - "addL5Skip(\"%s\")", - _l3Word.c_str()); -#endif size_t tlcp = getLCP(_l3Word, _l5Word); assert(tlcp <= lcp); if (tlcp < lcp) @@ -838,15 +774,6 @@ PageDict4PWriter:: addCounts(const vespalib::stringref &word, const Counts &counts) { -#if 0 - std::ostringstream txtcounts; - txtcounts << counts; - LOG(info, - "addCounts(\"%s\", %s), wordNum=%d", - word.c_str(), - txtcounts.str().c_str(), - (int) _wordNum); -#endif assert(_countsWordOffset == _words.size()); size_t lcp = getLCP(_pendingCountsWord, _countsWord); if (_l1StrideCheck >= getL1SkipStride()) @@ -858,14 +785,6 @@ addCounts(const vespalib::stringref &word, if (eCountsOffset + _l1Size + _l2Size + _headerSize + 8 * (_countsWordOffset + 2 + _pendingCountsWord.size() - lcp) > getPageBitSize()) { -#if 0 - LOG(info, - "Backtrack: eCountsOffset=%d, l1size=%d, l2size=%d, hdrsize=%d", - (int) eCountsOffset, - (int) _l1Size, - (int) _l2Size, - (int) _headerSize); -#endif if (_l1StrideCheck == 0u) { _l1Size = _prevL1Size; // Undo L1 _l2Size = _prevL2Size; // Undo L2 @@ -890,11 +809,6 @@ addCounts(const vespalib::stringref &word, _l3WordNum, getPageNum()); resetPage(); -#if 0 - std::ostringstream txtoffsets; - txtoffsets << _countsStartOffset; - LOG(info, "countsStartOffsets=%s", txtoffsets.str().c_str()); -#endif return; } } @@ -902,11 +816,6 @@ addCounts(const vespalib::stringref &word, ++_countsEntries; ++_l1StrideCheck; _countsStartOffset.adjust(counts); -#if 0 - std::ostringstream txtoffsets; - txtoffsets << _countsStartOffset; - LOG(info, "countsStartOffsets=%s", txtoffsets.str().c_str()); -#endif _countsWord = _pendingCountsWord; _countsWordOffset = _words.size(); _pendingCountsWord = word; @@ -936,10 +845,6 @@ PageDict4PWriter::addOverflowCounts(const vespalib::stringref &word, e.smallAlign(64); e.writeComprBufferIfNeeded(); e.writeBits(_wordNum, 64); // Identifies overflow for later read -#if 0 - LOG(info, - "AddOverflowCounts wordnum %d", (int) _wordNum); -#endif uint32_t alignedHeaderSize = (_headerSize + 63) & -64; uint32_t padding = getPageBitSize() - alignedHeaderSize - 64; e.padBits(padding); @@ -963,14 +868,6 @@ PageDict4PWriter::addL1Skip(size_t &lcp) if (tlcp < lcp) lcp = tlcp; _l1StrideCheck = 0u; -#if 0 - LOG(info, - "addL1SKip(\"%s\"), lcp=%d, offset=%d -> %d", - _pendingCountsWord.c_str(), - (int) lcp, - (int) _l1WordOffset, - (int) _countsWordOffset); -#endif _eL1.encodeExpGolomb(_countsWordOffset - _l1WordOffset, K_VALUE_COUNTFILE_L1_WORDOFFSET); _eL1.writeComprBufferIfNeeded(); @@ -1000,14 +897,6 @@ PageDict4PWriter::addL2Skip(size_t &lcp) if (tlcp < lcp) lcp = tlcp; _l2StrideCheck = 0; -#if 0 - LOG(info, - "addL2SKip(\"%s\"), lcp=%d, offset=%d -> %d", - _pendingCountsWord.c_str(), - (int) lcp, - (int) _l2WordOffset, - (int) _countsWordOffset); -#endif _eL2.encodeExpGolomb(_countsWordOffset - _l2WordOffset, K_VALUE_COUNTFILE_L2_WORDOFFSET); _eL2.writeComprBufferIfNeeded(); @@ -1101,12 +990,6 @@ PageDict4SSReader::setup(DC &ssd) DC dL6; -#if 0 - LOG(info, - "comprBuf=%p, comprBufSize=%d", - static_cast<const void *>(_cb._comprBuf), - (int) _cb._comprBufSize); -#endif setDecoderPosition(dL6, _cb, _ssStartOffset); dL6.copyParams(_ssd); @@ -1127,12 +1010,6 @@ PageDict4SSReader::setup(DC &ssd) bool overflow = false; while (l6Offset < _ssFileBitLen) { -#if 0 - LOG(info, - "L6Offset=%" PRIu32 ", bitLen=%" PRIu64, - l6Offset, - _ssFileBitLen); -#endif UC64_DECODECONTEXT(o); uint32_t length; uint64_t val64; @@ -1166,11 +1043,6 @@ PageDict4SSReader::setup(DC &ssd) UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L6_WORDNUM, EC); -#if 0 - LOG(info, - "Bumping l6wordnum from %d to %d (delta %d)", - (int) l6WordNum, (int) (l6WordNum + val64) , (int) val64); -#endif l6WordNum += val64; UC64_DECODECONTEXT_STORE(o, dL6._); dL6.smallAlign(8); @@ -1182,10 +1054,6 @@ PageDict4SSReader::setup(DC &ssd) word += reinterpret_cast<const char *>(bytes); dL6.setByteCompr(bytes + word.size() + 1 - lcp); if (overflow) { -#if 0 - LOG(info, - "AddOverflowRef2 wordnum %d", (int) (l6WordNum - 1)); -#endif _overflows.push_back(OverflowRef(l6WordNum - 1, _l7.size())); dL6.readCounts(counts); startOffset.adjust(counts); @@ -1199,18 +1067,6 @@ PageDict4SSReader::setup(DC &ssd) ++sparsePageNum; UC64_DECODECONTEXT_STORE(o, dL6._); } -#if 0 - std::ostringstream txtfileoffset; - txtfileoffset << startOffset; - LOG(info, - "ssreader::setup " - "word=%s, l6offset=%d->%d, startOffsets=%s overflow=%s", - word.c_str(), - (int) l6Offset, - (int) dL6.getReadOffset(), - txtfileoffset.str().c_str(), - overflow ? "true" : "false"); -#endif ++l7StrideCheck; l6Offset = dL6.getReadOffset(); } @@ -1281,14 +1137,6 @@ lookup(const vespalib::stringref &key) l6WordNum = l7e._l7WordNum; } -#if 0 - LOG(info, - "sslookup1: l6WordNum=%d, l6Word=\"%s\", key=\"%s\", l6Offset=%d", - (int) l6WordNum, - l6Word.c_str(), - key.c_str(), - (int) l6Offset); -#endif setDecoderPosition(dL6, _cb, l6Offset); @@ -1325,13 +1173,6 @@ lookup(const vespalib::stringref &key) word += reinterpret_cast<const char *>(bytes); dL6.setByteCompr(bytes + word.size() + 1 - lcp); if (overflow) { -#if 0 - LOG(info, - "sslookup: wordNum=%d, word=\"%s\", key=\"%s\"", - (int) wordNum, - word.c_str(), - key.c_str()); -#endif bool l6NotLessThanKey = !(word < key); if (l6NotLessThanKey) { if (key == word) { @@ -1398,13 +1239,6 @@ lookupOverflow(uint64_t wordNum) const assert(l7Ref < _l7.size()); const vespalib::string &word = _l7[l7Ref]._l7Word; -#if 0 - LOG(info, - "lookupOverflow: wordNum %d -> word %s, next l7 Pos is %d", - (int) wordNum, - word.c_str(), - (int) l7Ref); -#endif uint64_t l6Offset = _ssStartOffset; StartOffset startOffset; if (l7Ref > 0) { @@ -1428,18 +1262,6 @@ lookupOverflow(uint64_t wordNum) const dL6.copyParams(_ssd); setDecoderPosition(dL6, _cb, l6Offset); -#if 0 - std::ostringstream txtStartOffset; - std::ostringstream txtL6StartOffset; - txtStartOffset << startOffset; - txtL6StartOffset << l6StartOffset; - LOG(info, - "Lookupoverflow l6Offset=%d, l6fileoffset=%s, fileoffset=%s", - (int) l6Offset, - txtL6StartOffset.str().c_str(), - txtStartOffset.str().c_str()); -#endif - UC64_DECODECONTEXT(o); uint32_t length; const bool bigEndian = true; @@ -1474,16 +1296,6 @@ lookupOverflow(uint64_t wordNum) const (void) lcp; Counts counts; dL6.readCounts(counts); -#if 0 - std::ostringstream txtCounts; - txtStartOffset.str(""); - txtStartOffset << startOffset; - txtCounts << counts; - LOG(info, - "Lookupoverflow fileoffset=%s, counts=%s", - txtStartOffset.str().c_str(), - txtCounts.str().c_str()); -#endif res._overflow = true; res._counts = counts; res._startOffset = startOffset; @@ -1911,7 +1723,12 @@ PageDict4Reader::PageDict4Reader(const SSReader &ssReader, _spwc(), _spwe(), _ssd(), - _wordNum(1u) + _wordNum(1u), + _l1SkipChecks(), + _l2SkipChecks(), + _l3SkipChecks(), + _l4SkipChecks(), + _l5SkipChecks() { } @@ -1929,8 +1746,8 @@ PageDict4Reader::setup() _spd.skipBits(getFileHeaderPad(_ssReader._spStartOffset)); assert(_pFileBitLen >= _pd.getReadOffset()); if (_pFileBitLen > _pd.getReadOffset()) { - setupPage(); setupSPage(); + setupPage(); } const ComprBuffer &sscb = _ssReader._cb; @@ -1943,25 +1760,62 @@ PageDict4Reader::~PageDict4Reader() { } +namespace +{ + +template <typename CheckVector> +void checkWordOffset(CheckVector &skip, uint32_t &skipAdjust, uint32_t wordOffset, uint32_t wordEntryLen) +{ + if (skip.valid() && skip->wordOffset + skipAdjust <= wordOffset) { + assert(skip->wordOffset + skipAdjust == wordOffset); + skipAdjust += wordEntryLen; + skip.step(); + } +} + +} + + +template <typename Entry1, typename Entry2> +void +PageDict4Reader::checkWordOffsets(const std::vector<char> &words, + CheckVector<Entry1> &skip1, + CheckVector<Entry2> &skip2) +{ + skip1.setup(); + skip2.setup(); + uint32_t wordOffset = 0; + uint32_t skip1Adjust = 0; + uint32_t skip2Adjust = 0; + auto c = words.cbegin(); + auto ce = words.cend(); + while (c != ce) { + wordOffset = c - words.cbegin(); + ++c; // skip lcp + assert(c != ce); + while (*c != '\0') { + ++c; + assert(c != ce); + } + assert(c != ce); + ++c; + uint32_t wordEntryLen = c - words.cbegin() - wordOffset; + checkWordOffset(skip1, skip1Adjust, wordOffset, wordEntryLen); + checkWordOffset(skip2, skip2Adjust, wordOffset, wordEntryLen); + } + assert(!skip1.valid()); + assert(!skip2.valid()); +} void PageDict4Reader::setupPage() { -#if 0 - LOG(info, - "setupPage(%ld), " - (long int) _pd.getReadOffset()); -#endif uint32_t l2Size = _pd.readBits(15); uint32_t l1Size = _pd.readBits(15); uint32_t countsEntries = _pd.readBits(15); uint32_t wordsSize = _pd.readBits(12); _countsResidue = countsEntries; -#if 0 - _pd.skipBits(l2Size + l1Size); - Counts counts; -#else if (countsEntries == 0 && l1Size == 0 && l2Size == 0) { _pd.smallAlign(64); _overflowPage = true; @@ -1974,138 +1828,214 @@ PageDict4Reader::setupPage() uint64_t beforePos = _pd.getReadOffset(); Counts counts; - StartOffset startOffset; + _l2SkipChecks.clear(); + L2SkipCheck l2SkipCheck(_startOffset); while (l2Residue > 0) { UC64_DECODECONTEXT(o); uint32_t length; + uint64_t val64; const bool bigEndian = true; UC64_DECODECONTEXT_LOAD(o, _pd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_WORDOFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_WORDOFFSET, EC); + l2SkipCheck.wordOffset += val64; UC64_DECODECONTEXT_STORE(o, _pd._); readStartOffset(_pd, - startOffset, + l2SkipCheck.startOffset, K_VALUE_COUNTFILE_L2_FILEOFFSET, K_VALUE_COUNTFILE_L2_ACCNUMDOCS); UC64_DECODECONTEXT_LOAD(o, _pd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_COUNTOFFSET, EC); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_L1OFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_COUNTOFFSET, EC); + l2SkipCheck.countOffset += val64; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L2_L1OFFSET, EC); + l2SkipCheck.l1Offset += val64; UC64_DECODECONTEXT_STORE(o, _pd._); --l2Residue; + _l2SkipChecks.push_back(l2SkipCheck); } + _l2SkipChecks.setup(); assert(_pd.getReadOffset() == beforePos + l2Size); + _l1SkipChecks.clear(); + L1SkipCheck l1SkipCheck(_startOffset); while (l1Residue > 0) { UC64_DECODECONTEXT(o); uint32_t length; + uint64_t val64; const bool bigEndian = true; UC64_DECODECONTEXT_LOAD(o, _pd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L1_WORDOFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L1_WORDOFFSET, EC); + l1SkipCheck.wordOffset += val64; UC64_DECODECONTEXT_STORE(o, _pd._); readStartOffset(_pd, - startOffset, + l1SkipCheck.startOffset, K_VALUE_COUNTFILE_L1_FILEOFFSET, K_VALUE_COUNTFILE_L1_ACCNUMDOCS); UC64_DECODECONTEXT_LOAD(o, _pd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L1_COUNTOFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L1_COUNTOFFSET, EC); + l1SkipCheck.countOffset += val64; UC64_DECODECONTEXT_STORE(o, _pd._); --l1Residue; + _l1SkipChecks.push_back(l1SkipCheck); + if (_l2SkipChecks.valid()) { + uint64_t l1CheckOffset = beforePos + l2Size + _l2SkipChecks->l1Offset; + uint64_t l1Offset = _pd.getReadOffset(); + if (l1Offset >= l1CheckOffset) { + assert(l1Offset == l1CheckOffset); + assert(_l2SkipChecks->check(l1SkipCheck)); + _l2SkipChecks.step(); + } + } } + assert(!_l2SkipChecks.valid()); + _l1SkipChecks.setup(); assert(_pd.getReadOffset() == beforePos + l2Size + l1Size); (void) beforePos; -#endif _counts.clear(); + StartOffset startOffset(_startOffset); while (countsEntries > 0) { _pd.readCounts(counts); _counts.push_back(counts); + startOffset.adjust(counts); --countsEntries; + if (_l1SkipChecks.valid()) { + uint64_t countsCheckOffset = beforePos + l2Size + l1Size + _l1SkipChecks->countOffset; + uint64_t countsOffset = _pd.getReadOffset(); + if (countsOffset >= countsCheckOffset) { + assert(countsOffset == countsCheckOffset); + assert(startOffset == _l1SkipChecks->startOffset); + _l1SkipChecks.step(); + } + } + } + assert(!_l1SkipChecks.valid()); + if (_l3SkipChecks.valid()) { + assert(_l3SkipChecks->startOffset == startOffset); + assert(_l3SkipChecks->wordNum == _wordNum + _countsResidue); + _l3SkipChecks.step(); } _cc = _counts.begin(); _ce = _counts.end(); uint32_t pageOffset = _pd.getReadOffset() & (getPageBitSize() - 1); - uint32_t padding = getPageBitSize() - wordsSize * 8 - pageOffset; + uint32_t padding = (getPageBitSize() - wordsSize * 8 - pageOffset) & (getPageBitSize() - 1); _pd.skipBits(padding); _words.resize(wordsSize); _pd.readBytes(reinterpret_cast<uint8_t *>(&_words[0]), wordsSize); _wc = _words.begin(); _we = _words.end(); + checkWordOffsets(_words, _l1SkipChecks, _l2SkipChecks); } void PageDict4Reader::setupSPage() { -#if 0 - LOG(info, "setupSPage(%d),", (int) _spd.getReadOffset()); -#endif uint32_t l5Size = _spd.readBits(15); uint32_t l4Size = _spd.readBits(15); uint32_t l3Entries = _spd.readBits(15); uint32_t wordsSize = _spd.readBits(12); _l3Residue = l3Entries; - -#if 0 - _spd.skipBits(l5Size + l4Size); -#else + assert(!_l3SkipChecks.valid()); assert(l3Entries > 0); uint32_t l4Residue = getL4Entries(l3Entries); uint32_t l5Residue = getL5Entries(l4Residue); uint64_t beforePos = _spd.getReadOffset(); - StartOffset startOffset; + _l5SkipChecks.clear(); + L5SkipCheck l5SkipCheck(_startOffset, _wordNum); while (l5Residue > 0) { UC64_DECODECONTEXT(o); uint32_t length; + uint64_t val64; const bool bigEndian = true; UC64_DECODECONTEXT_LOAD(o, _spd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_WORDOFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_WORDOFFSET, EC); + l5SkipCheck.wordOffset += val64; UC64_DECODECONTEXT_STORE(o, _spd._); readStartOffset(_spd, - startOffset, + l5SkipCheck.startOffset, K_VALUE_COUNTFILE_L5_FILEOFFSET, K_VALUE_COUNTFILE_L5_ACCNUMDOCS); UC64_DECODECONTEXT_LOAD(o, _spd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_WORDNUM, EC); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_L3OFFSET, EC); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_L4OFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_WORDNUM, EC); + l5SkipCheck.wordNum += val64; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_L3OFFSET, EC); + l5SkipCheck.l3Offset += val64; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L5_L4OFFSET, EC); + l5SkipCheck.l4Offset += val64; UC64_DECODECONTEXT_STORE(o, _spd._); --l5Residue; + _l5SkipChecks.push_back(l5SkipCheck); } + _l5SkipChecks.setup(); assert(_spd.getReadOffset() == beforePos + l5Size); + _l4SkipChecks.clear(); + L4SkipCheck l4SkipCheck(_startOffset, _wordNum); while (l4Residue > 0) { UC64_DECODECONTEXT(o); uint32_t length; + uint64_t val64; const bool bigEndian = true; UC64_DECODECONTEXT_LOAD(o, _spd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_WORDOFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_WORDOFFSET, EC); + l4SkipCheck.wordOffset += val64; UC64_DECODECONTEXT_STORE(o, _spd._); readStartOffset(_spd, - startOffset, + l4SkipCheck.startOffset, K_VALUE_COUNTFILE_L4_FILEOFFSET, K_VALUE_COUNTFILE_L4_ACCNUMDOCS); UC64_DECODECONTEXT_LOAD(o, _spd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_WORDNUM, EC); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_L3OFFSET, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_WORDNUM, EC); + l4SkipCheck.wordNum += val64; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L4_L3OFFSET, EC); + l4SkipCheck.l3Offset += val64; UC64_DECODECONTEXT_STORE(o, _spd._); --l4Residue; + _l4SkipChecks.push_back(l4SkipCheck); + if (_l5SkipChecks.valid()) { + uint64_t l4CheckOffset = beforePos + l5Size + _l5SkipChecks->l4Offset; + uint64_t l4Offset = _spd.getReadOffset(); + if (l4Offset >= l4CheckOffset) { + assert(l4Offset == l4CheckOffset); + assert(_l5SkipChecks->check(l4SkipCheck)); + _l5SkipChecks.step(); + } + } } + assert(!_l5SkipChecks.valid()); + _l4SkipChecks.setup(); assert(_spd.getReadOffset() == beforePos + l5Size + l4Size); (void) l4Size; (void) l5Size; (void) beforePos; -#endif + _l3SkipChecks.clear(); + L3SkipCheck l3SkipCheck(_startOffset, _wordNum); while (l3Entries > 1) { readStartOffset(_spd, - startOffset, + l3SkipCheck.startOffset, K_VALUE_COUNTFILE_L3_FILEOFFSET, K_VALUE_COUNTFILE_L3_ACCNUMDOCS); UC64_DECODECONTEXT(o); uint32_t length; + uint64_t val64; const bool bigEndian = true; UC64_DECODECONTEXT_LOAD(o, _spd._); - UC64_SKIPEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L3_WORDNUM, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_COUNTFILE_L3_WORDNUM, EC); + l3SkipCheck.wordNum += val64; UC64_DECODECONTEXT_STORE(o, _spd._); --l3Entries; + _l3SkipChecks.push_back(l3SkipCheck); + if (_l4SkipChecks.valid()) { + uint64_t l3CheckOffset = beforePos + l5Size + l4Size + _l4SkipChecks->l3Offset; + uint64_t l3Offset = _spd.getReadOffset(); + if (l3Offset >= l3CheckOffset) { + assert(l3Offset == l3CheckOffset); + assert(_l4SkipChecks->check(l3SkipCheck)); + _l4SkipChecks.step(); + } + } } + assert(!_l4SkipChecks.valid()); + _l3SkipChecks.setup(); uint32_t pageOffset = _spd.getReadOffset() & (getPageBitSize() - 1); uint32_t padding = getPageBitSize() - wordsSize * 8 - pageOffset; _spd.skipBits(padding); @@ -2113,6 +2043,7 @@ PageDict4Reader::setupSPage() _spd.readBytes(reinterpret_cast<uint8_t *>(&_spwords[0]), wordsSize); _spwc = _spwords.begin(); _spwe = _spwords.end(); + checkWordOffsets(_spwords, _l4SkipChecks, _l5SkipChecks); } @@ -2188,13 +2119,6 @@ PageDict4Reader::decodeSSWord(vespalib::string &word) word += reinterpret_cast<const char *>(bytes); _ssd.setByteCompr(bytes + word.size() + 1 - lcp); _lastSSWord = word; -#if 0 - LOG(info, - "word is %s LCP %d, overflow=%s", - word.c_str(), - (int) lcp, - overflow ? "true" : "false"); -#endif if (overflow) { Counts counts; _ssd.readCounts(counts); @@ -2243,17 +2167,18 @@ PageDict4Reader::readCounts(vespalib::string &word, --_l3Residue; } --_countsResidue; + wordNum = _wordNum++; if (_countsResidue == 0) { assert((_pd.getReadOffset() & (getPageBitSize() - 1)) == 0); if (_pd.getReadOffset() < _pFileBitLen) { - setupPage(); - if (_l3Residue == 0) + if (_l3Residue == 0) { setupSPage(); + } + setupPage(); } else { assert(_pd.getReadOffset() == _pFileBitLen); } } - wordNum = _wordNum++; } else if (_overflowPage) { readOverflowCounts(word, counts); _overflowPage = false; @@ -2266,15 +2191,16 @@ PageDict4Reader::readCounts(vespalib::string &word, assert(tword == word); --_l3Residue; _lastWord = word; + wordNum = _wordNum++; _pd.align(getPageBitSize()); if (_pd.getReadOffset() < _pFileBitLen) { - setupPage(); - if (_l3Residue == 0) + if (_l3Residue == 0) { setupSPage(); + } + setupPage(); } else { assert(_pd.getReadOffset() == _pFileBitLen); } - wordNum = _wordNum++; } else { // Mark end of file. word.clear(); @@ -2298,25 +2224,14 @@ PageDict4Reader::readOverflowCounts(vespalib::string &word, word = wtsslr._lastWord; counts = wtsslr._counts; -#if 0 - std::ostringstream txtCounts; - std::ostringstream txtStartOffset; - std::ostringstream txtLRStartOffset; - - txtCounts << counts; - txtStartOffset << _startOffset; - txtLRStartOffset << wtsslr._startOffset; - LOG(info, - "readOverflowCounts _wordNum=%" PRIu64 - ", counts=%s, startOffset=%s (should be %s)", - _wordNum, - txtCounts.str().c_str(), - txtLRStartOffset.str().c_str(), - txtStartOffset.str().c_str()); -#endif - + assert(wordNum == _wordNum); assert(wtsslr._startOffset == _startOffset); _startOffset.adjust(counts); + if (_l3SkipChecks.valid()) { + assert(_l3SkipChecks->startOffset == _startOffset); + assert(_l3SkipChecks->wordNum == _wordNum + 1); + _l3SkipChecks.step(); + } } } // namespace bitcompression diff --git a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h index 47f2354bcc6..717702d4ef7 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h +++ b/searchlib/src/vespa/searchlib/bitcompression/pagedict4.h @@ -201,7 +201,7 @@ private: uint32_t _l4WordOffset; // Offset for last L4 word written uint32_t _l5WordOffset; // Offset for last L5 word written - // file offsets + // Offsets in data files for last L3 entry StartOffset _l3StartOffset; // Offsets in data files for last L4 entry @@ -213,7 +213,7 @@ private: // Offsets in data files for last L6 entry StartOffset _l6StartOffset; - uint64_t _l3WordNum; // word number next L3 entry to write + uint64_t _l3WordNum; // word number last L3 entry uint64_t _l4WordNum; // word number last L4 entry uint64_t _l5WordNum; // word number last L5 entry uint64_t _l6WordNum; // word number last L6 entry @@ -663,6 +663,106 @@ public: StartOffset _startOffset; bool _overflowPage; typedef std::vector<Counts> PCV; + struct L1SkipCheck + { + uint32_t wordOffset; + StartOffset startOffset; + uint32_t countOffset; + L1SkipCheck(const StartOffset &startOffset_) + : wordOffset(0), + startOffset(startOffset_), + countOffset(0) + { + } + }; + struct L2SkipCheck : public L1SkipCheck + { + uint32_t l1Offset; + L2SkipCheck(const StartOffset &startOffset_) + : L1SkipCheck(startOffset_), + l1Offset(0) + { + } + + bool check(const L1SkipCheck &rhs) const { + return startOffset == rhs.startOffset && + countOffset == rhs.countOffset; + } + }; + struct L3SkipCheck + { + StartOffset startOffset; + uint64_t wordNum; + L3SkipCheck(const StartOffset &startOffset_, uint64_t wordNum_) + : startOffset(startOffset_), + wordNum(wordNum_) + { + } + }; + struct L4SkipCheck : public L3SkipCheck + { + uint32_t wordOffset; + uint32_t l3Offset; + L4SkipCheck(const StartOffset &startOffset_, uint64_t wordNum_) + : L3SkipCheck(startOffset_, wordNum_), + wordOffset(0), + l3Offset(0) + { + } + bool check(const L3SkipCheck &rhs) const { + return startOffset == rhs.startOffset && + wordNum == rhs.wordNum; + } + }; + struct L5SkipCheck : public L4SkipCheck + { + uint32_t l4Offset; + L5SkipCheck(const StartOffset &startOffset_, uint64_t wordNum_) + : L4SkipCheck(startOffset_, wordNum_), + l4Offset(0) + { + } + bool check(const L4SkipCheck &rhs) const { + return startOffset == rhs.startOffset && + wordNum == rhs.wordNum && + l3Offset == rhs.l3Offset; + } + }; + template <typename Element> + class CheckVector + { + using Vector = std::vector<Element>; + Vector _vector; + typename Vector::const_iterator _cur; + typename Vector::const_iterator _end; + public: + CheckVector() + : _vector(), + _cur(), + _end() + { + } + void clear() { + _vector.clear(); + } + void setup() { + _cur = _vector.cbegin(); + _end = _vector.cend(); + } + bool valid() const { return _cur != _end; } + const Element *operator->() const { return _cur.operator->(); } + void step() { + ++_cur; + } + void push_back(const Element &element) { + _vector.push_back(element); + } + }; + template <typename Entry1, typename Entry2> + void + checkWordOffsets(const std::vector<char> &words, + CheckVector<Entry1> &skip1, + CheckVector<Entry2> &skip2); PCV _counts; PCV::const_iterator _cc; PCV::const_iterator _ce; @@ -681,6 +781,11 @@ public: DC _ssd; uint64_t _wordNum; + CheckVector<L1SkipCheck> _l1SkipChecks; + CheckVector<L2SkipCheck> _l2SkipChecks; + CheckVector<L3SkipCheck> _l3SkipChecks; + CheckVector<L4SkipCheck> _l4SkipChecks; + CheckVector<L5SkipCheck> _l5SkipChecks; PageDict4Reader(const SSReader &ssReader, diff --git a/searchlib/src/vespa/searchlib/test/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/test/diskindex/CMakeLists.txt index 5b698234b90..f4d8853aeca 100644 --- a/searchlib/src/vespa/searchlib/test/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/test/diskindex/CMakeLists.txt @@ -1,6 +1,11 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_searchlib_test_diskindex OBJECT SOURCES + pagedict4_decoders.cpp + pagedict4_encoders.cpp + pagedict4_mem_seq_reader.cpp + pagedict4_mem_rand_reader.cpp + pagedict4_mem_writer.cpp threelevelcountbuffers.cpp testdiskindex.cpp DEPENDS diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.cpp b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.cpp new file mode 100644 index 00000000000..122ce582850 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.cpp @@ -0,0 +1,20 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "pagedict4_decoders.h" + +namespace search::diskindex::test { + +PageDict4Decoders::PageDict4Decoders(uint32_t chunkSize, uint64_t numWordIds) + : ssd(), + spd(), + pd() +{ + ssd._minChunkDocs = chunkSize; + ssd._numWordIds = numWordIds; + spd.copyParams(ssd); + pd.copyParams(ssd); +} + +PageDict4Decoders::~PageDict4Decoders() = default; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.h b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.h new file mode 100644 index 00000000000..c92364ba585 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_decoders.h @@ -0,0 +1,24 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/bitcompression/countcompression.h> + +namespace search::diskindex::test { + +/* + * Class for writing to memory based pagedict4 structure + */ +struct PageDict4Decoders +{ + using DC = search::bitcompression::PostingListCountFileDecodeContext; + + DC ssd; + DC spd; + DC pd; + + PageDict4Decoders(uint32_t chunkSize, uint64_t numWordIds); + ~PageDict4Decoders(); +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.cpp b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.cpp new file mode 100644 index 00000000000..9d5cc7be9d4 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.cpp @@ -0,0 +1,20 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "pagedict4_encoders.h" + +namespace search::diskindex::test { + +PageDict4Encoders::PageDict4Encoders(uint32_t chunkSize, uint64_t numWordIds) + : sse(), + spe(), + pe() +{ + sse._minChunkDocs = chunkSize; + sse._numWordIds = numWordIds; + spe.copyParams(sse); + pe.copyParams(sse); +} + +PageDict4Encoders::~PageDict4Encoders() = default; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.h b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.h new file mode 100644 index 00000000000..78885150eff --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_encoders.h @@ -0,0 +1,24 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/bitcompression/countcompression.h> + +namespace search::diskindex::test { + +/* + * Class for writing to memory based pagedict4 structure + */ +struct PageDict4Encoders +{ + using EC = search::bitcompression::PostingListCountFileEncodeContext; + + EC sse; + EC spe; + EC pe; + + PageDict4Encoders(uint32_t chunkSize, uint64_t numWordIds); + ~PageDict4Encoders(); +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.cpp b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.cpp new file mode 100644 index 00000000000..fdee620d12d --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.cpp @@ -0,0 +1,73 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "pagedict4_mem_rand_reader.h" + +namespace search::diskindex::test { + +PageDict4MemRandReader::PageDict4MemRandReader(uint32_t chunkSize, uint64_t numWordIds, + ThreeLevelCountWriteBuffers &wb) + : _decoders(chunkSize, numWordIds), + _buffers(_decoders.ssd, _decoders.spd, _decoders.pd, wb), + _ssr(_buffers._rcssd, + wb._ssHeaderLen, wb._ssFileBitSize, + wb._spHeaderLen, wb._spFileBitSize, + wb._pHeaderLen, wb._pFileBitSize), + _spData(static_cast<const char *>(_buffers._rcspd._comprBuf)), + _pData(static_cast<const char *>(_buffers._rcpd._comprBuf)), + _pageSize(search::bitcompression::PageDict4PageParams::getPageByteSize()) +{ + _ssr.setup(_decoders.ssd); +} + +PageDict4MemRandReader::~PageDict4MemRandReader() = default; + +bool +PageDict4MemRandReader::lookup(const std::string &key, uint64_t &wordNum, + PostingListCounts &counts, StartOffset &offsets) +{ + PageDict4SSLookupRes sslr; + + sslr = _ssr.lookup(key); + if (!sslr._res) { + counts.clear(); + offsets = sslr._l6StartOffset; + wordNum = sslr._l6WordNum; + return false; + } + + if (sslr._overflow) { + wordNum = sslr._l6WordNum; + counts = sslr._counts; + offsets = sslr._startOffset; + return true; + } + PageDict4SPLookupRes splr; + splr.lookup(_ssr, + _spData + + _pageSize * sslr._sparsePageNum, + key, + sslr._l6Word, + sslr._lastWord, + sslr._l6StartOffset, + sslr._l6WordNum, + sslr._pageNum); + + PageDict4PLookupRes plr; + plr.lookup(_ssr, + _pData + _pageSize * splr._pageNum, + key, + splr._l3Word, + splr._lastWord, + splr._l3StartOffset, + splr._l3WordNum); + wordNum = plr._wordNum; + offsets = plr._startOffset; + if (plr._res) { + counts = plr._counts; + return true; + } + counts.clear(); + return false; +} + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.h b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.h new file mode 100644 index 00000000000..f51c6bfb6da --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_rand_reader.h @@ -0,0 +1,38 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "pagedict4_decoders.h" +#include "threelevelcountbuffers.h" +#include <vespa/searchlib/bitcompression/pagedict4.h> + +namespace search::diskindex::test { + +/* + * Class for performing random lookups in memory based pagedict4 structure + */ +class PageDict4MemRandReader +{ +public: + using PageDict4SSReader = search::bitcompression::PageDict4SSReader; + using PageDict4SSLookupRes = search::bitcompression::PageDict4SSLookupRes; + using PageDict4SPLookupRes = search::bitcompression::PageDict4SPLookupRes; + using PageDict4PLookupRes = search::bitcompression::PageDict4PLookupRes; + using StartOffset = search::bitcompression::PageDict4StartOffset; + using PostingListCounts = search::index::PostingListCounts; + + PageDict4Decoders _decoders; + ThreeLevelCountReadBuffers _buffers; + PageDict4SSReader _ssr; + const char *_spData; + const char *_pData; + size_t _pageSize; + + PageDict4MemRandReader(uint32_t chunkSize, uint64_t numWordIds, + ThreeLevelCountWriteBuffers &wb); + ~PageDict4MemRandReader(); + bool lookup(const std::string &key, uint64_t &wordNum, + PostingListCounts &counts, StartOffset &offsets); +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.cpp b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.cpp new file mode 100644 index 00000000000..e33a0a1af0e --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.cpp @@ -0,0 +1,31 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "pagedict4_mem_seq_reader.h" + +namespace search::diskindex::test { + +PageDict4MemSeqReader::PageDict4MemSeqReader(uint32_t chunkSize, uint64_t numWordIds, + ThreeLevelCountWriteBuffers &wb) + : _decoders(chunkSize, numWordIds), + _buffers(_decoders.ssd, _decoders.spd, _decoders.pd, wb), + _ssr(_buffers._rcssd, + wb._ssHeaderLen, wb._ssFileBitSize, + wb._spHeaderLen, wb._spFileBitSize, + wb._pHeaderLen, wb._pFileBitSize), + _pr(_ssr, _decoders.spd, _decoders.pd) +{ + _ssr.setup(_decoders.ssd); + _pr.setup(); +} + +PageDict4MemSeqReader::~PageDict4MemSeqReader() = default; + +void +PageDict4MemSeqReader::readCounts(vespalib::string &word, + uint64_t &wordNum, + PostingListCounts &counts) +{ + _pr.readCounts(word, wordNum, counts); +} + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.h b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.h new file mode 100644 index 00000000000..c9709f63796 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_seq_reader.h @@ -0,0 +1,34 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "pagedict4_decoders.h" +#include "threelevelcountbuffers.h" +#include <vespa/searchlib/bitcompression/pagedict4.h> + +namespace search::diskindex::test { + +/* + * Class for performing sequential reads in memory based pagedict4 structure + */ +class PageDict4MemSeqReader +{ +public: + using PageDict4SSReader = search::bitcompression::PageDict4SSReader; + using PageDict4Reader = search::bitcompression::PageDict4Reader; + using PostingListCounts = search::index::PostingListCounts; + + PageDict4Decoders _decoders; + ThreeLevelCountReadBuffers _buffers; + PageDict4SSReader _ssr; + PageDict4Reader _pr; + + PageDict4MemSeqReader(uint32_t chunkSize, uint64_t numWordIds, + ThreeLevelCountWriteBuffers &wb); + ~PageDict4MemSeqReader(); + void readCounts(vespalib::string &word, + uint64_t &wordNum, + PostingListCounts &counts); +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.cpp b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.cpp new file mode 100644 index 00000000000..d82f2967a0b --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.cpp @@ -0,0 +1,50 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "pagedict4_mem_writer.h" +#include <vespa/searchlib/bitcompression/pagedict4.h> + +namespace search::diskindex::test { + +PageDict4MemWriter::PageDict4MemWriter(uint32_t chunkSize, uint64_t numWordIds, uint32_t ssPad, uint32_t spPad, uint32_t pPad) + : _encoders(chunkSize, numWordIds), + _buffers(_encoders.sse, _encoders.spe, _encoders.pe), + _ssw(NULL), + _spw(NULL), + _pw(NULL) +{ + _buffers.startPad(ssPad, spPad, pPad); + allocWriters(); +} + +PageDict4MemWriter::~PageDict4MemWriter() +{ + delete _ssw; + delete _spw; + delete _pw; +} + +void +PageDict4MemWriter::allocWriters() +{ + _ssw = new PageDict4SSWriter(_buffers._sse); + _spw = new PageDict4SPWriter(*_ssw, _buffers._spe); + _pw = new PageDict4PWriter(*_spw, _buffers._pe); + _spw->setup(); + _pw->setup(); +} + +void +PageDict4MemWriter::flush() +{ + _pw->flush(); + _buffers.flush(); +} + +void +PageDict4MemWriter::addCounts(const std::string &word, + const PostingListCounts &counts) +{ + _pw->addCounts(word, counts); +} + +} diff --git a/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.h b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.h new file mode 100644 index 00000000000..ae36883f844 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/diskindex/pagedict4_mem_writer.h @@ -0,0 +1,52 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "pagedict4_encoders.h" +#include "threelevelcountbuffers.h" + +namespace search::bitcompression { + +class PageDict4SSWriter; +class PageDict4SPWriter; +class PageDict4PWriter; + +} + +namespace search::index { class PostingListCounts; } + +namespace search::diskindex::test { + +/* + * Class for writing to memory based pagedict4 structure + */ +class PageDict4MemWriter +{ +public: + using PageDict4SSWriter = search::bitcompression::PageDict4SSWriter; + using PageDict4SPWriter = search::bitcompression::PageDict4SPWriter; + using PageDict4PWriter = search::bitcompression::PageDict4PWriter; + using PostingListCounts = search::index::PostingListCounts; + +private: + PageDict4Encoders _encoders; +public: + ThreeLevelCountWriteBuffers _buffers; +private: + PageDict4SSWriter *_ssw; + PageDict4SPWriter *_spw; + PageDict4PWriter *_pw; + + void allocWriters(); +public: + PageDict4MemWriter(uint32_t chunkSize, uint64_t numWordIds, uint32_t ssPad, uint32_t spPad, uint32_t pPad); + ~PageDict4MemWriter(); + void flush(); + void addCounts(const std::string &word, const PostingListCounts &counts); + void startPad(uint32_t ssHeaderLen, uint32_t spHeaderLen, uint32_t pHeaderLen) + { + _buffers.startPad(ssHeaderLen, spHeaderLen, pHeaderLen); + } +}; + +} |