summaryrefslogtreecommitdiffstats
path: root/searchlib/src/tests/diskindex/pagedict4
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib/src/tests/diskindex/pagedict4')
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/.gitignore5
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp876
3 files changed, 890 insertions, 0 deletions
diff --git a/searchlib/src/tests/diskindex/pagedict4/.gitignore b/searchlib/src/tests/diskindex/pagedict4/.gitignore
new file mode 100644
index 00000000000..2381ed57229
--- /dev/null
+++ b/searchlib/src/tests/diskindex/pagedict4/.gitignore
@@ -0,0 +1,5 @@
+.depend
+Makefile
+pagedict4_test
+fakedict.*
+searchlib_pagedict4_test_app
diff --git a/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
new file mode 100644
index 00000000000..f8aef573c9a
--- /dev/null
+++ b/searchlib/src/tests/diskindex/pagedict4/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_pagedict4_test_app
+ SOURCES
+ pagedict4test.cpp
+ DEPENDS
+ searchlib_test
+ searchlib
+)
+vespa_add_test(NAME searchlib_pagedict4_test_app COMMAND searchlib_pagedict4_test_app)
diff --git a/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp b/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp
new file mode 100644
index 00000000000..03d73e84b42
--- /dev/null
+++ b/searchlib/src/tests/diskindex/pagedict4/pagedict4test.cpp
@@ -0,0 +1,876 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP("pagedict4test");
+#include <vespa/searchlib/bitcompression/compression.h>
+#include <vector>
+#include <vespa/searchlib/util/rand48.h>
+#include <vespa/searchlib/index/schemautil.h>
+#include <vespa/searchlib/bitcompression/countcompression.h>
+#include <vespa/searchlib/bitcompression/pagedict4.h>
+#include <vespa/searchlib/test/diskindex/threelevelcountbuffers.h>
+#include <vespa/searchlib/index/postinglistcounts.h>
+#include <vespa/searchlib/index/dummyfileheadercontext.h>
+#include <vespa/searchlib/diskindex/pagedict4file.h>
+#include <vespa/searchlib/diskindex/pagedict4randread.h>
+#include <vespa/searchlib/common/tunefileinfo.h>
+
+using search::bitcompression::PostingListCountFileEncodeContext;
+using search::bitcompression::PostingListCountFileDecodeContext;
+using search::index::PostingListCounts;
+using search::index::PostingListOffsetAndCounts;
+using search::index::PostingListParams;
+using search::bitcompression::PageDict4SSWriter;
+using search::bitcompression::PageDict4SPWriter;
+using search::bitcompression::PageDict4PWriter;
+using search::bitcompression::PageDict4Reader;
+using search::bitcompression::PageDict4SSReader;
+using search::bitcompression::PageDict4SSLookupRes;
+using search::bitcompression::PageDict4SPLookupRes;
+using search::bitcompression::PageDict4PLookupRes;
+using search::index::Schema;
+using search::index::DictionaryFileSeqRead;
+using search::index::DictionaryFileSeqWrite;
+using search::index::DictionaryFileRandRead;
+using search::diskindex::PageDict4FileSeqRead;
+using search::diskindex::PageDict4FileSeqWrite;
+using search::diskindex::PageDict4RandRead;
+using search::index::DummyFileHeaderContext;
+
+typedef search::bitcompression::PageDict4StartOffset StartOffset;
+
+namespace
+{
+
+
+class Writer : public search::diskindex::ThreeLevelCountWriteBuffers
+{
+public:
+ PageDict4SSWriter *_ssw;
+ PageDict4SPWriter *_spw;
+ PageDict4PWriter *_pw;
+
+ Writer(EC &sse,
+ EC &spe,
+ EC &pe)
+ : ThreeLevelCountWriteBuffers(sse, spe, pe),
+ _ssw(NULL),
+ _spw(NULL),
+ _pw(NULL)
+ {
+ }
+
+ ~Writer(void)
+ {
+ delete _ssw;
+ delete _spw;
+ delete _pw;
+ }
+
+ void
+ allocWriters()
+ {
+ _ssw = new PageDict4SSWriter(_sse);
+ _spw = new PageDict4SPWriter(*_ssw, _spe);
+ _pw = new PageDict4PWriter(*_spw, _pe);
+ _spw->setup();
+ _pw->setup();
+ }
+
+ void
+ flush(void)
+ {
+ _pw->flush();
+ ThreeLevelCountWriteBuffers::flush();
+ }
+
+ void
+ addCounts(const std::string &word,
+ const PostingListCounts &counts)
+ {
+ _pw->addCounts(word, counts);
+ }
+};
+
+
+class SeqReader : public search::diskindex::ThreeLevelCountReadBuffers
+{
+public:
+ PageDict4SSReader _ssr;
+ PageDict4Reader _pr;
+
+ SeqReader(DC &ssd,
+ DC &spd,
+ DC &pd,
+ search::diskindex::ThreeLevelCountWriteBuffers &wb)
+ : ThreeLevelCountReadBuffers(ssd, spd, pd, wb),
+ _ssr(_rcssd,
+ wb._ssHeaderLen, wb._ssFileBitSize,
+ wb._spHeaderLen, wb._spFileBitSize,
+ wb._pHeaderLen, wb._pFileBitSize),
+ _pr(_ssr, spd, pd)
+ {
+ _ssr.setup(ssd);
+ _pr.setup();
+ }
+
+ void
+ readCounts(vespalib::string &word,
+ uint64_t &wordNum,
+ PostingListCounts &counts)
+ {
+ _pr.readCounts(word, wordNum, counts);
+ }
+};
+
+class RandReader : public search::diskindex::ThreeLevelCountReadBuffers
+{
+public:
+ PageDict4SSReader _ssr;
+ const char *_spData;
+ const char *_pData;
+ size_t _pageSize;
+
+ RandReader(DC &ssd,
+ DC &spd,
+ DC &pd,
+ search::diskindex::ThreeLevelCountWriteBuffers &wb)
+ : ThreeLevelCountReadBuffers(ssd, spd, pd, wb),
+ _ssr(_rcssd,
+ wb._ssHeaderLen, wb._ssFileBitSize,
+ wb._spHeaderLen, wb._spFileBitSize,
+ wb._pHeaderLen, wb._pFileBitSize),
+ _spData(static_cast<const char *>(_rcspd._comprBuf)),
+ _pData(static_cast<const char *>(_rcpd._comprBuf)),
+ _pageSize(search::bitcompression::PageDict4PageParams::getPageByteSize())
+ {
+ _ssr.setup(ssd);
+ }
+
+ bool
+ lookup(const std::string &key,
+ uint64_t &wordNum,
+ PostingListCounts &counts,
+ StartOffset &offsets)
+ {
+ PageDict4SSLookupRes sslr;
+
+ sslr = _ssr.lookup(key);
+ if (!sslr._res) {
+ counts.clear();
+ offsets = sslr._l6StartOffset;
+ wordNum = sslr._l6WordNum;
+ return false;
+ }
+
+ if (sslr._overflow) {
+ wordNum = sslr._l6WordNum;
+ counts = sslr._counts;
+ offsets = sslr._startOffset;
+ return true;
+ }
+ PageDict4SPLookupRes splr;
+ splr.lookup(_ssr,
+ _spData +
+ _pageSize * sslr._sparsePageNum,
+ key,
+ sslr._l6Word,
+ sslr._lastWord,
+ sslr._l6StartOffset,
+ sslr._l6WordNum,
+ sslr._pageNum);
+
+ PageDict4PLookupRes plr;
+ plr.lookup(_ssr,
+ _pData + _pageSize * splr._pageNum,
+ key,
+ splr._l3Word,
+ splr._lastWord,
+ splr._l3StartOffset,
+ splr._l3WordNum);
+ wordNum = plr._wordNum;
+ offsets = plr._startOffset;
+ if (plr._res) {
+ counts = plr._counts;
+ return true;
+ }
+ counts.clear();
+ return false;
+ }
+};
+
+}
+
+class PageDict4TestApp : public FastOS_Application
+{
+public:
+ search::Rand48 _rnd;
+ bool _stress;
+ bool _emptyWord;
+ bool _firstWordForcedCommon;
+ bool _lastWordForcedCommon;
+
+ void
+ usage(void);
+
+ int
+ Main(void);
+
+ void
+ testWords(void);
+
+ PageDict4TestApp(void)
+ : _rnd(),
+ _stress(false),
+ _emptyWord(false),
+ _firstWordForcedCommon(false),
+ _lastWordForcedCommon(false)
+ {
+ }
+};
+
+
+void
+PageDict4TestApp::usage(void)
+{
+ printf("Usage: wordnumbers\n");
+ fflush(stdout);
+}
+
+
+int
+PageDict4TestApp::Main(void)
+{
+ if (_argc > 0) {
+ DummyFileHeaderContext::setCreator(_argv[0]);
+ }
+ _rnd.srand48(32);
+ for (int32_t i = 1; i < _argc; ++i) {
+ if (strcmp(_argv[i], "stress") == 0)
+ _stress = true;
+ if (strcmp(_argv[i], "emptyword") == 0)
+ _emptyWord = true;
+ if (strcmp(_argv[i], "firstwordforcedcommon") == 0)
+ _firstWordForcedCommon = true;
+ if (strcmp(_argv[i], "lastwordforcedcommon") == 0)
+ _lastWordForcedCommon = true;
+ }
+ testWords();
+
+ LOG(info,
+ "_stress is %s",
+ _stress ? "true" : "false");
+ LOG(info,
+ "_emptyWord is %s",
+ _emptyWord ? "true" : "false");
+ LOG(info,
+ "_firstWordForcedCommon is %s",
+ _firstWordForcedCommon ? "true" : "false");
+ LOG(info,
+ "_lastWordForcedCommon is %s",
+ _lastWordForcedCommon ? "true" : "false");
+
+ LOG(info, "SUCCESS");
+ return 0;
+}
+
+
+class WordIndexCounts
+{
+public:
+ uint32_t _numDocs;
+ uint64_t _fileOffset;
+ uint64_t _bitLength;
+ uint64_t _accNumDocs;
+
+ WordIndexCounts(uint64_t bitLength,
+ uint32_t numDocs)
+ : _numDocs(numDocs),
+ _fileOffset(0),
+ _bitLength(bitLength),
+ _accNumDocs(0)
+ {
+ }
+
+ WordIndexCounts()
+ : _numDocs(0),
+ _fileOffset(0),
+ _bitLength(0),
+ _accNumDocs(0)
+ {
+ }
+};
+
+class WordCounts
+{
+public:
+ std::string _word;
+ WordIndexCounts _counts;
+
+ bool
+ operator!=(const WordCounts &rhs) const
+ {
+ return _word != rhs._word;
+ }
+
+ WordCounts(const std::string &word)
+ : _word(word),
+ _counts()
+ {
+ }
+
+ bool
+ operator<(const WordCounts &rhs) const
+ {
+ return _word < rhs._word;
+ }
+};
+
+
+void
+deDup(std::vector<WordCounts> &v)
+{
+ std::vector<WordCounts> v2;
+ std::sort(v.begin(), v.end());
+ for (std::vector<WordCounts>::const_iterator
+ i = v.begin(),
+ ie = v.end();
+ i != ie;
+ ++i) {
+ if (v2.empty() || v2.back() != *i)
+ v2.push_back(*i);
+ }
+ std::swap(v, v2);
+}
+
+
+void
+deDup(std::vector<uint32_t> &v)
+{
+ std::vector<uint32_t> v2;
+ std::sort(v.begin(), v.end());
+ for (std::vector<uint32_t>::const_iterator
+ i = v.begin(),
+ ie = v.end();
+ i != ie;
+ ++i) {
+ if (v2.empty() || v2.back() != *i)
+ v2.push_back(*i);
+ }
+ std::swap(v, v2);
+}
+
+
+static WordIndexCounts
+makeIndex(search::Rand48 &rnd, bool forceCommon)
+{
+ uint64_t bitLength = 10;
+ uint32_t numDocs = 1;
+ if ((rnd.lrand48() % 150) == 0 || forceCommon) {
+ bitLength = 1000000000;
+ numDocs = 500000;
+ }
+ return WordIndexCounts(bitLength, numDocs);
+}
+
+
+void
+makeIndexes(search::Rand48 &rnd,
+ WordIndexCounts &counts,
+ bool forceCommon)
+{
+ counts = makeIndex(rnd, forceCommon);
+}
+
+
+static void
+makeWords(std::vector<WordCounts> &v,
+ search::Rand48 &rnd,
+ uint32_t numWordIds,
+ uint32_t tupleCount,
+ bool emptyWord,
+ bool firstWordForcedCommon,
+ bool lastWordForcedCommon)
+{
+ v.clear();
+ for (unsigned int i = 0; i < tupleCount; ++i) {
+ uint64_t word = rnd.lrand48() % numWordIds;
+ uint64_t wordCount = (rnd.lrand48() % 10) + 1;
+ for (unsigned int j = 0; j < wordCount; ++j) {
+ uint64_t nextWord = rnd.lrand48() % numWordIds;
+ uint64_t nextWordCount = 0;
+ bool incomplete = true;
+ nextWordCount = rnd.lrand48() % 10;
+ incomplete = (rnd.lrand48() % 3) == 0 || nextWordCount == 0;
+ for (unsigned int k = 0; k < nextWordCount; ++k) {
+ uint64_t nextNextWord = rnd.lrand48() % numWordIds;
+ std::ostringstream w;
+ w << word;
+ w << "-";
+ w << nextWord;
+ w << "-";
+ w << nextNextWord;
+ v.push_back(WordCounts(w.str()));
+ }
+ if (incomplete) {
+ std::ostringstream w;
+ w << word;
+ w << "-";
+ w << nextWord;
+ w << "-";
+ w << "9999999999999999";
+ v.push_back(WordCounts(w.str()));
+ }
+ }
+ }
+ deDup(v);
+ if (!v.empty() && emptyWord)
+ v.front()._word = "";
+ for (std::vector<WordCounts>::iterator
+ i = v.begin(), ib = v.begin(), ie = v.end();
+ i != ie; ++i) {
+ std::vector<WordIndexCounts> indexes;
+ makeIndexes(rnd, i->_counts,
+ (i == ib && firstWordForcedCommon) ||
+ (i + 1 == ie && lastWordForcedCommon));
+ }
+ uint64_t fileOffset = 0;
+ uint64_t accNumDocs = 0;
+ for (std::vector<WordCounts>::iterator
+ i = v.begin(),
+ ie = v.end();
+ i != ie;
+ ++i) {
+ WordIndexCounts *f = &i->_counts;
+ assert(f->_numDocs > 0);
+ assert(f->_bitLength > 0);
+ f->_fileOffset = fileOffset;
+ f->_accNumDocs = accNumDocs;
+ fileOffset += f->_bitLength;
+ accNumDocs += f->_numDocs;
+ }
+}
+
+
+void
+makeCounts(PostingListCounts &counts,
+ const WordCounts &i,
+ uint32_t chunkSize)
+{
+ PostingListCounts c;
+ const WordIndexCounts *j = &i._counts;
+ c._bitLength = j->_bitLength;
+ c._numDocs = j->_numDocs;
+ c._segments.clear();
+ assert(j->_numDocs > 0);
+ uint32_t numChunks = (j->_numDocs + chunkSize - 1) / chunkSize;
+ if (numChunks > 1) {
+ uint32_t chunkBits = j->_bitLength / numChunks;
+ for (uint32_t chunkNo = 0; chunkNo < numChunks; ++chunkNo) {
+ PostingListCounts::Segment seg;
+ seg._bitLength = chunkBits;
+ seg._numDocs = chunkSize;
+ seg._lastDoc = (chunkNo + 1) * chunkSize - 1;
+ if (chunkNo + 1 == numChunks) {
+ seg._bitLength = c._bitLength -
+ (numChunks - 1) * chunkBits;
+ seg._lastDoc = c._numDocs - 1;
+ seg._numDocs = c._numDocs - (numChunks - 1) * chunkSize;
+ }
+ c._segments.push_back(seg);
+ }
+ }
+ counts = c;
+}
+
+
+void
+checkCounts(const std::string &word,
+ const PostingListCounts &counts,
+ const StartOffset &fileOffset,
+ const WordCounts &i,
+ uint32_t chunkSize)
+{
+ PostingListCounts answer;
+
+ makeCounts(answer, i, chunkSize);
+ assert(word == i._word);
+ (void) word;
+ (void) fileOffset;
+ const WordIndexCounts *j = &i._counts;
+ assert(counts._bitLength == j->_bitLength);
+ assert(counts._numDocs == j->_numDocs);
+ assert(fileOffset._fileOffset == j->_fileOffset);
+ assert(fileOffset._accNumDocs == j->_accNumDocs);
+ assert(counts._segments == answer._segments);
+ assert(counts == answer);
+ (void) counts;
+}
+
+
+void
+testWords(const std::string &logname,
+ search::Rand48 &rnd,
+ uint64_t numWordIds,
+ uint32_t tupleCount,
+ uint32_t chunkSize,
+ uint32_t ssPad,
+ uint32_t spPad,
+ uint32_t pPad,
+ bool emptyWord,
+ bool firstWordForcedCommon,
+ bool lastWordForcedCommon)
+{
+ typedef search::bitcompression::PostingListCountFileEncodeContext EC;
+ typedef search::bitcompression::PostingListCountFileDecodeContext DC;
+
+ LOG(info, "%s: word test start", logname.c_str());
+ std::vector<WordCounts> myrand;
+ makeWords(myrand, rnd, numWordIds, tupleCount,
+ emptyWord, firstWordForcedCommon, lastWordForcedCommon);
+
+ PostingListCounts xcounts;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i) {
+ makeCounts(xcounts, *i, chunkSize);
+ }
+ LOG(info, "%s: word counts generated", logname.c_str());
+
+ EC pe;
+ EC spe;
+ EC sse;
+
+ sse._minChunkDocs = chunkSize;
+ sse._numWordIds = numWordIds;
+ spe.copyParams(sse);
+ pe.copyParams(sse);
+ Writer w(sse, spe, pe);
+ w.startPad(ssPad, spPad, pPad);
+ w.allocWriters();
+
+ PostingListCounts counts;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i) {
+ makeCounts(counts, *i, chunkSize);
+ w.addCounts(i->_word, counts);
+ }
+ w.flush();
+
+ LOG(info,
+ "%s: Used %" PRIu64 "+%" PRIu64 "+%" PRIu64
+ " bits for %d words",
+ logname.c_str(),
+ w._pFileBitSize,
+ w._spFileBitSize,
+ w._ssFileBitSize,
+ (int) myrand.size());
+
+ StartOffset checkOffset;
+
+ {
+ DC ssd;
+ ssd._minChunkDocs = chunkSize;
+ ssd._numWordIds = numWordIds;
+ DC spd;
+ spd.copyParams(ssd);
+ DC pd;
+ pd.copyParams(ssd);
+
+ SeqReader r(ssd, spd, pd, w);
+
+ uint64_t wordNum = 1;
+ uint64_t checkWordNum = 0;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i, ++wordNum) {
+ vespalib::string word;
+ counts.clear();
+ r.readCounts(word, checkWordNum, counts);
+ checkCounts(word, counts, checkOffset, *i, chunkSize);
+ assert(checkWordNum == wordNum);
+ checkOffset._fileOffset += counts._bitLength;
+ checkOffset._accNumDocs += counts._numDocs;
+ }
+ assert(pd.getReadOffset() == w._pFileBitSize);
+ LOG(info, "%s: words seqRead test OK", logname.c_str());
+ }
+
+ {
+ DC ssd;
+ ssd._minChunkDocs = chunkSize;
+ ssd._numWordIds = numWordIds;
+ DC spd;
+ spd.copyParams(ssd);
+ DC pd;
+ pd.copyParams(ssd);
+
+ RandReader rr(ssd, spd, pd, w);
+
+ uint64_t wordNum = 1;
+ uint64_t checkWordNum = 0;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i, ++wordNum) {
+ checkWordNum = 0;
+ bool res = rr.lookup(i->_word,
+ checkWordNum,
+ counts,
+ checkOffset);
+ assert(res);
+ (void) res;
+ checkCounts(i->_word, counts, checkOffset,
+ *i, chunkSize);
+ assert(checkWordNum == wordNum);
+ }
+ LOG(info, "%s: word randRead test OK", logname.c_str());
+ }
+
+ Schema schema;
+ std::vector<uint32_t> indexes;
+ {
+ std::ostringstream fn;
+ fn << "f0";
+ schema.addIndexField(Schema::
+ IndexField(fn.str(),
+ Schema::STRING,
+ Schema::SINGLE));
+ indexes.push_back(0);
+ }
+ {
+ std::unique_ptr<DictionaryFileSeqWrite>
+ dw(new PageDict4FileSeqWrite);
+ std::vector<uint32_t> wIndexes;
+ std::vector<PostingListCounts> wCounts;
+ search::TuneFileSeqWrite tuneFileWrite;
+ DummyFileHeaderContext fileHeaderContext;
+ PostingListParams params;
+ params.set("numWordIds", numWordIds);
+ params.set("minChunkDocs", chunkSize);
+ dw->setParams(params);
+ bool openres = dw->open("fakedict",
+ tuneFileWrite,
+ fileHeaderContext);
+ assert(openres);
+
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i) {
+ makeCounts(counts, *i, chunkSize);
+ dw->writeWord(i->_word, counts);
+ }
+ bool closeres = dw->close();
+ assert(closeres);
+ (void) closeres;
+
+ LOG(info, "%s: pagedict4 written", logname.c_str());
+ }
+ {
+ std::unique_ptr<DictionaryFileSeqRead> dr(new PageDict4FileSeqRead);
+ search::TuneFileSeqRead tuneFileRead;
+
+ bool openres = dr->open("fakedict",
+ tuneFileRead);
+ assert(openres);
+ (void) openres;
+ std::string lastWord;
+ vespalib::string checkWord;
+ PostingListCounts wCounts;
+ PostingListCounts rCounts;
+ uint64_t wordNum = 1;
+ uint64_t checkWordNum = 5;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i, ++wordNum) {
+ makeCounts(counts, *i, chunkSize);
+ wCounts = counts;
+ checkWord.clear();
+ checkWordNum = 0;
+ dr->readWord(checkWord, checkWordNum, rCounts);
+ assert(rCounts == wCounts);
+ assert(wordNum == checkWordNum);
+ assert(checkWord == i->_word);
+ }
+
+ checkWord = "bad";
+ checkWordNum = 5;
+ dr->readWord(checkWord, checkWordNum, rCounts);
+ assert(checkWord.empty());
+ assert(checkWordNum == DictionaryFileSeqRead::noWordNumHigh());
+ bool closeres = dr->close();
+ assert(closeres);
+ (void) closeres;
+
+ LOG(info, "%s: pagedict4 seqverify OK", logname.c_str());
+ }
+ {
+ std::unique_ptr<DictionaryFileRandRead> drr(new PageDict4RandRead);
+ search::TuneFileRandRead tuneFileRead;
+ bool openres = drr->open("fakedict",
+ tuneFileRead);
+ assert(openres);
+ (void) openres;
+ std::string lastWord;
+ vespalib::string checkWord;
+ PostingListCounts wCounts;
+ PostingListCounts rCounts;
+ uint64_t wOffset;
+ uint64_t rOffset;
+ PostingListOffsetAndCounts rOffsetAndCounts;
+ uint64_t wordNum = 1;
+ uint64_t checkWordNum = 5;
+ std::string missWord;
+ wOffset = 0;
+ for (std::vector<WordCounts>::const_iterator
+ i = myrand.begin(),
+ ie = myrand.end();
+ i != ie;
+ ++i, ++wordNum) {
+ makeCounts(counts, *i, chunkSize);
+ wCounts = counts;
+
+ checkWordNum = 0;
+ rCounts.clear();
+ rOffset = 0;
+ bool lres = drr->lookup(i->_word, checkWordNum,
+ rOffsetAndCounts);
+ assert(lres);
+ (void) lres;
+ assert((rOffsetAndCounts._counts._bitLength == 0) ==
+ (rOffsetAndCounts._counts._numDocs == 0));
+ rOffset = rOffsetAndCounts._offset;
+ rCounts = rOffsetAndCounts._counts;
+ assert(rCounts == wCounts);
+ assert(wordNum == checkWordNum);
+ assert(rOffset == wOffset);
+
+ wOffset += wCounts._bitLength;
+ lastWord = i->_word;
+
+ missWord = i->_word;
+ missWord.append(1, '\1');
+ checkWordNum = 0;
+ lres = drr->lookup(missWord, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ assert(checkWordNum == wordNum + 1);
+ }
+
+ checkWordNum = 0;
+ std::string notfoundword = "Thiswordhasbetternotbeindictionary";
+ bool lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ checkWordNum = 0;
+ notfoundword = lastWord + "somethingmore";
+ lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ (void) lres;
+ LOG(info, "Lookup beyond dict EOF gave wordnum %d", (int) checkWordNum);
+
+ if (firstWordForcedCommon) {
+ if (!emptyWord) {
+ checkWordNum = 0;
+ notfoundword = "";
+ lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ assert(checkWordNum == 1);
+ }
+ if (!myrand.empty()) {
+ checkWordNum = 0;
+ notfoundword = myrand.front()._word;
+ notfoundword.append(1, '\1');
+ lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ assert(checkWordNum == 2);
+ }
+ }
+ if (lastWordForcedCommon && !myrand.empty()) {
+ if (myrand.size() > 1) {
+ checkWordNum = 0;
+ notfoundword = myrand[myrand.size() - 2]._word;
+ notfoundword.append(1, '\1');
+ lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ assert(checkWordNum == myrand.size());
+ }
+ checkWordNum = 0;
+ notfoundword = myrand[myrand.size() - 1]._word;
+ notfoundword.append(1, '\1');
+ lres = drr->lookup(notfoundword, checkWordNum,
+ rOffsetAndCounts);
+ assert(!lres);
+ assert(checkWordNum == myrand.size() + 1);
+ }
+ bool closeres = drr->close();
+ assert(closeres);
+ (void) closeres;
+ LOG(info, "%s: pagedict4 randverify OK", logname.c_str());
+ }
+}
+
+
+void
+PageDict4TestApp::testWords(void)
+{
+ ::testWords("smallchunkwordsempty", _rnd,
+ 1000000, 0,
+ 64, 80, 72, 64,
+ false, false, false);
+ ::testWords("smallchunkwordsempty2", _rnd,
+ 0, 0,
+ 64, 80, 72, 64,
+ false, false, false);
+ ::testWords("smallchunkwords", _rnd,
+ 1000000, 100,
+ 64, 80, 72, 64,
+ false, false, false);
+ ::testWords("smallchunkwordswithemptyword", _rnd,
+ 1000000, 100,
+ 64, 80, 72, 64,
+ true, false, false);
+ ::testWords("smallchunkwordswithcommonfirstword", _rnd,
+ 1000000, 100,
+ 64, 80, 72, 64,
+ false, true, false);
+ ::testWords("smallchunkwordswithcommonemptyfirstword", _rnd,
+ 1000000, 100,
+ 64, 80, 72, 64,
+ true, true, false);
+ ::testWords("smallchunkwordswithcommonlastword", _rnd,
+ 1000000, 100,
+ 64, 80, 72, 64,
+ false, false, true);
+#if 1
+ ::testWords("smallchunkwords2", _rnd,
+ 1000000, _stress ? 10000 : 100,
+ 64, 80, 72, 64,
+ _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
+#endif
+#if 1
+ ::testWords("stdwords", _rnd,
+ 1000000, _stress ? 10000 : 100,
+ 262144, 80, 72, 64,
+ _emptyWord, _firstWordForcedCommon, _lastWordForcedCommon);
+#endif
+}
+
+FASTOS_MAIN(PageDict4TestApp);