summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-05-10 14:17:16 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-05-10 15:31:06 +0200
commitb6f24cf5f949bd9dbb16c4e36b27eec84bcd217a (patch)
tree0899ade2682a11f4c3adbb559fc2be14d5b23bb2
parent1e98247ac92f391bf8af18627354f2374255f32b (diff)
Consolidate factory for posocc iterators.
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp37
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h5
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp132
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h21
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp23
5 files changed, 68 insertions, 150 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
index 7678b11ba41..bea92e5c009 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "zcposocciterators.h"
+#include "zc4_posting_params.h"
namespace search::diskindex {
@@ -64,6 +65,42 @@ ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool d
_decodeContext = &_decodeContextReal;
}
+template <bool bigEndian>
+std::unique_ptr<search::queryeval::SearchIterator>
+create_zc_posocc_iterator(const PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data)
+{
+ using EC = bitcompression::EncodeContext64<bigEndian>;
+ bitcompression::DecodeContext64<bigEndian> d(start.getOccurences(), start.getBitOffset());
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+ uint32_t length;
+ uint64_t val64;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
+ uint32_t num_docs = static_cast<uint32_t>(val64) + 1;
+ assert((num_docs == counts._numDocs) || ((num_docs == posting_params._min_chunk_docs) && (num_docs < counts._numDocs)));
+ if (num_docs < posting_params._min_skip_docs) {
+ if (posting_params._dynamic_k) {
+ return std::make_unique<ZcRareWordPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, &fields_params, match_data);
+ } else {
+ return std::make_unique<Zc4RareWordPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, &fields_params, match_data);
+ }
+ } else {
+ if (posting_params._dynamic_k) {
+ return std::make_unique<ZcPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, posting_params._min_chunk_docs, counts, &fields_params, match_data);
+ } else {
+ return std::make_unique<Zc4PosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, posting_params._min_chunk_docs, counts, &fields_params, match_data);
+ }
+ }
+}
+
+std::unique_ptr<search::queryeval::SearchIterator>
+create_zc_posocc_iterator(bool bigEndian, const PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data)
+{
+ if (bigEndian) {
+ return create_zc_posocc_iterator<true>(counts, start, bit_length, posting_params, fields_params, match_data);
+ } else {
+ return create_zc_posocc_iterator<false>(counts, start, bit_length, posting_params, fields_params, match_data);
+ }
+}
template class Zc4RareWordPosOccIterator<true>;
template class Zc4RareWordPosOccIterator<false>;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
index 3b58203aab4..76e7b384c11 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
@@ -7,6 +7,8 @@
namespace search::diskindex {
+struct Zc4PostingParams;
+
template <bool bigEndian>
class Zc4RareWordPosOccIterator : public Zc4RareWordPostingIterator<bigEndian>
{
@@ -73,6 +75,9 @@ public:
};
+std::unique_ptr<search::queryeval::SearchIterator>
+create_zc_posocc_iterator(bool bigEndian, const index::PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data);
+
extern template class Zc4RareWordPosOccIterator<true>;
extern template class Zc4RareWordPosOccIterator<false>;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
index 9d7df382325..edbd78b9b01 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
@@ -36,15 +36,11 @@ using vespalib::getLastErrorString;
ZcPosOccRandRead::ZcPosOccRandRead()
: _file(std::make_unique<FastOS_File>()),
_fileSize(0),
- _minChunkDocs(1 << 30),
- _minSkipDocs(64),
- _docIdLimit(10000000),
+ _posting_params(64, 1 << 30, 10000000, true, true, false),
_numWords(0),
_fileBitSize(0),
_headerBitSize(0),
- _fieldsParams(),
- _dynamicK(true),
- _decode_cheap_features(false)
+ _fieldsParams()
{ }
@@ -65,8 +61,6 @@ createIterator(const PostingListCounts &counts,
{
(void) usebitVector;
- typedef EGPosOccEncodeContext<true> EC;
-
assert((handle._bitLength != 0) == (counts._bitLength != 0));
assert((counts._numDocs != 0) == (counts._bitLength != 0));
assert(handle._bitOffsetMem <= handle._bitOffset);
@@ -85,22 +79,7 @@ createIterator(const PostingListCounts &counts,
handle._bitOffsetMem) & 63;
Position start(mem, bitOffset);
-
- EGPosOccDecodeContext<true> d(mem, bitOffset, &_fieldsParams);
-
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
- uint32_t length;
- uint64_t val64;
-
- UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
-
- uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
-
- if (numDocs < _minSkipDocs) {
- return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData);
- } else {
- return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData);
- }
+ return create_zc_posocc_iterator(true, counts, start, handle._bitLength, _posting_params, _fieldsParams, matchData).release();
}
@@ -200,10 +179,11 @@ ZcPosOccRandRead::close()
}
+template <typename DecodeContext>
void
-ZcPosOccRandRead::readHeader()
+ZcPosOccRandRead::readHeader(const vespalib::string &identifier)
{
- EGPosOccDecodeContext<true> d(&_fieldsParams);
+ DecodeContext d(&_fieldsParams);
ComprFileReadContext drc(d);
drc.setFile(_file.get());
@@ -227,14 +207,14 @@ ZcPosOccRandRead::readHeader()
assert(header.hasTag("minSkipDocs"));
assert(header.getTag("frozen").asInteger() != 0);
_fileBitSize = header.getTag("fileBitSize").asInteger();
- assert(header.getTag("format.0").asString() == myId5);
+ assert(header.getTag("format.0").asString() == identifier);
assert(header.getTag("format.1").asString() == d.getIdentifier());
_numWords = header.getTag("numWords").asInteger();
- _minChunkDocs = header.getTag("minChunkDocs").asInteger();
- _docIdLimit = header.getTag("docIdLimit").asInteger();
- _minSkipDocs = header.getTag("minSkipDocs").asInteger();
+ _posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger();
+ _posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger();
+ _posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger();
if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) {
- _decode_cheap_features = true;
+ _posting_params._encode_cheap_features = true;
}
// Read feature decoding specific subheader
d.readHeader(header, "features.");
@@ -245,6 +225,11 @@ ZcPosOccRandRead::readHeader()
_headerBitSize = d.getReadOffset();
}
+void
+ZcPosOccRandRead::readHeader()
+{
+ readHeader<EGPosOccDecodeContext<true>>(myId5);
+}
const vespalib::string &
ZcPosOccRandRead::getIdentifier()
@@ -266,95 +251,14 @@ Zc4PosOccRandRead::
Zc4PosOccRandRead()
: ZcPosOccRandRead()
{
- _dynamicK = false;
+ _posting_params._dynamic_k = false;
}
-search::queryeval::SearchIterator *
-Zc4PosOccRandRead::
-createIterator(const PostingListCounts &counts,
- const PostingListHandle &handle,
- const search::fef::TermFieldMatchDataArray &matchData,
- bool usebitVector) const
-{
- (void) usebitVector;
- typedef EGPosOccEncodeContext<true> EC;
-
- assert((handle._bitLength != 0) == (counts._bitLength != 0));
- assert((counts._numDocs != 0) == (counts._bitLength != 0));
- assert(handle._bitOffsetMem <= handle._bitOffset);
-
- if (handle._bitLength == 0) {
- return new search::queryeval::EmptySearch;
- }
-
- const char *cmem = static_cast<const char *>(handle._mem);
- uint64_t memOffset = reinterpret_cast<unsigned long>(cmem) & 7;
- const uint64_t *mem = reinterpret_cast<const uint64_t *>
- (cmem - memOffset) +
- (memOffset * 8 + handle._bitOffset -
- handle._bitOffsetMem) / 64;
- int bitOffset = (memOffset * 8 + handle._bitOffset -
- handle._bitOffsetMem) & 63;
-
- Position start(mem, bitOffset);
- EG2PosOccDecodeContext<true> d(mem, bitOffset, &_fieldsParams);
-
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
- uint32_t length;
- uint64_t val64;
-
- UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
-
- uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
-
- if (numDocs < _minSkipDocs) {
- return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData);
- } else {
- return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData);
- }
-}
-
void
Zc4PosOccRandRead::readHeader()
{
- EG2PosOccDecodeContext<true> d(&_fieldsParams);
- ComprFileReadContext drc(d);
-
- drc.setFile(_file.get());
- drc.setFileSize(_file->GetSize());
- drc.allocComprBuf(512, 32768u);
- d.emptyBuffer(0);
- drc.readComprBuffer();
- d.setReadContext(&drc);
-
- vespalib::FileHeader header;
- d.readHeader(header, _file->getSize());
- uint32_t headerLen = header.getSize();
- assert(header.hasTag("frozen"));
- assert(header.hasTag("fileBitSize"));
- assert(header.hasTag("format.0"));
- assert(header.hasTag("format.1"));
- assert(!header.hasTag("format.2"));
- assert(header.hasTag("numWords"));
- assert(header.hasTag("minChunkDocs"));
- assert(header.hasTag("docIdLimit"));
- assert(header.hasTag("minSkipDocs"));
- assert(header.getTag("frozen").asInteger() != 0);
- _fileBitSize = header.getTag("fileBitSize").asInteger();
- assert(header.getTag("format.0").asString() == myId4);
- assert(header.getTag("format.1").asString() == d.getIdentifier());
- _numWords = header.getTag("numWords").asInteger();
- _minChunkDocs = header.getTag("minChunkDocs").asInteger();
- _docIdLimit = header.getTag("docIdLimit").asInteger();
- _minSkipDocs = header.getTag("minSkipDocs").asInteger();
- // Read feature decoding specific subheader
- d.readHeader(header, "features.");
- // Align on 64-bit unit
- d.smallAlign(64);
- headerLen += (-headerLen & 7);
- assert(d.getReadOffset() == headerLen * 8);
- _headerBitSize = d.getReadOffset();
+ readHeader<EG2PosOccDecodeContext<true> >(myId4);
}
const vespalib::string &
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
index a78ae6f14f3..dcaca5d1f81 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
@@ -6,6 +6,7 @@
#include <vespa/searchlib/bitcompression/compression.h>
#include <vespa/searchlib/bitcompression/posocccompression.h>
#include <vespa/searchlib/fef/termfieldmatchdataarray.h>
+#include "zc4_posting_params.h"
namespace search::diskindex {
@@ -14,18 +15,11 @@ class ZcPosOccRandRead : public index::PostingListFileRandRead
protected:
std::unique_ptr<FastOS_FileInterface> _file;
uint64_t _fileSize;
-
- uint32_t _minChunkDocs; // # of documents needed for chunking
- uint32_t _minSkipDocs; // # of documents needed for skipping
- uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
-
+ Zc4PostingParams _posting_params;
uint64_t _numWords; // Number of words in file
uint64_t _fileBitSize;
uint64_t _headerBitSize;
bitcompression::PosOccFieldsParams _fieldsParams;
- bool _dynamicK;
- bool _decode_cheap_features;
-
public:
ZcPosOccRandRead();
@@ -50,6 +44,8 @@ public:
bool open(const vespalib::string &name, const TuneFileRandRead &tuneFileRead) override;
bool close() override;
+ template <typename DecodeContext>
+ void readHeader(const vespalib::string &identifier);
virtual void readHeader();
static const vespalib::string &getIdentifier();
static const vespalib::string &getSubIdentifier();
@@ -57,17 +53,10 @@ public:
class Zc4PosOccRandRead : public ZcPosOccRandRead
{
+ using ZcPosOccRandRead::readHeader;
public:
Zc4PosOccRandRead();
- /**
- * Create iterator for single word. Semantic lifetime of counts and
- * handle must exceed lifetime of iterator.
- */
- queryeval::SearchIterator *
- createIterator(const PostingListCounts &counts, const PostingListHandle &handle,
- const fef::TermFieldMatchDataArray &matchData, bool usebitVector) const override;
-
void readHeader() override;
static const vespalib::string &getIdentifier();
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
index c79574a61ff..0f914e2e3b1 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
@@ -1223,6 +1223,7 @@ FakeZcSkipPosOcc<bigEndian>::FakeZcSkipPosOcc(const FakeWord &fw)
{
setup(fw);
_counts._bitLength = _compressedBits;
+ _counts._numDocs = _hitDocs;
}
@@ -1284,6 +1285,7 @@ FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4Pos
{
setup(fw);
_counts._bitLength = _compressedBits;
+ _counts._numDocs = _hitDocs;
}
template <bool bigEndian>
@@ -1318,26 +1320,7 @@ SearchIterator *
FakeZc4SkipPosOcc<bigEndian>::
createIterator(const TermFieldMatchDataArray &matchData) const
{
- if (_hitDocs >= _posting_params._min_skip_docs) {
- if (_posting_params._dynamic_k) {
- return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features,
- static_cast<uint32_t>(-1),
- _counts,
- &_fieldsParams,
- matchData);
- } else {
- return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features,
- static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData);
- }
- } else {
- if (_posting_params._dynamic_k) {
- return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
- _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData);
- } else {
- return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
- _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData);
- }
- }
+ return create_zc_posocc_iterator(bigEndian, _counts, Position(_compressed.first, 0), _compressedBits, _posting_params, _fieldsParams, matchData).release();
}
template <bool bigEndian>