summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2019-05-08 21:04:19 +0200
committerGitHub <noreply@github.com>2019-05-08 21:04:19 +0200
commitf69f960d5ae48d246f56a60e6e46c90a58f836ba (patch)
tree4fbba4b13578dfd08f708a7b87784ffb713d38f2
parent26842a42a1521ad7c6ec333d723c358bb325850f (diff)
parentb8c3247c0813a91b5f064272f750f0e5c49e3352 (diff)
Merge pull request #9317 from vespa-engine/toregge/extend-zc4-postings-with-cheap-features
Extend Zc4/Zc5 postings with "cheap" features.
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h6
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp12
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h10
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h18
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h8
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp68
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h21
-rw-r--r--searchlib/src/vespa/searchlib/index/docidandfeatures.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/index/docidandfeatures.h6
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakeword.h5
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp417
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h20
23 files changed, 423 insertions, 301 deletions
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h
index b9166f675aa..de206d33b8d 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h
@@ -68,6 +68,8 @@ private:
#define K_VALUE_ZCPOSTING_L4SKIPSIZE 6
#define K_VALUE_ZCPOSTING_FEATURESSIZE 25
#define K_VALUE_ZCPOSTING_DELTA_DOCID 22
+#define K_VALUE_ZCPOSTING_FIELD_LENGTH 9
+#define K_VALUE_ZCPOSTING_NUM_OCCS 0
/**
* Lookup tables used for compression / decompression.
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h
index ea4cc6f58a6..8a5564d1cf7 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h
@@ -15,13 +15,15 @@ struct Zc4PostingParams {
uint32_t _doc_id_limit;
bool _dynamic_k;
bool _encode_features;
+ bool _encode_cheap_features;
- Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features)
+ Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features, bool encode_cheap_features)
: _min_skip_docs(min_skip_docs),
_min_chunk_docs(min_chunk_docs),
_doc_id_limit(doc_id_limit),
_dynamic_k(dynamic_k),
- _encode_features(encode_features)
+ _encode_features(encode_features),
+ _encode_cheap_features(encode_cheap_features)
{
}
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
index 961940d318f..ab2598211bb 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
@@ -50,6 +50,22 @@ Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features
UC64_DECODEEXPGOLOMB_SMALL_NS(o, _doc_id_k, EC);
_no_skip.set_doc_id(_no_skip.get_doc_id() + 1 + val64);
+ if (_posting_params._encode_cheap_features) {
+ if (__builtin_expect(oCompr >= d._valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _no_skip.set_field_length(val64 + 1);
+ if (__builtin_expect(oCompr >= d._valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ UC64_DECODECONTEXT_STORE(o, d._);
+ }
+ UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _no_skip.set_num_occs(val64 + 1);
+ }
UC64_DECODECONTEXT_STORE(o, d._);
if (__builtin_expect(oCompr >= d._valE, false)) {
_readContext.readComprBuffer();
@@ -57,6 +73,10 @@ Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features
}
features.set_doc_id(_no_skip.get_doc_id());
if (_posting_params._encode_features) {
+ if (_posting_params._encode_cheap_features) {
+ features.set_field_length(_no_skip.get_field_length());
+ features.set_num_occs(_no_skip.get_num_occs());
+ }
_decodeContext->readFeatures(features);
}
--_residue;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
index 59a660407b4..1be71450d09 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
@@ -10,7 +10,9 @@ namespace search::diskindex {
* Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k).
*
* Common words have docid deltas and skip info separate from
- * features.
+ * features. If "cheap" features are enabled then they are interleaved
+ * with docid deltas for quick access during sequential scan while the
+ * full features still remains separate.
*
* Rare words do not have skip info, and docid deltas and features are
* interleaved.
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp
index 68cf9489475..411b1cae8b9 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp
@@ -14,6 +14,8 @@ using bitcompression::DecodeContext64Base;
Zc4PostingReaderBase::NoSkip::NoSkip()
: _zc_buf(),
_doc_id(0),
+ _field_length(1),
+ _num_occs(1),
_doc_id_pos(0),
_features_pos(0)
{
@@ -35,10 +37,14 @@ Zc4PostingReaderBase::NoSkip::setup(DecodeContext &decode_context, uint32_t size
}
void
-Zc4PostingReaderBase::NoSkip::read()
+Zc4PostingReaderBase::NoSkip::read(bool decode_cheap_features)
{
assert(_zc_buf._valI < _zc_buf._valE);
_doc_id += (_zc_buf.decode()+ 1);
+ if (decode_cheap_features) {
+ _field_length = _zc_buf.decode() + 1;
+ _num_occs = _zc_buf.decode() + 1;
+ }
_doc_id_pos = _zc_buf.pos();
}
@@ -167,7 +173,7 @@ Zc4PostingReaderBase::Zc4PostingReaderBase(bool dynamic_k)
_num_docs(0),
_readContext(sizeof(uint64_t)),
_has_more(false),
- _posting_params(64, 1 << 30, 10000000, dynamic_k, true),
+ _posting_params(64, 1 << 30, 10000000, dynamic_k, true, false),
_last_doc_id(0),
_no_skip(),
_l1_skip(),
@@ -206,7 +212,7 @@ Zc4PostingReaderBase::read_common_word_doc_id(DecodeContext64Base &decode_contex
}
_l1_skip.next_skip_entry();
}
- _no_skip.read();
+ _no_skip.read(_posting_params._encode_cheap_features);
if (_residue == 1) {
_no_skip.check_end(_last_doc_id);
_l1_skip.check_end(_last_doc_id);
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h
index bc72a61942b..7ca87445633 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h
@@ -24,6 +24,8 @@ protected:
protected:
ZcBuf _zc_buf;
uint32_t _doc_id;
+ uint32_t _field_length;
+ uint32_t _num_occs;
uint32_t _doc_id_pos;
uint64_t _features_pos;
public:
@@ -31,13 +33,17 @@ protected:
~NoSkip();
void setup(DecodeContext &decode_context, uint32_t size, uint32_t doc_id);
void set_features_pos(uint64_t features_pos) { _features_pos = features_pos; }
- void read();
+ void read(bool decode_cheap_features);
void check_end(uint32_t last_doc_id);
void check_not_end(uint32_t last_doc_id);
uint32_t get_doc_id() const { return _doc_id; }
+ uint32_t get_field_length() const { return _field_length; }
+ uint32_t get_num_occs() const { return _num_occs; }
uint32_t get_doc_id_pos() const { return _doc_id_pos; }
uint64_t get_features_pos() const { return _features_pos; }
- void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; }
+ void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; }
+ void set_field_length(uint32_t field_length) { _field_length = field_length; }
+ void set_num_occs(uint32_t num_occs) { _num_occs = num_occs; }
};
// Helper class for L1 skip info
class L1Skip : public NoSkip {
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
index 78d18cb5550..ad8f7440bdc 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
@@ -83,10 +83,10 @@ Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore)
!hasMore) ?
numDocs : 1,
_docIdLimit);
- e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
+ e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back()._doc_id,
docIdK);
} else {
- e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
+ e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back()._doc_id,
K_VALUE_ZCPOSTING_LASTDOCID);
}
@@ -132,7 +132,7 @@ Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore)
PostingListCounts::Segment seg;
seg._bitLength = writePos - (_writePos + _counts._bitLength);
seg._numDocs = numDocs;
- seg._lastDoc = _docIds.back().first;
+ seg._lastDoc = _docIds.back()._doc_id;
_counts._segments.push_back(seg);
_counts._bitLength += seg._bitLength;
}
@@ -153,11 +153,11 @@ Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &fe
uint64_t writeOffset = _encode_features->getWriteOffset();
uint64_t featureSize = writeOffset - _featureOffset;
assert(static_cast<uint32_t>(featureSize) == featureSize);
- _docIds.push_back(std::make_pair(features.doc_id(),
- static_cast<uint32_t>(featureSize)));
+ _docIds.emplace_back(features.doc_id(), features.field_length(), features.num_occs(),
+ static_cast<uint32_t>(featureSize));
_featureOffset = writeOffset;
} else {
- _docIds.push_back(std::make_pair(features.doc_id(), uint32_t(0)));
+ _docIds.emplace_back(features.doc_id(), features.field_length(), features.num_occs(), 0);
}
}
@@ -187,10 +187,16 @@ Zc4PostingWriter<bigEndian>::flush_word_no_skip()
std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
for (; dit != dite; ++dit) {
- uint32_t docId = dit->first;
- uint32_t featureSize = dit->second;
+ uint32_t docId = dit->_doc_id;
+ uint32_t featureSize = dit->_features_size;
e.encodeExpGolomb(docId - baseDocId, docIdK);
baseDocId = docId + 1;
+ if (_encode_cheap_features) {
+ assert(dit->_field_length > 0);
+ e.encodeExpGolomb(dit->_field_length - 1, K_VALUE_ZCPOSTING_FIELD_LENGTH);
+ assert(dit->_num_occs > 0);
+ e.encodeExpGolomb(dit->_num_occs - 1, K_VALUE_ZCPOSTING_NUM_OCCS);
+ }
if (featureSize != 0) {
e.writeBits(features + (featureOffset >> 6),
featureOffset & 63,
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h
index f2c96af8901..ce1a73db7a5 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h
@@ -12,7 +12,9 @@ namespace search::diskindex {
* Class used to write posting lists of type "Zc.4" and "Zc.5" (dynamic k).
*
* Common words have docid deltas and skip info separate from
- * features.
+ * features. If "cheap" features are enabled then they are interleaved
+ * with docid deltas for quick access during sequential scan while the
+ * full features still remains separate.
*
* Rare words do not have skip info, and docid deltas and features are
* interleaved.
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
index 5ab37cecc3d..6e0cf6ed881 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
@@ -15,7 +15,7 @@ protected:
uint32_t _doc_id;
uint32_t _doc_id_pos;
uint32_t _feature_pos;
- using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>;
+ using DocIdAndFeatureSize = Zc4PostingWriterBase::DocIdAndFeatureSize;
public:
DocIdEncoder()
@@ -25,7 +25,7 @@ public:
{
}
- void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size);
+ void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size, bool encode_cheap_features);
void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; }
uint32_t get_doc_id() const { return _doc_id; }
uint32_t get_doc_id_pos() const { return _doc_id_pos; }
@@ -100,11 +100,17 @@ public:
};
void
-DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size)
+DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size, bool encode_cheap_features)
{
- _feature_pos += doc_id_and_feature_size.second;
- zc_buf.encode(doc_id_and_feature_size.first - _doc_id - 1);
- _doc_id = doc_id_and_feature_size.first;
+ _feature_pos += doc_id_and_feature_size._features_size;
+ zc_buf.encode(doc_id_and_feature_size._doc_id - _doc_id - 1);
+ _doc_id = doc_id_and_feature_size._doc_id;
+ if (encode_cheap_features) {
+ assert(doc_id_and_feature_size._field_length > 0);
+ zc_buf.encode(doc_id_and_feature_size._field_length - 1);
+ assert(doc_id_and_feature_size._num_occs > 0);
+ zc_buf.encode(doc_id_and_feature_size._num_occs - 1);
+ }
_doc_id_pos = zc_buf.size();
}
@@ -199,6 +205,7 @@ Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts)
_featureOffset(0),
_writePos(0),
_dynamicK(false),
+ _encode_cheap_features(false),
_zcDocIds(),
_l1Skip(),
_l2Skip(),
@@ -257,7 +264,7 @@ Zc4PostingWriterBase::calc_skip_info(bool encode_features)
}
}
}
- doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size);
+ doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size, _encode_cheap_features);
}
// Extra partial entries for skip tables to simplify iterator during search
l1_skip_encoder.write_partial_skip(_l1Skip, doc_id_encoder.get_doc_id());
@@ -282,6 +289,7 @@ Zc4PostingWriterBase::set_posting_list_params(const PostingListParams &params)
params.get("docIdLimit", _docIdLimit);
params.get("minChunkDocs", _minChunkDocs);
params.get("minSkipDocs", _minSkipDocs);
+ params.get("cheap_features", _encode_cheap_features);
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
index 6da59028803..bb94e379c38 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
@@ -18,18 +18,32 @@ namespace search::diskindex {
*/
class Zc4PostingWriterBase
{
+public:
+ struct DocIdAndFeatureSize {
+ uint32_t _doc_id;
+ uint32_t _field_length;
+ uint32_t _num_occs;
+ uint32_t _features_size;
+ DocIdAndFeatureSize(uint32_t doc_id, uint32_t field_length, uint32_t num_occs, uint32_t features_size)
+ : _doc_id(doc_id),
+ _field_length(field_length),
+ _num_occs(num_occs),
+ _features_size(features_size)
+ {
+ }
+ };
protected:
uint32_t _minChunkDocs; // # of documents needed for chunking
uint32_t _minSkipDocs; // # of documents needed for skipping
uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
// Unpacked document ids for word and feature sizes
- using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>;
std::vector<DocIdAndFeatureSize> _docIds;
uint64_t _featureOffset; // Bit offset of next feature
uint64_t _writePos; // Bit position for start of current word
bool _dynamicK; // Caclulate EG compression parameters ?
+ bool _encode_cheap_features;
ZcBuf _zcDocIds; // Document id deltas
ZcBuf _l1Skip; // L1 skip info
ZcBuf _l2Skip; // L2 skip info
@@ -58,7 +72,9 @@ public:
uint32_t get_docid_limit() const { return _docIdLimit; }
uint64_t get_num_words() const { return _numWords; }
bool get_dynamic_k() const { return _dynamicK; }
+ bool get_encode_cheap_features() const { return _encode_cheap_features; }
void set_dynamic_k(bool dynamicK) { _dynamicK = dynamicK; }
+ void set_encode_cheap_features(bool encode_cheap_features) { _encode_cheap_features = encode_cheap_features; }
void set_posting_list_params(const index::PostingListParams &params);
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
index 64160c848fb..7678b11ba41 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp
@@ -13,10 +13,10 @@ using search::index::PostingListCounts;
template <bool bigEndian>
Zc4RareWordPosOccIterator<bigEndian>::
-Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
const PosOccFieldsParams *fieldsParams,
const TermFieldMatchDataArray &matchData)
- : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit),
+ : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features),
_decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams)
{
assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size()));
@@ -26,11 +26,11 @@ Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimi
template <bool bigEndian>
Zc4PosOccIterator<bigEndian>::
-Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
uint32_t minChunkDocs, const PostingListCounts &counts,
const PosOccFieldsParams *fieldsParams,
const TermFieldMatchDataArray &matchData)
- : ZcPostingIterator<bigEndian>(minChunkDocs, false, counts, matchData, start, docIdLimit),
+ : ZcPostingIterator<bigEndian>(minChunkDocs, false, counts, matchData, start, docIdLimit, decode_cheap_features),
_decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams)
{
assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size()));
@@ -40,10 +40,10 @@ Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
template <bool bigEndian>
ZcRareWordPosOccIterator<bigEndian>::
-ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
const PosOccFieldsParams *fieldsParams,
const TermFieldMatchDataArray &matchData)
- : ZcRareWordPostingIterator<bigEndian>(matchData, start, docIdLimit),
+ : ZcRareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features),
_decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams)
{
assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size()));
@@ -53,11 +53,11 @@ ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit
template <bool bigEndian>
ZcPosOccIterator<bigEndian>::
-ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
uint32_t minChunkDocs, const PostingListCounts &counts,
const PosOccFieldsParams *fieldsParams,
const TermFieldMatchDataArray &matchData)
- : ZcPostingIterator<bigEndian>(minChunkDocs, true, counts, matchData, start, docIdLimit),
+ : ZcPostingIterator<bigEndian>(minChunkDocs, true, counts, matchData, start, docIdLimit, decode_cheap_features),
_decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams)
{
assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size()));
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
index 41f2b747916..3b58203aab4 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h
@@ -17,7 +17,7 @@ private:
typedef bitcompression::EG2PosOccDecodeContextCooked<bigEndian> DecodeContextReal;
DecodeContextReal _decodeContextReal;
public:
- Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+ Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
const bitcompression::PosOccFieldsParams *fieldsParams,
const fef::TermFieldMatchDataArray &matchData);
};
@@ -33,7 +33,7 @@ private:
typedef bitcompression::EG2PosOccDecodeContextCooked<bigEndian> DecodeContext;
DecodeContext _decodeContextReal;
public:
- Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit,
+ Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features,
uint32_t minChunkDocs, const index::PostingListCounts &counts,
const bitcompression::PosOccFieldsParams *fieldsParams,
const fef::TermFieldMatchDataArray &matchData);
@@ -50,7 +50,7 @@ private:
typedef bitcompression::EGPosOccDecodeContextCooked<bigEndian> DecodeContextReal;
DecodeContextReal _decodeContextReal;
public:
- ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit,
+ ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, bool decode_cheap_features,
const bitcompression::PosOccFieldsParams *fieldsParams,
const fef::TermFieldMatchDataArray &matchData);
};
@@ -66,7 +66,7 @@ private:
typedef bitcompression::EGPosOccDecodeContextCooked<bigEndian> DecodeContext;
DecodeContext _decodeContextReal;
public:
- ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit,
+ ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, bool decode_cheap_features,
uint32_t minChunkDocs, const index::PostingListCounts &counts,
const bitcompression::PosOccFieldsParams *fieldsParams,
const fef::TermFieldMatchDataArray &matchData);
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
index 90680851859..9d7df382325 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
@@ -25,6 +25,7 @@ namespace {
vespalib::string myId4("Zc.4");
vespalib::string myId5("Zc.5");
+vespalib::string cheap_features("cheap_features");
}
@@ -42,7 +43,8 @@ ZcPosOccRandRead::ZcPosOccRandRead()
_fileBitSize(0),
_headerBitSize(0),
_fieldsParams(),
- _dynamicK(true)
+ _dynamicK(true),
+ _decode_cheap_features(false)
{ }
@@ -95,9 +97,9 @@ createIterator(const PostingListCounts &counts,
uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
if (numDocs < _minSkipDocs) {
- return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, &_fieldsParams, matchData);
+ return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData);
} else {
- return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _minChunkDocs, counts, &_fieldsParams, matchData);
+ return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData);
}
}
@@ -231,6 +233,9 @@ ZcPosOccRandRead::readHeader()
_minChunkDocs = header.getTag("minChunkDocs").asInteger();
_docIdLimit = header.getTag("docIdLimit").asInteger();
_minSkipDocs = header.getTag("minSkipDocs").asInteger();
+ if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) {
+ _decode_cheap_features = true;
+ }
// Read feature decoding specific subheader
d.readHeader(header, "features.");
// Align on 64-bit unit
@@ -304,9 +309,9 @@ createIterator(const PostingListCounts &counts,
uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
if (numDocs < _minSkipDocs) {
- return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, &_fieldsParams, matchData);
+ return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData);
} else {
- return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _minChunkDocs, counts, &_fieldsParams, matchData);
+ return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData);
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
index 3741f011c30..a78ae6f14f3 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
@@ -24,6 +24,7 @@ protected:
uint64_t _headerBitSize;
bitcompression::PosOccFieldsParams _fieldsParams;
bool _dynamicK;
+ bool _decode_cheap_features;
public:
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
index a0203b64197..b03085b0b55 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
@@ -16,6 +16,7 @@ namespace {
vespalib::string myId5("Zc.5");
vespalib::string myId4("Zc.4");
vespalib::string emptyId;
+vespalib::string cheap_features("cheap_features");
}
@@ -165,6 +166,9 @@ Zc4PostingSeqRead::readHeader()
posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger();
posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger();
posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger();
+ if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) {
+ posting_params._encode_cheap_features = true;
+ }
assert(header.getTag("endian").asString() == "big");
// Read feature decoding specific subheader
d.readHeader(header, "features.");
@@ -233,6 +237,7 @@ Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext)
header.putTag(Tag("fileBitSize", 0));
header.putTag(Tag("format.0", myId));
header.putTag(Tag("format.1", f.getIdentifier()));
+ header.putTag(Tag("cheap_features", _writer.get_encode_cheap_features() ? 1 : 0));
header.putTag(Tag("numWords", 0));
header.putTag(Tag("minChunkDocs", _writer.get_min_chunk_docs()));
header.putTag(Tag("docIdLimit", _writer.get_docid_limit()));
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp
index 95679bb0af2..8ea576b06b0 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp
@@ -35,12 +35,15 @@ ZcIteratorBase::initRange(uint32_t beginid, uint32_t endid)
template <bool bigEndian>
Zc4RareWordPostingIterator<bigEndian>::
-Zc4RareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit)
+Zc4RareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features)
: ZcIteratorBase(matchData, start, docIdLimit),
_decodeContext(nullptr),
_residue(0),
_prevDocId(0),
- _numDocs(0)
+ _numDocs(0),
+ _decode_cheap_features(decode_cheap_features),
+ _field_length(0),
+ _num_occs(0)
{ }
@@ -66,6 +69,12 @@ Zc4RareWordPostingIterator<bigEndian>::doSeek(uint32_t docId)
printf("Decode docId=%d\n",
oDocId);
#endif
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
}
while (__builtin_expect(oDocId < docId, true)) {
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
@@ -80,6 +89,12 @@ Zc4RareWordPostingIterator<bigEndian>::doSeek(uint32_t docId)
printf("Decode docId=%d\n",
oDocId);
#endif
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
}
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
setDocId(oDocId);
@@ -123,6 +138,12 @@ Zc4RareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit)
_numDocs = static_cast<uint32_t>(val64) + 1;
UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DELTA_DOCID, EC);
uint32_t docId = static_cast<uint32_t>(val64) + 1;
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
setDocId(docId);
@@ -133,8 +154,8 @@ Zc4RareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit)
template <bool bigEndian>
ZcRareWordPostingIterator<bigEndian>::
-ZcRareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit)
- : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit),
+ZcRareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features)
+ : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features),
_docIdK(0)
{
}
@@ -162,6 +183,12 @@ ZcRareWordPostingIterator<bigEndian>::doSeek(uint32_t docId)
printf("Decode docId=%d\n",
oDocId);
#endif
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
}
while (__builtin_expect(oDocId < docId, true)) {
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
@@ -176,6 +203,12 @@ ZcRareWordPostingIterator<bigEndian>::doSeek(uint32_t docId)
printf("Decode docId=%d\n",
oDocId);
#endif
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
}
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
setDocId(oDocId);
@@ -200,6 +233,12 @@ ZcRareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit)
_docIdK = EC::calcDocIdK(_numDocs, docIdLimit);
UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC);
uint32_t docId = static_cast<uint32_t>(val64) + 1;
+ if (_decode_cheap_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC);
+ _field_length = static_cast<uint32_t>(val64) + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC);
+ _num_occs = static_cast<uint32_t>(val64) + 1;
+ }
UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
setDocId(docId);
@@ -207,7 +246,7 @@ ZcRareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit)
clearUnpacked();
}
-ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit)
+ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features)
: ZcIteratorBase(matchData, start, docIdLimit),
_valI(nullptr),
_valIBase(nullptr),
@@ -219,7 +258,10 @@ ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matc
_chunk(),
_featuresSize(0),
_hasMore(false),
- _chunkNo(0)
+ _decode_cheap_features(decode_cheap_features),
+ _chunkNo(0),
+ _field_length(0),
+ _num_occs(0)
{
}
@@ -229,8 +271,8 @@ ZcPostingIterator(uint32_t minChunkDocs,
bool dynamicK,
const PostingListCounts &counts,
const search::fef::TermFieldMatchDataArray &matchData,
- Position start, uint32_t docIdLimit)
- : ZcPostingIteratorBase(matchData, start, docIdLimit),
+ Position start, uint32_t docIdLimit, bool decode_cheap_features)
+ : ZcPostingIteratorBase(matchData, start, docIdLimit, decode_cheap_features),
_decodeContext(nullptr),
_minChunkDocs(minChunkDocs),
_docIdK(0),
@@ -550,6 +592,8 @@ ZcPostingIteratorBase::doSeek(uint32_t docId)
assert(docId <= _l4._skipDocId);
#endif
const uint8_t *oCompr = _valI;
+ uint32_t field_length = _field_length;
+ uint32_t num_occs = _num_occs;
while (__builtin_expect(oDocId < docId, true)) {
#if DEBUG_ZCPOSTING_ASSERT
assert(oDocId <= _l1._skipDocId);
@@ -562,10 +606,18 @@ ZcPostingIteratorBase::doSeek(uint32_t docId)
printf("Decode docId=%d\n",
oDocId);
#endif
+ if (_decode_cheap_features) {
+ ZCDECODE(oCompr, field_length =);
+ ZCDECODE(oCompr, num_occs =);
+ }
incNeedUnpack();
}
_valI = oCompr;
setDocId(oDocId);
+ if (_decode_cheap_features) {
+ _field_length = field_length;
+ _num_occs = num_occs;
+ }
return;
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h
index 97b7e2dc0cc..222fb404a7d 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h
@@ -68,8 +68,11 @@ public:
unsigned int _residue;
uint32_t _prevDocId; // Previous document id
uint32_t _numDocs; // Documents in chunk or word
+ bool _decode_cheap_features;
+ uint32_t _field_length;
+ uint32_t _num_occs;
- Zc4RareWordPostingIterator(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit);
+ Zc4RareWordPostingIterator(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features);
void doUnpack(uint32_t docId) override;
void doSeek(uint32_t docId) override;
@@ -89,12 +92,15 @@ private:
using ParentClass::setDocId;
using ParentClass::setAtEnd;
using ParentClass::_numDocs;
+ using ParentClass::_decode_cheap_features;
+ using ParentClass::_field_length;
+ using ParentClass::_num_occs;
uint32_t _docIdK;
public:
using ParentClass::_decodeContext;
- ZcRareWordPostingIterator(const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit);
+ ZcRareWordPostingIterator(const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features);
void doSeek(uint32_t docId) override;
void readWordStart(uint32_t docIdLimit) override;
@@ -239,12 +245,19 @@ protected:
ChunkSkip _chunk;
uint64_t _featuresSize;
bool _hasMore;
+ bool _decode_cheap_features;
uint32_t _chunkNo;
+ uint32_t _field_length;
+ uint32_t _num_occs;
void nextDocId(uint32_t prevDocId) {
uint32_t docId = prevDocId + 1;
ZCDECODE(_valI, docId +=);
setDocId(docId);
+ if (_decode_cheap_features) {
+ ZCDECODE(_valI, _field_length =);
+ ZCDECODE(_valI, _num_occs =);
+ }
}
virtual void featureSeek(uint64_t offset) = 0;
VESPA_DLL_LOCAL void doChunkSkipSeek(uint32_t docId);
@@ -254,7 +267,7 @@ protected:
VESPA_DLL_LOCAL void doL1SkipSeek(uint32_t docId);
void doSeek(uint32_t docId) override;
public:
- ZcPostingIteratorBase(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit);
+ ZcPostingIteratorBase(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features);
};
template <bool bigEndian>
@@ -281,7 +294,7 @@ public:
const PostingListCounts &_counts;
ZcPostingIterator(uint32_t minChunkDocs, bool dynamicK, const PostingListCounts &counts,
- const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit);
+ const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features);
void doUnpack(uint32_t docId) override;
diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
index 07b4da8a85f..ac3c4f4d3a5 100644
--- a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
+++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
@@ -8,6 +8,8 @@ namespace search::index {
DocIdAndFeatures::DocIdAndFeatures()
: _doc_id(0),
+ _field_length(1),
+ _num_occs(1),
_elements(),
_word_positions(),
_blob(),
@@ -19,6 +21,6 @@ DocIdAndFeatures::DocIdAndFeatures()
DocIdAndFeatures::DocIdAndFeatures(const DocIdAndFeatures &) = default;
DocIdAndFeatures & DocIdAndFeatures::operator = (const DocIdAndFeatures &) = default;
-DocIdAndFeatures::~DocIdAndFeatures() { }
+DocIdAndFeatures::~DocIdAndFeatures() = default;
}
diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.h b/searchlib/src/vespa/searchlib/index/docidandfeatures.h
index a063712a79e..5372d5ef3aa 100644
--- a/searchlib/src/vespa/searchlib/index/docidandfeatures.h
+++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.h
@@ -95,6 +95,8 @@ public:
protected:
uint32_t _doc_id; // Current document id
+ uint32_t _field_length;
+ uint32_t _num_occs;
std::vector<WordDocElementFeatures> _elements;
std::vector<WordDocElementWordPosFeatures> _word_positions;
@@ -140,7 +142,11 @@ public:
}
uint32_t doc_id() const { return _doc_id; }
+ uint32_t field_length() const { return _field_length; }
+ uint32_t num_occs() const { return _num_occs; }
void set_doc_id(uint32_t val) { _doc_id = val; }
+ void set_field_length(uint32_t val) { _field_length = val; }
+ void set_num_occs(uint32_t val) { _num_occs = val; }
const std::vector<WordDocElementFeatures>& elements() const { return _elements; }
std::vector<WordDocElementFeatures>& elements() { return _elements; }
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
index 8f6c16658c9..601451dc6c4 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
@@ -107,14 +107,12 @@ FakeWord::DocWordPosFeature::~DocWordPosFeature()
FakeWord::DocWordCollapsedFeature::DocWordCollapsedFeature()
+ : _field_len(0),
+ _num_occs(0)
{
}
-
-FakeWord::DocWordCollapsedFeature::~DocWordCollapsedFeature()
-{
-}
-
+FakeWord::DocWordCollapsedFeature::~DocWordCollapsedFeature() = default;
FakeWord::DocWordFeature::DocWordFeature()
: _docId(0),
@@ -235,14 +233,16 @@ FakeWord::fakeup(search::BitVector &bitmap,
DocWordPosFeature dwpf;
dwpf._wordPos = rnd.lrand48() % 8192;
dwpf._elementId = 0;
- if (_fieldsParams.getFieldParams()[0]._hasElements)
+ if (_fieldsParams.getFieldParams()[0]._hasElements) {
dwpf._elementId = rnd.lrand48() % 4;
+ }
wpf.push_back(dwpf);
}
if (positions > 1) {
/* Sort wordpos list and "avoid" duplicate positions */
std::sort(wpf.begin(), wpf.end());
}
+ uint32_t field_len = 0;
do {
DocWordPosFeatureList::iterator ie(wpf.end());
DocWordPosFeatureList::iterator i(wpf.begin());
@@ -274,8 +274,14 @@ FakeWord::fakeup(search::BitVector &bitmap,
pi->_elementWeight = elementWeight;
++pi;
}
+ field_len += elementLen;
+ }
+ if (_fieldsParams.getFieldParams()[0]._hasElements) {
+ field_len += ((rnd.lrand48() % 10) + 10);
}
} while (0);
+ dwf._collapsedDocWordFeatures._field_len = field_len;
+ dwf._collapsedDocWordFeatures._num_occs = dwf._positions;
dwf._accPositions = wordPosFeatures.size();
assert(dwf._positions == wpf.size());
postings.push_back(dwf);
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h
index 345d69c29f6..106c0c0d9ab 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h
@@ -49,6 +49,9 @@ public:
class DocWordCollapsedFeature
{
public:
+ uint32_t _field_len;
+ uint32_t _num_occs;
+
DocWordCollapsedFeature();
~DocWordCollapsedFeature();
};
@@ -201,6 +204,8 @@ public:
p->_elementWeight, p->_elementLen);
++p;
}
+ features.set_field_length(d._collapsedDocWordFeatures._field_len);
+ features.set_num_occs(d._collapsedDocWordFeatures._num_occs);
}
public:
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
index f6c6e5a64f3..c79574a61ff 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
@@ -29,6 +29,13 @@ namespace search {
namespace fakedata {
+namespace {
+
+constexpr uint32_t disable_chunking = 1000000000;
+constexpr uint32_t disable_skip = 1000000000;
+constexpr uint32_t force_skip = 1;
+
+}
#define DEBUG_ZCFILTEROCC_PRINTF 0
#define DEBUG_ZCFILTEROCC_ASSERT 0
@@ -86,7 +93,6 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw)
_l2SkipSize(0),
_l3SkipSize(0),
_l4SkipSize(0),
- _docIdLimit(0),
_hitDocs(0),
_lastDocId(0u),
_compressedBits(0),
@@ -94,14 +100,16 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw)
_compressedMalloc(NULL),
_featuresSize(0),
_fieldsParams(fw.getFieldsParams()),
- _bigEndian(true)
+ _bigEndian(true),
+ _posting_params(force_skip, disable_chunking, fw._docIdLimit, true, false, false)
{
- setup(fw, false, true);
+ setup(fw);
}
FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw,
bool bigEndian,
+ const diskindex::Zc4PostingParams &posting_params,
const char *nameSuffix)
: FakePosting(fw.getName() + nameSuffix),
_docIdsSize(0),
@@ -109,36 +117,34 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw,
_l2SkipSize(0),
_l3SkipSize(0),
_l4SkipSize(0),
- _docIdLimit(0),
_hitDocs(0),
_lastDocId(0u),
_compressedBits(0),
_compressed(std::make_pair(static_cast<uint64_t *>(NULL), 0)),
_featuresSize(0),
_fieldsParams(fw.getFieldsParams()),
- _bigEndian(bigEndian)
+ _bigEndian(bigEndian),
+ _posting_params(posting_params)
{
- // subclass responsible for calling setup(fw, false/true);
+ // subclass responsible for calling setup(fw);
}
void
-FakeZcFilterOcc::setup(const FakeWord &fw, bool doFeatures,
- bool dynamicK)
+FakeZcFilterOcc::setup(const FakeWord &fw)
{
if (_bigEndian) {
- setupT<true>(fw, doFeatures, dynamicK);
+ setupT<true>(fw);
} else {
- setupT<false>(fw, doFeatures, dynamicK);
+ setupT<false>(fw);
}
- validate_read(fw, doFeatures, dynamicK);
+ validate_read(fw);
}
template <bool bigEndian>
void
-FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
- bool dynamicK)
+FakeZcFilterOcc::setupT(const FakeWord &fw)
{
PostingListCounts counts;
Zc4PostingWriter<bigEndian> writer(counts);
@@ -154,18 +160,19 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
DocIdAndPosOccFeatures features;
EGPosOccEncodeContext<bigEndian> f1(&_fieldsParams);
EG2PosOccEncodeContext<bigEndian> f0(&_fieldsParams);
- FeatureEncodeContext<bigEndian> &f = (dynamicK ?
+ FeatureEncodeContext<bigEndian> &f = (_posting_params._dynamic_k ?
static_cast<FeatureEncodeContext<bigEndian> &>(f1) :
static_cast<FeatureEncodeContext<bigEndian> &>(f0));
- writer.set_dynamic_k(dynamicK);
- if (doFeatures) {
+ writer.set_dynamic_k(_posting_params._dynamic_k);
+ if (_posting_params._encode_features) {
writer.set_encode_features(&f);
}
PostingListParams params;
params.set("docIdLimit", fw._docIdLimit);
- params.set("minChunkDocs", 1000000000); // Disable chunking
- params.set("minSkipDocs", 1u); // Force skip info
+ params.set("minChunkDocs", _posting_params._min_chunk_docs); // Control chunking
+ params.set("minSkipDocs", _posting_params._min_skip_docs); // Control skip info
+ params.set("cheap_features", _posting_params._encode_cheap_features);
writer.set_posting_list_params(params);
auto &writeContext = writer.get_write_context();
search::ComprBuffer &cb = writeContext;
@@ -174,7 +181,7 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
e.setupWrite(cb);
// Ensure that some space is initially available in encoding buffers
while (d != de) {
- if (doFeatures) {
+ if (_posting_params._encode_features) {
fw.setupFeatures(*d, &*p, features);
p += d->_positions;
} else {
@@ -183,13 +190,12 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
writer.write_docid_and_features(features);
++d;
}
- if (doFeatures) {
+ if (_posting_params._encode_features) {
assert(p == pe);
}
writer.flush_word();
_featuresSize = 0;
_hitDocs = fw._postings.size();
- _docIdLimit = fw._docIdLimit;
_compressedBits = e.getWriteOffset();
assert(_compressedBits == counts._bitLength);
assert(_hitDocs == counts._numDocs);
@@ -199,55 +205,56 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
std::pair<void *, size_t> ectxData = writeContext.grabComprBuffer(_compressedMalloc);
_compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first),
ectxData.second);
- read_header<bigEndian>(doFeatures, dynamicK, writer.get_min_skip_docs(), writer.get_min_chunk_docs());
+ read_header<bigEndian>();
}
template <bool bigEndian>
void
-FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs)
+FakeZcFilterOcc::read_header()
{
// read back word header to get skip sizes
DecodeContext64<bigEndian> decode_context;
decode_context.setPosition({ _compressed.first, 0 });
- Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures);
Zc4PostingHeader header;
- header.read(decode_context, params);
+ header.read(decode_context, _posting_params);
_docIdsSize = header._doc_ids_size;
_l1SkipSize = header._l1_skip_size;
_l2SkipSize = header._l2_skip_size;
_l3SkipSize = header._l3_skip_size;
_l4SkipSize = header._l4_skip_size;
_featuresSize = header._features_size;
- assert(_lastDocId == header._last_doc_id);
+ assert(header._num_docs == _hitDocs);
+ if (header._num_docs >= _posting_params._min_skip_docs) {
+ assert(_lastDocId == header._last_doc_id);
+ } else {
+ assert(header._last_doc_id == 0);
+ }
}
void
-FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const
+FakeZcFilterOcc::validate_read(const FakeWord &fw) const
{
if (_bigEndian) {
- validate_read<true>(fw, encode_features, dynamic_k);
+ validate_read<true>(fw);
} else {
- validate_read<false>(fw, encode_features, dynamic_k);
+ validate_read<false>(fw);
}
}
template <bool bigEndian>
void
-FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const
+FakeZcFilterOcc::validate_read(const FakeWord &fw) const
{
bitcompression::EGPosOccDecodeContextCooked<bigEndian> decode_context_dynamic_k(&_fieldsParams);
bitcompression::EG2PosOccDecodeContextCooked<bigEndian> decode_context_static_k(&_fieldsParams);
bitcompression::FeatureDecodeContext<bigEndian> &decode_context_dynamic_k_upcast = decode_context_dynamic_k;
bitcompression::FeatureDecodeContext<bigEndian> &decode_context_static_k_upcast = decode_context_static_k;
- bitcompression::FeatureDecodeContext<bigEndian> &decode_context = dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast;
- Zc4PostingReader<bigEndian> reader(dynamic_k);
+ bitcompression::FeatureDecodeContext<bigEndian> &decode_context = _posting_params._dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast;
+ Zc4PostingReader<bigEndian> reader(_posting_params._dynamic_k);
reader.set_decode_features(&decode_context);
auto &params = reader.get_posting_params();
- params._min_skip_docs = 1;
- params._min_chunk_docs = 1000000000;
- params._doc_id_limit = _docIdLimit;
- params._encode_features = encode_features;
+ params = _posting_params;
reader.get_read_context().reference_compressed_buffer(_compressed.first, _compressed.second);
assert(decode_context.getReadOffset() == 0u);
PostingListCounts counts;
@@ -260,7 +267,7 @@ FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dy
DocIdAndFeatures features;
uint32_t hits = 0;
for (const auto &doc : fw._postings) {
- if (encode_features) {
+ if (_posting_params._encode_features) {
fw.setupFeatures(doc, &*word_pos_iterator, check_features);
word_pos_iterator += doc._positions;
} else {
@@ -270,9 +277,13 @@ FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dy
assert(features.doc_id() == doc._docId);
assert(features.elements().size() == check_features.elements().size());
assert(features.word_positions().size() == check_features.word_positions().size());
+ if (_posting_params._encode_cheap_features) {
+ assert(features.field_length() == doc._collapsedDocWordFeatures._field_len);
+ assert(features.num_occs() == doc._collapsedDocWordFeatures._num_occs);
+ }
++hits;
}
- if (encode_features) {
+ if (_posting_params._encode_features) {
assert(word_pos_iterator == word_pos_iterator_end);
}
reader.read_doc_id_and_features(features);
@@ -398,7 +409,7 @@ public:
uint32_t docIdLimit,
const fef::TermFieldMatchDataArray &matchData);
- ~FakeFilterOccZCArrayIterator();
+ ~FakeFilterOccZCArrayIterator() override;
void doUnpack(uint32_t docId) override;
void doSeek(uint32_t docId) override;
@@ -427,7 +438,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end)
{
queryeval::RankedSearchIteratorBase::initRange(begin, end);
DecodeContext &d = _decodeContext;
- Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
+ Zc4PostingParams params(force_skip, disable_chunking, _docIdLimit, true, false, false);
Zc4PostingHeader header;
header.read(d, params);
assert((d.getBitOffset() & 7) == 0);
@@ -462,11 +473,13 @@ FakeFilterOccZCArrayIterator::doSeek(uint32_t docId)
const uint8_t *oCompr = _valI;
uint32_t oDocId = getDocId();
- if (getUnpacked())
+ if (getUnpacked()) {
clearUnpacked();
+ }
while (oDocId < docId) {
- if (--_residue == 0)
+ if (--_residue == 0) {
goto atbreak;
+ }
ZCDECODE(oCompr, oDocId += 1 +);
#if DEBUG_ZCFILTEROCC_PRINTF
printf("Decode docId=%d\n",
@@ -499,7 +512,7 @@ SearchIterator *
FakeZcFilterOcc::
createIterator(const TermFieldMatchDataArray &matchData) const
{
- return new FakeFilterOccZCArrayIterator(_compressed.first, 0, _docIdLimit, matchData);
+ return new FakeFilterOccZCArrayIterator(_compressed.first, 0, _posting_params._doc_id_limit, matchData);
}
template <bool doSkip>
@@ -508,7 +521,7 @@ class FakeZcSkipFilterOcc : public FakeZcFilterOcc
public:
FakeZcSkipFilterOcc(const FakeWord &fw);
- ~FakeZcSkipFilterOcc();
+ ~FakeZcSkipFilterOcc() override;
SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override;
};
@@ -523,17 +536,17 @@ initSkip(std::make_pair("ZcSkipFilterOcc",
template<>
FakeZcSkipFilterOcc<false>::FakeZcSkipFilterOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, true, ".zcnoskipfilterocc")
+ : FakeZcFilterOcc(fw, true, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, false, false), ".zc5noskipfilterocc")
{
- setup(fw, false, true);
+ setup(fw);
}
template<>
FakeZcSkipFilterOcc<true>::FakeZcSkipFilterOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, true, ".zcskipfilterocc")
+ : FakeZcFilterOcc(fw, true, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, false, false), ".zc5skipfilterocc")
{
- setup(fw, false, true);
+ setup(fw);
}
@@ -591,7 +604,7 @@ public:
uint32_t docIdLimit,
const TermFieldMatchDataArray &matchData);
- ~FakeFilterOccZCSkipArrayIterator();
+ ~FakeFilterOccZCSkipArrayIterator() override;
void doL4SkipSeek(uint32_t docId);
void doL3SkipSeek(uint32_t docId);
@@ -648,7 +661,7 @@ initRange(uint32_t begin, uint32_t end)
{
queryeval::RankedSearchIteratorBase::initRange(begin, end);
DecodeContext &d = _decodeContext;
- Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
+ Zc4PostingParams params(force_skip, disable_chunking, _docIdLimit, true, false, false);
Zc4PostingHeader header;
header.read(d, params);
_lastDocId = header._last_doc_id;
@@ -800,8 +813,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL3SkipSeek(uint32_t docId)
if (__builtin_expect(docId > _l4SkipDocId, false)) {
doL4SkipSeek(docId);
- if (docId <= _l3SkipDocId)
+ if (docId <= _l3SkipDocId) {
return;
+ }
}
do {
lastL3SkipDocId = _l3SkipDocId;
@@ -847,8 +861,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL2SkipSeek(uint32_t docId)
if (__builtin_expect(docId > _l3SkipDocId, false)) {
doL3SkipSeek(docId);
- if (docId <= _l2SkipDocId)
+ if (docId <= _l2SkipDocId) {
return;
+ }
}
do {
lastL2SkipDocId = _l2SkipDocId;
@@ -894,8 +909,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL1SkipSeek(uint32_t docId)
uint32_t lastL1SkipDocId;
if (__builtin_expect(docId > _l2SkipDocId, false)) {
doL2SkipSeek(docId);
- if (docId <= _l1SkipDocId)
+ if (docId <= _l1SkipDocId) {
return;
+ }
}
do {
lastL1SkipDocId = _l1SkipDocId;
@@ -925,8 +941,9 @@ template <bool doSkip>
void
FakeFilterOccZCSkipArrayIterator<doSkip>::doSeek(uint32_t docId)
{
- if (getUnpacked())
+ if (getUnpacked()) {
clearUnpacked();
+ }
if (doSkip && docId > _l1SkipDocId) {
doL1SkipSeek(docId);
}
@@ -1075,7 +1092,7 @@ createIterator(const TermFieldMatchDataArray &matchData) const
{
return new FakeFilterOccZCSkipArrayIterator<doSkip>(_compressed.first,
0,
- _docIdLimit,
+ _posting_params._doc_id_limit,
matchData);
}
@@ -1085,8 +1102,7 @@ class FakeEGCompr64PosOcc : public FakeZcFilterOcc
{
public:
FakeEGCompr64PosOcc(const FakeWord &fw);
- ~FakeEGCompr64PosOcc();
- void setup(const FakeWord &fw);
+ ~FakeEGCompr64PosOcc() override;
size_t bitSize() const override;
bool hasWordPositions() const override;
SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override;
@@ -1095,7 +1111,7 @@ public:
template <bool bigEndian>
FakeEGCompr64PosOcc<bigEndian>::FakeEGCompr64PosOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, bigEndian,
+ : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, true, true, false),
bigEndian ? ".zcposoccbe" : ".zcposoccle")
{
setup(fw);
@@ -1107,80 +1123,6 @@ FakeEGCompr64PosOcc<bigEndian>::~FakeEGCompr64PosOcc()
{
}
-
-template <bool bigEndian>
-void
-FakeEGCompr64PosOcc<bigEndian>::setup(const FakeWord &fw)
-{
- uint32_t lastDocId = 0u;
-
- typedef FakeWord FW;
- typedef FW::DocWordFeatureList DWFL;
- typedef FW::DocWordPosFeatureList DWPFL;
-
- DWFL::const_iterator d(fw._postings.begin());
- DWFL::const_iterator de(fw._postings.end());
- DWPFL::const_iterator p(fw._wordPosFeatures.begin());
- DWPFL::const_iterator pe(fw._wordPosFeatures.end());
- DocIdAndPosOccFeatures features;
- EGPosOccEncodeContext<bigEndian> e(&_fieldsParams);
- ComprFileWriteContext ectx(e);
- e.setWriteContext(&ectx);
- ectx.allocComprBuf(64, 1);
- e.afterWrite(ectx, 0, 0);
-
- _hitDocs = fw._postings.size();
- _docIdLimit = fw._docIdLimit;
- if (_hitDocs > 0)
- _lastDocId = fw._postings.back()._docId;
- else
- _lastDocId = 0u;
- e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
-
- uint32_t docIdK = e.calcDocIdK(_hitDocs, _docIdLimit);
-
- while (d != de) {
- e.encodeExpGolomb(d->_docId - lastDocId - 1, docIdK);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("Encode docId=%d (+%u + 1)\n",
- d->_docId, d->_docId - lastDocId - 1);
-#endif
- fw.setupFeatures(*d, &*p, features);
- p += d->_positions;
- e.writeFeatures(features);
- lastDocId = d->_docId;
- ++d;
- }
- assert(p == pe);
-
- _compressedBits = e.getWriteOffset();
-
- // First pad to 64 bits.
- uint32_t pad = (64 - e.getWriteOffset()) & 63;
- while (pad > 0) {
- uint32_t now = std::min(32u, pad);
- e.writeBits(0, now);
- e.writeComprBufferIfNeeded();
- pad -= now;
- }
-
- // Then write 128 more bits. This allows for 64-bit decoding
- // with a readbits that always leaves a nonzero preRead
- for (unsigned int i = 0; i < 4; i++) {
- e.writeBits(0, 32);
- e.writeComprBufferIfNeeded();
- }
- e.writeComprBufferIfNeeded();
- e.flush();
- e.writeComprBuffer();
-
- std::pair<void *, size_t> ectxData =
- ectx.grabComprBuffer(_compressedMalloc);
- _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first),
- ectxData.second);
-}
-
-
template <bool bigEndian>
size_t
FakeEGCompr64PosOcc<bigEndian>::bitSize() const
@@ -1203,7 +1145,7 @@ FakeEGCompr64PosOcc<bigEndian>::
createIterator(const TermFieldMatchDataArray &matchData) const
{
return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
- _compressedBits, _docIdLimit, &_fieldsParams, matchData);
+ _compressedBits, _posting_params._doc_id_limit, false, &_fieldsParams, matchData);
}
@@ -1212,8 +1154,7 @@ class FakeEG2Compr64PosOcc : public FakeZcFilterOcc
{
public:
FakeEG2Compr64PosOcc(const FakeWord &fw);
- ~FakeEG2Compr64PosOcc();
- void setup(const FakeWord &fw);
+ ~FakeEG2Compr64PosOcc() override;
size_t bitSize() const override;
bool hasWordPositions() const override;
SearchIterator *createIterator(const fef::TermFieldMatchDataArray &matchData) const override;
@@ -1222,8 +1163,8 @@ public:
template <bool bigEndian>
FakeEG2Compr64PosOcc<bigEndian>::FakeEG2Compr64PosOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, bigEndian,
- bigEndian ? ".zc2posoccbe" : ".zc2posoccle")
+ : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, false, true, false),
+ bigEndian ? ".zc4posoccbe" : ".zc4posoccle")
{
setup(fw);
}
@@ -1236,78 +1177,6 @@ FakeEG2Compr64PosOcc<bigEndian>::~FakeEG2Compr64PosOcc()
template <bool bigEndian>
-void
-FakeEG2Compr64PosOcc<bigEndian>::setup(const FakeWord &fw)
-{
- uint32_t lastDocId = 0u;
-
- typedef FakeWord FW;
- typedef FW::DocWordFeatureList DWFL;
- typedef FW::DocWordPosFeatureList DWPFL;
-
- DWFL::const_iterator d(fw._postings.begin());
- DWFL::const_iterator de(fw._postings.end());
- DWPFL::const_iterator p(fw._wordPosFeatures.begin());
- DWPFL::const_iterator pe(fw._wordPosFeatures.end());
- DocIdAndPosOccFeatures features;
- EG2PosOccEncodeContext<bigEndian> e(&_fieldsParams);
- ComprFileWriteContext ectx(e);
- e.setWriteContext(&ectx);
- ectx.allocComprBuf(64, 1);
- e.afterWrite(ectx, 0, 0);
-
- _hitDocs = fw._postings.size();
- _docIdLimit = fw._docIdLimit;
- if (_hitDocs > 0)
- _lastDocId = fw._postings.back()._docId;
- else
- _lastDocId = 0u;
- e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
-
- while (d != de) {
- e.encodeExpGolomb(d->_docId - lastDocId - 1,
- K_VALUE_ZCPOSTING_DELTA_DOCID);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("Encode docId=%d (+%u + 1)\n",
- d->_docId, d->_docId - lastDocId - 1);
-#endif
- fw.setupFeatures(*d, &*p, features);
- p += d->_positions;
- e.writeFeatures(features);
- lastDocId = d->_docId;
- ++d;
- }
- assert(p == pe);
-
- _compressedBits = e.getWriteOffset();
-
- // First pad to 64 bits.
- uint32_t pad = (64 - e.getWriteOffset()) & 63;
- while (pad > 0) {
- uint32_t now = std::min(32u, pad);
- e.writeBits(0, now);
- e.writeComprBufferIfNeeded();
- pad -= now;
- }
-
- // Then write 128 more bits. This allows for 64-bit decoding
- // with a readbits that always leaves a nonzero preRead
- for (unsigned int i = 0; i < 4; i++) {
- e.writeBits(0, 32);
- e.writeComprBufferIfNeeded();
- }
- e.writeComprBufferIfNeeded();
- e.flush();
- e.writeComprBuffer();
-
- std::pair<void *, size_t> ectxData =
- ectx.grabComprBuffer(_compressedMalloc);
- _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first),
- ectxData.second);
-}
-
-
-template <bool bigEndian>
size_t
FakeEG2Compr64PosOcc<bigEndian>::bitSize() const
{
@@ -1329,7 +1198,7 @@ FakeEG2Compr64PosOcc<bigEndian>::
createIterator(const TermFieldMatchDataArray &matchData) const
{
return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
- _compressedBits, _docIdLimit, &_fieldsParams, matchData);
+ _compressedBits, _posting_params._doc_id_limit, false, &_fieldsParams, matchData);
}
@@ -1339,7 +1208,7 @@ class FakeZcSkipPosOcc : public FakeZcFilterOcc
search::index::PostingListCounts _counts;
public:
FakeZcSkipPosOcc(const FakeWord &fw);
- ~FakeZcSkipPosOcc();
+ ~FakeZcSkipPosOcc() override;
size_t bitSize() const override;
bool hasWordPositions() const override;
@@ -1349,10 +1218,10 @@ public:
template <bool bigEndian>
FakeZcSkipPosOcc<bigEndian>::FakeZcSkipPosOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, bigEndian,
+ : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, true, false),
bigEndian ? ".zcskipposoccbe" : ".zcskipposoccle")
{
- setup(fw, true, true);
+ setup(fw);
_counts._bitLength = _compressedBits;
}
@@ -1385,7 +1254,7 @@ SearchIterator *
FakeZcSkipPosOcc<bigEndian>::
createIterator(const TermFieldMatchDataArray &matchData) const
{
- return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _docIdLimit,
+ return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, false,
static_cast<uint32_t>(-1),
_counts,
&_fieldsParams,
@@ -1394,12 +1263,15 @@ createIterator(const TermFieldMatchDataArray &matchData) const
template <bool bigEndian>
-class FakeZc2SkipPosOcc : public FakeZcFilterOcc
+class FakeZc4SkipPosOcc : public FakeZcFilterOcc
{
search::index::PostingListCounts _counts;
+ bool _encode_cheap_features;
+protected:
+ FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4PostingParams &posting_params, const char *name_suffix);
public:
- FakeZc2SkipPosOcc(const FakeWord &fw);
- ~FakeZc2SkipPosOcc();
+ FakeZc4SkipPosOcc(const FakeWord &fw);
+ ~FakeZc4SkipPosOcc() override;
size_t bitSize() const override;
bool hasWordPositions() const override;
SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override;
@@ -1407,24 +1279,26 @@ public:
template <bool bigEndian>
-FakeZc2SkipPosOcc<bigEndian>::FakeZc2SkipPosOcc(const FakeWord &fw)
- : FakeZcFilterOcc(fw, bigEndian,
- bigEndian ? ".zc2skipposoccbe" : ".zc2skipposoccle")
+FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4PostingParams &posting_params, const char *name_suffix)
+ : FakeZcFilterOcc(fw, bigEndian, posting_params, name_suffix)
{
- setup(fw, true, false);
+ setup(fw);
_counts._bitLength = _compressedBits;
}
-
template <bool bigEndian>
-FakeZc2SkipPosOcc<bigEndian>::~FakeZc2SkipPosOcc()
+FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw)
+ : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, false, true, false),
+ (bigEndian ? ".zc4skipposoccbe" : ".zc4skipposoccle"))
{
}
+template <bool bigEndian>
+FakeZc4SkipPosOcc<bigEndian>::~FakeZc4SkipPosOcc() = default;
template <bool bigEndian>
size_t
-FakeZc2SkipPosOcc<bigEndian>::bitSize() const
+FakeZc4SkipPosOcc<bigEndian>::bitSize() const
{
return _compressedBits -
_l1SkipSize - _l2SkipSize - _l3SkipSize - _l4SkipSize;
@@ -1433,7 +1307,7 @@ FakeZc2SkipPosOcc<bigEndian>::bitSize() const
template <bool bigEndian>
bool
-FakeZc2SkipPosOcc<bigEndian>::hasWordPositions() const
+FakeZc4SkipPosOcc<bigEndian>::hasWordPositions() const
{
return true;
}
@@ -1441,13 +1315,63 @@ FakeZc2SkipPosOcc<bigEndian>::hasWordPositions() const
template <bool bigEndian>
SearchIterator *
-FakeZc2SkipPosOcc<bigEndian>::
+FakeZc4SkipPosOcc<bigEndian>::
createIterator(const TermFieldMatchDataArray &matchData) const
{
- return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _docIdLimit,
- static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData);
+ if (_hitDocs >= _posting_params._min_skip_docs) {
+ if (_posting_params._dynamic_k) {
+ return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features,
+ static_cast<uint32_t>(-1),
+ _counts,
+ &_fieldsParams,
+ matchData);
+ } else {
+ return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features,
+ static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData);
+ }
+ } else {
+ if (_posting_params._dynamic_k) {
+ return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
+ _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData);
+ } else {
+ return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0),
+ _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData);
+ }
+ }
}
+template <bool bigEndian>
+class FakeZc4SkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian>
+{
+public:
+ FakeZc4SkipPosOccCf(const FakeWord &fw)
+ : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, false, true, true),
+ (bigEndian ? ".zc4skipposoccbe.cf" : ".zc4skipposoccle.cf"))
+ {
+ }
+};
+
+template <bool bigEndian>
+class FakeZc4NoSkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian>
+{
+public:
+ FakeZc4NoSkipPosOccCf(const FakeWord &fw)
+ : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, false, true, true),
+ (bigEndian ? ".zc4noskipposoccbe.cf" : "zc4noskipposoccle.cf"))
+ {
+ }
+};
+
+template <bool bigEndian>
+class FakeZc5NoSkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian>
+{
+public:
+ FakeZc5NoSkipPosOccCf(const FakeWord &fw)
+ : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, true, true, true),
+ (bigEndian ? ".zc5noskipposoccbe.cf" : "zc5noskipposoccle.cf"))
+ {
+ }
+};
static FPFactoryInit
initPosbe(std::make_pair("EGCompr64PosOccBE",
@@ -1479,13 +1403,42 @@ initSkipPosle(std::make_pair("ZcSkipPosOccLE",
static FPFactoryInit
-initSkipPos0be(std::make_pair("Zc2SkipPosOccBE",
- makeFPFactory<FPFactoryT<FakeZc2SkipPosOcc<true> > >));
+initSkipPos0be(std::make_pair("Zc4SkipPosOccBE",
+ makeFPFactory<FPFactoryT<FakeZc4SkipPosOcc<true> > >));
+
+
+static FPFactoryInit
+initSkipPos0le(std::make_pair("Zc4SkipPosOccLE",
+ makeFPFactory<FPFactoryT<FakeZc4SkipPosOcc<false> > >));
+
+
+static FPFactoryInit
+initSkipPos0becf(std::make_pair("Zc4SkipPosOccBE.cf",
+ makeFPFactory<FPFactoryT<FakeZc4SkipPosOccCf<true> > >));
+
+
+static FPFactoryInit
+initSkipPos0lecf(std::make_pair("Zc4SkipPosOccLE.cf",
+ makeFPFactory<FPFactoryT<FakeZc4SkipPosOccCf<false> > >));
+
+static FPFactoryInit
+initNoSkipPos0becf(std::make_pair("Zc4NoSkipPosOccBE.cf",
+ makeFPFactory<FPFactoryT<FakeZc4NoSkipPosOccCf<true> > >));
+
+
+static FPFactoryInit
+initNoSkipPos0lecf(std::make_pair("Zc4NoSkipPosOccLE.cf",
+ makeFPFactory<FPFactoryT<FakeZc4NoSkipPosOccCf<false> > >));
+
+
+static FPFactoryInit
+initNoSkipPosbecf(std::make_pair("Zc5NoSkipPosOccBE.cf",
+ makeFPFactory<FPFactoryT<FakeZc5NoSkipPosOccCf<true> > >));
static FPFactoryInit
-initSkipPos0le(std::make_pair("Zc2SkipPosOccLE",
- makeFPFactory<FPFactoryT<FakeZc2SkipPosOcc<false> > >));
+initNoSkipPoslecf(std::make_pair("Zc5NoSkipPosOccLE.cf",
+ makeFPFactory<FPFactoryT<FakeZc5NoSkipPosOccCf<false> > >));
} // namespace fakedata
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
index 36738a0f5a8..3d1673edec7 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
@@ -5,6 +5,7 @@
#include "fakeposting.h"
#include <vespa/searchlib/bitcompression/compression.h>
#include <vespa/searchlib/bitcompression/posocccompression.h>
+#include <vespa/searchlib/diskindex/zc4_posting_params.h>
namespace search {
@@ -21,7 +22,6 @@ protected:
size_t _l2SkipSize;
size_t _l3SkipSize;
size_t _l4SkipSize;
- unsigned int _docIdLimit;
unsigned int _hitDocs;
uint32_t _lastDocId;
@@ -31,23 +31,27 @@ protected:
uint64_t _featuresSize;
const search::bitcompression::PosOccFieldsParams &_fieldsParams;
bool _bigEndian;
+ diskindex::Zc4PostingParams _posting_params;
protected:
- void setup(const FakeWord &fw, bool doFeatures, bool dynamicK);
+ void setup(const FakeWord &fw);
template <bool bigEndian>
- void setupT(const FakeWord &fw, bool doFeatures, bool dynamicK);
+ void setupT(const FakeWord &fw);
template <bool bigEndian>
- void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs);
+ void read_header();
- void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const;
+ void validate_read(const FakeWord &fw) const;
template <bool bigEndian>
- void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const;
+ void validate_read(const FakeWord &fw) const;
public:
FakeZcFilterOcc(const FakeWord &fw);
- FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix);
- ~FakeZcFilterOcc();
+ FakeZcFilterOcc(const FakeWord &fw,
+ bool bigEndian,
+ const diskindex::Zc4PostingParams &posting_params,
+ const char *nameSuffix);
+ ~FakeZcFilterOcc() override;
static void forceLink();