Factor out reading of zc4 posting header in unit tests.

author: Tor Egge <Tor.Egge@broadpark.no> 2019-04-12 13:59:19 +0200
committer: Tor Egge <Tor.Egge@broadpark.no> 2019-04-12 14:07:35 +0200
commit: 278ed8c522a77519b8fd942421b7f0958d306725 (patch)
tree: 61b564d9bd11fb2f24521e753e89c1715f21f3f8 /searchlib
parent: 28a9be2321136a976bdc3bc5b45cef084f81d815 (diff)
5 files changed, 208 insertions, 127 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index b21b799e693..104994ad038 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -18,6 +18,7 @@ vespa_add_library(searchlib_diskindex OBJECT
     pagedict4file.cpp
     pagedict4randread.cpp
     wordnummapper.cpp
+    zc4_posting_header.cpp
     zc4_posting_writer.cpp
     zc4_posting_writer_base.cpp
     zcbuf.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp
new file mode 100644
index 00000000000..3adb32f8681
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp
@@ -0,0 +1,105 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_header.h"
+#include "zc4_posting_params.h"
+#include <vespa/searchlib/bitcompression/compression.h>
+
+namespace search::diskindex
+{
+
+Zc4PostingHeader::Zc4PostingHeader()
+    : _has_more(false),
+      _doc_id_k(K_VALUE_ZCPOSTING_LASTDOCID),
+      _num_docs(0u),
+      _doc_ids_size(0u),
+      _l1_skip_size(0u),
+      _l2_skip_size(0u),
+      _l3_skip_size(0u),
+      _l4_skip_size(0u),
+      _features_size(0u),
+      _last_doc_id(0)
+{
+}
+
+template <bool bigEndian>
+void
+Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params)
+{
+    using EC = bitcompression::FeatureEncodeContext<bigEndian>;
+    UC64_DECODECONTEXT_CONSTRUCTOR(o, decode_context._);
+    uint32_t length;
+    uint64_t val64;
+
+    UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
+    _num_docs = static_cast<uint32_t>(val64) + 1;
+    bool has_more = false;
+    if (__builtin_expect(_num_docs >= params._min_chunk_docs, false)) {
+        if (bigEndian) {
+            has_more = static_cast<int64_t>(oVal) < 0;
+            oVal <<= 1;
+            length = 1;
+        } else {
+            has_more = (oVal & 1) != 0;
+            oVal >>= 1;
+            length = 1;
+        }
+        UC64_READBITS_NS(o, EC);
+    }
+    if (params._dynamic_k) {
+        _doc_id_k = EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit);
+    } else {
+        _doc_id_k = K_VALUE_ZCPOSTING_LASTDOCID;
+    }
+    if (_num_docs < params._min_skip_docs && !_has_more) {
+        _doc_ids_size = 0;
+        _l1_skip_size = 0;
+        _l2_skip_size = 0;
+        _l3_skip_size = 0;
+        _l4_skip_size = 0;
+        _features_size = 0;
+        _last_doc_id = 0;
+    } else {
+        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
+        _doc_ids_size = val64 + 1;
+        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
+        _l1_skip_size = val64;
+        if (_l1_skip_size != 0) {
+            UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
+            _l2_skip_size = val64;
+        }
+        if (_l2_skip_size != 0) {
+            UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
+            _l3_skip_size = val64;
+        }
+        if (_l3_skip_size != 0) {
+            UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
+            _l4_skip_size = val64;
+        }
+        if (params._encode_features) {
+            UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
+            _features_size = val64;
+        } else {
+            _features_size = 0;
+        }
+        UC64_DECODEEXPGOLOMB_NS(o, _doc_id_k, EC);
+        _last_doc_id = params._doc_id_limit - 1 - val64;
+        uint64_t bytePad = oPreRead & 7;
+        if (bytePad > 0) {
+            length = bytePad;
+            UC64_READBITS_NS(o, EC);
+        }
+    }
+    UC64_DECODECONTEXT_STORE(o, decode_context._);
+    _has_more = has_more;
+}
+
+template
+void
+Zc4PostingHeader::read<false>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
+
+template
+void
+Zc4PostingHeader::read<true>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
+
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h
new file mode 100644
index 00000000000..7382f59d176
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h
@@ -0,0 +1,35 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <cstdint>
+
+namespace search::bitcompression { class DecodeContext64Base; }
+
+namespace search::diskindex {
+
+struct Zc4PostingParams;
+
+/*
+ * Struct containing the decoded header for a word.
+ */
+struct Zc4PostingHeader {
+    bool     _has_more;
+    uint32_t _doc_id_k;
+    uint32_t _num_docs;
+    uint32_t _doc_ids_size;
+    uint32_t _l1_skip_size;
+    uint32_t _l2_skip_size;
+    uint32_t _l3_skip_size;
+    uint32_t _l4_skip_size;
+    uint64_t _features_size;
+    uint32_t _last_doc_id;
+
+    Zc4PostingHeader();
+
+    template <bool bigEndian>
+    void
+    read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h
new file mode 100644
index 00000000000..ea4cc6f58a6
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h
@@ -0,0 +1,29 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <cstdint>
+
+namespace search::diskindex {
+
+/*
+ * Struct containing parameters for posting list.
+ */
+struct Zc4PostingParams {
+    uint32_t _min_skip_docs;
+    uint32_t _min_chunk_docs;
+    uint32_t _doc_id_limit;
+    bool     _dynamic_k;
+    bool     _encode_features;
+
+    Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features)
+        : _min_skip_docs(min_skip_docs),
+          _min_chunk_docs(min_chunk_docs),
+          _doc_id_limit(doc_id_limit),
+          _dynamic_k(dynamic_k),
+          _encode_features(encode_features)
+    {
+    }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
index 33819d4f7cb..3d4567ed2ab 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
@@ -4,6 +4,8 @@
 #include "fpfactory.h"
 #include <vespa/searchlib/diskindex/zcposocciterators.h>
 #include <vespa/searchlib/diskindex/zc4_posting_writer.h>
+#include <vespa/searchlib/diskindex/zc4_posting_header.h>
+#include <vespa/searchlib/diskindex/zc4_posting_params.h>
 
 using search::fef::TermFieldMatchData;
 using search::fef::TermFieldMatchDataArray;
@@ -13,6 +15,8 @@ using search::index::PostingListCounts;
 using search::index::PostingListParams;
 using search::index::DocIdAndFeatures;
 using search::index::DocIdAndPosOccFeatures;
+using search::bitcompression::DecodeContext64;
+using search::bitcompression::DecodeContext64Base;
 using search::bitcompression::PosOccFieldParams;
 using search::bitcompression::EGPosOccEncodeContext;
 using search::bitcompression::EG2PosOccEncodeContext;
@@ -200,38 +204,18 @@ void
 FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs)
 {
     // read back word header to get skip sizes
-    using EC = FeatureEncodeContext<bigEndian>;
-    UC64_DECODECONTEXT(o);
-    uint32_t length;
-    uint64_t val64;
-    UC64_SETUPBITS_NS(o, _compressed.first, 0, EC);
-    UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
-    assert(static_cast<uint32_t>(val64) + 1 == _hitDocs);
-    assert(_hitDocs >= min_skip_docs);
-    assert(_hitDocs < min_chunk_docs);
-    uint32_t docIdK = dynamicK ? EC::calcDocIdK(_hitDocs, _docIdLimit) : K_VALUE_ZCPOSTING_LASTDOCID;
-    UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
-    _docIdsSize = val64 + 1;
-    UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
-    _l1SkipSize = val64;
-    if (_l1SkipSize != 0) {
-        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
-        _l2SkipSize = val64;
-    }
-    if (_l2SkipSize != 0) {
-        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
-        _l3SkipSize = val64;
-    }
-    if (_l3SkipSize != 0) {
-        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
-        _l4SkipSize = val64;
-    }
-    if (doFeatures) {
-        UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
-        _featuresSize = val64;
-    }
-    UC64_DECODEEXPGOLOMB_NS(o, docIdK, EC);
-    assert(_lastDocId == _docIdLimit - 1 - val64);
+    DecodeContext64<bigEndian> decode_context;
+    decode_context.setPosition({ _compressed.first, 0 });
+    Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures);
+    Zc4PostingHeader header;
+    header.read<bigEndian>(decode_context, params);
+    _docIdsSize = header._doc_ids_size;
+    _l1SkipSize = header._l1_skip_size;
+    _l2SkipSize = header._l2_skip_size;
+    _l3SkipSize = header._l3_skip_size;
+    _l4SkipSize = header._l4_skip_size;
+    _featuresSize = header._features_size;
+    assert(_lastDocId == header._last_doc_id);
 }
 
 
@@ -383,54 +367,17 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end)
 {
     queryeval::RankedSearchIteratorBase::initRange(begin, end);
     DecodeContext &d = _decodeContext;
-    typedef EncodeContext EC;
-    UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-    uint32_t length;
-    uint64_t val64;
-
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
-    uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
-
-    uint32_t docIdK = EC::calcDocIdK(numDocs, _docIdLimit);
-
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
-    uint32_t docIdsSize = val64 + 1;
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
-    uint32_t l1SkipSize = val64;
-    uint32_t l2SkipSize = 0;
-    if (l1SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
-        l2SkipSize = val64;
-    }
-    uint32_t l3SkipSize = 0;
-    if (l2SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
-        l3SkipSize = val64;
-    }
-    uint32_t l4SkipSize = 0;
-    if (l3SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
-        l4SkipSize = val64;
-    }
-    // Feature size would be here
-    UC64BE_DECODEEXPGOLOMB_NS(o, docIdK, EC);
-    _lastDocId = _docIdLimit - 1 - val64;
-    UC64_DECODECONTEXT_STORE(o, d._);
-    uint64_t bytePad = oPreRead & 7;
-    if (bytePad > 0) {
-        length = bytePad;
-        oVal <<= length;
-        UC64BE_READBITS_NS(o, EC);
-    }
-    UC64_DECODECONTEXT_STORE(o, d._);
+    Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
+    Zc4PostingHeader header;
+    header.read<true>(d, params);
     assert((d.getBitOffset() & 7) == 0);
     const uint8_t *bcompr = d.getByteCompr();
     _valI = bcompr;
-    bcompr += docIdsSize;
-    bcompr += l1SkipSize;
-    bcompr += l2SkipSize;
-    bcompr += l3SkipSize;
-    bcompr += l4SkipSize;
+    bcompr += header._doc_ids_size;
+    bcompr += header._l1_skip_size;
+    bcompr += header._l2_skip_size;
+    bcompr += header._l3_skip_size;
+    bcompr += header._l4_skip_size,
     d.setByteCompr(bcompr);
     uint32_t oDocId;
     ZCDECODE(_valI, oDocId = 1 +);
@@ -439,7 +386,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end)
            oDocId);
 #endif
     setDocId(oDocId);
-    _residue = numDocs;
+    _residue = header._num_docs;
 }
 
 
@@ -641,79 +588,43 @@ initRange(uint32_t begin, uint32_t end)
 {
     queryeval::RankedSearchIteratorBase::initRange(begin, end);
     DecodeContext &d = _decodeContext;
-    typedef EncodeContext EC;
-    UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-    uint32_t length;
-    uint64_t val64;
-
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
-    uint32_t numDocs = static_cast<uint32_t>(val64) + 1;
-
-    uint32_t docIdK = EC::calcDocIdK(numDocs, _docIdLimit);
-
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
-    uint32_t docIdsSize = val64 + 1;
-    UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
-    uint32_t l1SkipSize = val64;
-    uint32_t l2SkipSize = 0;
-    if (l1SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
-        l2SkipSize = val64;
-    }
-    uint32_t l3SkipSize = 0;
-    if (l2SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
-        l3SkipSize = val64;
-    }
-    uint32_t l4SkipSize = 0;
-    if (l3SkipSize != 0) {
-        UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
-        l4SkipSize = val64;
-    }
-    // Feature size would be here
-    UC64BE_DECODEEXPGOLOMB_NS(o, docIdK, EC);
-    _lastDocId = _docIdLimit - 1 - val64;
-    UC64_DECODECONTEXT_STORE(o, d._);
-    uint64_t bytePad = oPreRead & 7;
-    if (bytePad > 0) {
-        length = bytePad;
-        oVal <<= length;
-        UC64BE_READBITS_NS(o, EC);
-    }
-    UC64_DECODECONTEXT_STORE(o, d._);
+    Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
+    Zc4PostingHeader header;
+    header.read<true>(d, params);
+    _lastDocId = header._last_doc_id;
     assert((d.getBitOffset() & 7) == 0);
     const uint8_t *bcompr = d.getByteCompr();
     _valIBase = _valI = bcompr;
     _l1SkipDocIdPos = _l2SkipDocIdPos = bcompr;
     _l3SkipDocIdPos = _l4SkipDocIdPos = bcompr;
-    bcompr += docIdsSize;
-    if (l1SkipSize != 0) {
+    bcompr += header._doc_ids_size;
+    if (header._l1_skip_size != 0) {
         _l1SkipValIBase = _l1SkipValI = bcompr;
         _l2SkipL1SkipPos = _l3SkipL1SkipPos = _l4SkipL1SkipPos = bcompr;
-        bcompr += l1SkipSize;
+        bcompr += header._l1_skip_size;
     } else {
         _l1SkipValIBase = _l1SkipValI = NULL;
         _l2SkipL1SkipPos = _l3SkipL1SkipPos = _l4SkipL1SkipPos = NULL;
     }
-    if (l2SkipSize != 0) {
+    if (header._l2_skip_size != 0) {
         _l2SkipValIBase = _l2SkipValI = bcompr;
         _l3SkipL2SkipPos = _l4SkipL2SkipPos = bcompr;
-        bcompr += l2SkipSize;
+        bcompr += header._l2_skip_size;
     } else {
         _l2SkipValIBase = _l2SkipValI = NULL;
         _l3SkipL2SkipPos = _l4SkipL2SkipPos = NULL;
     }
-    if (l3SkipSize != 0) {
+    if (header._l3_skip_size != 0) {
         _l3SkipValIBase = _l3SkipValI = bcompr;
         _l4SkipL3SkipPos = bcompr;
-        bcompr += l3SkipSize;
+        bcompr += header._l3_skip_size;
     } else {
         _l3SkipValIBase = _l3SkipValI = NULL;
         _l4SkipL3SkipPos = NULL;
     }
-    if (l4SkipSize != 0) {
+    if (header._l4_skip_size != 0) {
         _l4SkipValI = bcompr;
-        bcompr += l4SkipSize;
+        bcompr += header._l4_skip_size;
     } else {
         _l4SkipValI = NULL;
     }
author	Tor Egge <Tor.Egge@broadpark.no>	2019-04-12 13:59:19 +0200
committer	Tor Egge <Tor.Egge@broadpark.no>	2019-04-12 14:07:35 +0200
commit	278ed8c522a77519b8fd942421b7f0958d306725 (patch)
tree	61b564d9bd11fb2f24521e753e89c1715f21f3f8 /searchlib
parent	28a9be2321136a976bdc3bc5b45cef084f81d815 (diff)