summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp12
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp35
-rw-r--r--searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp8
-rw-r--r--searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp13
-rw-r--r--searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp40
-rw-r--r--searchlib/src/tests/memoryindex/field_index/CMakeLists.txt10
-rw-r--r--searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp73
-rw-r--r--searchlib/src/tests/memoryindex/field_index/field_index_test.cpp1084
-rw-r--r--searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp49
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/compression.h140
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp99
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocccompression.h4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/diskindex.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/diskindex.h61
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/extposocc.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldwriter.h27
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp509
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/indexbuilder.h27
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp77
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp438
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h96
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp354
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.cpp561
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.h92
-rw-r--r--searchlib/src/vespa/searchlib/features/attributefeature.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/features/constant_feature.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/features/queryfeature.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/index/docidandfeatures.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/index/docidandfeatures.h131
-rw-r--r--searchlib/src/vespa/searchlib/index/indexbuilder.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/index/indexbuilder.h48
-rw-r--r--searchlib/src/vespa/searchlib/index/postinglistfile.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/index/postinglistfile.h31
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_index.cpp34
-rw-r--r--searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp17
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp72
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h4
-rw-r--r--searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h4
-rw-r--r--searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h64
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.cpp19
-rw-r--r--searchlib/src/vespa/searchlib/util/comprfile.h1
55 files changed, 1895 insertions, 2504 deletions
diff --git a/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp b/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp
index 4e4d90e6871..90953f78c40 100644
--- a/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp
+++ b/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp
@@ -94,15 +94,13 @@ unpackFeatures(std::vector<PosEntry> &entries,
uint64_t wordNum,
const DocIdAndFeatures &features)
{
- std::vector<search::index::WordDocElementFeatures>::const_iterator
- element = features._elements.begin();
- std::vector<search::index::WordDocElementWordPosFeatures>::
- const_iterator position = features._wordPositions.begin();
- uint32_t numElements = features._elements.size();
+ auto element = features.elements().begin();
+ auto position = features.word_positions().begin();
+ uint32_t numElements = features.elements().size();
while (numElements--) {
uint32_t numOccs = element->getNumOccs();
while (numOccs--) {
- entries.push_back(PosEntry(features._docId,
+ entries.push_back(PosEntry(features.doc_id(),
fieldId,
element->getElementId(),
position->getWordPos(),
@@ -447,7 +445,7 @@ ShowPostingListSubApp::readPostings(const SchemaUtil::IndexIterator &index,
if (r.isValid())
r.read();
while (r.isValid()) {
- uint32_t docId = r._docIdAndFeatures._docId;
+ uint32_t docId = r._docIdAndFeatures.doc_id();
if (docId >= _minDocId && docId < _docIdLimit) {
unpackFeatures(entries, index.getIndex(),
r._wordNum, r._docIdAndFeatures);
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index bd814b0ad32..cbbaa518b16 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -40,9 +40,6 @@ static bool operator==(const Tensor &lhs, const Tensor &rhs)
vespalib::string sparseSpec("tensor(x{},y{})");
vespalib::string denseSpec("tensor(x[2],y[3])");
-vespalib::string denseAbstractSpec_xy("tensor(x[],y[])");
-vespalib::string denseAbstractSpec_x("tensor(x[2],y[])");
-vespalib::string denseAbstractSpec_y("tensor(x[],y[3])");
struct Fixture
{
@@ -307,7 +304,7 @@ Fixture::testSaveLoad()
void
Fixture::testCompaction()
{
- if (_useDenseTensorAttribute && _denseTensors && !_cfg.tensorType().is_abstract()) {
+ if (_useDenseTensorAttribute && _denseTensors) {
LOG(info, "Skipping compaction test for tensor '%s' which is using free-lists", _cfg.tensorType().to_spec().c_str());
return;
}
@@ -411,34 +408,4 @@ TEST("Test dense tensors with dense tensor attribute")
testAll([]() { return std::make_shared<Fixture>(denseSpec, true); });
}
-TEST("Test dense tensors with generic tensor attribute with unbound x and y dims")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_xy); });
-}
-
-TEST("Test dense tensors with dense tensor attribute with unbound x and y dims")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_xy, true); });
-}
-
-TEST("Test dense tensors with generic tensor attribute with unbound x dim")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_x); });
-}
-
-TEST("Test dense tensors with dense tensor attribute with unbound x dim")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_x, true); });
-}
-
-TEST("Test dense tensors with generic tensor attribute with unbound y dim")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_y); });
-}
-
-TEST("Test dense tensors with dense tensor attribute with unbound y dim")
-{
- testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_y, true); });
-}
-
TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); }
diff --git a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
index e33158e559f..fab2ed734cd 100644
--- a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
+++ b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
@@ -62,10 +62,10 @@ FieldWriterWrapper &
FieldWriterWrapper::add(uint32_t docId)
{
DocIdAndFeatures daf;
- daf._docId = docId;
- daf._elements.push_back(WordDocElementFeatures(0));
- daf._elements.back().setNumOccs(1);
- daf._wordPositions.push_back(WordDocElementWordPosFeatures(0));
+ daf.set_doc_id(docId);
+ daf.elements().emplace_back(0);
+ daf.elements().back().setNumOccs(1);
+ daf.word_positions().emplace_back(0);
//LOG(info, "add(%" PRIu64 ", %u)", wordNum, docId);
_writer.add(daf);
return *this;
diff --git a/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp b/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp
index 2419f450950..c7c3447a4cc 100644
--- a/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp
+++ b/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp
@@ -54,7 +54,7 @@ ExpressionReplacer::SP make_replacer() {
auto replacer = std::make_shared<ListExpressionReplacer>();
replacer->add(std::make_unique<NullExpressionReplacer>());
replacer->add(std::make_unique<DummyReplacer>("foo", FeatureType::number()));
- replacer->add(std::make_unique<DummyReplacer>("bar", FeatureType::object(ValueType::from_spec("tensor(x[])"))));
+ replacer->add(std::make_unique<DummyReplacer>("bar", FeatureType::object(ValueType::from_spec("tensor(x[5])"))));
return replacer;
}
@@ -124,15 +124,6 @@ TEST("require that ranking expression can resolve to concrete complex type") {
FeatureType::object(ValueType::from_spec("tensor(x{},y{},z{})"))));
}
-TEST("require that ranking expression can resolve to abstract complex type") {
- TEST_DO(verify_output_type({{"a", "tensor"}}, "a*b", FeatureType::object(ValueType::from_spec("tensor"))));
-}
-
-TEST("require that ranking expression can resolve to 'any' type") {
- TEST_DO(verify_output_type({{"a", "tensor(x{},y{})"}, {"b", "tensor"}}, "a*b",
- FeatureType::object(ValueType::from_spec("any"))));
-}
-
TEST("require that setup fails for incompatible types") {
TEST_DO(verify_setup_fail({{"a", "tensor(x{},y{})"}, {"b", "tensor(y[10],z{})"}}, "a*b"));
}
@@ -150,7 +141,7 @@ TEST("require that replaced expressions override result type") {
TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "foo*b*c",
FeatureType::number()));
TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "a*b*bar",
- FeatureType::object(ValueType::from_spec("tensor(x[])"))));
+ FeatureType::object(ValueType::from_spec("tensor(x[5])"))));
TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "foo*b*bar",
FeatureType::number()));
}
diff --git a/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp b/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp
index 49e9d613861..aca83d67a8a 100644
--- a/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp
+++ b/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp
@@ -41,27 +41,27 @@ Test::assertFeatures(const DocIdAndFeatures &exp,
const DocIdAndFeatures &act)
{
// docid is not encoded as part of features
- if (!EXPECT_EQUAL(exp._elements.size(),
- act._elements.size()))
+ if (!EXPECT_EQUAL(exp.elements().size(),
+ act.elements().size()))
return false;
- for (size_t i = 0; i < exp._elements.size(); ++i) {
- if (!EXPECT_EQUAL(exp._elements[i]._elementId,
- act._elements[i]._elementId))
+ for (size_t i = 0; i < exp.elements().size(); ++i) {
+ if (!EXPECT_EQUAL(exp.elements()[i].getElementId(),
+ act.elements()[i].getElementId()))
return false;
- if (!EXPECT_EQUAL(exp._elements[i]._numOccs,
- act._elements[i]._numOccs))
+ if (!EXPECT_EQUAL(exp.elements()[i].getNumOccs(),
+ act.elements()[i].getNumOccs()))
return false;
- if (!EXPECT_EQUAL(exp._elements[i]._weight, act._elements[i]._weight))
+ if (!EXPECT_EQUAL(exp.elements()[i].getWeight(), act.elements()[i].getWeight()))
return false;
- if (!EXPECT_EQUAL(exp._elements[i]._elementLen,
- act._elements[i]._elementLen))
+ if (!EXPECT_EQUAL(exp.elements()[i].getElementLen(),
+ act.elements()[i].getElementLen()))
return false;
}
- if (!EXPECT_EQUAL(exp._wordPositions.size(), act._wordPositions.size()))
+ if (!EXPECT_EQUAL(exp.word_positions().size(), act.word_positions().size()))
return false;
- for (size_t i = 0; i < exp._wordPositions.size(); ++i) {
- if (!EXPECT_EQUAL(exp._wordPositions[i]._wordPos,
- act._wordPositions[i]._wordPos)) return false;
+ for (size_t i = 0; i < exp.word_positions().size(); ++i) {
+ if (!EXPECT_EQUAL(exp.word_positions()[i].getWordPos(),
+ act.word_positions()[i].getWordPos())) return false;
}
return true;
}
@@ -73,13 +73,13 @@ getFeatures(uint32_t numOccs,
uint32_t elemLen)
{
DocIdAndFeatures f;
- f._docId = 0;
- f._elements.push_back(WordDocElementFeatures(0));
- f._elements.back().setNumOccs(numOccs);
- f._elements.back().setWeight(weight);
- f._elements.back().setElementLen(elemLen);
+ f.set_doc_id(0);
+ f.elements().push_back(WordDocElementFeatures(0));
+ f.elements().back().setNumOccs(numOccs);
+ f.elements().back().setWeight(weight);
+ f.elements().back().setElementLen(elemLen);
for (uint32_t i = 0; i < numOccs; ++i) {
- f._wordPositions.push_back(WordDocElementWordPosFeatures(i));
+ f.word_positions().push_back(WordDocElementWordPosFeatures(i));
}
return f;
}
diff --git a/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt b/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt
index 767097b99db..a09d6baf1a5 100644
--- a/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt
+++ b/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt
@@ -5,5 +5,15 @@ vespa_add_executable(searchlib_field_index_test_app TEST
DEPENDS
searchlib
searchlib_test
+ gtest
)
vespa_add_test(NAME searchlib_field_index_test_app COMMAND searchlib_field_index_test_app)
+
+vespa_add_executable(searchlib_field_index_iterator_test_app TEST
+ SOURCES
+ field_index_iterator_test.cpp
+ DEPENDS
+ searchlib
+ searchlib_test
+)
+vespa_add_test(NAME searchlib_field_index_iterator_test_app COMMAND searchlib_field_index_iterator_test_app)
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp
new file mode 100644
index 00000000000..df7f80e8601
--- /dev/null
+++ b/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp
@@ -0,0 +1,73 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchcommon/common/schema.h>
+#include <vespa/searchlib/memoryindex/field_index.h>
+#include <vespa/searchlib/memoryindex/posting_iterator.h>
+#include <vespa/searchlib/test/memoryindex/wrap_inserter.h>
+#include <vespa/searchlib/test/searchiteratorverifier.h>
+#include <vespa/vespalib/testkit/testapp.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP("field_index_iterator_test");
+
+using namespace search::fef;
+using namespace search::index;
+using namespace search::memoryindex::test;
+using namespace search::memoryindex;
+
+using search::index::schema::DataType;
+using search::test::SearchIteratorVerifier;
+
+class Verifier : public SearchIteratorVerifier {
+private:
+ mutable TermFieldMatchData _tfmd;
+ FieldIndex _field_index;
+
+public:
+ Verifier(const Schema& schema)
+ : _tfmd(),
+ _field_index(schema, 0)
+ {
+ WrapInserter inserter(_field_index);
+ inserter.word("a");
+ for (uint32_t docId : getExpectedDocIds()) {
+ inserter.add(docId);
+ }
+ inserter.flush();
+ }
+ ~Verifier() {}
+
+ SearchIterator::UP create(bool strict) const override {
+ (void) strict;
+ TermFieldMatchDataArray match_data;
+ match_data.add(&_tfmd);
+ return std::make_unique<PostingIterator>(_field_index.find("a"),
+ _field_index.getFeatureStore(), 0, match_data);
+ }
+};
+
+Schema
+get_schema()
+{
+ Schema result;
+ result.addIndexField(Schema::IndexField("f0", DataType::STRING));
+ return result;
+}
+
+struct Fixture {
+ Schema schema;
+ Verifier verifier;
+ Fixture()
+ : schema(get_schema()),
+ verifier(schema)
+ {
+ }
+};
+
+TEST_F("require that posting iterator conforms", Fixture)
+{
+ f.verifier.verify();
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
+
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
index 3a635756ec7..2b9b77d32a3 100644
--- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
+++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
@@ -9,17 +9,18 @@
#include <vespa/searchlib/fef/fieldpositionsiterator.h>
#include <vespa/searchlib/fef/termfieldmatchdata.h>
#include <vespa/searchlib/index/docbuilder.h>
+#include <vespa/searchlib/index/docidandfeatures.h>
#include <vespa/searchlib/index/dummyfileheadercontext.h>
#include <vespa/searchlib/memoryindex/document_inverter.h>
#include <vespa/searchlib/memoryindex/field_index_collection.h>
#include <vespa/searchlib/memoryindex/field_inverter.h>
#include <vespa/searchlib/memoryindex/ordered_field_index_inserter.h>
#include <vespa/searchlib/memoryindex/posting_iterator.h>
-#include <vespa/searchlib/test/searchiteratorverifier.h>
-#include <vespa/vespalib/testkit/testapp.h>
+#include <vespa/searchlib/test/memoryindex/wrap_inserter.h>
+#include <vespa/vespalib/gtest/gtest.h>
#include <vespa/log/log.h>
-LOG_SETUP("dictionary_test");
+LOG_SETUP("field_index_test");
namespace search {
@@ -32,45 +33,35 @@ using document::Document;
using queryeval::SearchIterator;
using search::index::schema::CollectionType;
using search::index::schema::DataType;
-using test::SearchIteratorVerifier;
using vespalib::GenerationHandler;
namespace memoryindex {
-typedef FieldIndex::PostingList PostingList;
-typedef PostingList::ConstIterator PostingConstItr;
+using test::WrapInserter;
+using PostingList = FieldIndex::PostingList;
+using PostingConstItr = PostingList::ConstIterator;
class MyBuilder : public IndexBuilder {
private:
std::stringstream _ss;
bool _insideWord;
bool _insideField;
- bool _insideDoc;
- bool _insideElem;
bool _firstWord;
bool _firstField;
bool _firstDoc;
- bool _firstElem;
- bool _firstPos;
-public:
+public:
MyBuilder(const Schema &schema)
: IndexBuilder(schema),
_ss(),
_insideWord(false),
_insideField(false),
- _insideDoc(false),
- _insideElem(false),
_firstWord(true),
_firstField(true),
- _firstDoc(true),
- _firstElem(true),
- _firstPos(true)
+ _firstDoc(true)
{}
- virtual void
- startWord(vespalib::stringref word) override
- {
+ virtual void startWord(vespalib::stringref word) override {
assert(_insideField);
assert(!_insideWord);
if (!_firstWord)
@@ -80,19 +71,14 @@ public:
_insideWord = true;
}
- virtual void
- endWord() override
- {
+ virtual void endWord() override {
assert(_insideWord);
- assert(!_insideDoc);
_ss << "]";
_firstWord = false;
_insideWord = false;
}
- virtual void
- startField(uint32_t fieldId) override
- {
+ virtual void startField(uint32_t fieldId) override {
assert(!_insideField);
if (!_firstField) _ss << ",";
_ss << "f=" << fieldId << "[";
@@ -100,9 +86,7 @@ public:
_insideField = true;
}
- virtual void
- endField() override
- {
+ virtual void endField() override {
assert(_insideField);
assert(!_insideWord);
_ss << "]";
@@ -110,63 +94,36 @@ public:
_insideField = false;
}
- virtual void
- startDocument(uint32_t docId) override
- {
+ virtual void add_document(const DocIdAndFeatures &features) override {
assert(_insideWord);
- assert(!_insideDoc);
- if (!_firstDoc) _ss << ",";
- _ss << "d=" << docId << "[";
- _firstElem = true;
- _insideDoc = true;
- }
-
- virtual void
- endDocument() override
- {
- assert(_insideDoc);
- assert(!_insideElem);
- _ss << "]";
- _firstDoc = false;
- _insideDoc = false;
- }
-
- virtual void
- startElement(uint32_t elementId,
- int32_t weight,
- uint32_t elementLen) override
- {
- assert(_insideDoc);
- assert(!_insideElem);
- if (!_firstElem)
+ if (!_firstDoc) {
_ss << ",";
- _ss << "e=" << elementId <<
- ",w=" << weight << ",l=" << elementLen << "[";
- _firstPos = true;
- _insideElem = true;
- }
-
- virtual void
- endElement() override
- {
- assert(_insideElem);
+ }
+ _ss << "d=" << features.doc_id() << "[";
+ bool first_elem = true;
+ size_t word_pos_offset = 0;
+ for (const auto& elem : features.elements()) {
+ if (!first_elem) {
+ _ss << ",";
+ }
+ _ss << "e=" << elem.getElementId() << ",w=" << elem.getWeight() << ",l=" << elem.getElementLen() << "[";
+ bool first_pos = true;
+ for (size_t i = 0; i < elem.getNumOccs(); ++i) {
+ if (!first_pos) {
+ _ss << ",";
+ }
+ _ss << features.word_positions()[i + word_pos_offset].getWordPos();
+ first_pos = false;
+ }
+ word_pos_offset += elem.getNumOccs();
+ _ss << "]";
+ first_elem = false;
+ }
_ss << "]";
- _firstElem = false;
- _insideElem = false;
- }
-
- virtual void
- addOcc(const WordDocElementWordPosFeatures &features) override
- {
- assert(_insideElem);
- if (!_firstPos) _ss << ",";
- _ss << features.getWordPos();
- _firstPos = false;
+ _firstDoc = false;
}
- std::string
- toStr() const
- {
+ std::string toStr() const {
return _ss.str();
}
};
@@ -186,8 +143,9 @@ toString(FieldPositionsIterator posItr,
first = false;
if (hasElements) {
ss << "[e=" << posItr.getElementId();
- if (hasWeights)
+ if (hasWeights) {
ss << ",w=" << posItr.getElementWeight();
+ }
ss << ",l=" << posItr.getElementLen() << "]";
}
}
@@ -198,10 +156,10 @@ toString(FieldPositionsIterator posItr,
bool
assertPostingList(const std::string &exp,
PostingConstItr itr,
- const FeatureStore *store = NULL)
+ const FeatureStore *store = nullptr)
{
std::stringstream ss;
- FeatureStore::DecodeContextCooked decoder(NULL);
+ FeatureStore::DecodeContextCooked decoder(nullptr);
TermFieldMatchData tfmd;
TermFieldMatchDataArray matchData;
matchData.add(&tfmd);
@@ -210,7 +168,7 @@ assertPostingList(const std::string &exp,
if (i > 0) ss << ",";
uint32_t docId = itr.getKey();
ss << docId;
- if (store != NULL) { // consider features as well
+ if (store != nullptr) { // consider features as well
EntryRef ref(itr.getData());
store->setupForField(0, decoder);
store->setupForUnpackFeatures(ref, decoder);
@@ -219,7 +177,9 @@ assertPostingList(const std::string &exp,
}
}
ss << "]";
- return EXPECT_EQUAL(exp, ss.str());
+ bool result = (exp == ss.str());
+ EXPECT_EQ(exp, ss.str());
+ return result;
}
bool
@@ -236,15 +196,13 @@ assertPostingList(std::vector<uint32_t> &exp, PostingConstItr itr)
}
-namespace
-{
+namespace {
/**
* A simple mockup of a memory field index, used to verify
* that we get correct posting lists from real memory field index.
*/
-class MockFieldIndex
-{
+class MockFieldIndex {
std::map<std::pair<vespalib::string, uint32_t>, std::set<uint32_t>> _dict;
vespalib::string _word;
uint32_t _fieldId;
@@ -252,32 +210,23 @@ class MockFieldIndex
public:
~MockFieldIndex();
void
- setNextWord(const vespalib::string &word)
- {
+ setNextWord(const vespalib::string &word) {
_word = word;
}
- void
- setNextField(uint32_t fieldId)
- {
+ void setNextField(uint32_t fieldId) {
_fieldId = fieldId;
}
- void
- add(uint32_t docId)
- {
+ void add(uint32_t docId) {
_dict[std::make_pair(_word, _fieldId)].insert(docId);
}
- void
- remove(uint32_t docId)
- {
+ void remove(uint32_t docId) {
_dict[std::make_pair(_word, _fieldId)].erase(docId);
}
- std::vector<uint32_t>
- find(const vespalib::string &word, uint32_t fieldId)
- {
+ std::vector<uint32_t> find(const vespalib::string &word, uint32_t fieldId) {
std::vector<uint32_t> res;
for (auto docId : _dict[std::make_pair(word, fieldId)] ) {
res.push_back(docId);
@@ -285,13 +234,11 @@ public:
return res;
}
- auto begin()
- {
+ auto begin() {
return _dict.begin();
}
- auto end()
- {
+ auto end() {
return _dict.end();
}
};
@@ -303,8 +250,7 @@ MockFieldIndex::~MockFieldIndex() = default;
* still stored safely in memory, to satisfy OrderedFieldIndexInserter
* needs.
*/
-class MockWordStoreScan
-{
+class MockWordStoreScan {
vespalib::string _word0;
vespalib::string _word1;
vespalib::string *_prevWord;
@@ -319,15 +265,11 @@ public:
{ }
~MockWordStoreScan();
- const vespalib::string &
- getWord() const
- {
+ const vespalib::string &getWord() const {
return *_word;
}
- const vespalib::string &
- setWord(const vespalib::string &word)
- {
+ const vespalib::string &setWord(const vespalib::string &word) {
std::swap(_prevWord, _word);
*_word = word;
return *_word;
@@ -341,8 +283,7 @@ MockWordStoreScan::~MockWordStoreScan() = default;
* and a real memory index. Mockup version is used to calculate expected
* answers.
*/
-class MyInserter
-{
+class MyInserter {
MockWordStoreScan _wordStoreScan;
MockFieldIndex _mock;
FieldIndexCollection _fieldIndexes;
@@ -361,17 +302,13 @@ public:
}
~MyInserter();
- void
- setNextWord(const vespalib::string &word)
- {
+ void setNextWord(const vespalib::string &word) {
const vespalib::string &w = _wordStoreScan.setWord(word);
_inserter->setNextWord(w);
_mock.setNextWord(w);
}
- void
- setNextField(uint32_t fieldId)
- {
+ void setNextField(uint32_t fieldId) {
if (_inserter != nullptr) {
_inserter->flush();
}
@@ -380,32 +317,26 @@ public:
_mock.setNextField(fieldId);
}
- void
- add(uint32_t docId)
- {
+ void add(uint32_t docId) {
_inserter->add(docId, _features);
_mock.add(docId);
}
- void
- remove(uint32_t docId)
- {
+ void remove(uint32_t docId) {
_inserter->remove(docId);
_mock.remove(docId);
}
- bool
- assertPosting(const vespalib::string &word,
- uint32_t fieldId)
- {
+ bool assertPosting(const vespalib::string &word,
+ uint32_t fieldId) {
std::vector<uint32_t> exp = _mock.find(word, fieldId);
PostingConstItr itr = _fieldIndexes.find(word, fieldId);
- return EXPECT_TRUE(assertPostingList(exp, itr));
+ bool result = assertPostingList(exp, itr);
+ EXPECT_TRUE(result);
+ return result;
}
- bool
- assertPostings()
- {
+ bool assertPostings() {
if (_inserter != nullptr) {
_inserter->flush();
}
@@ -413,25 +344,23 @@ public:
auto &wf = wfp.first;
auto &word = wf.first;
auto fieldId = wf.second;
- if (!EXPECT_TRUE(assertPosting(word, fieldId))) {
+ bool result = assertPosting(word, fieldId);
+ EXPECT_TRUE(result);
+ if (!result) {
return false;
}
}
return true;
}
- void
- rewind()
- {
+ void rewind() {
if (_inserter != nullptr) {
_inserter->flush();
_inserter = nullptr;
}
}
- uint32_t
- getNumUniqueWords()
- {
+ uint32_t getNumUniqueWords() {
return _fieldIndexes.getNumUniqueWords();
}
@@ -439,6 +368,7 @@ public:
};
MyInserter::~MyInserter() = default;
+
void
myremove(uint32_t docId, DocumentInverter &inv, FieldIndexCollection &fieldIndexes,
ISequencedTaskExecutor &invertThreads)
@@ -448,63 +378,7 @@ myremove(uint32_t docId, DocumentInverter &inv, FieldIndexCollection &fieldIndex
inv.pushDocuments(fieldIndexes, std::shared_ptr<IDestructorCallback>());
}
-
-class WrapInserter
-{
- OrderedFieldIndexInserter &_inserter;
-public:
- WrapInserter(FieldIndexCollection &fieldIndexes, uint32_t fieldId)
- : _inserter(fieldIndexes.getFieldIndex(fieldId)->getInserter())
- {
- }
-
- WrapInserter &word(vespalib::stringref word_)
- {
- _inserter.setNextWord(word_);
- return *this;
- }
-
- WrapInserter &add(uint32_t docId, const index::DocIdAndFeatures &features)
- {
- _inserter.add(docId, features);
- return *this;
- }
-
- WrapInserter &add(uint32_t docId)
- {
- DocIdAndPosOccFeatures features;
- features.addNextOcc(0, 0, 1, 1);
- return add(docId, features);
- }
-
- WrapInserter &remove(uint32_t docId)
- {
- _inserter.remove(docId);
- return *this;
- }
-
- WrapInserter &flush()
- {
- _inserter.flush();
- return *this;
- }
-
- WrapInserter &rewind()
- {
- _inserter.rewind();
- return *this;
- }
-
- datastore::EntryRef
- getWordRef()
- {
- return _inserter.getWordRef();
- }
-};
-
-
-class MyDrainRemoves : IFieldIndexRemoveListener
-{
+class MyDrainRemoves : IFieldIndexRemoveListener {
FieldIndexRemover &_remover;
public:
virtual void remove(const vespalib::stringref, uint32_t) override { }
@@ -514,8 +388,12 @@ public:
{
}
- void drain(uint32_t docId)
+ MyDrainRemoves(FieldIndex& field_index)
+ : _remover(field_index.getDocumentRemover())
{
+ }
+
+ void drain(uint32_t docId) {
_remover.remove(docId, *this);
}
};
@@ -526,7 +404,6 @@ myPushDocument(DocumentInverter &inv, FieldIndexCollection &fieldIndexes)
inv.pushDocuments(fieldIndexes, std::shared_ptr<IDestructorCallback>());
}
-
const FeatureStore *
featureStorePtr(const FieldIndexCollection &fieldIndexes, uint32_t fieldId)
{
@@ -539,7 +416,6 @@ featureStoreRef(const FieldIndexCollection &fieldIndexes, uint32_t fieldId)
return fieldIndexes.getFieldIndex(fieldId)->getFeatureStore();
}
-
DataStoreBase::MemStats
getFeatureStoreMemStats(const FieldIndexCollection &fieldIndexes)
{
@@ -553,8 +429,8 @@ getFeatureStoreMemStats(const FieldIndexCollection &fieldIndexes)
return res;
}
-
-void myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads)
+void
+myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads)
{
uint32_t fieldId = 0;
for (auto &fieldIndex : fieldIndexes.getFieldIndexes()) {
@@ -566,7 +442,6 @@ void myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushTh
pushThreads.sync();
}
-
void
myCompactFeatures(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads)
{
@@ -581,57 +456,77 @@ myCompactFeatures(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pu
}
-
-struct Fixture
+Schema
+make_single_field_schema()
{
- Schema _schema;
- Fixture() : _schema() {
- _schema.addIndexField(Schema::IndexField("f0", DataType::STRING));
- _schema.addIndexField(Schema::IndexField("f1", DataType::STRING));
- _schema.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY));
- _schema.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET));
+ Schema result;
+ result.addIndexField(Schema::IndexField("f0", DataType::STRING));
+ return result;
+}
+
+struct FieldIndexTest : public ::testing::Test {
+ Schema schema;
+ FieldIndex idx;
+ FieldIndexTest()
+ : schema(make_single_field_schema()),
+ idx(schema, 0)
+ {
}
- const Schema & getSchema() const { return _schema; }
};
-// TODO: Rewrite most tests to use FieldIndex directly instead of going via FieldIndexCollection.
+Schema
+make_multi_field_schema()
+{
+ Schema result;
+ result.addIndexField(Schema::IndexField("f0", DataType::STRING));
+ result.addIndexField(Schema::IndexField("f1", DataType::STRING));
+ result.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY));
+ result.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET));
+ return result;
+}
+
+struct FieldIndexCollectionTest : public ::testing::Test {
+ Schema schema;
+ FieldIndexCollection fic;
+ FieldIndexCollectionTest()
+ : schema(make_multi_field_schema()),
+ fic(schema)
+ {
+ }
+ ~FieldIndexCollectionTest() {}
+};
-TEST_F("requireThatFreshInsertWorks", Fixture)
+TEST_F(FieldIndexTest, require_that_fresh_insert_works)
{
- FieldIndexCollection fic(f.getSchema());
- SequencedTaskExecutor pushThreads(2);
- EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0)));
- EXPECT_EQUAL(0u, fic.getNumUniqueWords());
- WrapInserter(fic, 0).word("a").add(10).flush();
- EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0)));
- myCommit(fic, pushThreads);
- EXPECT_TRUE(assertPostingList("[10]", fic.findFrozen("a", 0)));
- EXPECT_EQUAL(1u, fic.getNumUniqueWords());
+ EXPECT_TRUE(assertPostingList("[]", idx.find("a")));
+ EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a")));
+ EXPECT_EQ(0u, idx.getNumUniqueWords());
+ WrapInserter(idx).word("a").add(10).flush();
+ EXPECT_TRUE(assertPostingList("[10]", idx.find("a")));
+ EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a")));
+ idx.commit();
+ EXPECT_TRUE(assertPostingList("[10]", idx.findFrozen("a")));
+ EXPECT_EQ(1u, idx.getNumUniqueWords());
}
-TEST_F("requireThatAppendInsertWorks", Fixture)
+TEST_F(FieldIndexTest, require_that_append_insert_works)
{
- FieldIndexCollection fic(f.getSchema());
- SequencedTaskExecutor pushThreads(2);
- WrapInserter(fic, 0).word("a").add(10).flush().rewind().
- word("a").add(5).flush();
- EXPECT_TRUE(assertPostingList("[5,10]", fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0)));
- WrapInserter(fic, 0).rewind().word("a").add(20).flush();
- EXPECT_TRUE(assertPostingList("[5,10,20]", fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0)));
- myCommit(fic, pushThreads);
- EXPECT_TRUE(assertPostingList("[5,10,20]", fic.findFrozen("a", 0)));
+ WrapInserter(idx).word("a").add(10).flush().rewind().
+ word("a").add(5).flush();
+ EXPECT_TRUE(assertPostingList("[5,10]", idx.find("a")));
+ EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a")));
+ WrapInserter(idx).rewind().word("a").add(20).flush();
+ EXPECT_TRUE(assertPostingList("[5,10,20]", idx.find("a")));
+ EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a")));
+ idx.commit();
+ EXPECT_TRUE(assertPostingList("[5,10,20]", idx.findFrozen("a")));
}
-TEST_F("requireThatMultiplePostingListsCanExist", Fixture)
+TEST_F(FieldIndexCollectionTest, require_that_multiple_posting_lists_across_multiple_fields_can_exist)
{
- FieldIndexCollection fic(f.getSchema());
WrapInserter(fic, 0).word("a").add(10).word("b").add(11).add(15).flush();
WrapInserter(fic, 1).word("a").add(5).word("b").add(12).flush();
- EXPECT_EQUAL(4u, fic.getNumUniqueWords());
+ EXPECT_EQ(4u, fic.getNumUniqueWords());
EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0)));
EXPECT_TRUE(assertPostingList("[5]", fic.find("a", 1)));
EXPECT_TRUE(assertPostingList("[11,15]", fic.find("b", 0)));
@@ -640,28 +535,27 @@ TEST_F("requireThatMultiplePostingListsCanExist", Fixture)
EXPECT_TRUE(assertPostingList("[]", fic.find("c", 0)));
}
-TEST_F("requireThatRemoveWorks", Fixture)
+TEST_F(FieldIndexTest, require_that_remove_works)
{
- FieldIndexCollection fic(f.getSchema());
- WrapInserter(fic, 0).word("a").remove(10).flush();
- EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0)));
- WrapInserter(fic, 0).add(10).add(20).add(30).flush();
- EXPECT_TRUE(assertPostingList("[10,20,30]", fic.find("a", 0)));
- WrapInserter(fic, 0).rewind().word("a").remove(10).flush();
- EXPECT_TRUE(assertPostingList("[20,30]", fic.find("a", 0)));
- WrapInserter(fic, 0).remove(20).flush();
- EXPECT_TRUE(assertPostingList("[30]", fic.find("a", 0)));
- WrapInserter(fic, 0).remove(30).flush();
- EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0)));
- EXPECT_EQUAL(1u, fic.getNumUniqueWords());
- MyDrainRemoves(fic, 0).drain(10);
- WrapInserter(fic, 0).rewind().word("a").add(10).flush();
- EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0)));
+ WrapInserter(idx).word("a").remove(10).flush();
+ EXPECT_TRUE(assertPostingList("[]", idx.find("a")));
+ WrapInserter(idx).add(10).add(20).add(30).flush();
+ EXPECT_TRUE(assertPostingList("[10,20,30]", idx.find("a")));
+ WrapInserter(idx).rewind().word("a").remove(10).flush();
+ EXPECT_TRUE(assertPostingList("[20,30]", idx.find("a")));
+ WrapInserter(idx).remove(20).flush();
+ EXPECT_TRUE(assertPostingList("[30]", idx.find("a")));
+ WrapInserter(idx).remove(30).flush();
+ EXPECT_TRUE(assertPostingList("[]", idx.find("a")));
+ EXPECT_EQ(1u, idx.getNumUniqueWords());
+ MyDrainRemoves(idx).drain(10);
+ WrapInserter(idx).rewind().word("a").add(10).flush();
+ EXPECT_TRUE(assertPostingList("[10]", idx.find("a")));
}
-TEST_F("requireThatMultipleInsertAndRemoveWorks", Fixture)
+TEST_F(FieldIndexCollectionTest, require_that_multiple_insert_and_remove_works)
{
- MyInserter inserter(f.getSchema());
+ MyInserter inserter(schema);
uint32_t numFields = 4;
for (uint32_t fi = 0; fi < numFields; ++fi) {
inserter.setNextField(fi);
@@ -671,8 +565,8 @@ TEST_F("requireThatMultipleInsertAndRemoveWorks", Fixture)
for (uint32_t di = 0; di < (uint32_t) w; ++di) { // insert
inserter.add(di * 3);
}
- EXPECT_EQUAL((w - 'a' + 1u) + ('z' - 'a' +1u) * fi,
- inserter.getNumUniqueWords());
+ EXPECT_EQ((w - 'a' + 1u) + ('z' - 'a' +1u) * fi,
+ inserter.getNumUniqueWords());
}
}
EXPECT_TRUE(inserter.assertPostings());
@@ -707,12 +601,10 @@ addElement(DocIdAndFeatures &f,
uint32_t numOccs,
int32_t weight = 1)
{
- f._elements.push_back(WordDocElementFeatures(f._elements.size()));
- f._elements.back().setElementLen(elemLen);
- f._elements.back().setWeight(weight);
- f._elements.back().setNumOccs(numOccs);
+ f.elements().emplace_back(f.elements().size(), weight, elemLen);
+ f.elements().back().setNumOccs(numOccs);
for (uint32_t i = 0; i < numOccs; ++i) {
- f._wordPositions.push_back(WordDocElementWordPosFeatures(i));
+ f.word_positions().emplace_back(i);
}
}
@@ -724,9 +616,8 @@ getFeatures(uint32_t elemLen, uint32_t numOccs, int32_t weight = 1)
return f;
}
-TEST_F("requireThatFeaturesAreInPostingLists", Fixture)
+TEST_F(FieldIndexCollectionTest, require_that_features_are_in_posting_lists)
{
- FieldIndexCollection fic(f.getSchema());
WrapInserter(fic, 0).word("a").add(1, getFeatures(4, 2)).flush();
EXPECT_TRUE(assertPostingList("[1{4:0,1}]",
fic.find("a", 0),
@@ -742,47 +633,9 @@ TEST_F("requireThatFeaturesAreInPostingLists", Fixture)
featureStorePtr(fic, 1)));
}
-class Verifier : public SearchIteratorVerifier {
-public:
- Verifier(const Schema & schema);
- ~Verifier();
-
- SearchIterator::UP create(bool strict) const override {
- (void) strict;
- TermFieldMatchDataArray matchData;
- matchData.add(&_tfmd);
- return std::make_unique<PostingIterator>(_fieldIndexes.find("a", 0), featureStoreRef(_fieldIndexes, 0), 0, matchData);
- }
-
-private:
- mutable TermFieldMatchData _tfmd;
- FieldIndexCollection _fieldIndexes;
-};
-
-
-Verifier::Verifier(const Schema & schema)
- : _tfmd(),
- _fieldIndexes(schema)
-{
- WrapInserter inserter(_fieldIndexes, 0);
- inserter.word("a");
- for (uint32_t docId : getExpectedDocIds()) {
- inserter.add(docId);
- }
- inserter.flush();
-}
-Verifier::~Verifier() {}
-
-TEST_F("require that postingiterator conforms", Fixture) {
- Verifier verifier(f.getSchema());
- verifier.verify();
-
-}
-
-TEST_F("requireThatPostingIteratorIsWorking", Fixture)
+TEST_F(FieldIndexTest, require_that_posting_iterator_is_working)
{
- FieldIndexCollection fic(f.getSchema());
- WrapInserter(fic, 0).word("a").add(10, getFeatures(4, 1)).
+ WrapInserter(idx).word("a").add(10, getFeatures(4, 1)).
add(20, getFeatures(5, 2)).
add(30, getFeatures(6, 1)).
add(40, getFeatures(7, 2)).flush();
@@ -790,166 +643,166 @@ TEST_F("requireThatPostingIteratorIsWorking", Fixture)
TermFieldMatchDataArray matchData;
matchData.add(&tfmd);
{
- PostingIterator itr(fic.find("not", 0),
- featureStoreRef(fic, 0),
+ PostingIterator itr(idx.find("not"),
+ idx.getFeatureStore(),
0, matchData);
itr.initFullRange();
EXPECT_TRUE(itr.isAtEnd());
}
{
- PostingIterator itr(fic.find("a", 0),
- featureStoreRef(fic, 0),
+ PostingIterator itr(idx.find("a"),
+ idx.getFeatureStore(),
0, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{4:0}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{4:0}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(25));
- EXPECT_EQUAL(30u, itr.getDocId());
+ EXPECT_EQ(30u, itr.getDocId());
itr.unpack(30);
- EXPECT_EQUAL("{6:0}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{6:0}", toString(tfmd.getIterator()));
EXPECT_TRUE(itr.seek(40));
- EXPECT_EQUAL(40u, itr.getDocId());
+ EXPECT_EQ(40u, itr.getDocId());
itr.unpack(40);
- EXPECT_EQUAL("{7:0,1}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{7:0,1}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(41));
EXPECT_TRUE(itr.isAtEnd());
}
}
-TEST_F("requireThatDumpingToIndexBuilderIsWorking", Fixture)
+TEST_F(FieldIndexCollectionTest, require_that_basic_dumping_to_index_builder_is_working)
{
- {
- MyBuilder b(f.getSchema());
- WordDocElementWordPosFeatures wpf;
- b.startField(4);
- b.startWord("a");
- b.startDocument(2);
- b.startElement(0, 10, 20);
- wpf.setWordPos(1);
- b.addOcc(wpf);
- wpf.setWordPos(3);
- b.addOcc(wpf);
- b.endElement();
- b.endDocument();
- b.endWord();
- b.endField();
- EXPECT_EQUAL("f=4[w=a[d=2[e=0,w=10,l=20[1,3]]]]", b.toStr());
- }
- {
- FieldIndexCollection fic(f.getSchema());
- MyBuilder b(f.getSchema());
- DocIdAndFeatures df;
- WrapInserter(fic, 1).word("a").add(5, getFeatures(2, 1)).
+ MyBuilder b(schema);
+ WordDocElementWordPosFeatures wpf;
+ b.startField(4);
+ b.startWord("a");
+ DocIdAndFeatures features;
+ features.set_doc_id(2);
+ features.elements().emplace_back(0, 10, 20);
+ features.elements().back().setNumOccs(2);
+ features.word_positions().emplace_back(1);
+ features.word_positions().emplace_back(3);
+ b.add_document(features);
+ b.endWord();
+ b.endField();
+ EXPECT_EQ("f=4[w=a[d=2[e=0,w=10,l=20[1,3]]]]", b.toStr());
+}
+
+TEST_F(FieldIndexCollectionTest, require_that_dumping_of_multiple_fields_to_index_builder_is_working)
+{
+ MyBuilder b(schema);
+ DocIdAndFeatures df;
+ WrapInserter(fic, 1).word("a").add(5, getFeatures(2, 1)).
add(7, getFeatures(3, 2)).
word("b").add(5, getFeatures(12, 2)).flush();
- df = getFeatures(4, 1);
- addElement(df, 5, 2);
- WrapInserter(fic, 2).word("a").add(5, df);
- df = getFeatures(6, 1);
- addElement(df, 7, 2);
- WrapInserter(fic, 2).add(7, df).flush();
-
- df = getFeatures(8, 1, 12);
- addElement(df, 9, 2, 13);
- WrapInserter(fic, 3).word("a").add(5, df);
- df = getFeatures(10, 1, 14);
- addElement(df, 11, 2, 15);
- WrapInserter(fic, 3).add(7, df).flush();
-
- fic.dump(b);
+ df = getFeatures(4, 1);
+ addElement(df, 5, 2);
+ WrapInserter(fic, 2).word("a").add(5, df);
+ df = getFeatures(6, 1);
+ addElement(df, 7, 2);
+ WrapInserter(fic, 2).add(7, df).flush();
+
+ df = getFeatures(8, 1, 12);
+ addElement(df, 9, 2, 13);
+ WrapInserter(fic, 3).word("a").add(5, df);
+ df = getFeatures(10, 1, 14);
+ addElement(df, 11, 2, 15);
+ WrapInserter(fic, 3).add(7, df).flush();
+
+ fic.dump(b);
+
+ EXPECT_EQ("f=0[],"
+ "f=1[w=a[d=5[e=0,w=1,l=2[0]],d=7[e=0,w=1,l=3[0,1]]],"
+ "w=b[d=5[e=0,w=1,l=12[0,1]]]],"
+ "f=2[w=a[d=5[e=0,w=1,l=4[0],e=1,w=1,l=5[0,1]],"
+ "d=7[e=0,w=1,l=6[0],e=1,w=1,l=7[0,1]]]],"
+ "f=3[w=a[d=5[e=0,w=12,l=8[0],e=1,w=13,l=9[0,1]],"
+ "d=7[e=0,w=14,l=10[0],e=1,w=15,l=11[0,1]]]]",
+ b.toStr());
+}
- EXPECT_EQUAL("f=0[],"
- "f=1[w=a[d=5[e=0,w=1,l=2[0]],d=7[e=0,w=1,l=3[0,1]]],"
- "w=b[d=5[e=0,w=1,l=12[0,1]]]],"
- "f=2[w=a[d=5[e=0,w=1,l=4[0],e=1,w=1,l=5[0,1]],"
- "d=7[e=0,w=1,l=6[0],e=1,w=1,l=7[0,1]]]],"
- "f=3[w=a[d=5[e=0,w=12,l=8[0],e=1,w=13,l=9[0,1]],"
- "d=7[e=0,w=14,l=10[0],e=1,w=15,l=11[0,1]]]]",
- b.toStr());
- }
- { // test word with no docs
- FieldIndexCollection fic(f.getSchema());
- WrapInserter(fic, 0).word("a").add(2, getFeatures(2, 1)).
+TEST_F(FieldIndexCollectionTest, require_that_dumping_words_with_no_docs_to_index_builder_is_working)
+{
+ WrapInserter(fic, 0).word("a").add(2, getFeatures(2, 1)).
word("b").add(4, getFeatures(4, 1)).flush().rewind().
word("a").remove(2).flush();
- {
- MyBuilder b(f.getSchema());
- fic.dump(b);
- EXPECT_EQUAL("f=0[w=b[d=4[e=0,w=1,l=4[0]]]],f=1[],f=2[],f=3[]",
- b.toStr());
- }
- {
- search::diskindex::IndexBuilder b(f.getSchema());
- b.setPrefix("dump");
- TuneFileIndexing tuneFileIndexing;
- DummyFileHeaderContext fileHeaderContext;
- b.open(5, 2, tuneFileIndexing, fileHeaderContext);
- fic.dump(b);
- b.close();
- }
+ {
+ MyBuilder b(schema);
+ fic.dump(b);
+ EXPECT_EQ("f=0[w=b[d=4[e=0,w=1,l=4[0]]]],f=1[],f=2[],f=3[]",
+ b.toStr());
+ }
+ {
+ search::diskindex::IndexBuilder b(schema);
+ b.setPrefix("dump");
+ TuneFileIndexing tuneFileIndexing;
+ DummyFileHeaderContext fileHeaderContext;
+ b.open(5, 2, tuneFileIndexing, fileHeaderContext);
+ fic.dump(b);
+ b.close();
}
}
-
-template <typename FixtureBase>
-class FieldIndexFixture : public FixtureBase
-{
+class InverterTest : public ::testing::Test {
public:
- using FixtureBase::getSchema;
+ Schema _schema;
FieldIndexCollection _fic;
DocBuilder _b;
SequencedTaskExecutor _invertThreads;
SequencedTaskExecutor _pushThreads;
DocumentInverter _inv;
- FieldIndexFixture()
- : FixtureBase(),
- _fic(getSchema()),
- _b(getSchema()),
+ InverterTest(const Schema& schema)
+ : _schema(schema),
+ _fic(_schema),
+ _b(_schema),
_invertThreads(2),
_pushThreads(2),
- _inv(getSchema(), _invertThreads, _pushThreads)
+ _inv(_schema, _invertThreads, _pushThreads)
{
}
};
+class BasicInverterTest : public InverterTest {
+public:
+ BasicInverterTest() : InverterTest(make_multi_field_schema()) {}
+};
-TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
+TEST_F(BasicInverterTest, require_that_inversion_is_working)
{
Document::UP doc;
- f._b.startDocument("doc::10");
- f._b.startIndexField("f0").
+ _b.startDocument("doc::10");
+ _b.startIndexField("f0").
addStr("a").addStr("b").addStr("c").addStr("d").
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(10, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- f._b.startDocument("doc::20");
- f._b.startIndexField("f0").
+ doc = _b.endDocument();
+ _inv.invertDocument(10, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ _b.startDocument("doc::20");
+ _b.startIndexField("f0").
addStr("a").addStr("a").addStr("b").addStr("c").addStr("d").
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(20, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- f._b.startDocument("doc::30");
- f._b.startIndexField("f0").
+ doc = _b.endDocument();
+ _inv.invertDocument(20, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ _b.startDocument("doc::30");
+ _b.startIndexField("f0").
addStr("a").addStr("b").addStr("c").addStr("d").
addStr("e").addStr("f").
endField();
- f._b.startIndexField("f1").
+ _b.startIndexField("f1").
addStr("\nw2").addStr("w").addStr("x").
addStr("\nw3").addStr("y").addStr("z").
endField();
- f._b.startIndexField("f2").
+ _b.startIndexField("f2").
startElement(4).
addStr("w").addStr("x").
endElement().
@@ -957,7 +810,7 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
addStr("y").addStr("z").
endElement().
endField();
- f._b.startIndexField("f3").
+ _b.startIndexField("f3").
startElement(6).
addStr("w").addStr("x").
endElement().
@@ -965,56 +818,56 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
addStr("y").addStr("z").
endElement().
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(30, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- f._b.startDocument("doc::40");
- f._b.startIndexField("f0").
+ doc = _b.endDocument();
+ _inv.invertDocument(30, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ _b.startDocument("doc::40");
+ _b.startIndexField("f0").
addStr("a").addStr("a").addStr("b").addStr("c").addStr("a").
addStr("e").addStr("f").
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(40, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- f._b.startDocument("doc::999");
- f._b.startIndexField("f0").
+ doc = _b.endDocument();
+ _inv.invertDocument(40, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ _b.startDocument("doc::999");
+ _b.startIndexField("f0").
addStr("this").addStr("is").addStr("_a_").addStr("test").
addStr("for").addStr("insertion").addStr("speed").addStr("with").
addStr("more").addStr("than").addStr("just").addStr("__a__").
addStr("few").addStr("words").addStr("present").addStr("in").
addStr("some").addStr("of").addStr("the").addStr("fields").
endField();
- f._b.startIndexField("f1").
+ _b.startIndexField("f1").
addStr("the").addStr("other").addStr("field").addStr("also").
addStr("has").addStr("some").addStr("content").
endField();
- f._b.startIndexField("f2").
+ _b.startIndexField("f2").
startElement(1).
addStr("strange").addStr("things").addStr("here").
addStr("has").addStr("some").addStr("content").
endElement().
endField();
- f._b.startIndexField("f3").
+ _b.startIndexField("f3").
startElement(3).
addStr("not").addStr("a").addStr("weighty").addStr("argument").
endElement().
endField();
- doc = f._b.endDocument();
+ doc = _b.endDocument();
for (uint32_t docId = 10000; docId < 20000; ++docId) {
- f._inv.invertDocument(docId, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
+ _inv.invertDocument(docId, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
}
- f._pushThreads.sync();
- DataStoreBase::MemStats beforeStats = getFeatureStoreMemStats(f._fic);
+ _pushThreads.sync();
+ DataStoreBase::MemStats beforeStats = getFeatureStoreMemStats(_fic);
LOG(info,
"Before feature compaction: allocElems=%zu, usedElems=%zu"
", deadElems=%zu, holdElems=%zu"
@@ -1027,14 +880,14 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
beforeStats._freeBuffers,
beforeStats._activeBuffers,
beforeStats._holdBuffers);
- myCompactFeatures(f._fic, f._pushThreads);
+ myCompactFeatures(_fic, _pushThreads);
std::vector<std::unique_ptr<GenerationHandler::Guard>> guards;
- for (auto &fieldIndex : f._fic.getFieldIndexes()) {
+ for (auto &fieldIndex : _fic.getFieldIndexes()) {
guards.push_back(std::make_unique<GenerationHandler::Guard>
(fieldIndex->takeGenerationGuard()));
}
- myCommit(f._fic, f._pushThreads);
- DataStoreBase::MemStats duringStats = getFeatureStoreMemStats(f._fic);
+ myCommit(_fic, _pushThreads);
+ DataStoreBase::MemStats duringStats = getFeatureStoreMemStats(_fic);
LOG(info,
"During feature compaction: allocElems=%zu, usedElems=%zu"
", deadElems=%zu, holdElems=%zu"
@@ -1048,8 +901,8 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
duringStats._activeBuffers,
duringStats._holdBuffers);
guards.clear();
- myCommit(f._fic, f._pushThreads);
- DataStoreBase::MemStats afterStats = getFeatureStoreMemStats(f._fic);
+ myCommit(_fic, _pushThreads);
+ DataStoreBase::MemStats afterStats = getFeatureStoreMemStats(_fic);
LOG(info,
"After feature compaction: allocElems=%zu, usedElems=%zu"
", deadElems=%zu, holdElems=%zu"
@@ -1067,116 +920,115 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>)
TermFieldMatchDataArray matchData;
matchData.add(&tfmd);
{
- PostingIterator itr(f._fic.findFrozen("not", 0), featureStoreRef(f._fic, 0), 0, matchData);
+ PostingIterator itr(_fic.findFrozen("not", 0), featureStoreRef(_fic, 0), 0, matchData);
itr.initFullRange();
EXPECT_TRUE(itr.isAtEnd());
}
{
- PostingIterator itr(f._fic.findFrozen("a", 0), featureStoreRef(f._fic, 0), 0, matchData);
+ PostingIterator itr(_fic.findFrozen("a", 0), featureStoreRef(_fic, 0), 0, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{4:0}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{4:0}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(25));
- EXPECT_EQUAL(30u, itr.getDocId());
+ EXPECT_EQ(30u, itr.getDocId());
itr.unpack(30);
- EXPECT_EQUAL("{6:0}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{6:0}", toString(tfmd.getIterator()));
EXPECT_TRUE(itr.seek(40));
- EXPECT_EQUAL(40u, itr.getDocId());
+ EXPECT_EQ(40u, itr.getDocId());
itr.unpack(40);
- EXPECT_EQUAL("{7:0,1,4}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{7:0,1,4}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(41));
EXPECT_TRUE(itr.isAtEnd());
}
{
- PostingIterator itr(f._fic.findFrozen("x", 0), featureStoreRef(f._fic, 0), 0, matchData);
+ PostingIterator itr(_fic.findFrozen("x", 0), featureStoreRef(_fic, 0), 0, matchData);
itr.initFullRange();
EXPECT_TRUE(itr.isAtEnd());
}
{
- PostingIterator itr(f._fic.findFrozen("x", 1), featureStoreRef(f._fic, 1), 1, matchData);
+ PostingIterator itr(_fic.findFrozen("x", 1), featureStoreRef(_fic, 1), 1, matchData);
itr.initFullRange();
- EXPECT_EQUAL(30u, itr.getDocId());
+ EXPECT_EQ(30u, itr.getDocId());
itr.unpack(30);
- EXPECT_EQUAL("{6:2[e=0,w=1,l=6]}", toString(tfmd.getIterator(), true, true));
+ EXPECT_EQ("{6:2[e=0,w=1,l=6]}", toString(tfmd.getIterator(), true, true));
}
{
- PostingIterator itr(f._fic.findFrozen("x", 2), featureStoreRef(f._fic, 2), 2, matchData);
+ PostingIterator itr(_fic.findFrozen("x", 2), featureStoreRef(_fic, 2), 2, matchData);
itr.initFullRange();
- EXPECT_EQUAL(30u, itr.getDocId());
+ EXPECT_EQ(30u, itr.getDocId());
itr.unpack(30);
// weight is hardcoded to 1 for new style il doc array field
- EXPECT_EQUAL("{2:1[e=0,w=1,l=2]}", toString(tfmd.getIterator(), true, true));
+ EXPECT_EQ("{2:1[e=0,w=1,l=2]}", toString(tfmd.getIterator(), true, true));
}
{
- PostingIterator itr(f._fic.findFrozen("x", 3), featureStoreRef(f._fic, 3), 3, matchData);
+ PostingIterator itr(_fic.findFrozen("x", 3), featureStoreRef(_fic, 3), 3, matchData);
itr.initFullRange();
- EXPECT_EQUAL(30u, itr.getDocId());
+ EXPECT_EQ(30u, itr.getDocId());
itr.unpack(30);
- EXPECT_EQUAL("{2:1[e=0,w=6,l=2]}",
- toString(tfmd.getIterator(), true, true));
+ EXPECT_EQ("{2:1[e=0,w=6,l=2]}",
+ toString(tfmd.getIterator(), true, true));
}
}
-TEST_F("requireThatInverterHandlesRemoveViaDocumentRemover",
- FieldIndexFixture<Fixture>)
+TEST_F(BasicInverterTest, require_that_inverter_handles_remove_via_document_remover)
{
Document::UP doc;
- f._b.startDocument("doc::1");
- f._b.startIndexField("f0").addStr("a").addStr("b").endField();
- f._b.startIndexField("f1").addStr("a").addStr("c").endField();
- Document::UP doc1 = f._b.endDocument();
- f._inv.invertDocument(1, *doc1.get());
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- f._b.startDocument("doc::2");
- f._b.startIndexField("f0").addStr("b").addStr("c").endField();
- Document::UP doc2 = f._b.endDocument();
- f._inv.invertDocument(2, *doc2.get());
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
- f._pushThreads.sync();
-
- EXPECT_TRUE(assertPostingList("[1]", f._fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[1,2]", f._fic.find("b", 0)));
- EXPECT_TRUE(assertPostingList("[2]", f._fic.find("c", 0)));
- EXPECT_TRUE(assertPostingList("[1]", f._fic.find("a", 1)));
- EXPECT_TRUE(assertPostingList("[1]", f._fic.find("c", 1)));
-
- myremove(1, f._inv, f._fic, f._invertThreads);
- f._pushThreads.sync();
-
- EXPECT_TRUE(assertPostingList("[]", f._fic.find("a", 0)));
- EXPECT_TRUE(assertPostingList("[2]", f._fic.find("b", 0)));
- EXPECT_TRUE(assertPostingList("[2]", f._fic.find("c", 0)));
- EXPECT_TRUE(assertPostingList("[]", f._fic.find("a", 1)));
- EXPECT_TRUE(assertPostingList("[]", f._fic.find("c", 1)));
+ _b.startDocument("doc::1");
+ _b.startIndexField("f0").addStr("a").addStr("b").endField();
+ _b.startIndexField("f1").addStr("a").addStr("c").endField();
+ Document::UP doc1 = _b.endDocument();
+ _inv.invertDocument(1, *doc1.get());
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ _b.startDocument("doc::2");
+ _b.startIndexField("f0").addStr("b").addStr("c").endField();
+ Document::UP doc2 = _b.endDocument();
+ _inv.invertDocument(2, *doc2.get());
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
+ _pushThreads.sync();
+
+ EXPECT_TRUE(assertPostingList("[1]", _fic.find("a", 0)));
+ EXPECT_TRUE(assertPostingList("[1,2]", _fic.find("b", 0)));
+ EXPECT_TRUE(assertPostingList("[2]", _fic.find("c", 0)));
+ EXPECT_TRUE(assertPostingList("[1]", _fic.find("a", 1)));
+ EXPECT_TRUE(assertPostingList("[1]", _fic.find("c", 1)));
+
+ myremove(1, _inv, _fic, _invertThreads);
+ _pushThreads.sync();
+
+ EXPECT_TRUE(assertPostingList("[]", _fic.find("a", 0)));
+ EXPECT_TRUE(assertPostingList("[2]", _fic.find("b", 0)));
+ EXPECT_TRUE(assertPostingList("[2]", _fic.find("c", 0)));
+ EXPECT_TRUE(assertPostingList("[]", _fic.find("a", 1)));
+ EXPECT_TRUE(assertPostingList("[]", _fic.find("c", 1)));
}
-class UriFixture
+Schema
+make_uri_schema()
{
+ Schema result;
+ result.addUriIndexFields(Schema::IndexField("iu", DataType::STRING));
+ result.addUriIndexFields(Schema::IndexField("iau", DataType::STRING, CollectionType::ARRAY));
+ result.addUriIndexFields(Schema::IndexField("iwu", DataType::STRING, CollectionType::WEIGHTEDSET));
+ return result;
+}
+
+class UriInverterTest : public InverterTest {
public:
- Schema _schema;
- UriFixture()
- : _schema()
- {
- _schema.addUriIndexFields(Schema::IndexField("iu", DataType::STRING));
- _schema.addUriIndexFields(Schema::IndexField("iau", DataType::STRING, CollectionType::ARRAY));
- _schema.addUriIndexFields(Schema::IndexField("iwu", DataType::STRING, CollectionType::WEIGHTEDSET));
- }
- const Schema & getSchema() const { return _schema; }
+ UriInverterTest() : InverterTest(make_uri_schema()) {}
};
-
-TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>)
+TEST_F(UriInverterTest, require_that_uri_indexing_is_working)
{
Document::UP doc;
- f._b.startDocument("doc::10");
- f._b.startIndexField("iu").
+ _b.startDocument("doc::10");
+ _b.startIndexField("iu").
startSubField("all").
addUrlTokenizedString("http://www.example.com:81/fluke?ab=2#4").
endSubField().
@@ -1199,7 +1051,7 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>)
addUrlTokenizedString("4").
endSubField().
endField();
- f._b.startIndexField("iau").
+ _b.startIndexField("iau").
startElement(1).
startSubField("all").
addUrlTokenizedString("http://www.example.com:82/fluke?ab=2#8").
@@ -1247,7 +1099,7 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>)
endSubField().
endElement().
endField();
- f._b.startIndexField("iwu").
+ _b.startIndexField("iwu").
startElement(4).
startSubField("all").
addUrlTokenizedString("http://www.example.com:83/fluke?ab=2#12").
@@ -1295,141 +1147,131 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>)
endSubField().
endElement().
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(10, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
+ doc = _b.endDocument();
+ _inv.invertDocument(10, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
- f._pushThreads.sync();
+ _pushThreads.sync();
TermFieldMatchData tfmd;
TermFieldMatchDataArray matchData;
matchData.add(&tfmd);
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("iu");
- PostingIterator itr(f._fic.findFrozen("not", fieldId),
- featureStoreRef(f._fic, fieldId),
+ uint32_t fieldId = _schema.getIndexFieldId("iu");
+ PostingIterator itr(_fic.findFrozen("not", fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
EXPECT_TRUE(itr.isAtEnd());
}
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("iu");
- PostingIterator itr(f._fic.findFrozen("example", fieldId),
- featureStoreRef(f._fic, fieldId),
+ uint32_t fieldId = _schema.getIndexFieldId("iu");
+ PostingIterator itr(_fic.findFrozen("example", fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{9:2}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{9:2}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(25));
EXPECT_TRUE(itr.isAtEnd());
}
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("iau");
- PostingIterator itr(f._fic.findFrozen("example", fieldId),
- featureStoreRef(f._fic, fieldId),
+ uint32_t fieldId = _schema.getIndexFieldId("iau");
+ PostingIterator itr(_fic.findFrozen("example", fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{9:2[e=0,l=9]}",
- toString(tfmd.getIterator(), true, false));
+ EXPECT_EQ("{9:2[e=0,l=9]}",
+ toString(tfmd.getIterator(), true, false));
EXPECT_TRUE(!itr.seek(25));
EXPECT_TRUE(itr.isAtEnd());
}
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("iwu");
- PostingIterator itr(f._fic.findFrozen("example", fieldId),
- featureStoreRef(f._fic, fieldId),
+ uint32_t fieldId = _schema.getIndexFieldId("iwu");
+ PostingIterator itr(_fic.findFrozen("example", fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{9:2[e=0,w=4,l=9]}",
- toString(tfmd.getIterator(), true, true));
+ EXPECT_EQ("{9:2[e=0,w=4,l=9]}",
+ toString(tfmd.getIterator(), true, true));
EXPECT_TRUE(!itr.seek(25));
EXPECT_TRUE(itr.isAtEnd());
}
{
- search::diskindex::IndexBuilder dib(f.getSchema());
+ search::diskindex::IndexBuilder dib(_schema);
dib.setPrefix("urldump");
TuneFileIndexing tuneFileIndexing;
DummyFileHeaderContext fileHeaderContext;
- dib.open(11, f._fic.getNumUniqueWords(), tuneFileIndexing,
+ dib.open(11, _fic.getNumUniqueWords(), tuneFileIndexing,
fileHeaderContext);
- f._fic.dump(dib);
+ _fic.dump(dib);
dib.close();
}
}
-
-class SingleFieldFixture
-{
+class CjkInverterTest : public InverterTest {
public:
- Schema _schema;
- SingleFieldFixture()
- : _schema()
- {
- _schema.addIndexField(Schema::IndexField("i", DataType::STRING));
- }
- const Schema & getSchema() const { return _schema; }
+ CjkInverterTest() : InverterTest(make_single_field_schema()) {}
};
-TEST_F("requireThatCjkIndexingIsWorking", FieldIndexFixture<SingleFieldFixture>)
+TEST_F(CjkInverterTest, require_that_cjk_indexing_is_working)
{
Document::UP doc;
- f._b.startDocument("doc::10");
- f._b.startIndexField("i").
+ _b.startDocument("doc::10");
+ _b.startIndexField("f0").
addStr("我就是那个").
setAutoSpace(false).
addStr("大灰狼").
setAutoSpace(true).
endField();
- doc = f._b.endDocument();
- f._inv.invertDocument(10, *doc);
- f._invertThreads.sync();
- myPushDocument(f._inv, f._fic);
+ doc = _b.endDocument();
+ _inv.invertDocument(10, *doc);
+ _invertThreads.sync();
+ myPushDocument(_inv, _fic);
- f._pushThreads.sync();
+ _pushThreads.sync();
TermFieldMatchData tfmd;
TermFieldMatchDataArray matchData;
matchData.add(&tfmd);
+ uint32_t fieldId = _schema.getIndexFieldId("f0");
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("i");
- PostingIterator itr(f._fic.findFrozen("not", fieldId),
- featureStoreRef(f._fic, fieldId),
+ PostingIterator itr(_fic.findFrozen("not", fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
EXPECT_TRUE(itr.isAtEnd());
}
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("i");
- PostingIterator itr(f._fic.findFrozen("我就"
+ PostingIterator itr(_fic.findFrozen("我就"
"是那个",
fieldId),
- featureStoreRef(f._fic, fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{2:0}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{2:0}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(25));
EXPECT_TRUE(itr.isAtEnd());
}
{
- uint32_t fieldId = f.getSchema().getIndexFieldId("i");
- PostingIterator itr(f._fic.findFrozen("大灰"
+ PostingIterator itr(_fic.findFrozen("大灰"
"狼",
fieldId),
- featureStoreRef(f._fic, fieldId),
+ featureStoreRef(_fic, fieldId),
fieldId, matchData);
itr.initFullRange();
- EXPECT_EQUAL(10u, itr.getDocId());
+ EXPECT_EQ(10u, itr.getDocId());
itr.unpack(10);
- EXPECT_EQUAL("{2:1}", toString(tfmd.getIterator()));
+ EXPECT_EQ("{2:1}", toString(tfmd.getIterator()));
EXPECT_TRUE(!itr.seek(25));
EXPECT_TRUE(itr.isAtEnd());
}
@@ -1441,80 +1283,74 @@ insertAndAssertTuple(const vespalib::string &word, uint32_t fieldId, uint32_t do
{
EntryRef wordRef = WrapInserter(dict, fieldId).rewind().word(word).
add(docId).flush().getWordRef();
- EXPECT_EQUAL(word,
- dict.getFieldIndex(fieldId)->getWordStore().getWord(wordRef));
+ EXPECT_EQ(word, dict.getFieldIndex(fieldId)->getWordStore().getWord(wordRef));
MyDrainRemoves(dict, fieldId).drain(docId);
}
-TEST_F("require that insert tells which word ref that was inserted", Fixture)
+TEST_F(FieldIndexCollectionTest, require_that_insert_tells_which_word_ref_that_was_inserted)
{
- FieldIndexCollection d(f.getSchema());
- insertAndAssertTuple("a", 1, 11, d);
- insertAndAssertTuple("b", 1, 11, d);
- insertAndAssertTuple("a", 2, 11, d);
-
- insertAndAssertTuple("a", 1, 22, d);
- insertAndAssertTuple("b", 2, 22, d);
- insertAndAssertTuple("c", 2, 22, d);
+ insertAndAssertTuple("a", 1, 11, fic);
+ insertAndAssertTuple("b", 1, 11, fic);
+ insertAndAssertTuple("a", 2, 11, fic);
+
+ insertAndAssertTuple("a", 1, 22, fic);
+ insertAndAssertTuple("b", 2, 22, fic);
+ insertAndAssertTuple("c", 2, 22, fic);
}
-struct RemoverFixture : public Fixture
-{
- FieldIndexCollection _fic;
+struct RemoverTest : public FieldIndexCollectionTest {
SequencedTaskExecutor _invertThreads;
SequencedTaskExecutor _pushThreads;
- RemoverFixture()
- :
- Fixture(),
- _fic(getSchema()),
- _invertThreads(2),
- _pushThreads(2)
+ RemoverTest()
+ : FieldIndexCollectionTest(),
+ _invertThreads(2),
+ _pushThreads(2)
{
}
void assertPostingLists(const vespalib::string &e1,
const vespalib::string &e2,
const vespalib::string &e3) {
- EXPECT_TRUE(assertPostingList(e1, _fic.find("a", 1)));
- EXPECT_TRUE(assertPostingList(e2, _fic.find("a", 2)));
- EXPECT_TRUE(assertPostingList(e3, _fic.find("b", 1)));
+ EXPECT_TRUE(assertPostingList(e1, fic.find("a", 1)));
+ EXPECT_TRUE(assertPostingList(e2, fic.find("a", 2)));
+ EXPECT_TRUE(assertPostingList(e3, fic.find("b", 1)));
}
void remove(uint32_t docId) {
- DocumentInverter inv(getSchema(), _invertThreads, _pushThreads);
- myremove(docId, inv, _fic, _invertThreads);
+ DocumentInverter inv(schema, _invertThreads, _pushThreads);
+ myremove(docId, inv, fic, _invertThreads);
_pushThreads.sync();
- EXPECT_FALSE(_fic.getFieldIndex(0u)->getDocumentRemover().
+ EXPECT_FALSE(fic.getFieldIndex(0u)->getDocumentRemover().
getStore().get(docId).valid());
}
};
-TEST_F("require that document remover can remove several documents", RemoverFixture)
+TEST_F(RemoverTest, require_that_document_remover_can_remove_several_documents)
{
- WrapInserter(f._fic, 1).word("a").add(11).add(13).add(15).
- word("b").add(11).add(15).flush();
- WrapInserter(f._fic, 2).word("a").add(11).add(13).flush();
- f.assertPostingLists("[11,13,15]", "[11,13]", "[11,15]");
+ WrapInserter(fic, 1).word("a").add(11).add(13).add(15).
+ word("b").add(11).add(15).flush();
+ WrapInserter(fic, 2).word("a").add(11).add(13).flush();
+ assertPostingLists("[11,13,15]", "[11,13]", "[11,15]");
- f.remove(13);
- f.assertPostingLists("[11,15]", "[11]", "[11,15]");
+ remove(13);
+ assertPostingLists("[11,15]", "[11]", "[11,15]");
- f.remove(11);
- f.assertPostingLists("[15]", "[]", "[15]");
+ remove(11);
+ assertPostingLists("[15]", "[]", "[15]");
- f.remove(15);
- f.assertPostingLists("[]", "[]", "[]");
+ remove(15);
+ assertPostingLists("[]", "[]", "[]");
}
-TEST_F("require that removal of non-existing document does not do anything", RemoverFixture)
+TEST_F(RemoverTest, require_that_removal_of_non_existing_document_does_not_do_anything)
{
- WrapInserter(f._fic, 1).word("a").add(11).word("b").add(11).flush();
- WrapInserter(f._fic, 2).word("a").add(11).flush();
- f.assertPostingLists("[11]", "[11]", "[11]");
- f.remove(13);
- f.assertPostingLists("[11]", "[11]", "[11]");
+ WrapInserter(fic, 1).word("a").add(11).word("b").add(11).flush();
+ WrapInserter(fic, 2).word("a").add(11).flush();
+ assertPostingLists("[11]", "[11]", "[11]");
+ remove(13);
+ assertPostingLists("[11]", "[11]", "[11]");
}
-} // namespace memoryindex
-} // namespace search
+}
+}
-TEST_MAIN() { TEST_RUN_ALL(); }
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp b/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp
index 9f47dede46a..1d80fec720a 100644
--- a/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp
+++ b/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp
@@ -60,34 +60,6 @@ TEST_F("require that we can store 1d bound tensor", Fixture("tensor(x[3])"))
add({{"x", 2}}, 5));
}
-TEST_F("require that we can store 1d un-bound tensor", Fixture("tensor(x[])"))
-{
- f.assertSetAndGetTensor(TensorSpec("tensor(x[3])").
- add({{"x", 0}}, 2).
- add({{"x", 1}}, 3).
- add({{"x", 2}}, 5));
-}
-
-TEST_F("require that un-bound dimension is concrete in returned 2d tensor", Fixture("tensor(x[3],y[])"))
-{
- f.assertSetAndGetTensor(TensorSpec("tensor(x[3],y[2])").
- add({{"x", 0}, {"y", 0}}, 2).
- add({{"x", 0}, {"y", 1}}, 3).
- add({{"x", 1}, {"y", 0}}, 5).
- add({{"x", 1}, {"y", 1}}, 7).
- add({{"x", 2}, {"y", 0}}, 11).
- add({{"x", 2}, {"y", 1}}, 13));
-}
-
-TEST_F("require that un-bound dimensions are concrete in returned 3d tensor", Fixture("tensor(x[],y[2],z[])"))
-{
- f.assertSetAndGetTensor(TensorSpec("tensor(x[1],y[2],z[2])").
- add({{"x", 0}, {"y", 0}, {"z", 0}}, 2).
- add({{"x", 0}, {"y", 0}, {"z", 1}}, 3).
- add({{"x", 0}, {"y", 1}, {"z", 0}}, 5).
- add({{"x", 0}, {"y", 1}, {"z", 1}}, 7));
-}
-
TEST_F("require that correct empty tensor is returned for 1d bound tensor", Fixture("tensor(x[3])"))
{
f.assertEmptyTensor(TensorSpec("tensor(x[3])").
@@ -96,21 +68,6 @@ TEST_F("require that correct empty tensor is returned for 1d bound tensor", Fixt
add({{"x", 2}}, 0));
}
-TEST_F("require that empty 2d tensor has size 1 in un-bound dimension", Fixture("tensor(x[3],y[])"))
-{
- f.assertEmptyTensor(TensorSpec("tensor(x[3],y[1])").
- add({{"x", 0}, {"y", 0}}, 0).
- add({{"x", 1}, {"y", 0}}, 0).
- add({{"x", 2}, {"y", 0}}, 0));
-}
-
-TEST_F("require that empty 3d tensor has size 1 in un-bound dimensions", Fixture("tensor(x[],y[2],z[])"))
-{
- f.assertEmptyTensor(TensorSpec("tensor(x[1],y[2],z[1])").
- add({{"x", 0}, {"y", 0}, {"z", 0}}, 0).
- add({{"x", 0}, {"y", 1}, {"z", 0}}, 0));
-}
-
void
assertArraySize(const vespalib::string &tensorType, uint32_t expArraySize) {
Fixture f(tensorType);
@@ -122,13 +79,7 @@ TEST("require that array size is calculated correctly")
TEST_DO(assertArraySize("tensor(x[1])", 32));
TEST_DO(assertArraySize("tensor(x[10])", 96));
TEST_DO(assertArraySize("tensor(x[3])", 32));
- TEST_DO(assertArraySize("tensor(x[3],y[])", 32));
- TEST_DO(assertArraySize("tensor(x[3],y[],z[])", 32));
- TEST_DO(assertArraySize("tensor(x[3],y[],z[],z2[])", 64));
TEST_DO(assertArraySize("tensor(x[10],y[10])", 800));
- TEST_DO(assertArraySize("tensor(x[])", 32));
- TEST_DO(assertArraySize("tensor(x[],x2[],x3[],x4[],x5[],x6[])", 32));
- TEST_DO(assertArraySize("tensor(x[],x2[],x3[],x4[],x5[],x6[],x7[])", 64));
}
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
index dfcdd991b22..83776d22fee 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp
@@ -133,10 +133,8 @@ vespalib::string noFeatures = "NoFeatures";
}
-template <bool bigEndian>
void
-FeatureDecodeContext<bigEndian>::
-readBytes(uint8_t *buf, size_t len)
+DecodeContext64Base::readBytes(uint8_t *buf, size_t len)
{
while (len > 0) {
// Ensure that buffer to read from isn't empty
@@ -167,9 +165,8 @@ readBytes(uint8_t *buf, size_t len)
}
-template <bool bigEndian>
uint32_t
-FeatureDecodeContext<bigEndian>::
+DecodeContext64Base::
readHeader(vespalib::GenericHeader &header, int64_t fileSize)
{
size_t hhSize = vespalib::GenericHeader::getMinSize();
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h
index 67e23aabc1e..b9166f675aa 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/compression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h
@@ -1136,16 +1136,18 @@ public:
// File position for end of buffer minus byte address of end of buffer
// minus sizeof uint64_t. Then shifted left by 3 to represent bits.
uint64_t _fileReadBias;
+ search::ComprFileReadContext *_readContext;
DecodeContext64Base()
: search::ComprFileDecodeContext(),
_valI(nullptr),
- _valE(nullptr),
+ _valE(static_cast<const uint64_t *>(nullptr) - 1),
_realValE(nullptr),
_val(0),
_cacheInt(0),
_preRead(0),
- _fileReadBias(0)
+ _fileReadBias(0),
+ _readContext(nullptr)
{
}
@@ -1163,7 +1165,8 @@ public:
_val(val),
_cacheInt(cacheInt),
_preRead(preRead),
- _fileReadBias(0)
+ _fileReadBias(0),
+ _readContext(nullptr)
{
}
@@ -1183,6 +1186,7 @@ public:
_cacheInt = rhs._cacheInt;
_preRead = rhs._preRead;
_fileReadBias = rhs._fileReadBias;
+ _readContext = rhs._readContext;
return *this;
}
@@ -1278,6 +1282,26 @@ public:
return (val >> 1);
}
}
+
+ void setReadContext(search::ComprFileReadContext *readContext) {
+ _readContext = readContext;
+ }
+ search::ComprFileReadContext *getReadContext() const {
+ return _readContext;
+ }
+ void readComprBuffer() {
+ _readContext->readComprBuffer();
+ }
+ void readComprBufferIfNeeded() {
+ if (__builtin_expect(_valI >= _valE, false)) {
+ readComprBuffer();
+ }
+ }
+ virtual uint64_t readBits(uint32_t length) = 0;
+ virtual void align(uint32_t alignment) = 0;
+ virtual uint64_t decode_exp_golomb(int k) = 0;
+ void readBytes(uint8_t *buf, size_t len);
+ uint32_t readHeader(vespalib::GenericHeader &header, int64_t fileSize);
};
@@ -1299,7 +1323,7 @@ public:
DecodeContext64(const uint64_t *compr,
int bitOffset)
: DecodeContext64Base(compr + 1,
- nullptr,
+ static_cast<const uint64_t *>(nullptr) - 1,
nullptr,
0,
EC::bswap(*compr),
@@ -1385,10 +1409,12 @@ public:
};
void skipBits(int bits) override {
+ readComprBufferIfNeeded();
while (bits >= 64) {
_val = 0;
ReadBits(64, _val, _cacheInt, _preRead, _valI);
bits -= 64;
+ readComprBufferIfNeeded();
}
if (bits > 0) {
if (bigEndian) {
@@ -1397,6 +1423,7 @@ public:
_val >>= bits;
}
ReadBits(bits, _val, _cacheInt, _preRead, _valI);
+ readComprBufferIfNeeded();
}
}
@@ -1436,7 +1463,7 @@ public:
}
uint64_t
- readBits(uint32_t length)
+ readBits(uint32_t length) override
{
uint64_t res;
if (length < 64) {
@@ -1452,20 +1479,32 @@ public:
_val = 0;
}
UC64_READBITS(_val, _valI, _preRead, _cacheInt, EC);
+ readComprBufferIfNeeded();
return res;
}
+ uint64_t decode_exp_golomb(int k) override {
+ uint32_t length;
+ uint64_t val64;
+ UC64_DECODEEXPGOLOMB(_val, _valI, _preRead, _cacheInt, k, EC);
+ readComprBufferIfNeeded();
+ return val64;
+ }
+
void
- align(uint32_t alignment)
+ align(uint32_t alignment) override
{
+ readComprBufferIfNeeded();
uint64_t pad = (- getReadOffset()) & (alignment - 1);
while (pad > 64) {
(void) readBits(64);
pad -= 64;
+ readComprBufferIfNeeded();
}
if (pad > 0) {
(void) readBits(pad);
}
+ readComprBufferIfNeeded();
}
/*
@@ -1489,7 +1528,6 @@ template <bool bigEndian>
class FeatureDecodeContext : public DecodeContext64<bigEndian>
{
public:
- search::ComprFileReadContext *_readContext;
typedef DecodeContext64<bigEndian> ParentClass;
typedef index::DocIdAndFeatures DocIdAndFeatures;
typedef index::PostingListParams PostingListParams;
@@ -1504,68 +1542,29 @@ public:
using ParentClass::getBitOffset;
using ParentClass::readBits;
using ParentClass::ReadBits;
+ using ParentClass::readComprBuffer;
+ using ParentClass::readComprBufferIfNeeded;
+ using ParentClass::readHeader;
+ using ParentClass::readBytes;
FeatureDecodeContext()
- : ParentClass(),
- _readContext(nullptr)
+ : ParentClass()
{
}
FeatureDecodeContext(const uint64_t *compr,
int bitOffset)
- : ParentClass(compr, bitOffset),
- _readContext(nullptr)
+ : ParentClass(compr, bitOffset)
{
}
FeatureDecodeContext(const uint64_t *compr,
int bitOffset,
uint64_t bitLength)
- : ParentClass(compr, bitOffset, bitLength),
- _readContext(nullptr)
- {
- }
-
- FeatureDecodeContext &
- operator=(const FeatureDecodeContext &rhs)
- {
- ParentClass::operator=(rhs);
- _readContext = rhs._readContext;
- return *this;
- }
-
- void
- setReadContext(search::ComprFileReadContext *readContext)
- {
- _readContext = readContext;
- }
-
- search::ComprFileReadContext *
- getReadContext() const
- {
- return _readContext;
- }
-
- void
- readComprBuffer()
- {
- _readContext->readComprBuffer();
- }
-
- void
- readComprBufferIfNeeded()
+ : ParentClass(compr, bitOffset, bitLength)
{
- if (__builtin_expect(_valI >= _valE, false)) {
- readComprBuffer();
- }
}
- void
- readBytes(uint8_t *buf, size_t len);
-
- virtual uint32_t
- readHeader(vespalib::GenericHeader &header, int64_t fileSize);
-
virtual void
readHeader(const vespalib::GenericHeader &header,
const vespalib::string &prefix);
@@ -1594,41 +1593,6 @@ public:
*/
virtual void
getParams(PostingListParams &params) const;
-
- void skipBits(int bits) override {
- readComprBufferIfNeeded();
- while (bits >= 64) {
- _val = 0;
- ReadBits(64, _val, _cacheInt, _preRead, _valI);
- bits -= 64;
- readComprBufferIfNeeded();
- }
- if (bits > 0) {
- if (bigEndian) {
- _val <<= bits;
- } else {
- _val >>= bits;
- }
- ReadBits(bits, _val, _cacheInt, _preRead, _valI);
- readComprBufferIfNeeded();
- }
- }
-
- void
- align(uint32_t alignment)
- {
- readComprBufferIfNeeded();
- uint64_t pad = (- getReadOffset()) & (alignment - 1);
- while (pad > 64) {
- (void) readBits(64);
- pad -= 64;
- readComprBufferIfNeeded();
- }
- if (pad > 0) {
- (void) readBits(pad);
- }
- readComprBufferIfNeeded();
- }
};
typedef FeatureDecodeContext<true> FeatureDecodeContextBE;
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp
index d4f663f32cc..9f5d3cf751f 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp
@@ -12,8 +12,6 @@
LOG_SETUP(".posocccompression");
using search::index::DocIdAndFeatures;
-using search::index::WordDocElementFeatures;
-using search::index::WordDocElementWordPosFeatures;
using search::index::PostingListParams;
using search::index::SchemaUtil;
using search::index::Schema;
@@ -343,8 +341,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
uint64_t val64;
const uint64_t *valE = _valE;
- features.clearFeatures((oPreRead == 0) ? 0 : 64 - oPreRead);
- features.setRaw(true);
+ features.clear_features((oPreRead == 0) ? 0 : 64 - oPreRead);
+ features.set_has_raw_data(true);
const uint64_t *rawFeatures =
(oPreRead == 0) ? (oCompr - 1) : (oCompr - 2);
uint64_t rawFeaturesStartBitPos =
@@ -373,7 +371,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
}
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -394,7 +392,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
do {
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -410,7 +408,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
for (uint32_t pos = 1; pos < numPositions; ++pos) {
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -429,9 +427,9 @@ readFeatures(search::index::DocIdAndFeatures &features)
_fileReadBias +
(reinterpret_cast<unsigned long>(oCompr) << 3) -
oPreRead;
- features._bitLength = rawFeaturesEndBitPos - rawFeaturesStartBitPos;
+ features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos);
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
if (__builtin_expect(oCompr >= valE, false)) {
@@ -450,8 +448,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
uint64_t val64;
const uint64_t *valE = _valE;
- features.clearFeatures();
- features.setRaw(false);
+ features.clear_features();
+ features.set_has_raw_data(false);
const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0];
uint32_t numElements = 1;
@@ -470,14 +468,13 @@ readFeatures(search::index::DocIdAndFeatures &features)
EC);
elementId += static_cast<uint32_t>(val64);
}
- features._elements.
- push_back(WordDocElementFeatures(elementId));
+ features.elements().emplace_back(elementId);
if (fieldParams._hasElementWeights) {
UC64_DECODEEXPGOLOMB_SMALL_NS(o,
K_VALUE_POSOCC_ELEMENTWEIGHT,
EC);
int32_t elementWeight = this->convertToSigned(val64);
- features._elements.back().setWeight(elementWeight);
+ features.elements().back().setWeight(elementWeight);
}
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
@@ -489,7 +486,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
K_VALUE_POSOCC_ELEMENTLEN,
EC);
uint32_t elementLen = static_cast<uint32_t>(val64) + 1;
- features._elements.back().setElementLen(elementLen);
+ features.elements().back().setElementLen(elementLen);
UC64_DECODEEXPGOLOMB_SMALL_NS(o,
K_VALUE_POSOCC_NUMPOSITIONS,
EC);
@@ -507,9 +504,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
K_VALUE_POSOCC_FIRST_WORDPOS,
EC);
wordPos = static_cast<uint32_t>(val64);
- features._elements.back().incNumOccs();
- features._wordPositions.push_back(
- WordDocElementWordPosFeatures(wordPos));
+ features.elements().back().incNumOccs();
+ features.word_positions().emplace_back(wordPos);
} while (0);
for (uint32_t pos = 1; pos < numPositions; ++pos) {
if (__builtin_expect(oCompr >= valE, false)) {
@@ -522,9 +518,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
K_VALUE_POSOCC_DELTA_WORDPOS,
EC);
wordPos += 1 + static_cast<uint32_t>(val64);
- features._elements.back().incNumOccs();
- features._wordPositions.push_back(
- WordDocElementWordPosFeatures(wordPos));
+ features.elements().back().incNumOccs();
+ features.word_positions().emplace_back(wordPos);
}
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -732,23 +727,19 @@ void
EG2PosOccEncodeContext<bigEndian>::
writeFeatures(const search::index::DocIdAndFeatures &features)
{
- if (features.getRaw()) {
- writeBits(&features._blob[0],
- features._bitOffset, features._bitLength);
+ if (features.has_raw_data()) {
+ writeBits(features.blob().data(),
+ features.bit_offset(), features.bit_length());
return;
}
- typedef WordDocElementFeatures Elements;
- typedef WordDocElementWordPosFeatures Positions;
- std::vector<Elements>::const_iterator element = features._elements.begin();
-
- std::vector<Positions>::const_iterator position =
- features._wordPositions.begin();
+ auto element = features.elements().begin();
+ auto position = features.word_positions().begin();
const PosOccFieldParams &fieldParams =
_fieldsParams->getFieldParams()[0];
- uint32_t numElements = features._elements.size();
+ uint32_t numElements = features.elements().size();
if (fieldParams._hasElements) {
assert(numElements > 0u);
encodeExpGolomb(numElements - 1,
@@ -854,8 +845,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
uint64_t val64;
const uint64_t *valE = _valE;
- features.clearFeatures((oPreRead == 0) ? 0 : 64 - oPreRead);
- features.setRaw(true);
+ features.clear_features((oPreRead == 0) ? 0 : 64 - oPreRead);
+ features.set_has_raw_data(true);
const uint64_t *rawFeatures =
(oPreRead == 0) ? (oCompr - 1) : (oCompr - 2);
uint64_t rawFeaturesStartBitPos =
@@ -885,7 +876,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
}
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -910,7 +901,7 @@ readFeatures(search::index::DocIdAndFeatures &features)
for (uint32_t pos = 0; pos < numPositions; ++pos) {
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -929,9 +920,9 @@ readFeatures(search::index::DocIdAndFeatures &features)
_fileReadBias +
(reinterpret_cast<unsigned long>(oCompr) << 3) -
oPreRead;
- features._bitLength = rawFeaturesEndBitPos - rawFeaturesStartBitPos;
+ features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos);
while (rawFeatures < oCompr) {
- features._blob.push_back(*rawFeatures);
+ features.blob().push_back(*rawFeatures);
++rawFeatures;
}
if (__builtin_expect(oCompr >= valE, false)) {
@@ -950,8 +941,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
uint64_t val64;
const uint64_t *valE = _valE;
- features.clearFeatures();
- features.setRaw(false);
+ features.clear_features();
+ features.set_has_raw_data(false);
const PosOccFieldParams &fieldParams =
_fieldsParams->getFieldParams()[0];
@@ -972,14 +963,13 @@ readFeatures(search::index::DocIdAndFeatures &features)
EC);
elementId += static_cast<uint32_t>(val64);
}
- features._elements.
- push_back(WordDocElementFeatures(elementId));
+ features.elements().emplace_back(elementId);
if (fieldParams._hasElementWeights) {
UC64_DECODEEXPGOLOMB_SMALL_NS(o,
K_VALUE_POSOCC_ELEMENTWEIGHT,
EC);
int32_t elementWeight = this->convertToSigned(val64);
- features._elements.back().setWeight(elementWeight);
+ features.elements().back().setWeight(elementWeight);
}
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
@@ -991,13 +981,13 @@ readFeatures(search::index::DocIdAndFeatures &features)
elementLenK,
EC);
uint32_t elementLen = static_cast<uint32_t>(val64) + 1;
- features._elements.back().setElementLen(elementLen);
+ features.elements().back().setElementLen(elementLen);
UC64_DECODEEXPGOLOMB_SMALL_NS(o,
K_VALUE_POSOCC_NUMPOSITIONS,
EC);
uint32_t numPositions = static_cast<uint32_t>(val64) + 1;
- features._bitLength = numPositions * 64;
+ features.set_bit_length(numPositions * 64);
uint32_t wordPosK = EGPosOccEncodeContext<bigEndian>::
calcWordPosK(numPositions, elementLen);
@@ -1014,9 +1004,8 @@ readFeatures(search::index::DocIdAndFeatures &features)
wordPosK,
EC);
wordPos += 1 + static_cast<uint32_t>(val64);
- features._elements.back().incNumOccs();
- features._wordPositions.push_back(
- WordDocElementWordPosFeatures(wordPos));
+ features.elements().back().incNumOccs();
+ features.word_positions().emplace_back(wordPos);
}
}
UC64_DECODECONTEXT_STORE(o, _);
@@ -1227,23 +1216,19 @@ void
EGPosOccEncodeContext<bigEndian>::
writeFeatures(const search::index::DocIdAndFeatures &features)
{
- if (features.getRaw()) {
- writeBits(&features._blob[0],
- features._bitOffset, features._bitLength);
+ if (features.has_raw_data()) {
+ writeBits(features.blob().data(),
+ features.bit_offset(), features.bit_length());
return;
}
- typedef WordDocElementFeatures Elements;
- typedef WordDocElementWordPosFeatures Positions;
-
- std::vector<Elements>::const_iterator element = features._elements.begin();
- std::vector<Positions>::const_iterator position =
- features._wordPositions.begin();
+ auto element = features.elements().begin();
+ auto position = features.word_positions().begin();
const PosOccFieldParams &fieldParams =
_fieldsParams->getFieldParams()[0];
uint32_t elementLenK = calcElementLenK(fieldParams._avgElemLen);
- uint32_t numElements = features._elements.size();
+ uint32_t numElements = features.elements().size();
if (fieldParams._hasElements) {
assert(numElements > 0u);
encodeExpGolomb(numElements - 1,
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h
index a5d46045ec5..d500dacd7d4 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h
@@ -48,9 +48,9 @@ public:
assert(elementLen == _elements.back().getElementLen());
}
assert(_elements.back().getNumOccs() == 0 ||
- wordPos > _wordPositions.back().getWordPos());
+ wordPos > _word_positions.back().getWordPos());
_elements.back().incNumOccs();
- _wordPositions.emplace_back(wordPos);
+ _word_positions.emplace_back(wordPos);
}
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index 104994ad038..2fea4f2bab7 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -19,6 +19,7 @@ vespa_add_library(searchlib_diskindex OBJECT
pagedict4randread.cpp
wordnummapper.cpp
zc4_posting_header.cpp
+ zc4_posting_reader.cpp
zc4_posting_writer.cpp
zc4_posting_writer_base.cpp
zcbuf.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
index d71ddc2c2d6..64a54187254 100644
--- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
@@ -39,7 +39,9 @@ DiskIndex::Key::Key() = default;
DiskIndex::Key::Key(const IndexList & indexes, vespalib::stringref word) :
_word(word),
_indexes(indexes)
-{ }
+{
+}
+
DiskIndex::Key::~Key() = default;
DiskIndex::DiskIndex(const vespalib::string &indexDir, size_t cacheSize)
@@ -73,7 +75,6 @@ DiskIndex::loadSchema()
return true;
}
-
bool
DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch)
{
@@ -91,7 +92,6 @@ DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch)
return true;
}
-
bool
DiskIndex::openField(const vespalib::string &fieldDir,
const TuneFileSearch &tuneFileSearch)
@@ -147,7 +147,6 @@ DiskIndex::openField(const vespalib::string &fieldDir,
return true;
}
-
bool
DiskIndex::setup(const TuneFileSearch &tuneFileSearch)
{
@@ -165,7 +164,6 @@ DiskIndex::setup(const TuneFileSearch &tuneFileSearch)
return true;
}
-
bool
DiskIndex::setup(const TuneFileSearch &tuneFileSearch,
const DiskIndex &old)
@@ -315,7 +313,6 @@ DiskIndex::readPostingList(const LookupResult &lookupRes) const
return handle;
}
-
BitVector::UP
DiskIndex::readBitVector(const LookupResult &lookupRes) const
{
@@ -327,7 +324,6 @@ DiskIndex::readBitVector(const LookupResult &lookupRes) const
return dict->lookup(lookupRes.wordNum);
}
-
void
DiskIndex::calculateSize()
{
@@ -335,19 +331,18 @@ DiskIndex::calculateSize()
_size = dirt.GetTreeSize();
}
-
namespace {
DiskIndex::LookupResult _G_nothing;
-class LookupCache
-{
+class LookupCache {
public:
LookupCache(DiskIndex & diskIndex, const std::vector<uint32_t> & fieldIds) :
_diskIndex(diskIndex),
_fieldIds(fieldIds),
_cache()
- { }
+ {
+ }
const DiskIndex::LookupResult &
lookup(const vespalib::string & word, uint32_t fieldId) {
Cache::const_iterator it = _cache.find(word);
@@ -363,14 +358,14 @@ public:
return _G_nothing;
}
private:
+
typedef vespalib::hash_map<vespalib::string, DiskIndex::LookupResultVector> Cache;
DiskIndex & _diskIndex;
const std::vector<uint32_t> & _fieldIds;
Cache _cache;
};
-class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper
-{
+class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper {
private:
LookupCache &_cache;
DiskIndex &_diskIndex;
@@ -391,8 +386,7 @@ public:
}
template <class TermNode>
- void visitTerm(TermNode &n)
- {
+ void visitTerm(TermNode &n) {
const vespalib::string termStr = termAsString(n);
const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId);
if (lookupRes.valid()) {
@@ -418,7 +412,6 @@ public:
void visit(PredicateQuery &) override { }
};
-
Blueprint::UP
createBlueprintHelper(LookupCache & cache, DiskIndex & diskIndex, const IRequestContext & requestContext,
const FieldSpec &field, uint32_t fieldId, const Node &term)
@@ -442,7 +435,6 @@ DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSp
return createBlueprintHelper(cache, *this, requestContext, field, fieldIds[0], term);
}
-
Blueprint::UP
DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSpecList &fields, const Node &term)
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.h b/searchlib/src/vespa/searchlib/diskindex/diskindex.h
index 4bef53a3030..d83b2f56d7c 100644
--- a/searchlib/src/vespa/searchlib/diskindex/diskindex.h
+++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.h
@@ -12,14 +12,13 @@
namespace search::diskindex {
/**
- * This class represents a disk index with a common dictionary, and
- * posting list files and bit vector files for each field.
- * Parts of the disk dictionary and all bit vector
- * dictionaries are loaded into memory during setup. All other files
- * are just opened, ready for later access.
- **/
-class DiskIndex : public queryeval::Searchable
-{
+ * This class represents a disk index that contains a set of field indexes that are independent of each other.
+ *
+ * Each field index has a dictionary, posting list files and bit vector files.
+ * Parts of the disk dictionary and all bit vector dictionaries are loaded into memory during setup.
+ * All other files are just opened, ready for later access.
+ */
+class DiskIndex : public queryeval::Searchable {
public:
/**
* The result after performing a disk dictionary lookup.
@@ -60,11 +59,12 @@ public:
vespalib::string _word;
IndexList _indexes;
};
+
private:
- typedef index::PostingListFileRandRead DiskPostingFile;
- typedef Zc4PosOccRandRead DiskPostingFileReal;
- typedef ZcPosOccRandRead DiskPostingFileDynamicKReal;
- typedef vespalib::cache<vespalib::CacheParam<vespalib::LruParam<Key, LookupResultVector>, DiskIndex>> Cache;
+ using DiskPostingFile = index::PostingListFileRandRead;
+ using DiskPostingFileReal = Zc4PosOccRandRead;
+ using DiskPostingFileDynamicKReal = ZcPosOccRandRead;
+ using Cache = vespalib::cache<vespalib::CacheParam<vespalib::LruParam<Key, LookupResultVector>, DiskIndex>>;
vespalib::string _indexDir;
size_t _cacheSize;
@@ -83,11 +83,11 @@ private:
public:
/**
- * Create a view of the disk index located in the given directory
- * described by the given schema.
+ * Create a view of the disk index located in the given directory.
*
* @param indexDir the directory where the disk index is located.
- **/
+ * @param cacheSize optional size (in bytes) of the disk dictionary lookup cache.
+ */
DiskIndex(const vespalib::string &indexDir, size_t cacheSize=0);
~DiskIndex();
@@ -95,29 +95,27 @@ public:
* Setup this instance by opening and loading relevant index files.
*
* @return true if this instance was successfully setup.
- **/
+ */
bool setup(const TuneFileSearch &tuneFileSearch);
bool setup(const TuneFileSearch &tuneFileSearch, const DiskIndex &old);
/**
- * Perform a dictionary lookup for the given word in the given
- * field.
+ * Perform a dictionary lookup for the given word in the given field.
*
- * @param indexId the id of the field to
- * perform lookup for.
+ * @param indexId the id of the field to perform lookup for.
* @param word the word to lookup.
* @return the lookup result or nullptr if the word is not found.
- **/
+ */
LookupResult::UP lookup(uint32_t indexId, vespalib::stringref word);
- LookupResultVector lookup(const std::vector<uint32_t> & indexes, vespalib::stringref word);
+ LookupResultVector lookup(const std::vector<uint32_t> & indexes, vespalib::stringref word);
/**
* Read the posting list corresponding to the given lookup result.
*
* @param lookupRes the result of the previous dictionary lookup.
* @return a handle for the posting list in memory.
- **/
+ */
index::PostingListHandle::UP readPostingList(const LookupResult &lookupRes) const;
/**
@@ -126,22 +124,19 @@ public:
* @param lookupRes the result of the previous dictionary lookup.
* @return the bit vector or nullptr if no bit vector exists for the
* word in the lookup result.
- **/
+ */
BitVector::UP readBitVector(const LookupResult &lookupRes) const;
- queryeval::Blueprint::UP
- createBlueprint(const queryeval::IRequestContext & requestContext,
- const queryeval::FieldSpec &field,
- const query::Node &term) override;
+ queryeval::Blueprint::UP createBlueprint(const queryeval::IRequestContext & requestContext,
+ const queryeval::FieldSpec &field,
+ const query::Node &term) override;
- queryeval::Blueprint::UP
- createBlueprint(const queryeval::IRequestContext & requestContext,
- const queryeval::FieldSpecList &fields,
- const query::Node &term) override;
+ queryeval::Blueprint::UP createBlueprint(const queryeval::IRequestContext & requestContext,
+ const queryeval::FieldSpecList &fields,
+ const query::Node &term) override;
/**
* Get the size on disk of this index.
- * @return the size of the index.
*/
uint64_t getSize() const { return _size; }
diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
index f6e4da945e0..34e64a9b558 100644
--- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
@@ -69,7 +69,7 @@ makePosOccWrite(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- ZcPosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(true) &&
fileHeader.getFormats()[1] ==
ZcPosOccSeqRead::getSubIdentifier()) {
dynamicK = true;
@@ -77,7 +77,7 @@ makePosOccWrite(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- Zc4PosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(false) &&
fileHeader.getFormats()[1] ==
Zc4PosOccSeqRead::getSubIdentifier()) {
dynamicK = false;
@@ -115,7 +115,7 @@ makePosOccRead(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- ZcPosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(true) &&
fileHeader.getFormats()[1] ==
ZcPosOccSeqRead::getSubIdentifier()) {
dynamicK = true;
@@ -123,7 +123,7 @@ makePosOccRead(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- Zc4PosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(false) &&
fileHeader.getFormats()[1] ==
Zc4PosOccSeqRead::getSubIdentifier()) {
dynamicK = false;
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
index 96b106a15da..a41f0412294 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
@@ -63,7 +63,7 @@ void
FieldReader::readDocIdAndFeatures()
{
_oldposoccfile->readDocIdAndFeatures(_docIdAndFeatures);
- _docIdAndFeatures._docId = _docIdMapper.mapDocId(_docIdAndFeatures._docId);
+ _docIdAndFeatures.set_doc_id(_docIdMapper.mapDocId(_docIdAndFeatures.doc_id()));
}
@@ -75,13 +75,13 @@ FieldReader::read()
readCounts();
if (_wordNum == noWordNumHigh()) {
assert(_residue == 0);
- _docIdAndFeatures._docId = NO_DOC;
+ _docIdAndFeatures.set_doc_id(NO_DOC);
return;
}
}
--_residue;
readDocIdAndFeatures();
- if (_docIdAndFeatures._docId != NO_DOC) {
+ if (_docIdAndFeatures.doc_id() != NO_DOC) {
return;
}
}
@@ -267,26 +267,26 @@ FieldReaderStripInfo::read()
if (_wordNum == noWordNumHigh()) {
return;
}
- assert(!features.getRaw());
- uint32_t numElements = features._elements.size();
+ assert(!features.has_raw_data());
+ uint32_t numElements = features.elements().size();
assert(numElements > 0);
std::vector<Element>::iterator element =
- features._elements.begin();
+ features.elements().begin();
if (_hasElements) {
if (!_hasElementWeights) {
for (uint32_t elementDone = 0; elementDone < numElements; ++elementDone, ++element) {
element->setWeight(1);
}
- assert(element == features._elements.end());
+ assert(element == features.elements().end());
}
} else {
if (element->getElementId() != 0) {
continue; // Drop this entry, try to read new entry
}
element->setWeight(1);
- features._wordPositions.resize(element->getNumOccs());
+ features.word_positions().resize(element->getNumOccs());
if (numElements > 1) {
- features._elements.resize(1);
+ features.elements().resize(1);
}
}
break;
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
index a73ffa149a9..50748d037c0 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
@@ -85,7 +85,7 @@ public:
bool operator<(const FieldReader &rhs) const {
return _wordNum < rhs._wordNum ||
(_wordNum == rhs._wordNum &&
- _docIdAndFeatures._docId < rhs._docIdAndFeatures._docId);
+ _docIdAndFeatures.doc_id() < rhs._docIdAndFeatures.doc_id());
}
virtual void setup(const WordNumMapping &wordNumMapping, const DocIdMapping &docIdMapping);
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
index 6454c0851a7..8c2b33a933e 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
@@ -98,7 +98,6 @@ FieldWriter::open(const vespalib::string &prefix,
return true;
}
-
void
FieldWriter::flush()
{
@@ -120,7 +119,6 @@ FieldWriter::flush()
}
}
-
void
FieldWriter::newWord(uint64_t wordNum, vespalib::stringref word)
{
@@ -134,14 +132,12 @@ FieldWriter::newWord(uint64_t wordNum, vespalib::stringref word)
_prevDocId = 0;
}
-
void
FieldWriter::newWord(vespalib::stringref word)
{
newWord(_wordNum + 1, word);
}
-
bool
FieldWriter::close()
{
@@ -183,7 +179,6 @@ FieldWriter::getFeatureParams(PostingListParams &params)
_posoccfile->getFeatureParams(params);
}
-
static const char *termOccNames[] =
{
"boolocc.bdat",
@@ -199,7 +194,6 @@ static const char *termOccNames[] =
nullptr,
};
-
void
FieldWriter::remove(const vespalib::string &prefix)
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
index 9a6edf90243..e5aa9788071 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
@@ -10,15 +10,13 @@
namespace search::diskindex {
-/*
- * FieldWriter is used to write a dictionary and posting list file
- * together.
+/**
+ * FieldWriter is used to write a dictionary and posting list file together.
*
* It is used by the fusion code to write the merged output for a field,
* and by the memory index dump code to write a field to disk.
*/
-class FieldWriter
-{
+class FieldWriter {
private:
uint64_t _wordNum;
uint32_t _prevDocId;
@@ -28,14 +26,15 @@ public:
using DictionaryFileSeqWrite = index::DictionaryFileSeqWrite;
- typedef index::PostingListFileSeqWrite PostingListFileSeqWrite;
- typedef index::DocIdAndFeatures DocIdAndFeatures;
- typedef index::Schema Schema;
- typedef index::PostingListCounts PostingListCounts;
- typedef index::PostingListParams PostingListParams;
+ using PostingListFileSeqWrite = index::PostingListFileSeqWrite;
+ using DocIdAndFeatures = index::DocIdAndFeatures;
+ using Schema = index::Schema;
+ using PostingListCounts = index::PostingListCounts;
+ using PostingListParams = index::PostingListParams;
std::unique_ptr<DictionaryFileSeqWrite> _dictFile;
std::unique_ptr<PostingListFileSeqWrite> _posoccfile;
+
private:
BitVectorCandidate _bvc;
BitVectorFileWrite _bmapfile;
@@ -59,11 +58,11 @@ public:
void newWord(vespalib::stringref word);
void add(const DocIdAndFeatures &features) {
- assert(features._docId < _docIdLimit);
- assert(features._docId > _prevDocId);
+ assert(features.doc_id() < _docIdLimit);
+ assert(features.doc_id() > _prevDocId);
_posoccfile->writeDocIdAndFeatures(features);
- _bvc.add(features._docId);
- _prevDocId = features._docId;
+ _bvc.add(features.doc_id());
+ _prevDocId = features.doc_id();
}
uint64_t getSparseWordNum() const { return _wordNum; }
diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
index a3c37cb91f6..42f6971e53f 100644
--- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
@@ -26,252 +26,65 @@ using index::WordDocElementFeatures;
using index::schema::DataType;
using vespalib::getLastErrorString;
-uint32_t
-noWordPos()
-{
- return std::numeric_limits<uint32_t>::max();
-}
-
+class FileHandle {
+private:
+ std::shared_ptr<FieldWriter> _fieldWriter;
-class FileHandle
-{
public:
- FieldWriter *_fieldWriter;
- DocIdAndFeatures _docIdAndFeatures;
-
FileHandle();
-
~FileHandle();
- void
- open(vespalib::stringref dir,
- const SchemaUtil::IndexIterator &index,
- uint32_t docIdLimit, uint64_t numWordIds,
- const TuneFileSeqWrite &tuneFileWrite,
- const FileHeaderContext &fileHeaderContext);
+ void open(vespalib::stringref dir,
+ const SchemaUtil::IndexIterator &index,
+ uint32_t docIdLimit, uint64_t numWordIds,
+ const TuneFileSeqWrite &tuneFileWrite,
+ const FileHeaderContext &fileHeaderContext);
- void
- close();
-};
+ void close();
+ FieldWriter* writer() { return _fieldWriter.get(); }
+};
}
-class IndexBuilder::FieldHandle
-{
+class IndexBuilder::FieldHandle {
+private:
+ bool _valid;
+ const Schema *_schema; // Ptr to allow being std::vector member
+ uint32_t _fieldId;
+ IndexBuilder *_builder; // Ptr to allow being std::vector member
+ FileHandle _file;
+
public:
FieldHandle(const Schema &schema,
uint32_t fieldId,
- IndexBuilder *ib);
+ IndexBuilder *builder);
~FieldHandle();
- static uint32_t
- noDocRef()
- {
- return std::numeric_limits<uint32_t>::max();
- }
-
- static uint32_t
- noElRef()
- {
- return std::numeric_limits<uint32_t>::max();
- }
-
- class FHWordDocFieldFeatures
- {
- public:
- uint32_t _docId;
- uint32_t _numElements;
-
- FHWordDocFieldFeatures(uint32_t docId)
- : _docId(docId),
- _numElements(0u)
- {
- }
-
- uint32_t
- getDocId() const
- {
- return _docId;
- }
-
- uint32_t
- getNumElements() const
- {
- return _numElements;
- }
+ void new_word(vespalib::stringref word);
+ void add_document(const index::DocIdAndFeatures &features);
- void
- incNumElements()
- {
- ++_numElements;
- }
- };
-
- class FHWordDocElementFeatures
- : public WordDocElementFeatures
- {
- public:
- uint32_t _docRef;
-
- FHWordDocElementFeatures(uint32_t elementId,
- int32_t weight,
- uint32_t elementLen,
- uint32_t docRef)
- : WordDocElementFeatures(elementId),
- _docRef(docRef)
- {
- setWeight(weight);
- setElementLen(elementLen);
- }
- };
-
- class FHWordDocElementWordPosFeatures
- : public WordDocElementWordPosFeatures
- {
- public:
- uint32_t _elementRef;
-
- FHWordDocElementWordPosFeatures(
- const WordDocElementWordPosFeatures &features,
- uint32_t elementRef)
- : WordDocElementWordPosFeatures(features),
- _elementRef(elementRef)
- {
- }
- };
-
- typedef vespalib::Array<FHWordDocFieldFeatures> FHWordDocFieldFeaturesVector;
- typedef vespalib::Array<FHWordDocElementFeatures> FHWordDocElementFeaturesVector;
- typedef vespalib::Array<FHWordDocElementWordPosFeatures> FHWordDocElementWordPosFeaturesVector;
-
- FHWordDocFieldFeaturesVector _wdff;
- FHWordDocElementFeaturesVector _wdfef;
- FHWordDocElementWordPosFeaturesVector _wdfepf;
-
- uint32_t _docRef;
- uint32_t _elRef;
- bool _valid;
- const Schema *_schema; // Ptr to allow being std::vector member
- uint32_t _fieldId;
- IndexBuilder *_ib; // Ptr to allow being std::vector member
-
- uint32_t _lowestOKElementId;
- uint32_t _lowestOKWordPos;
-
- FileHandle _files;
-
- void
- startWord(vespalib::stringref word);
-
- void
- endWord();
-
- void
- startDocument(uint32_t docId);
-
- void
- endDocument();
-
- void
- startElement(uint32_t elementId,
- int32_t weight,
- uint32_t elementLen);
-
- void
- endElement();
-
- void
- addOcc(const WordDocElementWordPosFeatures &features);
-
- void
- setValid()
- {
- _valid = true;
- }
-
- bool
- getValid() const
- {
- return _valid;
- }
+ const Schema::IndexField &getSchemaField();
+ const vespalib::string &getName();
+ vespalib::string getDir();
+ void open(uint32_t docIdLimit, uint64_t numWordIds,
+ const TuneFileSeqWrite &tuneFileWrite,
+ const FileHeaderContext &fileHeaderContext);
+ void close();
- const Schema::IndexField &
- getSchemaField();
-
- const vespalib::string &
- getName();
-
- vespalib::string
- getDir();
-
- void
- open(uint32_t docIdLimit, uint64_t numWordIds,
- const TuneFileSeqWrite &tuneFileWrite,
- const FileHeaderContext &fileHeaderContext);
-
- void
- close();
-
- uint32_t
- getIndexId() const
- {
- return _fieldId;
- }
+ void setValid() { _valid = true; }
+ bool getValid() const { return _valid; }
+ uint32_t getIndexId() const { return _fieldId; }
};
-namespace {
-
-class SingleIterator
-{
-public:
- typedef IndexBuilder::FieldHandle FH;
- FH::FHWordDocFieldFeaturesVector::const_iterator _dFeatures;
- FH::FHWordDocFieldFeaturesVector::const_iterator _dFeaturesE;
- FH::FHWordDocElementFeaturesVector::const_iterator _elFeatures;
- FH::FHWordDocElementWordPosFeaturesVector::const_iterator _pFeatures;
- uint32_t _docId;
- uint32_t _localFieldId;
-
- SingleIterator(FH &fieldHandle, uint32_t localFieldId);
-
- void
- appendFeatures(DocIdAndFeatures &features);
-
- bool
- isValid() const
- {
- return _dFeatures != _dFeaturesE;
- }
-
- bool
- operator<(const SingleIterator &rhs) const
- {
- if (_docId != rhs._docId) {
- return _docId < rhs._docId;
- }
- return _localFieldId < rhs._localFieldId;
- }
-};
-
-
-}
-
-
FileHandle::FileHandle()
- : _fieldWriter(nullptr),
- _docIdAndFeatures()
-{
-}
-
-
-FileHandle::~FileHandle()
+ : _fieldWriter()
{
- delete _fieldWriter;
}
+FileHandle::~FileHandle() = default;
void
FileHandle::open(vespalib::stringref dir,
@@ -280,9 +93,9 @@ FileHandle::open(vespalib::stringref dir,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext)
{
- assert(_fieldWriter == nullptr);
+ assert(_fieldWriter.get() == nullptr);
- _fieldWriter = new FieldWriter(docIdLimit, numWordIds);
+ _fieldWriter = std::make_shared<FieldWriter>(docIdLimit, numWordIds);
if (!_fieldWriter->open(dir + "/", 64, 262144u, false,
index.getSchema(), index.getIndex(),
@@ -293,18 +106,16 @@ FileHandle::open(vespalib::stringref dir,
}
}
-
void
FileHandle::close()
{
bool ret = true;
if (_fieldWriter != nullptr) {
bool closeRes = _fieldWriter->close();
- delete _fieldWriter;
- _fieldWriter = nullptr;
+ _fieldWriter.reset();
if (!closeRes) {
LOG(error,
- "Could not close term writer");
+ "Could not close field writer");
ret = false;
}
}
@@ -312,206 +123,66 @@ FileHandle::close()
(void) ret;
}
-
IndexBuilder::FieldHandle::FieldHandle(const Schema &schema,
uint32_t fieldId,
- IndexBuilder *ib)
- : _wdff(),
- _wdfef(),
- _wdfepf(),
- _docRef(noDocRef()),
- _elRef(noElRef()),
- _valid(false),
+ IndexBuilder *builder)
+ : _valid(false),
_schema(&schema),
_fieldId(fieldId),
- _ib(ib),
- _lowestOKElementId(0u),
- _lowestOKWordPos(0u),
- _files()
+ _builder(builder),
+ _file()
{
}
-
IndexBuilder::FieldHandle::~FieldHandle() = default;
-
void
-IndexBuilder::FieldHandle::startWord(vespalib::stringref word)
+IndexBuilder::FieldHandle::new_word(vespalib::stringref word)
{
assert(_valid);
- _files._fieldWriter->newWord(word);
+ _file.writer()->newWord(word);
}
-
void
-IndexBuilder::FieldHandle::endWord()
+IndexBuilder::FieldHandle::add_document(const index::DocIdAndFeatures &features)
{
- DocIdAndFeatures &features = _files._docIdAndFeatures;
- SingleIterator si(*this, 0u);
- for (; si.isValid();) {
- features.clear(si._docId);
- si.appendFeatures(features);
- _files._fieldWriter->add(features);
- }
- assert(si._elFeatures == _wdfef.end());
- assert(si._pFeatures == _wdfepf.end());
- _wdff.clear();
- _wdfef.clear();
- _wdfepf.clear();
- _docRef = noDocRef();
- _elRef = noElRef();
+ _file.writer()->add(features);
}
-
-void
-IndexBuilder::FieldHandle::startDocument(uint32_t docId)
-{
- assert(_docRef == noDocRef());
- assert(_wdff.empty() || _wdff.back().getDocId() < docId);
- _wdff.push_back(FHWordDocFieldFeatures(docId));
- _docRef = _wdff.size() - 1;
- _lowestOKElementId = 0u;
-}
-
-
-void
-IndexBuilder::FieldHandle::endDocument()
-{
- assert(_docRef != noDocRef());
- assert(_elRef == noElRef());
- FHWordDocFieldFeatures &ff = _wdff[_docRef];
- assert(ff.getNumElements() > 0);
- (void) ff;
- _docRef = noDocRef();
-}
-
-
-void
-IndexBuilder::FieldHandle::
-startElement(uint32_t elementId,
- int32_t weight,
- uint32_t elementLen)
-{
- assert(_docRef != noDocRef());
- assert(_elRef == noElRef());
- assert(elementId >= _lowestOKElementId);
-
- FHWordDocFieldFeatures &ff = _wdff[_docRef];
- _wdfef.push_back(
- FHWordDocElementFeatures(elementId,
- weight,
- elementLen,
- _docRef));
- ff.incNumElements();
- _elRef = _wdfef.size() - 1;
- _lowestOKWordPos = 0u;
-}
-
-
-void
-IndexBuilder::FieldHandle::endElement()
-{
- assert(_elRef != noElRef());
- FHWordDocElementFeatures &ef = _wdfef[_elRef];
- assert(ef.getNumOccs() > 0);
- _elRef = noElRef();
- _lowestOKElementId = ef.getElementId() + 1;
-}
-
-
-void
-IndexBuilder::FieldHandle::
-addOcc(const WordDocElementWordPosFeatures &features)
-{
- assert(_elRef != noElRef());
- FHWordDocElementFeatures &ef = _wdfef[_elRef];
- uint32_t wordPos = features.getWordPos();
- assert(wordPos < ef.getElementLen());
- assert(wordPos >= _lowestOKWordPos);
- _lowestOKWordPos = wordPos;
- _wdfepf.push_back(
- FHWordDocElementWordPosFeatures(features,
- _elRef));
- ef.incNumOccs();
-}
-
-
const Schema::IndexField &
IndexBuilder::FieldHandle::getSchemaField()
{
return _schema->getIndexField(_fieldId);
}
-
const vespalib::string &
IndexBuilder::FieldHandle::getName()
{
return getSchemaField().getName();
-
}
-
vespalib::string
IndexBuilder::FieldHandle::getDir()
{
- return _ib->appendToPrefix(getName());
+ return _builder->appendToPrefix(getName());
}
-
void
IndexBuilder::FieldHandle::open(uint32_t docIdLimit, uint64_t numWordIds,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext)
{
- _files.open(getDir(),
- SchemaUtil::IndexIterator(*_schema, getIndexId()),
- docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext);
+ _file.open(getDir(),
+ SchemaUtil::IndexIterator(*_schema, getIndexId()),
+ docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext);
}
-
void
IndexBuilder::FieldHandle::close()
{
- _files.close();
-}
-
-
-SingleIterator::SingleIterator(FH &fieldHandle, uint32_t localFieldId)
- : _dFeatures(fieldHandle._wdff.begin()),
- _dFeaturesE(fieldHandle._wdff.end()),
- _elFeatures(fieldHandle._wdfef.begin()),
- _pFeatures(fieldHandle._wdfepf.begin()),
- _docId(_dFeatures->getDocId()),
- _localFieldId(localFieldId)
-{
+ _file.close();
}
-
-void
-SingleIterator::appendFeatures(DocIdAndFeatures &features)
-{
- uint32_t elCount = _dFeatures->getNumElements();
- for (uint32_t elId = 0; elId < elCount; ++elId, ++_elFeatures) {
- features._elements.push_back(*_elFeatures);
- features._elements.back().setNumOccs(0);
- uint32_t posCount = _elFeatures->getNumOccs();
- uint32_t lastWordPos = noWordPos();
- for (uint32_t posId = 0; posId < posCount; ++posId, ++_pFeatures) {
- uint32_t wordPos = _pFeatures->getWordPos();
- if (wordPos != lastWordPos) {
- lastWordPos = wordPos;
- features._elements.back().incNumOccs();
- features._wordPositions.push_back(*_pFeatures);
- }
- }
- }
- ++_dFeatures;
- if (_dFeatures != _dFeaturesE) {
- _docId = _dFeatures->getDocId();
- }
-}
-
-
IndexBuilder::IndexBuilder(const Schema &schema)
: index::IndexBuilder(schema),
_currentField(nullptr),
@@ -541,53 +212,6 @@ IndexBuilder::IndexBuilder(const Schema &schema)
IndexBuilder::~IndexBuilder() = default;
void
-IndexBuilder::startWord(vespalib::stringref word)
-{
- assert(_currentField != nullptr);
- assert(!_inWord);
- // TODO: Check sort order
- _curWord = word;
- _inWord = true;
- _currentField->startWord(word);
-}
-
-
-void
-IndexBuilder::endWord()
-{
- assert(_inWord);
- assert(_currentField != nullptr);
- _currentField->endWord();
- _inWord = false;
- _lowestOKDocId = 1u;
-}
-
-
-void
-IndexBuilder::startDocument(uint32_t docId)
-{
- assert(_curDocId == noDocId());
- assert(docId >= _lowestOKDocId);
- assert(docId < _docIdLimit);
- assert(_currentField != nullptr);
- _curDocId = docId;
- assert(_curDocId != noDocId());
- _currentField->startDocument(docId);
-}
-
-
-void
-IndexBuilder::endDocument()
-{
- assert(_curDocId != noDocId());
- assert(_currentField != nullptr);
- _currentField->endDocument();
- _lowestOKDocId = _curDocId + 1;
- _curDocId = noDocId();
-}
-
-
-void
IndexBuilder::startField(uint32_t fieldId)
{
assert(_curDocId == noDocId());
@@ -598,51 +222,50 @@ IndexBuilder::startField(uint32_t fieldId)
assert(_currentField != nullptr);
}
-
void
IndexBuilder::endField()
{
assert(_curDocId == noDocId());
assert(!_inWord);
assert(_currentField != nullptr);
- _lowestOKFieldId = _currentField->_fieldId + 1;
+ _lowestOKFieldId = _currentField->getIndexId() + 1;
_currentField = nullptr;
}
-
void
-IndexBuilder::startElement(uint32_t elementId,
- int32_t weight,
- uint32_t elementLen)
+IndexBuilder::startWord(vespalib::stringref word)
{
assert(_currentField != nullptr);
- _currentField->startElement(elementId, weight, elementLen);
+ assert(!_inWord);
+ // TODO: Check sort order
+ _curWord = word;
+ _inWord = true;
+ _currentField->new_word(word);
}
-
void
-IndexBuilder::endElement()
+IndexBuilder::endWord()
{
+ assert(_inWord);
assert(_currentField != nullptr);
- _currentField->endElement();
+ _inWord = false;
+ _lowestOKDocId = 1u;
}
-
void
-IndexBuilder::addOcc(const WordDocElementWordPosFeatures &features)
+IndexBuilder::add_document(const index::DocIdAndFeatures &features)
{
+ assert(_inWord);
assert(_currentField != nullptr);
- _currentField->addOcc(features);
+ _currentField->add_document(features);
}
-
void
IndexBuilder::setPrefix(vespalib::stringref prefix)
{
_prefix = prefix;
}
-
vespalib::string
IndexBuilder::appendToPrefix(vespalib::stringref name)
{
@@ -652,7 +275,6 @@ IndexBuilder::appendToPrefix(vespalib::stringref name)
return _prefix + "/" + name;
}
-
void
IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds,
const TuneFileIndexing &tuneFileIndexing,
@@ -682,7 +304,6 @@ IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds,
}
}
-
void
IndexBuilder::close()
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
index fa818bf08e6..a1a77d608cd 100644
--- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
+++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
@@ -13,12 +13,16 @@ namespace search::diskindex {
class BitVectorCandidate;
-class IndexBuilder : public index::IndexBuilder
-{
+/**
+ * Class used to build a disk index for the set of index fields specified in a schema.
+ *
+ * The resulting disk index consists of field indexes that are independent of each other.
+ */
+class IndexBuilder : public index::IndexBuilder {
public:
class FieldHandle;
- typedef index::Schema Schema;
+ using Schema = index::Schema;
private:
// Text fields
FieldHandle *_currentField;
@@ -32,7 +36,7 @@ private:
uint32_t _docIdLimit;
uint64_t _numWordIds;
- const Schema &_schema; // Ptr to allow being std::vector member
+ const Schema &_schema;
static uint32_t noDocId() {
return std::numeric_limits<uint32_t>::max();
@@ -45,23 +49,16 @@ private:
public:
typedef index::WordDocElementWordPosFeatures WordDocElementWordPosFeatures;
- // schema argument must live until indexbuilder has been deleted.
+ // Schema argument must live until IndexBuilder has been deleted.
IndexBuilder(const Schema &schema);
~IndexBuilder() override;
- void startWord(vespalib::stringref word) override;
- void endWord() override;
- void startDocument(uint32_t docId) override;
- void endDocument() override;
void startField(uint32_t fieldId) override;
void endField() override;
- void startElement(uint32_t elementId, int32_t weight, uint32_t elementLen) override;
- void endElement() override;
- void addOcc(const WordDocElementWordPosFeatures &features) override;
-
- // TODO: methods for attribute vectors.
+ void startWord(vespalib::stringref word) override;
+ void endWord() override;
+ void add_document(const index::DocIdAndFeatures &features) override;
- // TODO: methods for document summary.
void setPrefix(vespalib::stringref prefix);
vespalib::string appendToPrefix(vespalib::stringref name);
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp
index 5288d054ef0..2149a44f5ce 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp
@@ -20,35 +20,13 @@ Zc4PostingHeader::Zc4PostingHeader()
{
}
-template <bool bigEndian>
void
Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params)
{
- using EC = bitcompression::FeatureEncodeContext<bigEndian>;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, decode_context._);
- uint32_t length;
- uint64_t val64;
-
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
- _num_docs = static_cast<uint32_t>(val64) + 1;
- bool has_more = false;
- if (__builtin_expect(_num_docs >= params._min_chunk_docs, false)) {
- if (bigEndian) {
- has_more = static_cast<int64_t>(oVal) < 0;
- oVal <<= 1;
- length = 1;
- } else {
- has_more = (oVal & 1) != 0;
- oVal >>= 1;
- length = 1;
- }
- UC64_READBITS_NS(o, EC);
- }
- if (params._dynamic_k) {
- _doc_id_k = EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit);
- } else {
- _doc_id_k = K_VALUE_ZCPOSTING_LASTDOCID;
- }
+ using EC = bitcompression::FeatureEncodeContext<true>;
+ _num_docs = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_NUMDOCS) + 1;
+ bool has_more = (_num_docs >= params._min_chunk_docs) ? (decode_context.readBits(1) != 0) : false;
+ _doc_id_k = params._dynamic_k ? EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit) : K_VALUE_ZCPOSTING_LASTDOCID;
if (_num_docs < params._min_skip_docs && !_has_more) {
_doc_ids_size = 0;
_l1_skip_size = 0;
@@ -58,47 +36,16 @@ Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, cons
_features_size = 0;
_last_doc_id = 0;
} else {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
- _doc_ids_size = val64 + 1;
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
- _l1_skip_size = val64;
- if (_l1_skip_size != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
- _l2_skip_size = val64;
- }
- if (_l2_skip_size != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
- _l3_skip_size = val64;
- }
- if (_l3_skip_size != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
- _l4_skip_size = val64;
- }
- if (params._encode_features) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
- _features_size = val64;
- } else {
- _features_size = 0;
- }
- UC64_DECODEEXPGOLOMB_NS(o, _doc_id_k, EC);
- _last_doc_id = params._doc_id_limit - 1 - val64;
- uint64_t bytePad = oPreRead & 7;
- if (bytePad > 0) {
- length = bytePad;
- UC64_READBITS_NS(o, EC);
- }
+ _doc_ids_size = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_DOCIDSSIZE) + 1;
+ _l1_skip_size = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L1SKIPSIZE);
+ _l2_skip_size = (_l1_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L2SKIPSIZE) : 0;
+ _l3_skip_size = (_l2_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L3SKIPSIZE) : 0;
+ _l4_skip_size = (_l3_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L4SKIPSIZE) : 0;
+ _features_size = params._encode_features ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_FEATURESSIZE) : 0;
+ _last_doc_id = params._doc_id_limit - 1 - decode_context.decode_exp_golomb(_doc_id_k);
+ decode_context.align(8);
}
- UC64_DECODECONTEXT_STORE(o, decode_context._);
_has_more = has_more;
}
-template
-void
-Zc4PostingHeader::read<false>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
-
-template
-void
-Zc4PostingHeader::read<true>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
-
-
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h
index 7382f59d176..d4032864e16 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h
@@ -27,7 +27,6 @@ struct Zc4PostingHeader {
Zc4PostingHeader();
- template <bool bigEndian>
void
read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams &params);
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
new file mode 100644
index 00000000000..c0e1115521c
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
@@ -0,0 +1,438 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_reader.h"
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+namespace search::diskindex {
+
+using index::PostingListCounts;
+using index::DocIdAndFeatures;
+using bitcompression::FeatureEncodeContext;
+
+
+template <bool bigEndian>
+Zc4PostingReader<bigEndian>::Zc4PostingReader(bool dynamic_k)
+ : _decodeContext(nullptr),
+ _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID),
+ _prevDocId(0),
+ _numDocs(0),
+ _readContext(sizeof(uint64_t)),
+ _has_more(false),
+ _posting_params(64, 1 << 30, 10000000, dynamic_k, true),
+ _lastDocId(0),
+ _zcDocIds(),
+ _l1Skip(),
+ _l2Skip(),
+ _l3Skip(),
+ _l4Skip(),
+ _chunkNo(0),
+ _l1SkipDocId(0),
+ _l1SkipDocIdPos(0),
+ _l1SkipFeaturesPos(0),
+ _l2SkipDocId(0),
+ _l2SkipDocIdPos(0),
+ _l2SkipL1SkipPos(0),
+ _l2SkipFeaturesPos(0),
+ _l3SkipDocId(0),
+ _l3SkipDocIdPos(0),
+ _l3SkipL1SkipPos(0),
+ _l3SkipL2SkipPos(0),
+ _l3SkipFeaturesPos(0),
+ _l4SkipDocId(0),
+ _l4SkipDocIdPos(0),
+ _l4SkipL1SkipPos(0),
+ _l4SkipL2SkipPos(0),
+ _l4SkipL3SkipPos(0),
+ _l4SkipFeaturesPos(0),
+ _featuresSize(0),
+ _counts(),
+ _residue(0)
+{
+}
+
+template <bool bigEndian>
+Zc4PostingReader<bigEndian>::~Zc4PostingReader()
+{
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatures &features)
+{
+ if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) {
+ read_word_start(); // Read start of next chunk
+ }
+ // Split docid & features.
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ uint32_t docIdPos = _zcDocIds.pos();
+ uint32_t docId = _prevDocId + 1 + _zcDocIds.decode();
+ features.set_doc_id(docId);
+ _prevDocId = docId;
+ assert(docId <= _lastDocId);
+ if (docId > _l1SkipDocId) {
+ _l1SkipDocIdPos += _l1Skip.decode() + 1;
+ assert(docIdPos == _l1SkipDocIdPos);
+ uint64_t featuresPos = _decodeContext->getReadOffset();
+ if (_posting_params._encode_features) {
+ _l1SkipFeaturesPos += _l1Skip.decode() + 1;
+ assert(featuresPos == _l1SkipFeaturesPos);
+ }
+ (void) featuresPos;
+ if (docId > _l2SkipDocId) {
+ _l2SkipDocIdPos += _l2Skip.decode() + 1;
+ assert(docIdPos == _l2SkipDocIdPos);
+ if (_posting_params._encode_features) {
+ _l2SkipFeaturesPos += _l2Skip.decode() + 1;
+ assert(featuresPos == _l2SkipFeaturesPos);
+ }
+ _l2SkipL1SkipPos += _l2Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l2SkipL1SkipPos);
+ if (docId > _l3SkipDocId) {
+ _l3SkipDocIdPos += _l3Skip.decode() + 1;
+ assert(docIdPos == _l3SkipDocIdPos);
+ if (_posting_params._encode_features) {
+ _l3SkipFeaturesPos += _l3Skip.decode() + 1;
+ assert(featuresPos == _l3SkipFeaturesPos);
+ }
+ _l3SkipL1SkipPos += _l3Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l3SkipL1SkipPos);
+ _l3SkipL2SkipPos += _l3Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l3SkipL2SkipPos);
+ if (docId > _l4SkipDocId) {
+ _l4SkipDocIdPos += _l4Skip.decode() + 1;
+ assert(docIdPos == _l4SkipDocIdPos);
+ (void) docIdPos;
+ if (_posting_params._encode_features) {
+ _l4SkipFeaturesPos += _l4Skip.decode() + 1;
+ assert(featuresPos == _l4SkipFeaturesPos);
+ }
+ _l4SkipL1SkipPos += _l4Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l4SkipL1SkipPos);
+ _l4SkipL2SkipPos += _l4Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l4SkipL2SkipPos);
+ _l4SkipL3SkipPos += _l4Skip.decode() + 1;
+ assert(_l3Skip.pos() == _l4SkipL3SkipPos);
+ _l4SkipDocId += _l4Skip.decode() + 1;
+ assert(_l4SkipDocId <= _lastDocId);
+ assert(_l4SkipDocId >= docId);
+ }
+ _l3SkipDocId += _l3Skip.decode() + 1;
+ assert(_l3SkipDocId <= _lastDocId);
+ assert(_l3SkipDocId <= _l4SkipDocId);
+ assert(_l3SkipDocId >= docId);
+ }
+ _l2SkipDocId += _l2Skip.decode() + 1;
+ assert(_l2SkipDocId <= _lastDocId);
+ assert(_l2SkipDocId <= _l4SkipDocId);
+ assert(_l2SkipDocId <= _l3SkipDocId);
+ assert(_l2SkipDocId >= docId);
+ }
+ _l1SkipDocId += _l1Skip.decode() + 1;
+ assert(_l1SkipDocId <= _lastDocId);
+ assert(_l1SkipDocId <= _l4SkipDocId);
+ assert(_l1SkipDocId <= _l3SkipDocId);
+ assert(_l1SkipDocId <= _l2SkipDocId);
+ assert(_l1SkipDocId >= docId);
+ }
+ if (docId < _lastDocId) {
+ // Assert more space available when not yet at last docid
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ } else {
+ // Assert that space has been used when at last docid
+ assert(_zcDocIds._valI == _zcDocIds._valE);
+ // Assert that we've read to end of skip info
+ assert(_l1SkipDocId == _lastDocId);
+ assert(_l2SkipDocId == _lastDocId);
+ assert(_l3SkipDocId == _lastDocId);
+ assert(_l4SkipDocId == _lastDocId);
+ if (!_has_more) {
+ _chunkNo = 0;
+ }
+ }
+ if (_posting_params._encode_features) {
+ _decodeContext->readFeatures(features);
+ }
+ --_residue;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features)
+{
+ if (_residue == 0 && !_has_more) {
+ if (_residue == 0) {
+ // Don't read past end of posting list.
+ features.clear(static_cast<uint32_t>(-1));
+ return;
+ }
+ }
+ if (_lastDocId > 0) {
+ read_common_word_doc_id_and_features(features);
+ return;
+ }
+ // Interleaves docid & features
+ using EC = FeatureEncodeContext<bigEndian>;
+ DecodeContext &d = *_decodeContext;
+ uint32_t length;
+ uint64_t val64;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+
+ UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC);
+ uint32_t docId = _prevDocId + 1 + val64;
+ features.set_doc_id(docId);
+ _prevDocId = docId;
+ UC64_DECODECONTEXT_STORE(o, d._);
+ if (__builtin_expect(oCompr >= d._valE, false)) {
+ _readContext.readComprBuffer();
+ }
+ if (_posting_params._encode_features) {
+ _decodeContext->readFeatures(features);
+ }
+ --_residue;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_word_start_with_skip()
+{
+ using EC = FeatureEncodeContext<bigEndian>;
+ DecodeContext &d = *_decodeContext;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+ uint32_t length;
+ uint64_t val64;
+ const uint64_t *valE = d._valE;
+
+ if (_has_more) {
+ ++_chunkNo;
+ } else {
+ _chunkNo = 0;
+ }
+ assert(_numDocs >= _posting_params._min_skip_docs || _has_more);
+ bool has_more = false;
+ if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) {
+ if (bigEndian) {
+ has_more = static_cast<int64_t>(oVal) < 0;
+ oVal <<= 1;
+ } else {
+ has_more = (oVal & 1) != 0;
+ oVal >>= 1;
+ }
+ length = 1;
+ UC64_READBITS_NS(o, EC);
+ }
+ if (_posting_params._dynamic_k) {
+ _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs,
+ _posting_params._doc_id_limit);
+ }
+ if (_has_more || has_more) {
+ assert(has_more == (_chunkNo + 1 < _counts._segments.size()));
+ assert(_numDocs == _counts._segments[_chunkNo]._numDocs);
+ if (has_more) {
+ assert(_numDocs >= _posting_params._min_skip_docs);
+ assert(_numDocs >= _posting_params._min_chunk_docs);
+ }
+ } else {
+ assert(_numDocs >= _posting_params._min_skip_docs);
+ assert(_numDocs == _counts._numDocs);
+ }
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
+ uint32_t docIdsSize = val64 + 1;
+ UC64_DECODEEXPGOLOMB_NS(o,
+ K_VALUE_ZCPOSTING_L1SKIPSIZE,
+ EC);
+ uint32_t l1SkipSize = val64;
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint32_t l2SkipSize = 0;
+ if (l1SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
+ l2SkipSize = val64;
+ }
+ uint32_t l3SkipSize = 0;
+ if (l2SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
+ l3SkipSize = val64;
+ }
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint32_t l4SkipSize = 0;
+ if (l3SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
+ l4SkipSize = val64;
+ }
+ if (_posting_params._encode_features) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
+ _featuresSize = val64;
+ }
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ if (_posting_params._dynamic_k) {
+ UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC);
+ } else {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC);
+ }
+ _lastDocId = _posting_params._doc_id_limit - 1 - val64;
+ if (_has_more || has_more) {
+ assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc);
+ }
+
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint64_t bytePad = oPreRead & 7;
+ if (bytePad > 0) {
+ length = bytePad;
+ if (bigEndian) {
+ oVal <<= length;
+ } else {
+ oVal >>= length;
+ }
+ UC64_READBITS_NS(o, EC);
+ }
+ UC64_DECODECONTEXT_STORE(o, d._);
+ if (__builtin_expect(oCompr >= valE, false)) {
+ _readContext.readComprBuffer();
+ }
+ _zcDocIds.clearReserve(docIdsSize);
+ _l1Skip.clearReserve(l1SkipSize);
+ _l2Skip.clearReserve(l2SkipSize);
+ _l3Skip.clearReserve(l3SkipSize);
+ _l4Skip.clearReserve(l4SkipSize);
+ _decodeContext->readBytes(_zcDocIds._valI, docIdsSize);
+ _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
+ if (l1SkipSize > 0) {
+ _decodeContext->readBytes(_l1Skip._valI, l1SkipSize);
+ }
+ _l1Skip._valE = _l1Skip._valI + l1SkipSize;
+ if (l2SkipSize > 0) {
+ _decodeContext->readBytes(_l2Skip._valI, l2SkipSize);
+ }
+ _l2Skip._valE = _l2Skip._valI + l2SkipSize;
+ if (l3SkipSize > 0) {
+ _decodeContext->readBytes(_l3Skip._valI, l3SkipSize);
+ }
+ _l3Skip._valE = _l3Skip._valI + l3SkipSize;
+ if (l4SkipSize > 0) {
+ _decodeContext->readBytes(_l4Skip._valI, l4SkipSize);
+ }
+ _l4Skip._valE = _l4Skip._valI + l4SkipSize;
+
+ if (l1SkipSize > 0) {
+ _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l1SkipDocId = _lastDocId;
+ }
+ if (l2SkipSize > 0) {
+ _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l2SkipDocId = _lastDocId;
+ }
+ if (l3SkipSize > 0) {
+ _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l3SkipDocId = _lastDocId;
+ }
+ if (l4SkipSize > 0) {
+ _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l4SkipDocId = _lastDocId;
+ }
+ _l1SkipDocIdPos = 0;
+ _l1SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l2SkipDocIdPos = 0;
+ _l2SkipL1SkipPos = 0;
+ _l2SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l3SkipDocIdPos = 0;
+ _l3SkipL1SkipPos = 0;
+ _l3SkipL2SkipPos = 0;
+ _l3SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l4SkipDocIdPos = 0;
+ _l4SkipL1SkipPos = 0;
+ _l4SkipL2SkipPos = 0;
+ _l4SkipL3SkipPos = 0;
+ _l4SkipFeaturesPos = _decodeContext->getReadOffset();
+ _has_more = has_more;
+ // Decode context is now positioned at start of features
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_word_start()
+{
+ using EC = FeatureEncodeContext<bigEndian>;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_);
+ uint32_t length;
+ uint64_t val64;
+ const uint64_t *valE = _decodeContext->_valE;
+
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
+ UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
+ if (oCompr >= valE) {
+ _readContext.readComprBuffer();
+ }
+ _numDocs = static_cast<uint32_t>(val64) + 1;
+ _residue = _numDocs;
+ _prevDocId = _has_more ? _lastDocId : 0u;
+ assert(_numDocs <= _counts._numDocs);
+ assert(_numDocs == _counts._numDocs ||
+ _numDocs >= _posting_params._min_chunk_docs ||
+ _has_more);
+
+ if (_numDocs >= _posting_params._min_skip_docs || _has_more) {
+ read_word_start_with_skip();
+ // Decode context is not positioned at start of features
+ } else {
+ if (_posting_params._dynamic_k) {
+ _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit);
+ }
+ _lastDocId = 0u;
+ // Decode context is not positioned at start of docids & features
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::set_counts(const PostingListCounts &counts)
+{
+ assert(!_has_more && _residue == 0); // Previous words must have been read.
+ _counts = counts;
+ assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
+ if (_counts._numDocs > 0) {
+ read_word_start();
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::set_decode_features(DecodeContext *decode_features)
+{
+ _decodeContext = decode_features;
+ _decodeContext->setReadContext(&_readContext);
+ _readContext.setDecodeContext(_decodeContext);
+}
+
+template class Zc4PostingReader<false>;
+template class Zc4PostingReader<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
new file mode 100644
index 00000000000..d8161da15d5
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
@@ -0,0 +1,96 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "zc4_posting_writer.h"
+#include <vespa/searchlib/index/postinglistfile.h>
+#include <vespa/fastos/file.h>
+#include "zc4_posting_params.h"
+
+namespace search::index {
+ class PostingListCountFileSeqRead;
+}
+
+namespace search::diskindex {
+
+/*
+ * Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k).
+ *
+ * Common words have docid deltas and skip info separate from
+ * features.
+ *
+ * Rare words do not have skip info, and docid deltas and features are
+ * interleaved.
+ */
+template <bool bigEndian>
+class Zc4PostingReader
+{
+
+protected:
+ using DecodeContext = bitcompression::FeatureDecodeContext<bigEndian>;
+
+ DecodeContext *_decodeContext;
+ uint32_t _docIdK;
+ uint32_t _prevDocId; // Previous document id
+ uint32_t _numDocs; // Documents in chunk or word
+ search::ComprFileReadContext _readContext;
+ bool _has_more;
+ Zc4PostingParams _posting_params;
+ uint32_t _lastDocId; // last document in chunk or word
+
+ ZcBuf _zcDocIds; // Document id deltas
+ ZcBuf _l1Skip; // L1 skip info
+ ZcBuf _l2Skip; // L2 skip info
+ ZcBuf _l3Skip; // L3 skip info
+ ZcBuf _l4Skip; // L4 skip info
+
+ uint64_t _numWords; // Number of words in file
+ uint32_t _chunkNo; // Chunk number
+
+ // Variables for validating skip information while reading
+ uint32_t _l1SkipDocId;
+ uint32_t _l1SkipDocIdPos;
+ uint64_t _l1SkipFeaturesPos;
+ uint32_t _l2SkipDocId;
+ uint32_t _l2SkipDocIdPos;
+ uint32_t _l2SkipL1SkipPos;
+ uint64_t _l2SkipFeaturesPos;
+ uint32_t _l3SkipDocId;
+ uint32_t _l3SkipDocIdPos;
+ uint32_t _l3SkipL1SkipPos;
+ uint32_t _l3SkipL2SkipPos;
+ uint64_t _l3SkipFeaturesPos;
+ uint32_t _l4SkipDocId;
+ uint32_t _l4SkipDocIdPos;
+ uint32_t _l4SkipL1SkipPos;
+ uint32_t _l4SkipL2SkipPos;
+ uint32_t _l4SkipL3SkipPos;
+ uint64_t _l4SkipFeaturesPos;
+
+ // Variable for validating chunk information while reading
+ uint64_t _featuresSize;
+ index::PostingListCounts _counts;
+
+ uint32_t _residue; // Number of unread documents after word header
+ void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features);
+ void read_word_start_with_skip();
+ void read_word_start();
+public:
+ Zc4PostingReader(bool dynamic_k);
+ Zc4PostingReader(const Zc4PostingReader &) = delete;
+ Zc4PostingReader(Zc4PostingReader &&) = delete;
+ Zc4PostingReader &operator=(const Zc4PostingReader &) = delete;
+ Zc4PostingReader &operator=(Zc4PostingReader &&) = delete;
+ ~Zc4PostingReader();
+ void read_doc_id_and_features(index::DocIdAndFeatures &features);
+ void set_counts(const index::PostingListCounts &counts);
+ void set_decode_features(DecodeContext *decode_features);
+ DecodeContext &get_decode_features() const { return *_decodeContext; }
+ ComprFileReadContext &get_read_context() { return _readContext; }
+ Zc4PostingParams &get_posting_params() { return _posting_params; }
+};
+
+extern template class Zc4PostingReader<false>;
+extern template class Zc4PostingReader<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
index 477db7095ed..78d18cb5550 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
@@ -153,11 +153,11 @@ Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &fe
uint64_t writeOffset = _encode_features->getWriteOffset();
uint64_t featureSize = writeOffset - _featureOffset;
assert(static_cast<uint32_t>(featureSize) == featureSize);
- _docIds.push_back(std::make_pair(features._docId,
+ _docIds.push_back(std::make_pair(features.doc_id(),
static_cast<uint32_t>(featureSize)));
_featureOffset = writeOffset;
} else {
- _docIds.push_back(std::make_pair(features._docId, uint32_t(0)));
+ _docIds.push_back(std::make_pair(features.doc_id(), uint32_t(0)));
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
index 51f7a2ea151..5ab37cecc3d 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
@@ -8,6 +8,189 @@ using search::index::PostingListParams;
namespace search::diskindex {
+namespace {
+
+class DocIdEncoder {
+protected:
+ uint32_t _doc_id;
+ uint32_t _doc_id_pos;
+ uint32_t _feature_pos;
+ using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>;
+
+public:
+ DocIdEncoder()
+ : _doc_id(0u),
+ _doc_id_pos(0u),
+ _feature_pos(0u)
+ {
+ }
+
+ void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size);
+ void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; }
+ uint32_t get_doc_id() const { return _doc_id; }
+ uint32_t get_doc_id_pos() const { return _doc_id_pos; }
+ uint32_t get_feature_pos() const { return _feature_pos; }
+};
+
+class L1SkipEncoder : public DocIdEncoder {
+protected:
+ uint32_t _stride_check;
+ uint32_t _l1_skip_pos;
+ const bool _encode_features;
+
+public:
+ L1SkipEncoder(bool encode_features)
+ : DocIdEncoder(),
+ _stride_check(0u),
+ _l1_skip_pos(0u),
+ _encode_features(encode_features)
+ {
+ }
+
+ void encode_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder);
+ void write_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder);
+ bool should_write_skip(uint32_t stride) { return ++_stride_check >= stride; }
+ void dec_stride_check() { --_stride_check; }
+ void write_partial_skip(ZcBuf &zc_buf, uint32_t doc_id);
+ uint32_t get_l1_skip_pos() const { return _l1_skip_pos; }
+};
+
+struct L2SkipEncoder : public L1SkipEncoder {
+protected:
+ uint32_t _l2_skip_pos;
+
+public:
+ L2SkipEncoder(bool encode_features)
+ : L1SkipEncoder(encode_features),
+ _l2_skip_pos(0u)
+ {
+ }
+
+ void encode_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip);
+ void write_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip);
+ uint32_t get_l2_skip_pos() const { return _l2_skip_pos; }
+};
+
+class L3SkipEncoder : public L2SkipEncoder {
+protected:
+ uint32_t _l3_skip_pos;
+
+public:
+ L3SkipEncoder(bool encode_features)
+ : L2SkipEncoder(encode_features),
+ _l3_skip_pos(0u)
+ {
+ }
+
+ void encode_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip);
+ void write_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip);
+ uint32_t get_l3_skip_pos() const { return _l3_skip_pos; }
+};
+
+class L4SkipEncoder : public L3SkipEncoder {
+
+public:
+ L4SkipEncoder(bool encode_features)
+ : L3SkipEncoder(encode_features)
+ {
+ }
+
+ void encode_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip);
+ void write_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip);
+};
+
+void
+DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size)
+{
+ _feature_pos += doc_id_and_feature_size.second;
+ zc_buf.encode(doc_id_and_feature_size.first - _doc_id - 1);
+ _doc_id = doc_id_and_feature_size.first;
+ _doc_id_pos = zc_buf.size();
+}
+
+void
+L1SkipEncoder::encode_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder)
+{
+ _stride_check = 0;
+ // doc id
+ uint32_t doc_id_delta = doc_id_encoder.get_doc_id() - _doc_id;
+ assert(static_cast<int32_t>(doc_id_delta) > 0);
+ zc_buf.encode(doc_id_delta - 1);
+ _doc_id = doc_id_encoder.get_doc_id();
+ // doc id pos
+ zc_buf.encode(doc_id_encoder.get_doc_id_pos() - _doc_id_pos - 1);
+ _doc_id_pos = doc_id_encoder.get_doc_id_pos();
+ if (_encode_features) {
+ // features pos
+ zc_buf.encode(doc_id_encoder.get_feature_pos() - _feature_pos - 1);
+ _feature_pos = doc_id_encoder.get_feature_pos();
+ }
+}
+
+void
+L1SkipEncoder::write_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder)
+{
+ encode_skip(zc_buf, doc_id_encoder);
+ _l1_skip_pos = zc_buf.size();
+}
+
+void
+L1SkipEncoder::write_partial_skip(ZcBuf &zc_buf, uint32_t doc_id)
+{
+ if (zc_buf.size() > 0) {
+ zc_buf.encode(doc_id - _doc_id - 1);
+ }
+}
+
+void
+L2SkipEncoder::encode_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip)
+{
+ L1SkipEncoder::encode_skip(zc_buf, l1_skip);
+ // L1 skip pos
+ zc_buf.encode(l1_skip.get_l1_skip_pos() - _l1_skip_pos - 1);
+ _l1_skip_pos = l1_skip.get_l1_skip_pos();
+}
+
+void
+L2SkipEncoder::write_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip)
+{
+ encode_skip(zc_buf, l1_skip);
+ _l2_skip_pos = zc_buf.size();
+}
+
+void
+L3SkipEncoder::encode_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip)
+{
+ L2SkipEncoder::encode_skip(zc_buf, l2_skip);
+ // L2 skip pos
+ zc_buf.encode(l2_skip.get_l2_skip_pos() - _l2_skip_pos - 1);
+ _l2_skip_pos = l2_skip.get_l2_skip_pos();
+}
+
+void
+L3SkipEncoder::write_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip)
+{
+ encode_skip(zc_buf, l2_skip);
+ _l3_skip_pos = zc_buf.size();
+}
+
+void
+L4SkipEncoder::encode_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip)
+{
+ L3SkipEncoder::encode_skip(zc_buf, l3_skip);
+ // L3 skip pos
+ zc_buf.encode(l3_skip.get_l3_skip_pos() - _l3_skip_pos - 1);
+ _l3_skip_pos = l3_skip.get_l3_skip_pos();
+}
+
+void
+L4SkipEncoder::write_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip)
+{
+ encode_skip(zc_buf, l3_skip);
+}
+
+}
+
Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts)
: _minChunkDocs(1 << 30),
_minSkipDocs(64),
@@ -45,159 +228,42 @@ Zc4PostingWriterBase::~Zc4PostingWriterBase()
#define L4SKIPSTRIDE 8
void
-Zc4PostingWriterBase::calc_skip_info(bool encodeFeatures)
+Zc4PostingWriterBase::calc_skip_info(bool encode_features)
{
- uint32_t lastDocId = 0u;
- uint32_t lastL1SkipDocId = 0u;
- uint32_t lastL1SkipDocIdPos = 0;
- uint32_t lastL1SkipFeaturePos = 0;
- uint32_t lastL2SkipDocId = 0u;
- uint32_t lastL2SkipDocIdPos = 0;
- uint32_t lastL2SkipFeaturePos = 0;
- uint32_t lastL2SkipL1SkipPos = 0;
- uint32_t lastL3SkipDocId = 0u;
- uint32_t lastL3SkipDocIdPos = 0;
- uint32_t lastL3SkipFeaturePos = 0;
- uint32_t lastL3SkipL1SkipPos = 0;
- uint32_t lastL3SkipL2SkipPos = 0;
- uint32_t lastL4SkipDocId = 0u;
- uint32_t lastL4SkipDocIdPos = 0;
- uint32_t lastL4SkipFeaturePos = 0;
- uint32_t lastL4SkipL1SkipPos = 0;
- uint32_t lastL4SkipL2SkipPos = 0;
- uint32_t lastL4SkipL3SkipPos = 0;
- unsigned int l1SkipCnt = 0;
- unsigned int l2SkipCnt = 0;
- unsigned int l3SkipCnt = 0;
- unsigned int l4SkipCnt = 0;
- uint64_t featurePos = 0;
-
- std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
- std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
-
+ DocIdEncoder doc_id_encoder;
+ L1SkipEncoder l1_skip_encoder(encode_features);
+ L2SkipEncoder l2_skip_encoder(encode_features);
+ L3SkipEncoder l3_skip_encoder(encode_features);
+ L4SkipEncoder l4_skip_encoder(encode_features);
+ l1_skip_encoder.dec_stride_check();
if (!_counts._segments.empty()) {
- lastDocId = _counts._segments.back()._lastDoc;
- lastL1SkipDocId = lastDocId;
- lastL2SkipDocId = lastDocId;
- lastL3SkipDocId = lastDocId;
- lastL4SkipDocId = lastDocId;
+ uint32_t doc_id = _counts._segments.back()._lastDoc;
+ doc_id_encoder.set_doc_id(doc_id);
+ l1_skip_encoder.set_doc_id(doc_id);
+ l2_skip_encoder.set_doc_id(doc_id);
+ l3_skip_encoder.set_doc_id(doc_id);
+ l4_skip_encoder.set_doc_id(doc_id);
}
-
- for (; dit != dite; ++dit) {
- if (l1SkipCnt >= L1SKIPSTRIDE) {
- // L1 docid delta
- uint32_t docIdDelta = lastDocId - lastL1SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l1Skip.encode(docIdDelta - 1);
- lastL1SkipDocId = lastDocId;
- // L1 docid pos
- uint64_t docIdPos = _zcDocIds.size();
- _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1);
- lastL1SkipDocIdPos = docIdPos;
- if (encodeFeatures) {
- // L1 features pos
- _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1);
- lastL1SkipFeaturePos = featurePos;
- }
- l1SkipCnt = 0;
- ++l2SkipCnt;
- if (l2SkipCnt >= L2SKIPSTRIDE) {
- // L2 docid delta
- docIdDelta = lastDocId - lastL2SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l2Skip.encode(docIdDelta - 1);
- lastL2SkipDocId = lastDocId;
- // L2 docid pos
- docIdPos = _zcDocIds.size();
- _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1);
- lastL2SkipDocIdPos = docIdPos;
- if (encodeFeatures) {
- // L2 features pos
- _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1);
- lastL2SkipFeaturePos = featurePos;
- }
- // L2 L1Skip pos
- uint64_t l1SkipPos = _l1Skip.size();
- _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1);
- lastL2SkipL1SkipPos = l1SkipPos;
- l2SkipCnt = 0;
- ++l3SkipCnt;
- if (l3SkipCnt >= L3SKIPSTRIDE) {
- // L3 docid delta
- docIdDelta = lastDocId - lastL3SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l3Skip.encode(docIdDelta - 1);
- lastL3SkipDocId = lastDocId;
- // L3 docid pos
- docIdPos = _zcDocIds.size();
- _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1);
- lastL3SkipDocIdPos = docIdPos;
- if (encodeFeatures) {
- // L3 features pos
- _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1);
- lastL3SkipFeaturePos = featurePos;
- }
- // L3 L1Skip pos
- l1SkipPos = _l1Skip.size();
- _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1);
- lastL3SkipL1SkipPos = l1SkipPos;
- // L3 L2Skip pos
- uint64_t l2SkipPos = _l2Skip.size();
- _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1);
- lastL3SkipL2SkipPos = l2SkipPos;
- l3SkipCnt = 0;
- ++l4SkipCnt;
- if (l4SkipCnt >= L4SKIPSTRIDE) {
- // L4 docid delta
- docIdDelta = lastDocId - lastL4SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l4Skip.encode(docIdDelta - 1);
- lastL4SkipDocId = lastDocId;
- // L4 docid pos
- docIdPos = _zcDocIds.size();
- _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1);
- lastL4SkipDocIdPos = docIdPos;
- if (encodeFeatures) {
- // L4 features pos
- _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1);
- lastL4SkipFeaturePos = featurePos;
- }
- // L4 L1Skip pos
- l1SkipPos = _l1Skip.size();
- _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1);
- lastL4SkipL1SkipPos = l1SkipPos;
- // L4 L2Skip pos
- l2SkipPos = _l2Skip.size();
- _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1);
- lastL4SkipL2SkipPos = l2SkipPos;
- // L4 L3Skip pos
- uint64_t l3SkipPos = _l3Skip.size();
- _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1);
- lastL4SkipL3SkipPos = l3SkipPos;
- l4SkipCnt = 0;
+ for (const auto &doc_id_and_feature_size : _docIds) {
+ if (l1_skip_encoder.should_write_skip(L1SKIPSTRIDE)) {
+ l1_skip_encoder.write_skip(_l1Skip, doc_id_encoder);
+ if (l2_skip_encoder.should_write_skip(L2SKIPSTRIDE)) {
+ l2_skip_encoder.write_skip(_l2Skip, l1_skip_encoder);
+ if (l3_skip_encoder.should_write_skip(L3SKIPSTRIDE)) {
+ l3_skip_encoder.write_skip(_l3Skip, l2_skip_encoder);
+ if (l4_skip_encoder.should_write_skip(L4SKIPSTRIDE)) {
+ l4_skip_encoder.write_skip(_l4Skip, l3_skip_encoder);
}
}
}
}
- uint32_t docId = dit->first;
- featurePos += dit->second;
- _zcDocIds.encode(docId - lastDocId - 1);
- lastDocId = docId;
- ++l1SkipCnt;
+ doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size);
}
// Extra partial entries for skip tables to simplify iterator during search
- if (_l1Skip.size() > 0) {
- _l1Skip.encode(lastDocId - lastL1SkipDocId - 1);
- }
- if (_l2Skip.size() > 0) {
- _l2Skip.encode(lastDocId - lastL2SkipDocId - 1);
- }
- if (_l3Skip.size() > 0) {
- _l3Skip.encode(lastDocId - lastL3SkipDocId - 1);
- }
- if (_l4Skip.size() > 0) {
- _l4Skip.encode(lastDocId - lastL4SkipDocId - 1);
- }
+ l1_skip_encoder.write_partial_skip(_l1Skip, doc_id_encoder.get_doc_id());
+ l2_skip_encoder.write_partial_skip(_l2Skip, doc_id_encoder.get_doc_id());
+ l3_skip_encoder.write_partial_skip(_l3Skip, doc_id_encoder.get_doc_id());
+ l4_skip_encoder.write_partial_skip(_l4Skip, doc_id_encoder.get_doc_id());
}
void
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
index e803fc692c3..6da59028803 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
@@ -47,7 +47,7 @@ protected:
Zc4PostingWriterBase &operator=(Zc4PostingWriterBase &&) = delete;
Zc4PostingWriterBase(index::PostingListCounts &counts);
~Zc4PostingWriterBase();
- void calc_skip_info(bool encodeFeatures);
+ void calc_skip_info(bool encode_features);
void clear_skip_info();
public:
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
index 10c08af92cb..3ae2a631cb1 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
@@ -16,14 +16,12 @@ using search::index::PostingListCountFileSeqRead;
using search::index::PostingListCountFileSeqWrite;
Zc4PosOccSeqRead::Zc4PosOccSeqRead(PostingListCountFileSeqRead *countFile)
- : Zc4PostingSeqRead(countFile),
+ : Zc4PostingSeqRead(countFile, false),
_fieldsParams(),
_cookedDecodeContext(&_fieldsParams),
_rawDecodeContext(&_fieldsParams)
{
- _decodeContext = &_cookedDecodeContext;
- _decodeContext->setReadContext(&_readContext);
- _readContext.setDecodeContext(_decodeContext);
+ _reader.set_decode_features(&_cookedDecodeContext);
}
@@ -31,18 +29,17 @@ void
Zc4PosOccSeqRead::
setFeatureParams(const PostingListParams &params)
{
- bool oldCooked = _decodeContext == &_cookedDecodeContext;
+ bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext;
bool newCooked = oldCooked;
params.get("cooked", newCooked);
if (oldCooked != newCooked) {
if (newCooked) {
_cookedDecodeContext = _rawDecodeContext;
- _decodeContext = &_cookedDecodeContext;
+ _reader.set_decode_features(&_cookedDecodeContext);
} else {
_rawDecodeContext = _cookedDecodeContext;
- _decodeContext = &_rawDecodeContext;
+ _reader.set_decode_features(&_rawDecodeContext);
}
- _readContext.setDecodeContext(_decodeContext);
}
}
@@ -69,14 +66,12 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema,
ZcPosOccSeqRead::ZcPosOccSeqRead(PostingListCountFileSeqRead *countFile)
- : ZcPostingSeqRead(countFile),
+ : Zc4PostingSeqRead(countFile, true),
_fieldsParams(),
_cookedDecodeContext(&_fieldsParams),
_rawDecodeContext(&_fieldsParams)
{
- _decodeContext = &_cookedDecodeContext;
- _decodeContext->setReadContext(&_readContext);
- _readContext.setDecodeContext(_decodeContext);
+ _reader.set_decode_features(&_cookedDecodeContext);
}
@@ -84,18 +79,17 @@ void
ZcPosOccSeqRead::
setFeatureParams(const PostingListParams &params)
{
- bool oldCooked = _decodeContext == &_cookedDecodeContext;
+ bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext;
bool newCooked = oldCooked;
params.get("cooked", newCooked);
if (oldCooked != newCooked) {
if (newCooked) {
_cookedDecodeContext = _rawDecodeContext;
- _decodeContext = &_cookedDecodeContext;
+ _reader.set_decode_features(&_cookedDecodeContext);
} else {
_rawDecodeContext = _cookedDecodeContext;
- _decodeContext = &_rawDecodeContext;
+ _reader.set_decode_features(&_rawDecodeContext);
}
- _readContext.setDecodeContext(_decodeContext);
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
index cd21fb02f33..1e0555116ce 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
@@ -34,7 +34,7 @@ public:
};
-class ZcPosOccSeqRead : public ZcPostingSeqRead
+class ZcPosOccSeqRead : public Zc4PostingSeqRead
{
private:
bitcompression::PosOccFieldsParams _fieldsParams;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
index e40842737c9..a0203b64197 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
@@ -29,60 +29,19 @@ using bitcompression::FeatureEncodeContextBE;
using vespalib::getLastErrorString;
-Zc4PostingSeqRead::
-Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile)
+Zc4PostingSeqRead::Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile, bool dynamic_k)
: PostingListFileSeqRead(),
- _decodeContext(),
- _docIdK(0),
- _prevDocId(0),
- _numDocs(0),
- _readContext(sizeof(uint64_t)),
+ _reader(dynamic_k),
_file(),
- _hasMore(false),
- _dynamicK(false),
- _lastDocId(0),
- _minChunkDocs(1 << 30),
- _minSkipDocs(64),
- _docIdLimit(10000000),
- _zcDocIds(),
- _l1Skip(),
- _l2Skip(),
- _l3Skip(),
- _l4Skip(),
_numWords(0),
_fileBitSize(0),
- _chunkNo(0),
- _l1SkipDocId(0),
- _l1SkipDocIdPos(0),
- _l1SkipFeaturesPos(0),
- _l2SkipDocId(0),
- _l2SkipDocIdPos(0),
- _l2SkipL1SkipPos(0),
- _l2SkipFeaturesPos(0),
- _l3SkipDocId(0),
- _l3SkipDocIdPos(0),
- _l3SkipL1SkipPos(0),
- _l3SkipL2SkipPos(0),
- _l3SkipFeaturesPos(0),
- _l4SkipDocId(0),
- _l4SkipDocIdPos(0),
- _l4SkipL1SkipPos(0),
- _l4SkipL2SkipPos(0),
- _l4SkipL3SkipPos(0),
- _l4SkipFeaturesPos(0),
- _featuresSize(0),
- _countFile(countFile),
- _headerBitLen(0),
- _rangeEndOffset(0),
- _readAheadEndOffset(0),
- _wordStart(0),
- _residue(0)
+ _countFile(countFile)
{
if (_countFile != nullptr) {
PostingListParams params;
_countFile->getParams(params);
- params.get("docIdLimit", _docIdLimit);
- params.get("minChunkDocs", _minChunkDocs);
+ params.get("docIdLimit", _reader.get_posting_params()._doc_id_limit);
+ params.get("minChunkDocs", _reader.get_posting_params()._min_chunk_docs);
}
}
@@ -91,387 +50,16 @@ Zc4PostingSeqRead::~Zc4PostingSeqRead()
{
}
-
-void
-Zc4PostingSeqRead::
-readCommonWordDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if ((_zcDocIds._valI >= _zcDocIds._valE) && _hasMore) {
- readWordStart(); // Read start of next chunk
- }
- // Split docid & features.
- assert(_zcDocIds._valI < _zcDocIds._valE);
- uint32_t docIdPos = _zcDocIds.pos();
- uint32_t docId = _prevDocId + 1 + _zcDocIds.decode();
- features._docId = docId;
- _prevDocId = docId;
- assert(docId <= _lastDocId);
- if (docId > _l1SkipDocId) {
- _l1SkipDocIdPos += _l1Skip.decode() + 1;
- assert(docIdPos == _l1SkipDocIdPos);
- _l1SkipFeaturesPos += _l1Skip.decode() + 1;
- uint64_t featuresPos = _decodeContext->getReadOffset();
- assert(featuresPos == _l1SkipFeaturesPos);
- (void) featuresPos;
- if (docId > _l2SkipDocId) {
- _l2SkipDocIdPos += _l2Skip.decode() + 1;
- assert(docIdPos == _l2SkipDocIdPos);
- _l2SkipFeaturesPos += _l2Skip.decode() + 1;
- assert(featuresPos == _l2SkipFeaturesPos);
- _l2SkipL1SkipPos += _l2Skip.decode() + 1;
- assert(_l1Skip.pos() == _l2SkipL1SkipPos);
- if (docId > _l3SkipDocId) {
- _l3SkipDocIdPos += _l3Skip.decode() + 1;
- assert(docIdPos == _l3SkipDocIdPos);
- _l3SkipFeaturesPos += _l3Skip.decode() + 1;
- assert(featuresPos == _l3SkipFeaturesPos);
- _l3SkipL1SkipPos += _l3Skip.decode() + 1;
- assert(_l1Skip.pos() == _l3SkipL1SkipPos);
- _l3SkipL2SkipPos += _l3Skip.decode() + 1;
- assert(_l2Skip.pos() == _l3SkipL2SkipPos);
- if (docId > _l4SkipDocId) {
- _l4SkipDocIdPos += _l4Skip.decode() + 1;
- assert(docIdPos == _l4SkipDocIdPos);
- (void) docIdPos;
- _l4SkipFeaturesPos += _l4Skip.decode() + 1;
- assert(featuresPos == _l4SkipFeaturesPos);
- _l4SkipL1SkipPos += _l4Skip.decode() + 1;
- assert(_l1Skip.pos() == _l4SkipL1SkipPos);
- _l4SkipL2SkipPos += _l4Skip.decode() + 1;
- assert(_l2Skip.pos() == _l4SkipL2SkipPos);
- _l4SkipL3SkipPos += _l4Skip.decode() + 1;
- assert(_l3Skip.pos() == _l4SkipL3SkipPos);
- _l4SkipDocId += _l4Skip.decode() + 1;
- assert(_l4SkipDocId <= _lastDocId);
- assert(_l4SkipDocId >= docId);
- }
- _l3SkipDocId += _l3Skip.decode() + 1;
- assert(_l3SkipDocId <= _lastDocId);
- assert(_l3SkipDocId <= _l4SkipDocId);
- assert(_l3SkipDocId >= docId);
- }
- _l2SkipDocId += _l2Skip.decode() + 1;
- assert(_l2SkipDocId <= _lastDocId);
- assert(_l2SkipDocId <= _l4SkipDocId);
- assert(_l2SkipDocId <= _l3SkipDocId);
- assert(_l2SkipDocId >= docId);
- }
- _l1SkipDocId += _l1Skip.decode() + 1;
- assert(_l1SkipDocId <= _lastDocId);
- assert(_l1SkipDocId <= _l4SkipDocId);
- assert(_l1SkipDocId <= _l3SkipDocId);
- assert(_l1SkipDocId <= _l2SkipDocId);
- assert(_l1SkipDocId >= docId);
- }
- if (docId < _lastDocId) {
- // Assert more space available when not yet at last docid
- assert(_zcDocIds._valI < _zcDocIds._valE);
- } else {
- // Assert that space has been used when at last docid
- assert(_zcDocIds._valI == _zcDocIds._valE);
- // Assert that we've read to end of skip info
- assert(_l1SkipDocId == _lastDocId);
- assert(_l2SkipDocId == _lastDocId);
- assert(_l3SkipDocId == _lastDocId);
- assert(_l4SkipDocId == _lastDocId);
- if (!_hasMore) {
- _chunkNo = 0;
- }
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-void
-Zc4PostingSeqRead::
-readDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if (_residue == 0 && !_hasMore) {
- if (_rangeEndOffset != 0) {
- DecodeContext &d = *_decodeContext;
- uint64_t curOffset = d.getReadOffset();
- assert(curOffset <= _rangeEndOffset);
- if (curOffset < _rangeEndOffset) {
- readWordStart();
- }
- }
- if (_residue == 0) {
- // Don't read past end of posting list.
- features.clear(static_cast<uint32_t>(-1));
- return;
- }
- }
- if (_lastDocId > 0) {
- return readCommonWordDocIdAndFeatures(features);
- }
- // Interleaves docid & features
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- uint32_t length;
- uint64_t val64;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-
- UC64BE_DECODEEXPGOLOMB_SMALL_NS(o,
- K_VALUE_ZCPOSTING_DELTA_DOCID,
- EC);
- uint32_t docId = _prevDocId + 1 + val64;
- features._docId = docId;
- _prevDocId = docId;
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= d._valE, false)) {
- _readContext.readComprBuffer();
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-void
-Zc4PostingSeqRead::readWordStartWithSkip()
-{
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = d._valE;
-
- if (_hasMore) {
- ++_chunkNo;
- } else {
- _chunkNo = 0;
- }
- assert(_numDocs >= _minSkipDocs || _hasMore);
- bool hasMore = false;
- if (__builtin_expect(_numDocs >= _minChunkDocs, false)) {
- hasMore = static_cast<int64_t>(oVal) < 0;
- oVal <<= 1;
- length = 1;
- UC64BE_READBITS_NS(o, EC);
- }
- if (_dynamicK) {
- _docIdK = EC::calcDocIdK((_hasMore || hasMore) ? 1 : _numDocs,
- _docIdLimit);
- }
- if (_hasMore || hasMore) {
- if (_rangeEndOffset == 0) {
- assert(hasMore == (_chunkNo + 1 < _counts._segments.size()));
- assert(_numDocs == _counts._segments[_chunkNo]._numDocs);
- }
- if (hasMore) {
- assert(_numDocs >= _minSkipDocs);
- assert(_numDocs >= _minChunkDocs);
- }
- } else {
- assert(_numDocs >= _minSkipDocs);
- if (_rangeEndOffset == 0) {
- assert(_numDocs == _counts._numDocs);
- }
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_DOCIDSSIZE,
- EC);
- uint32_t docIdsSize = val64 + 1;
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L1SKIPSIZE,
- EC);
- uint32_t l1SkipSize = val64;
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l2SkipSize = 0;
- if (l1SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L2SKIPSIZE,
- EC);
- l2SkipSize = val64;
- }
- uint32_t l3SkipSize = 0;
- if (l2SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L3SKIPSIZE,
- EC);
- l3SkipSize = val64;
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l4SkipSize = 0;
- if (l3SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L4SKIPSIZE,
- EC);
- l4SkipSize = val64;
- }
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_FEATURESSIZE,
- EC);
- _featuresSize = val64;
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- if (_dynamicK) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- _docIdK,
- EC);
- } else {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_LASTDOCID,
- EC);
- }
- _lastDocId = _docIdLimit - 1 - val64;
- if (_hasMore || hasMore) {
- if (_rangeEndOffset == 0) {
- assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc);
- }
- }
-
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint64_t bytePad = oPreRead & 7;
- if (bytePad > 0) {
- length = bytePad;
- oVal <<= length;
- UC64BE_READBITS_NS(o, EC);
- }
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= valE, false)) {
- _readContext.readComprBuffer();
- }
- _zcDocIds.clearReserve(docIdsSize);
- _l1Skip.clearReserve(l1SkipSize);
- _l2Skip.clearReserve(l2SkipSize);
- _l3Skip.clearReserve(l3SkipSize);
- _l4Skip.clearReserve(l4SkipSize);
- _decodeContext->readBytes(_zcDocIds._valI, docIdsSize);
- _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
- if (l1SkipSize > 0) {
- _decodeContext->readBytes(_l1Skip._valI, l1SkipSize);
- }
- _l1Skip._valE = _l1Skip._valI + l1SkipSize;
- if (l2SkipSize > 0) {
- _decodeContext->readBytes(_l2Skip._valI, l2SkipSize);
- }
- _l2Skip._valE = _l2Skip._valI + l2SkipSize;
- if (l3SkipSize > 0) {
- _decodeContext->readBytes(_l3Skip._valI, l3SkipSize);
- }
- _l3Skip._valE = _l3Skip._valI + l3SkipSize;
- if (l4SkipSize > 0) {
- _decodeContext->readBytes(_l4Skip._valI, l4SkipSize);
- }
- _l4Skip._valE = _l4Skip._valI + l4SkipSize;
-
- if (l1SkipSize > 0) {
- _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId;
- } else {
- _l1SkipDocId = _lastDocId;
- }
- if (l2SkipSize > 0) {
- _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId;
- } else {
- _l2SkipDocId = _lastDocId;
- }
- if (l3SkipSize > 0) {
- _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId;
- } else {
- _l3SkipDocId = _lastDocId;
- }
- if (l4SkipSize > 0) {
- _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId;
- } else {
- _l4SkipDocId = _lastDocId;
- }
- _l1SkipDocIdPos = 0;
- _l1SkipFeaturesPos = _decodeContext->getReadOffset();
- _l2SkipDocIdPos = 0;
- _l2SkipL1SkipPos = 0;
- _l2SkipFeaturesPos = _decodeContext->getReadOffset();
- _l3SkipDocIdPos = 0;
- _l3SkipL1SkipPos = 0;
- _l3SkipL2SkipPos = 0;
- _l3SkipFeaturesPos = _decodeContext->getReadOffset();
- _l4SkipDocIdPos = 0;
- _l4SkipL1SkipPos = 0;
- _l4SkipL2SkipPos = 0;
- _l4SkipL3SkipPos = 0;
- _l4SkipFeaturesPos = _decodeContext->getReadOffset();
- _hasMore = hasMore;
- // Decode context is now positioned at start of features
-}
-
-
void
-Zc4PostingSeqRead::readWordStart()
+Zc4PostingSeqRead::readDocIdAndFeatures(DocIdAndFeatures &features)
{
- typedef FeatureEncodeContextBE EC;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = _decodeContext->_valE;
-
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_NUMDOCS,
- EC);
- UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
- if (oCompr >= valE) {
- _readContext.readComprBuffer();
- }
- _numDocs = static_cast<uint32_t>(val64) + 1;
- _residue = _numDocs;
- _prevDocId = _hasMore ? _lastDocId : 0u;
- if (_rangeEndOffset == 0) {
- assert(_numDocs <= _counts._numDocs);
- assert(_numDocs == _counts._numDocs ||
- _numDocs >= _minChunkDocs ||
- _hasMore);
- }
-
- if (_numDocs >= _minSkipDocs || _hasMore) {
- readWordStartWithSkip();
- // Decode context is not positioned at start of features
- } else {
- if (_dynamicK) {
- _docIdK = EC::calcDocIdK(_numDocs, _docIdLimit);
- }
- _lastDocId = 0u;
- // Decode context is not positioned at start of docids & features
- }
+ _reader.read_doc_id_and_features(features);
}
-
void
Zc4PostingSeqRead::readCounts(const PostingListCounts &counts)
{
- assert(!_hasMore); // Previous words must have been read.
-
- _counts = counts;
-
- assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
- if (_counts._numDocs > 0) {
- _wordStart = _decodeContext->getReadOffset();
- readWordStart();
- }
+ _reader.set_counts(counts);
}
@@ -484,16 +72,17 @@ Zc4PostingSeqRead::open(const vespalib::string &name,
}
bool res = _file.OpenReadOnly(name.c_str());
if (res) {
- _readContext.setFile(&_file);
- _readContext.setFileSize(_file.GetSize());
- DecodeContext &d = *_decodeContext;
- _readContext.allocComprBuf(65536u, 32768u);
+ auto &readContext = _reader.get_read_context();
+ readContext.setFile(&_file);
+ readContext.setFileSize(_file.GetSize());
+ auto &d = _reader.get_decode_features();
+ readContext.allocComprBuf(65536u, 32768u);
d.emptyBuffer(0);
- _readContext.readComprBuffer();
+ readContext.readComprBuffer();
readHeader();
if (d._valI >= d._valE) {
- _readContext.readComprBuffer();
+ readContext.readComprBuffer();
}
} else {
LOG(error, "could not open %s: %s",
@@ -506,9 +95,10 @@ Zc4PostingSeqRead::open(const vespalib::string &name,
bool
Zc4PostingSeqRead::close()
{
- _readContext.dropComprBuf();
+ auto &readContext = _reader.get_read_context();
+ readContext.dropComprBuf();
_file.Close();
- _readContext.setFile(nullptr);
+ readContext.setFile(nullptr);
return true;
}
@@ -524,29 +114,30 @@ Zc4PostingSeqRead::getParams(PostingListParams &params)
uint32_t countMinChunkDocs = 0;
countParams.get("docIdLimit", countDocIdLimit);
countParams.get("minChunkDocs", countMinChunkDocs);
- assert(_docIdLimit == countDocIdLimit);
- assert(_minChunkDocs == countMinChunkDocs);
+ assert(_reader.get_posting_params()._doc_id_limit == countDocIdLimit);
+ assert(_reader.get_posting_params()._min_chunk_docs == countMinChunkDocs);
} else {
params.clear();
- params.set("docIdLimit", _docIdLimit);
- params.set("minChunkDocs", _minChunkDocs);
+ params.set("docIdLimit", _reader.get_posting_params()._doc_id_limit);
+ params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs);
}
- params.set("minSkipDocs", _minSkipDocs);
+ params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs);
}
void
Zc4PostingSeqRead::getFeatureParams(PostingListParams &params)
{
- _decodeContext->getParams(params);
+ _reader.get_decode_features().getParams(params);
}
void
Zc4PostingSeqRead::readHeader()
{
- FeatureDecodeContextBE &d = *_decodeContext;
- const vespalib::string &myId = _dynamicK ? myId5 : myId4;
+ FeatureDecodeContextBE &d = _reader.get_decode_features();
+ auto &posting_params = _reader.get_posting_params();
+ const vespalib::string &myId = posting_params._dynamic_k ? myId5 : myId4;
vespalib::FileHeader header;
d.readHeader(header, _file.getSize());
@@ -571,9 +162,9 @@ Zc4PostingSeqRead::readHeader()
(void) myId;
assert(header.getTag("format.1").asString() == d.getIdentifier());
_numWords = header.getTag("numWords").asInteger();
- _minChunkDocs = header.getTag("minChunkDocs").asInteger();
- _docIdLimit = header.getTag("docIdLimit").asInteger();
- _minSkipDocs = header.getTag("minSkipDocs").asInteger();
+ posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger();
+ posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger();
+ posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger();
assert(header.getTag("endian").asString() == "big");
// Read feature decoding specific subheader
d.readHeader(header, "features.");
@@ -585,38 +176,9 @@ Zc4PostingSeqRead::readHeader()
const vespalib::string &
-Zc4PostingSeqRead::getIdentifier()
-{
- return myId4;
-}
-
-
-uint64_t
-Zc4PostingSeqRead::getCurrentPostingOffset() const
+Zc4PostingSeqRead::getIdentifier(bool dynamic_k)
{
- FeatureDecodeContextBE &d = *_decodeContext;
- return d.getReadOffset() - _headerBitLen;
-}
-
-
-void
-Zc4PostingSeqRead::setPostingOffset(uint64_t offset,
- uint64_t endOffset,
- uint64_t readAheadOffset)
-{
- assert(_residue == 0); // Only to be called between posting lists
-
- FeatureDecodeContextBE &d = *_decodeContext;
-
- _rangeEndOffset = endOffset + _headerBitLen;
- _readAheadEndOffset = readAheadOffset + _headerBitLen;
- _readContext.setStopOffset(_readAheadEndOffset, false);
- uint64_t newOffset = offset + _headerBitLen;
- if (newOffset != d.getReadOffset()) {
- _readContext.setPosition(newOffset);
- assert(newOffset == d.getReadOffset());
- _readContext.readComprBuffer();
- }
+ return (dynamic_k ? myId5 : myId4);
}
@@ -809,65 +371,6 @@ getFeatureParams(PostingListParams &params)
}
-ZcPostingSeqRead::ZcPostingSeqRead(PostingListCountFileSeqRead *countFile)
- : Zc4PostingSeqRead(countFile)
-{
- _dynamicK = true;
-}
-
-
-void
-ZcPostingSeqRead::
-readDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if (_residue == 0 && !_hasMore) {
- if (_rangeEndOffset != 0) {
- DecodeContext &d = *_decodeContext;
- uint64_t curOffset = d.getReadOffset();
- assert(curOffset <= _rangeEndOffset);
- if (curOffset < _rangeEndOffset) {
- readWordStart();
- }
- }
- if (_residue == 0) {
- // Don't read past end of posting list.
- features.clear(static_cast<uint32_t>(-1));
- return;
- }
- }
- if (_lastDocId > 0) {
- readCommonWordDocIdAndFeatures(features);
- return;
- }
- // Interleaves docid & features
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- uint32_t length;
- uint64_t val64;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-
- UC64BE_DECODEEXPGOLOMB_SMALL_NS(o,
- _docIdK,
- EC);
- uint32_t docId = _prevDocId + 1 + val64;
- features._docId = docId;
- _prevDocId = docId;
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= d._valE, false)) {
- _readContext.readComprBuffer();
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-const vespalib::string &
-ZcPostingSeqRead::getIdentifier()
-{
- return myId5;
-}
-
-
ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile)
: Zc4PostingSeqWrite(countFile)
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
index 96cc306cea8..01049e720a9 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
@@ -3,8 +3,10 @@
#pragma once
#include "zc4_posting_writer.h"
+#include "zc4_posting_reader.h"
#include <vespa/searchlib/index/postinglistfile.h>
#include <vespa/fastos/file.h>
+#include "zc4_posting_params.h"
namespace search::index {
class PostingListCountFileSeqRead;
@@ -19,63 +21,14 @@ class Zc4PostingSeqRead : public index::PostingListFileSeqRead
Zc4PostingSeqRead &operator=(const Zc4PostingSeqRead &);
protected:
- typedef bitcompression::FeatureDecodeContextBE DecodeContext;
- typedef bitcompression::FeatureEncodeContextBE EncodeContext;
-
- DecodeContext *_decodeContext;
- uint32_t _docIdK;
- uint32_t _prevDocId; // Previous document id
- uint32_t _numDocs; // Documents in chunk or word
- search::ComprFileReadContext _readContext;
+ Zc4PostingReader<true> _reader;
FastOS_File _file;
- bool _hasMore;
- bool _dynamicK; // Caclulate EG compression parameters ?
- uint32_t _lastDocId; // last document in chunk or word
- uint32_t _minChunkDocs; // # of documents needed for chunking
- uint32_t _minSkipDocs; // # of documents needed for skipping
- uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
-
- ZcBuf _zcDocIds; // Document id deltas
- ZcBuf _l1Skip; // L1 skip info
- ZcBuf _l2Skip; // L2 skip info
- ZcBuf _l3Skip; // L3 skip info
- ZcBuf _l4Skip; // L4 skip info
-
uint64_t _numWords; // Number of words in file
uint64_t _fileBitSize;
- uint32_t _chunkNo; // Chunk number
-
- // Variables for validating skip information while reading
- uint32_t _l1SkipDocId;
- uint32_t _l1SkipDocIdPos;
- uint64_t _l1SkipFeaturesPos;
- uint32_t _l2SkipDocId;
- uint32_t _l2SkipDocIdPos;
- uint32_t _l2SkipL1SkipPos;
- uint64_t _l2SkipFeaturesPos;
- uint32_t _l3SkipDocId;
- uint32_t _l3SkipDocIdPos;
- uint32_t _l3SkipL1SkipPos;
- uint32_t _l3SkipL2SkipPos;
- uint64_t _l3SkipFeaturesPos;
- uint32_t _l4SkipDocId;
- uint32_t _l4SkipDocIdPos;
- uint32_t _l4SkipL1SkipPos;
- uint32_t _l4SkipL2SkipPos;
- uint32_t _l4SkipL3SkipPos;
- uint64_t _l4SkipFeaturesPos;
-
- // Variable for validating chunk information while reading
- uint64_t _featuresSize;
index::PostingListCountFileSeqRead *const _countFile;
-
uint64_t _headerBitLen; // Size of file header in bits
- uint64_t _rangeEndOffset; // End offset for word pair
- uint64_t _readAheadEndOffset; // Readahead end offset for word pair
- uint64_t _wordStart; // last word header position
- uint32_t _residue; // Number of unread documents after word header
public:
- Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile);
+ Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile, bool dynamic_k);
~Zc4PostingSeqRead();
@@ -83,11 +36,6 @@ public:
typedef index::PostingListCounts PostingListCounts;
typedef index::PostingListParams PostingListParams;
- /**
- * Read document id and features for common word.
- */
- virtual void readCommonWordDocIdAndFeatures(DocIdAndFeatures &features);
-
void readDocIdAndFeatures(DocIdAndFeatures &features) override;
void readCounts(const PostingListCounts &counts) override; // Fill in for next word
bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override;
@@ -97,28 +45,7 @@ public:
void readWordStartWithSkip();
void readWordStart();
void readHeader();
- static const vespalib::string &getIdentifier();
-
- // Methods used when generating posting list for common word pairs.
-
- /*
- * Get current posting offset, measured in bits. First posting list
- * starts at 0, i.e. file header is not accounted for here.
- *
- * @return current posting offset, measured in bits.
- */
- uint64_t getCurrentPostingOffset() const override;
-
- /**
- * Set current posting offset, measured in bits. First posting
- * list starts at 0, i.e. file header is not accounted for here.
- *
- * @param Offset start of posting lists for word pair.
- * @param endOffset end of posting lists for word pair.
- * @param readAheadOffset end of posting list for either this or a
- * later word pair, depending on disk seek cost.
- */
- void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) override;
+ static const vespalib::string &getIdentifier(bool dynamic_k);
};
@@ -161,15 +88,6 @@ public:
void updateHeader();
};
-
-class ZcPostingSeqRead : public Zc4PostingSeqRead
-{
-public:
- ZcPostingSeqRead(index::PostingListCountFileSeqRead *countFile);
- void readDocIdAndFeatures(DocIdAndFeatures &features) override;
- static const vespalib::string &getIdentifier();
-};
-
class ZcPostingSeqWrite : public Zc4PostingSeqWrite
{
public:
diff --git a/searchlib/src/vespa/searchlib/features/attributefeature.cpp b/searchlib/src/vespa/searchlib/features/attributefeature.cpp
index b3ebd0f3822..56d02ce6d4e 100644
--- a/searchlib/src/vespa/searchlib/features/attributefeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/attributefeature.cpp
@@ -295,10 +295,13 @@ AttributeBlueprint::setup(const search::fef::IIndexEnvironment & env,
vespalib::string attrType = type::Attribute::lookup(env.getProperties(), _attrName);
if (!attrType.empty()) {
_tensorType = ValueType::from_spec(attrType);
+ if (_tensorType.is_error()) {
+ LOG(error, "%s: invalid type: '%s'", getName().c_str(), attrType.c_str());
+ }
}
- FeatureType output_type = _tensorType.is_tensor()
- ? FeatureType::object(_tensorType)
- : FeatureType::number();
+ FeatureType output_type = _tensorType.is_double()
+ ? FeatureType::number()
+ : FeatureType::object(_tensorType);
describeOutput("value", "The value of a single value attribute, "
"the value at the given index of an array attribute, "
"the given key of a weighted set attribute, or"
@@ -309,7 +312,7 @@ AttributeBlueprint::setup(const search::fef::IIndexEnvironment & env,
describeOutput("count", "Returns the number of elements in this array or weighted set attribute.");
}
env.hintAttributeAccess(_attrName);
- return true;
+ return !_tensorType.is_error();
}
search::fef::Blueprint::UP
diff --git a/searchlib/src/vespa/searchlib/features/constant_feature.cpp b/searchlib/src/vespa/searchlib/features/constant_feature.cpp
index 4d76512ab00..ced9d95fb33 100644
--- a/searchlib/src/vespa/searchlib/features/constant_feature.cpp
+++ b/searchlib/src/vespa/searchlib/features/constant_feature.cpp
@@ -63,8 +63,10 @@ ConstantBlueprint::setup(const IIndexEnvironment &env,
{
_key = params[0].getValue();
_value = env.getConstantValue(_key);
- if (!_value || _value->type().is_error()) {
+ if (!_value) {
LOG(error, "Constant '%s' not found", _key.c_str());
+ } else if (_value->type().is_error()) {
+ LOG(error, "Constant '%s' has invalid type", _key.c_str());
}
FeatureType output_type = _value ?
FeatureType::object(_value->type()) :
diff --git a/searchlib/src/vespa/searchlib/features/queryfeature.cpp b/searchlib/src/vespa/searchlib/features/queryfeature.cpp
index eb7eb427283..b9041901ced 100644
--- a/searchlib/src/vespa/searchlib/features/queryfeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/queryfeature.cpp
@@ -98,12 +98,15 @@ QueryBlueprint::setup(const IIndexEnvironment &env, const ParameterList &params)
vespalib::string queryFeatureType = type::QueryFeature::lookup(env.getProperties(), _key);
if (!queryFeatureType.empty()) {
_valueType = ValueType::from_spec(queryFeatureType);
+ if (_valueType.is_error()) {
+ LOG(error, "%s: invalid type: '%s'", getName().c_str(), queryFeatureType.c_str());
+ }
}
- FeatureType output_type = _valueType.is_tensor()
- ? FeatureType::object(_valueType)
- : FeatureType::number();
+ FeatureType output_type = _valueType.is_double()
+ ? FeatureType::number()
+ : FeatureType::object(_valueType);
describeOutput("out", "The value looked up in query properties using the given key.", output_type);
- return true;
+ return !_valueType.is_error();
}
namespace {
diff --git a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
index 72865d042e7..b2c8c64d55a 100644
--- a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
@@ -239,9 +239,6 @@ RankingExpressionBlueprint::setup(const fef::IIndexEnvironment &env,
LOG(error, "rank expression contains type errors: %s\n", script.c_str());
return false;
}
- if (root_type.is_any()) {
- LOG(warning, "rank expression could produce run-time type errors: %s\n", script.c_str());
- }
auto compile_issues = CompiledFunction::detect_issues(rank_function);
auto interpret_issues = InterpretedFunction::detect_issues(rank_function);
if (do_compile && compile_issues && !interpret_issues) {
diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
index 513c542637d..07b4da8a85f 100644
--- a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
+++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp
@@ -7,15 +7,15 @@ LOG_SETUP(".index.docidandfeatures");
namespace search::index {
DocIdAndFeatures::DocIdAndFeatures()
- : _docId(0),
- _wordDocFeatures(),
+ : _doc_id(0),
_elements(),
- _wordPositions(),
+ _word_positions(),
_blob(),
- _bitOffset(0u),
- _bitLength(0u),
- _raw(false)
-{ }
+ _bit_offset(0u),
+ _bit_length(0u),
+ _has_raw_data(false)
+{
+}
DocIdAndFeatures::DocIdAndFeatures(const DocIdAndFeatures &) = default;
DocIdAndFeatures & DocIdAndFeatures::operator = (const DocIdAndFeatures &) = default;
diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.h b/searchlib/src/vespa/searchlib/index/docidandfeatures.h
index d1d44f78aa6..a063712a79e 100644
--- a/searchlib/src/vespa/searchlib/index/docidandfeatures.h
+++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.h
@@ -7,57 +7,25 @@
namespace search::index {
-/*
+/**
* The following feature classes are not self contained. To reduce
* memory allocator pressure, the DocIdAndFeatures class contains a
* flattened representation of the features at different levels.
*/
-/*
- * (word, doc) features.
- *
- * Present as member in DocIdAndFeatures.
- */
-class WordDocFeatures {
-public:
- // TODO: add support for user features
-
- WordDocFeatures() { }
- void clear() { }
-};
-
-/*
- * (word, doc, field) features.
- *
- * Present as vector element in DocIdAndFeatures.
- */
-class WordDocFieldFeatures {
-public:
- uint32_t _numElements; // Number of array indexes
- // TODO: add support for user features
-
- WordDocFieldFeatures()
- : _numElements(0u)
- {}
-
- uint32_t getNumElements() const { return _numElements; }
- void setNumElements(uint32_t numElements) { _numElements = numElements; }
- void incNumElements() { ++_numElements; }
-};
-
-/*
- * (word, doc, field, element) features.
+/**
+ * (word, doc, element) features.
*
* Present as vector element in DocIdAndFeatures.
*/
class WordDocElementFeatures {
-public:
+private:
uint32_t _elementId; // Array index
uint32_t _numOccs;
int32_t _weight;
uint32_t _elementLen;
- // TODO: add support for user features
+public:
WordDocElementFeatures()
: _elementId(0u),
_numOccs(0u),
@@ -93,16 +61,16 @@ public:
void incNumOccs() { ++_numOccs; }
};
-/*
- * (word, doc, field, element, wordpos) features.
+/**
+ * (word, doc, element, wordpos) features.
*
* Present as vector element in DocIdAndFeatures.
*/
class WordDocElementWordPosFeatures {
-public:
+private:
uint32_t _wordPos;
- // TODO: add support for user features
+public:
WordDocElementWordPosFeatures()
: _wordPos(0u)
{}
@@ -116,30 +84,27 @@ public:
};
/**
- * Class for minimal common representation of features available for a
- * (word, doc) pair, used by index fusion to shuffle information from
+ * Class for minimal common representation of features available for a (word, doc) pair.
+ *
+ * Used in memory index and disk index posting lists and by index fusion to shuffle information from
* input files to the output file without having to know all the details.
*/
class DocIdAndFeatures {
public:
- uint32_t _docId; // Current Docid
- // generic feature data, flattened to avoid excessive allocator usage
- WordDocFeatures _wordDocFeatures;
+ using RawData = std::vector<uint64_t>;
+
+protected:
+ uint32_t _doc_id; // Current document id
std::vector<WordDocElementFeatures> _elements;
- std::vector<WordDocElementWordPosFeatures> _wordPositions;
-#ifdef notyet
- // user blobs (packed)
- UserFeatures _userFeatures;
- // TODO: Determine how to handle big endian versus little endian user
- // features, and whether set of user features is contiguous in file or
- // interleaved with predefined features (word position, word weight)
-#endif
- // raw data (file format specific, packed)
- std::vector<uint64_t> _blob; // Feature data for (word, docid) pair
- uint32_t _bitOffset; // Offset of feature start ([0..63])
- uint32_t _bitLength; // Length of features
- bool _raw; //
+ std::vector<WordDocElementWordPosFeatures> _word_positions;
+ // Raw data (file format specific, packed)
+ RawData _blob; // Feature data for (word, docid) pair
+ uint32_t _bit_offset; // Offset of feature start ([0..63])
+ uint32_t _bit_length; // Length of features
+ bool _has_raw_data;
+
+public:
DocIdAndFeatures();
DocIdAndFeatures(const DocIdAndFeatures &);
DocIdAndFeatures & operator = (const DocIdAndFeatures &);
@@ -147,37 +112,49 @@ public:
DocIdAndFeatures & operator = (DocIdAndFeatures &&) = default;
~DocIdAndFeatures();
- void clearFeatures() {
- _wordDocFeatures.clear();
+ void clear_features() {
_elements.clear();
- _wordPositions.clear();
- _bitOffset = 0u;
- _bitLength = 0u;
+ _word_positions.clear();
+ _bit_offset = 0u;
+ _bit_length = 0u;
_blob.clear();
}
- void clearFeatures(uint32_t bitOffset) {
- _wordDocFeatures.clear();
+ void clear_features(uint32_t bit_offset) {
_elements.clear();
- _wordPositions.clear();
- _bitOffset = bitOffset;
- _bitLength = 0u;
+ _word_positions.clear();
+ _bit_offset = bit_offset;
+ _bit_length = 0u;
_blob.clear();
}
- void clear(uint32_t docId) {
- _docId = docId;
- clearFeatures();
+ void clear(uint32_t doc_id) {
+ _doc_id = doc_id;
+ clear_features();
}
- void clear(uint32_t docId, uint32_t bitOffset) {
- _docId = docId;
- clearFeatures(bitOffset);
+ void clear(uint32_t doc_id, uint32_t bit_offset) {
+ _doc_id = doc_id;
+ clear_features(bit_offset);
}
- void setRaw(bool raw) { _raw = raw; }
- bool getRaw() const { return _raw; }
+ uint32_t doc_id() const { return _doc_id; }
+ void set_doc_id(uint32_t val) { _doc_id = val; }
+
+ const std::vector<WordDocElementFeatures>& elements() const { return _elements; }
+ std::vector<WordDocElementFeatures>& elements() { return _elements; }
+
+ const std::vector<WordDocElementWordPosFeatures>& word_positions() const { return _word_positions; }
+ std::vector<WordDocElementWordPosFeatures>& word_positions() { return _word_positions; }
+
+ const RawData& blob() const { return _blob; }
+ RawData& blob() { return _blob; }
+ uint32_t bit_offset() const { return _bit_offset; }
+ uint32_t bit_length() const { return _bit_length; }
+ void set_bit_length(uint32_t val) { _bit_length = val; }
+ bool has_raw_data() const { return _has_raw_data; }
+ void set_has_raw_data(bool val) { _has_raw_data = val; }
};
}
diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.cpp b/searchlib/src/vespa/searchlib/index/indexbuilder.cpp
index 6b88c51e6cc..d585238107a 100644
--- a/searchlib/src/vespa/searchlib/index/indexbuilder.cpp
+++ b/searchlib/src/vespa/searchlib/index/indexbuilder.cpp
@@ -6,7 +6,8 @@ namespace search::index {
IndexBuilder::IndexBuilder(const Schema &schema)
: _schema(schema)
-{ }
+{
+}
IndexBuilder::~IndexBuilder() = default;
diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.h b/searchlib/src/vespa/searchlib/index/indexbuilder.h
index 66ca740a20c..cf9df4bd154 100644
--- a/searchlib/src/vespa/searchlib/index/indexbuilder.h
+++ b/searchlib/src/vespa/searchlib/index/indexbuilder.h
@@ -5,9 +5,18 @@
namespace search::index {
+class DocIdAndFeatures;
class Schema;
class WordDocElementWordPosFeatures;
+/**
+ * Interface used to build an index for the set of index fields specified in a schema.
+ *
+ * The index should be built as follows:
+ * For each field add the set of unique words in sorted order.
+ * For each word add the set of document ids in sorted order.
+ * For each document id add the position information for that document.
+ */
class IndexBuilder {
protected:
const Schema &_schema;
@@ -15,39 +24,12 @@ protected:
public:
IndexBuilder(const Schema &schema);
- virtual
- ~IndexBuilder();
-
- virtual void
- startWord(vespalib::stringref word) = 0;
-
- virtual void
- endWord() = 0;
-
- virtual void
- startDocument(uint32_t docId) = 0;
-
- virtual void
- endDocument() = 0;
-
- virtual void
- startField(uint32_t fieldId) = 0;
-
- virtual void
- endField() = 0;
-
- virtual void
- startElement(uint32_t elementId, int32_t weight, uint32_t elementLen) = 0;
-
- virtual void
- endElement() = 0;
-
- virtual void
- addOcc(const WordDocElementWordPosFeatures &features) = 0;
-
- // TODO: methods for attribute vectors.
-
- // TODO: methods for document summary.
+ virtual ~IndexBuilder();
+ virtual void startField(uint32_t fieldId) = 0;
+ virtual void endField() = 0;
+ virtual void startWord(vespalib::stringref word) = 0;
+ virtual void endWord() = 0;
+ virtual void add_document(const DocIdAndFeatures &features) = 0;
};
}
diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
index 0f0860f9145..52c6b85a0b8 100644
--- a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
+++ b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
@@ -6,8 +6,6 @@
namespace search::index {
PostingListFileSeqRead::PostingListFileSeqRead()
- : _counts(),
- _residueDocs(0)
{
}
diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h
index 194ac519a19..1e7dde7f139 100644
--- a/searchlib/src/vespa/searchlib/index/postinglistfile.h
+++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h
@@ -19,9 +19,6 @@ class DocIdAndFeatures;
* for words.
*/
class PostingListFileSeqRead {
-protected:
- PostingListCounts _counts;
- unsigned int _residueDocs; // Docids left to read for word
public:
PostingListFileSeqRead();
@@ -63,34 +60,6 @@ public:
* Get current (word, docid) feature parameters.
*/
virtual void getFeatureParams(PostingListParams &params);
-
- // Methods used when generating posting list for common word pairs.
-
- /*
- * Get current posting offset, measured in bits. First posting list
- * starts at 0, i.e. file header is not accounted for here.
- *
- * @return current posting offset, measured in bits.
- */
- virtual uint64_t getCurrentPostingOffset() const = 0;
-
- /**
- * Set current posting offset, measured in bits. First posting
- * list starts at 0, i.e. file header is not accounted for here.
- *
- * @param Offset start of posting lists for word pair.
- * @param endOffset end of posting lists for word pair.
- * @param readAheadOffset end of posting list for either this or a
- * later word pair, depending on disk seek cost.
- */
- virtual void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) = 0;
-
- /**
- * Get counts read by last readCounts().
- */
- const PostingListCounts &getCounts() const { return _counts; }
-
- PostingListCounts &getCounts() { return _counts; }
};
/**
diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
index 974fcc01c36..1d55ed76a09 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
@@ -21,7 +21,7 @@ FeatureStore::writeFeatures(uint32_t packedIndex, const DocIdAndFeatures &featur
oldOffset = 0;
assert(_f.getWriteOffset() == oldOffset);
}
- assert(!features.getRaw());
+ assert(!features.has_raw_data());
_f.writeFeatures(features);
return oldOffset;
}
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
index 7d10895c32f..e79cab28dec 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
@@ -169,23 +169,10 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilder)
for (; pitr.valid(); ++pitr) {
uint32_t docId = pitr.getKey();
EntryRef featureRef(pitr.getData());
- indexBuilder.startDocument(docId);
_featureStore.setupForReadFeatures(featureRef, decoder);
decoder.readFeatures(features);
- size_t poff = 0;
- uint32_t wpIdx = 0u;
- size_t numElements = features._elements.size();
- for (size_t i = 0; i < numElements; ++i) {
- const WordDocElementFeatures & fef = features._elements[i];
- indexBuilder.startElement(fef.getElementId(), fef.getWeight(), fef.getElementLen());
- for (size_t j = 0; j < fef.getNumOccs(); ++j, ++wpIdx) {
- assert(wpIdx == poff + j);
- indexBuilder.addOcc(features._wordPositions[poff + j]);
- }
- poff += fef.getNumOccs();
- indexBuilder.endElement();
- }
- indexBuilder.endDocument();
+ features.set_doc_id(docId);
+ indexBuilder.add_document(features);
}
} else {
const PostingListKeyDataType *kd =
@@ -194,23 +181,10 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilder)
for (; kd != kde; ++kd) {
uint32_t docId = kd->_key;
EntryRef featureRef(kd->getData());
- indexBuilder.startDocument(docId);
_featureStore.setupForReadFeatures(featureRef, decoder);
decoder.readFeatures(features);
- size_t poff = 0;
- uint32_t wpIdx = 0u;
- size_t numElements = features._elements.size();
- for (size_t i = 0; i < numElements; ++i) {
- const WordDocElementFeatures & fef = features._elements[i];
- indexBuilder.startElement(fef.getElementId(), fef.getWeight(), fef.getElementLen());
- for (size_t j = 0; j < fef.getNumOccs(); ++j, ++wpIdx) {
- assert(wpIdx == poff + j);
- indexBuilder.addOcc(features._wordPositions[poff + j]);
- }
- poff += fef.getNumOccs();
- indexBuilder.endElement();
- }
- indexBuilder.endDocument();
+ features.set_doc_id(docId);
+ indexBuilder.add_document(features);
}
}
indexBuilder.endWord();
diff --git a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
index f0bb1eb6519..1e25878a33e 100644
--- a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
+++ b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
@@ -7,6 +7,7 @@
namespace search::diskindex {
+using index::DocIdAndFeatures;
using index::DummyFileHeaderContext;
using index::Schema;
using index::WordDocElementWordPosFeatures;
@@ -17,13 +18,17 @@ struct Builder
search::diskindex::IndexBuilder _ib;
TuneFileIndexing _tuneFileIndexing;
DummyFileHeaderContext _fileHeaderContext;
+ DocIdAndFeatures _features;
Builder(const std::string &dir,
const Schema &s,
uint32_t docIdLimit,
uint64_t numWordIds,
bool directio)
- : _ib(s)
+ : _ib(s),
+ _tuneFileIndexing(),
+ _fileHeaderContext(),
+ _features()
{
if (directio) {
_tuneFileIndexing._read.setWantDirectIO();
@@ -37,11 +42,11 @@ struct Builder
void
addDoc(uint32_t docId)
{
- _ib.startDocument(docId);
- _ib.startElement(0, 1, 1);
- _ib.addOcc(WordDocElementWordPosFeatures(0));
- _ib.endElement();
- _ib.endDocument();
+ _features.clear(docId);
+ _features.elements().emplace_back(0, 1, 1);
+ _features.elements().back().setNumOccs(1);
+ _features.word_positions().emplace_back(0);
+ _ib.add_document(_features);
}
void
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
index 9cbbd136148..d59417a1e78 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
@@ -206,7 +206,7 @@ FakeMemTreeOccMgr::add(uint32_t wordIdx, index::DocIdAndFeatures &features)
_featureSizes[wordIdx] += RefType::align((r.second + 7) / 8) * 8;
- _unflushed.push_back(PendingOp(wordIdx, features._docId, r.first));
+ _unflushed.push_back(PendingOp(wordIdx, features.doc_id(), r.first));
if (_unflushed.size() >= 10000)
flush();
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
index 1fa518af28f..8f6c16658c9 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp
@@ -584,7 +584,7 @@ FakeWord::validate(FieldReader &fieldReader,
for (residue = numDocs; residue > 0; --residue) {
assert(fieldReader._wordNum == wordNum);
DocIdAndFeatures &features(fieldReader._docIdAndFeatures);
- docId = features._docId;
+ docId = features.doc_id();
assert(d != de);
assert(d->_docId == docId);
if (matchData.valid()) {
@@ -598,15 +598,15 @@ FakeWord::validate(FieldReader &fieldReader,
typedef WordDocElementWordPosFeatures Positions;
std::vector<Elements>::const_iterator element =
- features._elements.begin();
+ features.elements().begin();
std::vector<Positions>::const_iterator position =
- features._wordPositions.begin();
+ features.word_positions().begin();
TermFieldMatchData *tfmd = matchData[0];
assert(tfmd != 0);
- tfmd->reset(features._docId);
+ tfmd->reset(features.doc_id());
- uint32_t elementResidue = features._elements.size();
+ uint32_t elementResidue = features.elements().size();
while (elementResidue != 0) {
uint32_t positionResidue = element->getNumOccs();
while (positionResidue != 0) {
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
index 3d4567ed2ab..f6c6e5a64f3 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
@@ -3,9 +3,10 @@
#include "fakezcfilterocc.h"
#include "fpfactory.h"
#include <vespa/searchlib/diskindex/zcposocciterators.h>
-#include <vespa/searchlib/diskindex/zc4_posting_writer.h>
#include <vespa/searchlib/diskindex/zc4_posting_header.h>
#include <vespa/searchlib/diskindex/zc4_posting_params.h>
+#include <vespa/searchlib/diskindex/zc4_posting_reader.h>
+#include <vespa/searchlib/diskindex/zc4_posting_writer.h>
using search::fef::TermFieldMatchData;
using search::fef::TermFieldMatchDataArray;
@@ -125,10 +126,12 @@ void
FakeZcFilterOcc::setup(const FakeWord &fw, bool doFeatures,
bool dynamicK)
{
- if (_bigEndian)
+ if (_bigEndian) {
setupT<true>(fw, doFeatures, dynamicK);
- else
+ } else {
setupT<false>(fw, doFeatures, dynamicK);
+ }
+ validate_read(fw, doFeatures, dynamicK);
}
@@ -208,7 +211,7 @@ FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_d
decode_context.setPosition({ _compressed.first, 0 });
Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures);
Zc4PostingHeader header;
- header.read<bigEndian>(decode_context, params);
+ header.read(decode_context, params);
_docIdsSize = header._doc_ids_size;
_l1SkipSize = header._l1_skip_size;
_l2SkipSize = header._l2_skip_size;
@@ -219,6 +222,63 @@ FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_d
}
+void
+FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const
+{
+ if (_bigEndian) {
+ validate_read<true>(fw, encode_features, dynamic_k);
+ } else {
+ validate_read<false>(fw, encode_features, dynamic_k);
+ }
+}
+
+template <bool bigEndian>
+void
+FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const
+{
+ bitcompression::EGPosOccDecodeContextCooked<bigEndian> decode_context_dynamic_k(&_fieldsParams);
+ bitcompression::EG2PosOccDecodeContextCooked<bigEndian> decode_context_static_k(&_fieldsParams);
+ bitcompression::FeatureDecodeContext<bigEndian> &decode_context_dynamic_k_upcast = decode_context_dynamic_k;
+ bitcompression::FeatureDecodeContext<bigEndian> &decode_context_static_k_upcast = decode_context_static_k;
+ bitcompression::FeatureDecodeContext<bigEndian> &decode_context = dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast;
+ Zc4PostingReader<bigEndian> reader(dynamic_k);
+ reader.set_decode_features(&decode_context);
+ auto &params = reader.get_posting_params();
+ params._min_skip_docs = 1;
+ params._min_chunk_docs = 1000000000;
+ params._doc_id_limit = _docIdLimit;
+ params._encode_features = encode_features;
+ reader.get_read_context().reference_compressed_buffer(_compressed.first, _compressed.second);
+ assert(decode_context.getReadOffset() == 0u);
+ PostingListCounts counts;
+ counts._bitLength = _compressedBits;
+ counts._numDocs = _hitDocs;
+ reader.set_counts(counts);
+ auto word_pos_iterator(fw._wordPosFeatures.begin());
+ auto word_pos_iterator_end(fw._wordPosFeatures.end());
+ DocIdAndPosOccFeatures check_features;
+ DocIdAndFeatures features;
+ uint32_t hits = 0;
+ for (const auto &doc : fw._postings) {
+ if (encode_features) {
+ fw.setupFeatures(doc, &*word_pos_iterator, check_features);
+ word_pos_iterator += doc._positions;
+ } else {
+ check_features.clear(doc._docId);
+ }
+ reader.read_doc_id_and_features(features);
+ assert(features.doc_id() == doc._docId);
+ assert(features.elements().size() == check_features.elements().size());
+ assert(features.word_positions().size() == check_features.word_positions().size());
+ ++hits;
+ }
+ if (encode_features) {
+ assert(word_pos_iterator == word_pos_iterator_end);
+ }
+ reader.read_doc_id_and_features(features);
+ assert(static_cast<int32_t>(features.doc_id()) == -1);
+}
+
FakeZcFilterOcc::~FakeZcFilterOcc()
{
free(_compressedMalloc);
@@ -369,7 +429,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end)
DecodeContext &d = _decodeContext;
Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
Zc4PostingHeader header;
- header.read<true>(d, params);
+ header.read(d, params);
assert((d.getBitOffset() & 7) == 0);
const uint8_t *bcompr = d.getByteCompr();
_valI = bcompr;
@@ -590,7 +650,7 @@ initRange(uint32_t begin, uint32_t end)
DecodeContext &d = _decodeContext;
Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false);
Zc4PostingHeader header;
- header.read<true>(d, params);
+ header.read(d, params);
_lastDocId = header._last_doc_id;
assert((d.getBitOffset() & 7) == 0);
const uint8_t *bcompr = d.getByteCompr();
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
index b68e3866461..36738a0f5a8 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
@@ -40,6 +40,10 @@ protected:
template <bool bigEndian>
void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs);
+ void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const;
+ template <bool bigEndian>
+ void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const;
+
public:
FakeZcFilterOcc(const FakeWord &fw);
FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix);
diff --git a/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h b/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h
index 08473f9fc6c..a341e36045e 100644
--- a/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h
+++ b/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h
@@ -53,9 +53,9 @@ public:
_ss << "a=" << docId;
if (_verbose) {
_ss << "(";
- auto wpi = features._wordPositions.begin();
+ auto wpi = features.word_positions().begin();
bool firstElement = true;
- for (auto &el : features._elements) {
+ for (auto &el : features.elements()) {
if (!firstElement) {
_ss << ",";
}
diff --git a/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h b/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h
new file mode 100644
index 00000000000..eeb09898aa2
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h
@@ -0,0 +1,64 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchlib/memoryindex/field_index_collection.h>
+#include <vespa/searchlib/memoryindex/ordered_field_index_inserter.h>
+
+namespace search::memoryindex::test {
+
+/**
+ * Test class used to populate a FieldIndex.
+ */
+class WrapInserter {
+private:
+ OrderedFieldIndexInserter& _inserter;
+
+public:
+ WrapInserter(FieldIndexCollection& field_indexes, uint32_t field_id)
+ : _inserter(field_indexes.getFieldIndex(field_id)->getInserter())
+ {
+ }
+
+ WrapInserter(FieldIndex& field_index)
+ : _inserter(field_index.getInserter())
+ {
+ }
+
+ WrapInserter& word(vespalib::stringref word_) {
+ _inserter.setNextWord(word_);
+ return *this;
+ }
+
+ WrapInserter& add(uint32_t doc_id, const index::DocIdAndFeatures& features) {
+ _inserter.add(doc_id, features);
+ return *this;
+ }
+
+ WrapInserter& add(uint32_t doc_id) {
+ index::DocIdAndPosOccFeatures features;
+ features.addNextOcc(0, 0, 1, 1);
+ return add(doc_id, features);
+ }
+
+ WrapInserter& remove(uint32_t doc_id) {
+ _inserter.remove(doc_id);
+ return *this;
+ }
+
+ WrapInserter& flush() {
+ _inserter.flush();
+ return *this;
+ }
+
+ WrapInserter& rewind() {
+ _inserter.rewind();
+ return *this;
+ }
+
+ datastore::EntryRef getWordRef() {
+ return _inserter.getWordRef();
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp
index 155bb194f97..400a93acd26 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.cpp
+++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp
@@ -408,6 +408,25 @@ ComprFileReadContext::referenceWriteContext(const ComprFileWriteContext &rhs)
}
}
+void
+ComprFileReadContext::reference_compressed_buffer(void *buffer, size_t usedUnits)
+{
+ ComprFileDecodeContext *d = getDecodeContext();
+
+ _comprBuf = buffer;
+ _comprBufSize = usedUnits;
+ setBufferEndFilePos(static_cast<uint64_t>(usedUnits) * _unitSize);
+ setFileSize(static_cast<uint64_t>(usedUnits) * _unitSize);
+ if (d != NULL) {
+ d->afterRead(_comprBuf,
+ usedUnits,
+ static_cast<uint64_t>(usedUnits) * _unitSize,
+ false);
+ d->setupBits(0);
+ setBitOffset(-1);
+ assert(d->getBitPosV() == 0);
+ }
+}
ComprFileWriteContext::
ComprFileWriteContext(ComprFileEncodeContext &encodeContext)
diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h
index d4de1d305fa..431126dee47 100644
--- a/searchlib/src/vespa/searchlib/util/comprfile.h
+++ b/searchlib/src/vespa/searchlib/util/comprfile.h
@@ -137,6 +137,7 @@ public:
* long as rhs is live and unchanged.
*/
void referenceWriteContext(const ComprFileWriteContext &rhs);
+ void reference_compressed_buffer(void *buffer, size_t usedUnits);
};