diff options
Diffstat (limited to 'searchlib/src/tests/predicate/predicate_index_test.cpp')
-rw-r--r-- | searchlib/src/tests/predicate/predicate_index_test.cpp | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/searchlib/src/tests/predicate/predicate_index_test.cpp b/searchlib/src/tests/predicate/predicate_index_test.cpp new file mode 100644 index 00000000000..b22c80294d0 --- /dev/null +++ b/searchlib/src/tests/predicate/predicate_index_test.cpp @@ -0,0 +1,363 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Unit tests for predicate_index. + +#include <vespa/log/log.h> +LOG_SETUP("predicate_index_test"); +#include <vespa/fastos/fastos.h> + +#include <vespa/searchlib/predicate/predicate_index.h> +#include <vespa/searchlib/predicate/predicate_tree_annotator.h> +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/attribute/predicate_attribute.h> + +using namespace search; +using namespace search::predicate; +using std::make_pair; +using std::pair; +using std::vector; + +namespace { + +struct DummyDocIdLimitProvider : public DocIdLimitProvider { + virtual uint32_t getDocIdLimit() const { return 10000; } + virtual uint32_t getCommittedDocIdLimit() const { return 10000; } +}; + +vespalib::GenerationHandler generation_handler; +vespalib::GenerationHolder generation_holder; +DummyDocIdLimitProvider dummy_provider; +SimpleIndexConfig simple_index_config; + +TEST("require that PredicateIndex can index empty documents") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); + index.indexEmptyDocument(2); + index.commit(); + EXPECT_EQUAL(1u, index.getZeroConstraintDocs().size()); +} + +TEST("require that indexDocument don't index empty documents") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); + PredicateTreeAnnotations annotations; + index.indexDocument(3, annotations); + index.commit(); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); +} + +TEST("require that PredicateIndex can remove empty documents") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); + index.indexEmptyDocument(2); + index.commit(); + EXPECT_EQUAL(1u, index.getZeroConstraintDocs().size()); + index.removeDocument(2); + index.commit(); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); +} + +TEST("require that indexing the same empty document multiple times is ok") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_EQUAL(0u, index.getZeroConstraintDocs().size()); + index.indexEmptyDocument(2); + index.commit(); + EXPECT_EQUAL(1u, index.getZeroConstraintDocs().size()); + index.indexEmptyDocument(2); + index.commit(); + EXPECT_EQUAL(1u, index.getZeroConstraintDocs().size()); +} + +void indexFeature(PredicateIndex &attr, uint32_t doc_id, int min_feature, + const vector<pair<uint64_t, Interval>> &intervals, + const vector<pair<uint64_t, IntervalWithBounds>> &bounds) { + PredicateTreeAnnotations annotations(min_feature); + for (auto &p : intervals) { + annotations.interval_map[p.first] = std::vector<Interval>{{p.second}}; + annotations.features.push_back(p.first); + } + for (auto &p : bounds) { + annotations.bounds_map[p.first] = + std::vector<IntervalWithBounds>{{p.second}}; + annotations.features.push_back(p.first); + } + attr.indexDocument(doc_id, annotations); +} + +PredicateIndex::BTreeIterator +lookupPosting(const PredicateIndex &index, uint64_t hash) { + const auto &interval_index = index.getIntervalIndex(); + auto it = interval_index.lookup(hash); + ASSERT_TRUE(it.valid()); + auto entry = it.getData(); + EXPECT_TRUE(entry.valid()); + + auto posting_it = interval_index.getBTreePostingList(entry); + ASSERT_TRUE(posting_it.valid()); + return posting_it; +} + +const int min_feature = 3; +const int k = min_feature - 1; +const uint32_t doc_id = 2; +const uint64_t hash = 0x12345; +const uint64_t hash2 = 0x3456; +const Interval interval = {0x0001ffff}; +const IntervalWithBounds bounds = {0x0001ffff, 0x03}; +Interval single_buf; + +TEST("require that PredicateIndex can index document") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + indexFeature(index, doc_id, min_feature, {{hash, interval}}, {}); + index.commit(); + + auto posting_it = lookupPosting(index, hash); + EXPECT_EQUAL(doc_id, posting_it.getKey()); + uint32_t size; + const auto &interval_list = + index.getIntervalStore().get(posting_it.getData(), size, &single_buf); + ASSERT_EQUAL(1u, size); + EXPECT_EQUAL(interval, interval_list[0]); +} + +TEST("require that PredicateIndex can index document with bounds") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + indexFeature(index, doc_id, min_feature, {}, {{hash, bounds}}); + index.commit(); + + const auto &bounds_index = index.getBoundsIndex(); + auto it = bounds_index.lookup(hash); + ASSERT_TRUE(it.valid()); + auto entry = it.getData(); + EXPECT_TRUE(entry.valid()); + + auto posting_it = bounds_index.getBTreePostingList(entry); + ASSERT_TRUE(posting_it.valid()); + EXPECT_EQUAL(doc_id, posting_it.getKey()); + + uint32_t size; + IntervalWithBounds single; + const auto &interval_list = + index.getIntervalStore().get(posting_it.getData(), size, &single); + ASSERT_EQUAL(1u, size); + EXPECT_EQUAL(bounds, interval_list[0]); +} + +TEST("require that PredicateIndex can index multiple documents " + "with the same feature") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + for (uint32_t id = 1; id < 100; ++id) { + indexFeature(index, id, min_feature, {{hash, interval}}, {}); + } + index.commit(); + + auto posting_it = lookupPosting(index, hash); + for (uint32_t id = 1; id < 100; ++id) { + ASSERT_TRUE(posting_it.valid()); + EXPECT_EQUAL(id, posting_it.getKey()); + uint32_t size; + const auto &interval_list = index.getIntervalStore().get( + posting_it.getData(), size, &single_buf); + ASSERT_EQUAL(1u, size); + EXPECT_EQUAL(interval, interval_list[0]); + ++posting_it; + } + ASSERT_FALSE(posting_it.valid()); +} + +TEST("require that PredicateIndex can remove indexed documents") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + indexFeature(index, doc_id, min_feature, + {{hash, interval}}, {{hash2, bounds}}); + index.removeDocument(doc_id); + index.commit(); + auto it = index.getIntervalIndex().lookup(hash); + ASSERT_FALSE(it.valid()); + auto it2 = index.getBoundsIndex().lookup(hash2); + ASSERT_FALSE(it2.valid()); + + // Remove again. Nothing should happen. + index.removeDocument(doc_id); +} + +TEST("require that PredicateIndex can remove multiple documents") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + const auto &interval_index = index.getIntervalIndex(); + EXPECT_FALSE(interval_index.lookup(hash).valid()); + for (uint32_t id = 1; id < 100; ++id) { + indexFeature(index, id, min_feature, {{hash, interval}}, {}); + } + index.commit(); + for (uint32_t id = 1; id < 110; ++id) { + index.removeDocument(id); + index.commit(); + auto it = interval_index.lookup(hash); + if (id < 99) { + ASSERT_TRUE(it.valid()); + } else { + ASSERT_FALSE(it.valid()); + } + } +} + +TEST("require that PredicateIndex can remove multiple documents with " + "multiple features") { + vector<pair<uint64_t, Interval>> intervals; + vector<pair<uint64_t, IntervalWithBounds>> bounds_intervals; + for (int i = 0; i < 100; ++i) { + intervals.push_back(make_pair(hash + i, interval)); + bounds_intervals.push_back(make_pair(hash2 + i, bounds)); + } + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + const auto &interval_index = index.getIntervalIndex(); + EXPECT_FALSE(interval_index.lookup(hash).valid()); + for (uint32_t id = 1; id < 100; ++id) { + indexFeature(index, id, id, intervals, bounds_intervals); + } + index.commit(); + for (uint32_t id = 1; id < 100; ++id) { + index.removeDocument((id + 50) % 99 + 1); + index.commit(); + auto it = interval_index.lookup(hash); + if (id < 99) { + ASSERT_TRUE(it.valid()); + } else { + ASSERT_FALSE(it.valid()); + } + } +} + +// Helper function for next test. +template <typename Iterator, typename IntervalT> +void checkAllIntervals(Iterator posting_it, IntervalT expected_interval, + const PredicateIntervalStore &interval_store) { + for (uint32_t id = 1; id < 100u; ++id) { + ASSERT_TRUE(posting_it.valid()); + EXPECT_EQUAL(id, posting_it.getKey()); + btree::EntryRef ref = posting_it.getData(); + ASSERT_TRUE(ref.valid()); + uint32_t size; + IntervalT single; + const IntervalT *read_interval = + interval_store.get(ref, size, &single); + EXPECT_EQUAL(1u, size); + EXPECT_EQUAL(expected_interval, read_interval[0]); + ++posting_it; + } +} + +namespace { +struct DocIdLimitFinder : SimpleIndexDeserializeObserver<> { + uint32_t &_doc_id_limit; + DocIdLimitFinder(uint32_t &doc_id_limit) : _doc_id_limit(doc_id_limit) + { + doc_id_limit = 0u; + } + void notifyInsert(uint64_t, uint32_t doc_id, uint32_t) { + _doc_id_limit = std::max(_doc_id_limit, doc_id); + } +}; +} + +TEST("require that PredicateIndex can be (de)serialized") { + vector<pair<uint64_t, Interval>> intervals; + vector<pair<uint64_t, IntervalWithBounds>> bounds_intervals; + for (int i = 0; i < 100; ++i) { + intervals.push_back(make_pair(hash + i, interval)); + bounds_intervals.push_back(make_pair(hash2 + i, bounds)); + } + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 8); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + for (uint32_t id = 1; id < 100; ++id) { + indexFeature(index, id, id, intervals, bounds_intervals); + index.indexEmptyDocument(id + 100); + } + index.commit(); + + vespalib::MMapDataBuffer buffer; + index.serialize(buffer); + uint32_t doc_id_limit; + DocIdLimitFinder finder(doc_id_limit); + PredicateIndex index2(generation_handler, generation_holder, dummy_provider, simple_index_config, + buffer, finder, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION); + const PredicateIntervalStore &interval_store = index2.getIntervalStore(); + EXPECT_EQUAL(199u, doc_id_limit); + + EXPECT_EQUAL(index.getArity(), index2.getArity()); + EXPECT_EQUAL(index.getZeroConstraintDocs().size(), + index2.getZeroConstraintDocs().size()); + { + auto it = index2.getZeroConstraintDocs().begin(); + for (uint32_t i = 1; i < 100u; ++i) { + TEST_STATE(vespalib::make_string("%d", i).c_str()); + ASSERT_TRUE(it.valid()); + EXPECT_EQUAL(i + 100, it.getKey()); + ++it; + } + EXPECT_FALSE(it.valid()); + } + + const auto &interval_index = index2.getIntervalIndex(); + const auto &bounds_index = index2.getBoundsIndex(); + for (int i = 0; i < 100; ++i) { + { + auto it = interval_index.lookup(hash + i); + ASSERT_TRUE(it.valid()); + auto posting_it = interval_index.getBTreePostingList(it.getData()); + checkAllIntervals(posting_it, interval, interval_store); + } + { + auto it = bounds_index.lookup(hash2 + i); + ASSERT_TRUE(it.valid()); + auto posting_it = bounds_index.getBTreePostingList(it.getData()); + checkAllIntervals(posting_it, bounds, interval_store); + } + } +} + +TEST("require that DocumentFeaturesStore is restored on deserialization") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); + indexFeature(index, doc_id, min_feature, + {{hash, interval}}, {{hash2, bounds}}); + vespalib::MMapDataBuffer buffer; + index.serialize(buffer); + uint32_t doc_id_limit; + DocIdLimitFinder finder(doc_id_limit); + PredicateIndex index2(generation_handler, generation_holder, dummy_provider, simple_index_config, + buffer, finder, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION); + const auto &interval_index = index2.getIntervalIndex(); + const auto &bounds_index = index2.getBoundsIndex(); + EXPECT_EQUAL(doc_id, doc_id_limit); + + auto it = interval_index.lookup(hash); + EXPECT_TRUE(it.valid()); + auto it2 = bounds_index.lookup(hash2); + EXPECT_TRUE(it2.valid()); + + index2.removeDocument(doc_id); + index2.commit(); + + it = interval_index.lookup(hash); + EXPECT_FALSE(it.valid()); + it2 = bounds_index.lookup(hash2); + EXPECT_FALSE(it2.valid()); +} + +TEST("require that hold lists are attempted emptied on destruction") { + PredicateIndex index(generation_handler, generation_holder, dummy_provider, simple_index_config, 10); + indexFeature(index, doc_id, min_feature, + {{hash, interval}}, {{hash2, bounds}}); + { + auto guard = generation_handler.takeGuard(); + index.removeDocument(doc_id); + index.commit(); + } + // No assert on index destruction. +} +} // namespace + +TEST_MAIN() { TEST_RUN_ALL(); } |