diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-14 17:08:26 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-14 17:08:26 +0100 |
commit | 6f0cc5cac45ba97c6365aafde252ff1d1a80f088 (patch) | |
tree | 844e820e81bbfa2a9cd37b00cd74335b8c77d14d /searchlib | |
parent | bad76b99df21498acc44b395ba7b4c72fc8fd335 (diff) | |
parent | 621d7274cbade4a4e4f44d16fb5ffff0a76f0248 (diff) |
Merge pull request #29657 from vespa-engine/geirst/direct-posting-store-single-string-attribute
Support direct posting store API for single string attribute.
Diffstat (limited to 'searchlib')
4 files changed, 103 insertions, 49 deletions
diff --git a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp index f2341d0968e..0245af828dd 100644 --- a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp +++ b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp @@ -4,6 +4,7 @@ #include <vespa/searchlib/attribute/i_docid_posting_store.h> #include <vespa/searchlib/attribute/i_docid_with_weight_posting_store.h> #include <vespa/searchlib/attribute/integerbase.h> +#include <vespa/searchlib/attribute/stringbase.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/searchlib/queryeval/orsearch.h> #include <vespa/searchlib/queryeval/searchiterator.h> @@ -19,13 +20,22 @@ using namespace search::queryeval; using namespace search; using testing::StartsWith; -struct IntegerKey : public IDirectPostingStore::LookupKey { +using LookupKey = IDirectPostingStore::LookupKey; + +struct IntegerKey : public LookupKey { int64_t _value; IntegerKey(int64_t value_in) : _value(value_in) {} vespalib::stringref asString() const override { abort(); } bool asInteger(int64_t& value) const override { value = _value; return true; } }; +struct StringKey : public LookupKey { + vespalib::string _value; + StringKey(int64_t value_in) : _value(std::to_string(value_in)) {} + vespalib::stringref asString() const override { return _value; } + bool asInteger(int64_t&) const override { abort(); } +}; + const vespalib::string field_name = "test"; constexpr uint32_t field_id = 3; uint32_t doc_id_limit = 500; @@ -50,73 +60,86 @@ concat(const Docids& a, const Docids& b) return res; } +template <typename AttributeType, typename DataType> +void +populate_attribute(AttributeType& attr, const std::vector<DataType>& values) +{ + // Values 0 and 1 have btree (short) posting lists. + attr.update(10, values[0]); + attr.update(30, values[1]); + attr.update(31, values[1]); + + // Values 2 and 3 have bitvector posting lists. + // We need at least 128 documents to get bitvector posting list (see PostingStoreBase2::resizeBitVectors()) + for (auto docid : range(100, 128)) { + attr.update(docid, values[2]); + } + for (auto docid : range(300, 128)) { + attr.update(docid, values[3]); + } + attr.commit(true); +} + std::shared_ptr<AttributeVector> -make_attribute(bool field_is_filter, CollectionType col_type) +make_attribute(CollectionType col_type, BasicType type, bool field_is_filter) { - Config cfg(BasicType::INT64, col_type); + Config cfg(type, col_type); cfg.setFastSearch(true); if (field_is_filter) { cfg.setIsFilter(field_is_filter); } uint32_t num_docs = doc_id_limit - 1; auto attr = test::AttributeBuilder(field_name, cfg).docs(num_docs).get(); - IntegerAttribute& real = dynamic_cast<IntegerAttribute&>(*attr); - - // Values 1 and 3 have btree (short) posting lists with weights. - real.update(10, 1); - real.update(30, 3); - real.update(31, 3); - - // Values 100 and 300 have bitvector posting lists. - // We need at least 128 documents to get bitvector posting list (see PostingStoreBase2::resizeBitVectors()) - for (auto docid : range(100, 128)) { - real.update(docid, 100); - } - for (auto docid : range(300, 128)) { - real.update(docid, 300); + if (type == BasicType::STRING) { + populate_attribute<StringAttribute, vespalib::string>(dynamic_cast<StringAttribute&>(*attr), + {"1", "3", "100", "300"}); + } else { + populate_attribute<IntegerAttribute, int64_t>(dynamic_cast<IntegerAttribute&>(*attr), + {1, 3, 100, 300}); } - attr->commit(true); return attr; } void -expect_has_btree_iterator(const IDirectPostingStore& store, int64_t term_value) +expect_has_btree_iterator(const IDirectPostingStore& store, const LookupKey& key) { auto snapshot = store.get_dictionary_snapshot(); - auto res = store.lookup(IntegerKey(term_value), snapshot); + auto res = store.lookup(key, snapshot); EXPECT_TRUE(store.has_btree_iterator(res.posting_idx)); } void -expect_has_bitvector_iterator(const IDirectPostingStore& store, int64_t term_value) +expect_has_bitvector_iterator(const IDirectPostingStore& store, const LookupKey& key) { auto snapshot = store.get_dictionary_snapshot(); - auto res = store.lookup(IntegerKey(term_value), snapshot); + auto res = store.lookup(key, snapshot); EXPECT_TRUE(store.has_bitvector(res.posting_idx)); } +template <typename LookupKeyType> void validate_posting_lists(const IDirectPostingStore& store) { - expect_has_btree_iterator(store, 1); - expect_has_btree_iterator(store, 3); + expect_has_btree_iterator(store, LookupKeyType(1)); + expect_has_btree_iterator(store, LookupKeyType(3)); if (store.has_always_btree_iterator()) { - expect_has_btree_iterator(store, 100); - expect_has_btree_iterator(store, 300); + expect_has_btree_iterator(store, LookupKeyType(100)); + expect_has_btree_iterator(store, LookupKeyType(300)); } - expect_has_bitvector_iterator(store, 100); - expect_has_bitvector_iterator(store, 300); + expect_has_bitvector_iterator(store, LookupKeyType(100)); + expect_has_bitvector_iterator(store, LookupKeyType(300)); } struct TestParam { CollectionType col_type; - TestParam(CollectionType col_type_in) : col_type(col_type_in) {} + BasicType type; + TestParam(CollectionType col_type_in, BasicType type_in) : col_type(col_type_in), type(type_in) {} ~TestParam() = default; }; std::ostream& operator<<(std::ostream& os, const TestParam& param) { - os << param.col_type.asString(); + os << param.col_type.asString() << "_" << param.type.asString(); return os; } @@ -125,6 +148,7 @@ public: using SingleValueBlueprintType = DirectMultiTermBlueprint<IDocidPostingStore, WeightedSetTermSearch>; using MultiValueBlueprintType = DirectMultiTermBlueprint<IDocidWithWeightPostingStore, WeightedSetTermSearch>; std::shared_ptr<AttributeVector> attr; + bool integer_type; std::shared_ptr<SingleValueBlueprintType> single_blueprint; std::shared_ptr<MultiValueBlueprintType> multi_blueprint; queryeval::ComplexLeafBlueprint* blueprint; @@ -133,6 +157,7 @@ public: fef::TermFieldMatchDataArray tfmda; DirectMultiTermBlueprintTest() : attr(), + integer_type(true), single_blueprint(), multi_blueprint(), blueprint(), @@ -142,20 +167,27 @@ public: tfmda.add(&tfmd); } void setup(bool field_is_filter, bool need_term_field_match_data) { - attr = make_attribute(field_is_filter, GetParam().col_type); + attr = make_attribute(GetParam().col_type, GetParam().type, field_is_filter); + integer_type = GetParam().type != BasicType::STRING; FieldSpec spec(field_name, field_id, fef::TermFieldHandle(), field_is_filter); + const IDirectPostingStore* store; if (GetParam().col_type == CollectionType::SINGLE) { - const auto* store = attr->as_docid_posting_store(); - ASSERT_TRUE(store); - validate_posting_lists(*store); - single_blueprint = std::make_shared<SingleValueBlueprintType>(spec, *attr, *store, 2); + auto real_store = attr->as_docid_posting_store(); + ASSERT_TRUE(real_store); + single_blueprint = std::make_shared<SingleValueBlueprintType>(spec, *attr, *real_store, 2); blueprint = single_blueprint.get(); + store = real_store; } else { - const auto* store = attr->as_docid_with_weight_posting_store(); - ASSERT_TRUE(store); - validate_posting_lists(*store); - multi_blueprint = std::make_shared<MultiValueBlueprintType>(spec, *attr, *store, 2); + auto real_store = attr->as_docid_with_weight_posting_store(); + ASSERT_TRUE(real_store); + multi_blueprint = std::make_shared<MultiValueBlueprintType>(spec, *attr, *real_store, 2); blueprint = multi_blueprint.get(); + store = real_store; + } + if (integer_type) { + validate_posting_lists<IntegerKey>(*store); + } else { + validate_posting_lists<StringKey>(*store); } blueprint->setDocIdLimit(doc_id_limit); if (need_term_field_match_data) { @@ -164,11 +196,19 @@ public: tfmd.tagAsNotNeeded(); } } + template <typename BlueprintType> + void add_term_helper(BlueprintType& b, int64_t term_value) { + if (integer_type) { + b.addTerm(IntegerKey(term_value), 1, estimate); + } else { + b.addTerm(StringKey(term_value), 1, estimate); + } + } void add_term(int64_t term_value) { if (single_blueprint) { - single_blueprint->addTerm(IntegerKey(term_value), 1, estimate); + add_term_helper(*single_blueprint, term_value); } else { - multi_blueprint->addTerm(IntegerKey(term_value), 1, estimate); + add_term_helper(*multi_blueprint, term_value); } } std::unique_ptr<SearchIterator> create_leaf_search() const { @@ -201,7 +241,10 @@ expect_or_child(SearchIterator& itr, size_t child, const vespalib::string& exp_c INSTANTIATE_TEST_SUITE_P(DefaultInstantiation, DirectMultiTermBlueprintTest, - testing::Values(CollectionType::SINGLE, CollectionType::WSET), + testing::Values(TestParam(CollectionType::SINGLE, BasicType::INT64), + TestParam(CollectionType::SINGLE, BasicType::STRING), + TestParam(CollectionType::WSET, BasicType::INT64), + TestParam(CollectionType::WSET, BasicType::STRING)), testing::PrintToStringParamName()); TEST_P(DirectMultiTermBlueprintTest, weight_iterators_used_for_none_filter_field) diff --git a/searchlib/src/vespa/searchlib/attribute/singlenumericpostattribute.h b/searchlib/src/vespa/searchlib/attribute/singlenumericpostattribute.h index 482dc90f6cd..749fa48565b 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlenumericpostattribute.h +++ b/searchlib/src/vespa/searchlib/attribute/singlenumericpostattribute.h @@ -52,16 +52,16 @@ private: using ValueModifier = typename B::BaseClass::ValueModifier; using generation_t = typename SingleValueNumericEnumAttribute<B>::generation_t; - using DirectPostingStoreAdapterType = attribute::NumericDirectPostingStoreAdapter<IDocidPostingStore, - PostingStore, EnumStore>; - DirectPostingStoreAdapterType _posting_store_adapter; - using PostingParent::_posting_store; using PostingParent::clearAllPostings; using PostingParent::handle_load_posting_lists; using PostingParent::handle_load_posting_lists_and_update_enum_store; using PostingParent::forwardedOnAddDoc; + using DirectPostingStoreAdapterType = attribute::NumericDirectPostingStoreAdapter<IDocidPostingStore, + PostingStore, EnumStore>; + DirectPostingStoreAdapterType _posting_store_adapter; + void freezeEnumDictionary() override; void mergeMemoryStats(vespalib::MemoryUsage & total) override; void applyUpdateValueChange(const Change & c, EnumStore & enumStore, diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.h b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.h index 543cfdd90ec..5a5b599244f 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.h +++ b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.h @@ -2,8 +2,10 @@ #pragma once -#include "singlestringattribute.h" +#include "i_docid_posting_store.h" #include "postinglistattribute.h" +#include "singlestringattribute.h" +#include "string_direct_posting_store_adapter.h" namespace search { @@ -48,12 +50,17 @@ private: using PostingParent::handle_load_posting_lists; using PostingParent::handle_load_posting_lists_and_update_enum_store; using PostingParent::forwardedOnAddDoc; + public: using PostingStore = typename PostingParent::PostingStore; using Dictionary = EnumPostingTree; using PostingParent::get_posting_store; private: + using DirectPostingStoreAdapterType = attribute::StringDirectPostingStoreAdapter<IDocidPostingStore, + PostingStore, EnumStore>; + DirectPostingStoreAdapterType _posting_store_adapter; + void freezeEnumDictionary() override; void mergeMemoryStats(vespalib::MemoryUsage & total) override; void applyUpdateValueChange(const Change & c, @@ -77,6 +84,8 @@ public: std::unique_ptr<attribute::SearchContext> getSearch(QueryTermSimpleUP term, const attribute::SearchContextParams & params) const override; + const IDocidPostingStore* as_docid_posting_store() const override { return &_posting_store_adapter; } + bool onAddDoc(DocId doc) override { return forwardedOnAddDoc(doc, this->_enumIndices.size(), this->_enumIndices.capacity()); } diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp index 85b0c095d76..5822ab2c786 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp @@ -4,6 +4,7 @@ #include "singlestringpostattribute.h" #include "single_string_enum_search_context.h" +#include "string_direct_posting_store_adapter.hpp" #include <vespa/searchcommon/attribute/config.h> #include <vespa/searchlib/query/query_term_ucs4.h> @@ -13,7 +14,8 @@ template <typename B> SingleValueStringPostingAttributeT<B>::SingleValueStringPostingAttributeT(const vespalib::string & name, const AttributeVector::Config & c) : SingleValueStringAttributeT<B>(name, c), - PostingParent(*this, this->getEnumStore()) + PostingParent(*this, this->getEnumStore()), + _posting_store_adapter(this->get_posting_store(), this->_enumStore, this->getIsFilter()) { } @@ -152,5 +154,5 @@ SingleValueStringPostingAttributeT<B>::getSearch(QueryTermSimpleUP qTerm, *this); } -} // namespace search +} |