diff options
author | Tor Egge <Tor.Egge@yahooinc.com> | 2023-09-20 14:20:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-20 14:20:53 +0200 |
commit | 062a17042f773ac28d5d8ebd80629191e6777c80 (patch) | |
tree | f0728b3e9ab8902c4c130efe8bfab7cb080d5b05 /searchlib | |
parent | 55f1dee010c13aecbc9e0317ccf1257f84c72c49 (diff) | |
parent | fddf3cefb9cecb68ec6cae5a3ae1ad25a7d8ae43 (diff) |
Merge pull request #28580 from vespa-engine/toregge/switch-sort-order-for-cased-string-enum-store
Switch sort order for cased string enum store.
Diffstat (limited to 'searchlib')
5 files changed, 155 insertions, 24 deletions
diff --git a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp index 8d101b5329a..1c7b8b2b695 100644 --- a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp +++ b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp @@ -11,6 +11,13 @@ using namespace vespalib::btree; using vespalib::datastore::AtomicEntryRef; +namespace vespalib::datastore { + +std::ostream & operator << (std::ostream& os, const EntryRef& ref) { + return os << "EntryRef(" << ref.ref() << ")"; +} + +} namespace search { using NumericEnumStore = EnumStoreT<int32_t>; @@ -25,6 +32,7 @@ using TreeType = BTreeRoot<AtomicEntryRef, BTreeNoLeafData, using NodeAllocator = TreeType::NodeAllocatorType; using attribute::DfaStringComparator; +using vespalib::datastore::EntryComparator; TEST(EnumComparatorTest, require_that_numeric_less_is_working) @@ -152,6 +160,13 @@ TEST(EnumComparatorTest, require_that_comparator_with_tree_is_working) m.reclaim_memory(g.get_oldest_used_generation()); } +using EnumIndexVector = std::vector<EnumIndex>; + +void sort_enum_indexes(EnumIndexVector &vec, const EntryComparator &compare) +{ + std::stable_sort(vec.begin(), vec.end(), [&compare](auto& lhs, auto& rhs) { return compare.less(lhs, rhs); }); +} + TEST(EnumComparatorTest, require_that_folded_less_is_working) { StringEnumStore es(false, DictionaryConfig::Type::BTREE); @@ -170,6 +185,18 @@ TEST(EnumComparatorTest, require_that_folded_less_is_working) EXPECT_FALSE(cmp2.less(e4, EnumIndex())); EXPECT_FALSE(cmp3.less(EnumIndex(), e4)); // similar when prefix EXPECT_FALSE(cmp3.less(e4, EnumIndex())); // similar when prefix + // Full sort, CompareStrategy::UNCASED_THEN_CASED + EnumIndexVector vec{e4, e3, e2, e1}; + sort_enum_indexes(vec, es.get_comparator()); + EXPECT_EQ((EnumIndexVector{e1, e2, e3, e4}), vec); + // Partial sort, CompareStrategy::UNCASED + EnumIndexVector vec2{e4, e3, e2, e1}; + sort_enum_indexes(vec2, cmp1); + EXPECT_EQ((EnumIndexVector{e2, e1, e3, e4}), vec2); + // Partial sort, CompareStrategy::UNCASED + EnumIndexVector vec3{e4, e3, e1, e2}; + sort_enum_indexes(vec3, cmp1); + EXPECT_EQ((EnumIndexVector{e1, e2, e3, e4}), vec3); } TEST(EnumComparatorTest, require_that_equal_is_working) @@ -190,6 +217,36 @@ TEST(EnumComparatorTest, require_that_equal_is_working) EXPECT_TRUE(cmp1.equal(e3, e3)); } +TEST(EnumComparatorTest, require_that_cased_less_is_working) +{ + StringEnumStore es(false, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::CASED)); + EnumIndex e1 = es.insert("Aa"); + EnumIndex e2 = es.insert("aa"); + EnumIndex e3 = es.insert("aB"); + EnumIndex e4 = es.insert("Folded"); + const auto & cmp1 = es.get_folded_comparator(); + EXPECT_TRUE(cmp1.less(e1, e2)); + EXPECT_FALSE(cmp1.less(e2, e1)); + EXPECT_FALSE(cmp1.less(e2, e3)); + EXPECT_TRUE(cmp1.less(e3, e2)); + auto cmp2 = es.make_folded_comparator("fol"); + auto cmp3 = es.make_folded_comparator_prefix("fol"); + EXPECT_FALSE(cmp2.less(EnumIndex(), e4)); // case mismatch + EXPECT_TRUE(cmp2.less(e4, EnumIndex())); // case mismatch + EXPECT_FALSE(cmp3.less(EnumIndex(), e4)); // case mismatch + EXPECT_TRUE(cmp3.less(e4, EnumIndex())); // case mismatch + auto cmp4 = es.make_folded_comparator("Fol"); + auto cmp5 = es.make_folded_comparator_prefix("Fol"); + EXPECT_TRUE(cmp4.less(EnumIndex(), e4)); // no match + EXPECT_FALSE(cmp4.less(e4, EnumIndex())); // no match + EXPECT_FALSE(cmp5.less(EnumIndex(), e4)); // prefix match + EXPECT_FALSE(cmp5.less(e4, EnumIndex())); // prefix match + // Full sort, CompareStrategy::CASED + EnumIndexVector vec{e4, e3, e2, e1}; + sort_enum_indexes(vec, es.get_comparator()); + EXPECT_EQ((EnumIndexVector{e1, e4, e3, e2}), vec); +} + TEST(DfaStringComparatorTest, require_that_less_is_working) { StringEnumStore es(false, DictionaryConfig::Type::BTREE); diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp index 2b01c266e80..7a11453f61b 100644 --- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp +++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp @@ -8,7 +8,9 @@ #include <vespa/log/log.h> LOG_SETUP("enumstore_test"); -using Type = search::DictionaryConfig::Type; +using DictionaryConfig = search::DictionaryConfig; +using Type = DictionaryConfig::Type; +using Match = DictionaryConfig::Match; using vespalib::datastore::AtomicEntryRef; using vespalib::datastore::CompactionStrategy; using vespalib::datastore::EntryRef; @@ -41,61 +43,91 @@ using StringEnumStore = EnumStoreT<const char*>; struct BTreeDoubleEnumStore { using EnumStoreType = DoubleEnumStore; static constexpr Type type = Type::BTREE; + static constexpr Match match = Match::UNCASED; }; struct HybridDoubleEnumStore { using EnumStoreType = DoubleEnumStore; static constexpr Type type = Type::BTREE_AND_HASH; + static constexpr Match match = Match::UNCASED; }; struct HashDoubleEnumStore { using EnumStoreType = DoubleEnumStore; static constexpr Type type = Type::HASH; + static constexpr Match match = Match::UNCASED; }; struct BTreeFloatEnumStore { using EnumStoreType = FloatEnumStore; static constexpr Type type = Type::BTREE; + static constexpr Match match = Match::UNCASED; }; struct HybridFloatEnumStore { using EnumStoreType = FloatEnumStore; static constexpr Type type = Type::BTREE_AND_HASH; + static constexpr Match match = Match::UNCASED; }; struct HashFloatEnumStore { using EnumStoreType = FloatEnumStore; static constexpr Type type = Type::HASH; + static constexpr Match match = Match::UNCASED; }; struct BTreeNumericEnumStore { using EnumStoreType = NumericEnumStore; static constexpr Type type = Type::BTREE; + static constexpr Match match = Match::UNCASED; }; struct HybridNumericEnumStore { using EnumStoreType = NumericEnumStore; static constexpr Type type = Type::BTREE_AND_HASH; + static constexpr Match match = Match::UNCASED; }; struct HashNumericEnumStore { using EnumStoreType = NumericEnumStore; static constexpr Type type = Type::HASH; + static constexpr Match match = Match::UNCASED; }; struct BTreeStringEnumStore { using EnumStoreType = StringEnumStore; static constexpr Type type = Type::BTREE; + static constexpr Match match = Match::UNCASED; }; struct HybridStringEnumStore { using EnumStoreType = StringEnumStore; static constexpr Type type = Type::BTREE_AND_HASH; + static constexpr Match match = Match::UNCASED; }; struct HashStringEnumStore { using EnumStoreType = StringEnumStore; static constexpr Type type = Type::HASH; + static constexpr Match match = Match::UNCASED; +}; + +struct BTreeCasedStringEnumStore { + using EnumStoreType = StringEnumStore; + static constexpr Type type = Type::BTREE; + static constexpr Match match = Match::CASED; +}; + +struct HybridCasedStringEnumStore { + using EnumStoreType = StringEnumStore; + static constexpr Type type = Type::BTREE_AND_HASH; + static constexpr Match match = Match::CASED; +}; + +struct HashCasedStringEnumStore { + using EnumStoreType = StringEnumStore; + static constexpr Type type = Type::HASH; + static constexpr Match match = Match::CASED; }; using StringVector = std::vector<std::string>; @@ -146,7 +178,7 @@ public: using EnumStoreType = typename EnumStoreTypeAndDictionaryType::EnumStoreType; EnumStoreType es; FloatEnumStoreTest() - : es(false, EnumStoreTypeAndDictionaryType::type) + : es(false, DictionaryConfig(EnumStoreTypeAndDictionaryType::type, EnumStoreTypeAndDictionaryType::match)) {} }; @@ -555,7 +587,7 @@ public: }; -using LoaderTestTypes = ::testing::Types<BTreeNumericEnumStore, BTreeFloatEnumStore, BTreeStringEnumStore, HybridNumericEnumStore, HybridFloatEnumStore, HybridStringEnumStore, HashNumericEnumStore, HashFloatEnumStore, HashStringEnumStore>; +using LoaderTestTypes = ::testing::Types<BTreeNumericEnumStore, BTreeFloatEnumStore, BTreeStringEnumStore, HybridNumericEnumStore, HybridFloatEnumStore, HybridStringEnumStore, HashNumericEnumStore, HashFloatEnumStore, HashStringEnumStore, BTreeCasedStringEnumStore, HybridCasedStringEnumStore, HashCasedStringEnumStore>; TYPED_TEST_SUITE(LoaderTest, LoaderTestTypes); TYPED_TEST(LoaderTest, store_is_instantiated_with_enumerated_loader) diff --git a/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp b/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp index b3ef9ed5ca0..651a6fd7edf 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp @@ -12,25 +12,25 @@ EnumStoreComparator<EntryT>::equal_helper(const EntryT& lhs, const EntryT& rhs) return vespalib::datastore::UniqueStoreComparatorHelper<EntryT>::equal(lhs, rhs); } -EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold) +EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy) : ParentType(data_store, nullptr), - _fold(fold), + _compare_strategy(compare_strategy), _prefix(false), _prefix_len(0) { } -EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value) +EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value) : ParentType(data_store, lookup_value), - _fold(fold), + _compare_strategy(compare_strategy), _prefix(false), _prefix_len(0) { } -EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value, bool prefix) +EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value, bool prefix) : ParentType(data_store, lookup_value), - _fold(fold), + _compare_strategy(compare_strategy), _prefix(prefix), _prefix_len(0) { @@ -41,14 +41,21 @@ EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_s bool EnumStoreStringComparator::less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const { - return _fold - ? (use_prefix() + switch (_compare_strategy) { + case CompareStrategy::UNCASED: + return (use_prefix() ? (FoldedStringCompare::compareFoldedPrefix<true, true>(get(lhs), get(rhs), _prefix_len) < 0) - : (FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0)) - : (use_prefix() + : (FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0)); + case CompareStrategy::CASED: + return (use_prefix() + ? (FoldedStringCompare::compareFoldedPrefix<false, false>(get(lhs), get(rhs), _prefix_len) < 0) + : (FoldedStringCompare::compareFolded<false, false>(get(lhs), get(rhs)) < 0)); + case CompareStrategy::UNCASED_THEN_CASED: + default: + return (use_prefix() ? (FoldedStringCompare::comparePrefix(get(lhs), get(rhs), _prefix_len) < 0) : (FoldedStringCompare::compare(get(lhs), get(rhs)) < 0)); - + } } template class EnumStoreComparator<int8_t>; diff --git a/searchlib/src/vespa/searchlib/attribute/enumcomparator.h b/searchlib/src/vespa/searchlib/attribute/enumcomparator.h index 4bc09fcfe2a..3aec7c18461 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumcomparator.h +++ b/searchlib/src/vespa/searchlib/attribute/enumcomparator.h @@ -51,35 +51,59 @@ protected: private: using ParentType::get; + /* + * CompareStrategy determines how to compare string values: + * + * UNCASED_THEN_CASED compares the values ignoring case (i.e. + * performs case folding during the first compare). If the values + * are equal then they are compared again using case. + * + * UNCASED compares the values ignoring case. + * + * CASED compares the value, using case. + * + * Only UNCASED_THEN_CASED or CASED can be used for sorting. + * UNCASED can be used for lookup during search when sort order is + * UNCASED_THEN_CASED. + * + * UNCASED_THEN_CASED sort order: ["BAR", "bar", "FOO", "foo"] + * CASED sort order: ["BAR", "FOO", "bar", "foo"] + */ + enum class CompareStrategy : uint8_t { + UNCASED_THEN_CASED, + UNCASED, + CASED + }; + public: - EnumStoreStringComparator(const DataStoreType& data_store) - : EnumStoreStringComparator(data_store, false) + EnumStoreStringComparator(const DataStoreType& data_store, bool cased) + : EnumStoreStringComparator(data_store, cased ? CompareStrategy::CASED : CompareStrategy::UNCASED_THEN_CASED) {} private: - EnumStoreStringComparator(const DataStoreType& data_store, bool fold); + EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy); /** * Creates a comparator using the given low-level data store and that uses the * given value during compare if the enum index is invalid. */ - EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value); - EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value, bool prefix); + EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value); + EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value, bool prefix); public: bool less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const override; EnumStoreStringComparator make_folded() const { - return EnumStoreStringComparator(_store, true); + return EnumStoreStringComparator(_store, _compare_strategy == CompareStrategy::UNCASED_THEN_CASED ? CompareStrategy::UNCASED : _compare_strategy); } EnumStoreStringComparator make_for_lookup(const char* lookup_value) const { - return EnumStoreStringComparator(_store, _fold, lookup_value); + return EnumStoreStringComparator(_store, _compare_strategy, lookup_value); } EnumStoreStringComparator make_for_prefix_lookup(const char* lookup_value) const { - return EnumStoreStringComparator(_store, _fold, lookup_value, true); + return EnumStoreStringComparator(_store, _compare_strategy, lookup_value, true); } private: inline bool use_prefix() const noexcept { return _prefix; } - const bool _fold; + const CompareStrategy _compare_strategy; const bool _prefix; uint32_t _prefix_len; }; diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp index 140ac207d32..135bbcf5a72 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp @@ -26,6 +26,17 @@ namespace search { using vespalib::datastore::CompactionStrategy; using vespalib::datastore::EntryComparator; +template <typename Comparator, typename DataStore> +Comparator +make_enum_store_comparator(const DataStore& data_store, const DictionaryConfig& dict_cfg) +{ + if constexpr (std::is_same_v<Comparator, EnumStoreStringComparator>) { + return Comparator(data_store, dict_cfg.getMatch() == DictionaryConfig::Match::CASED); + } else { + return Comparator(data_store); + } +} + std::unique_ptr<vespalib::datastore::IUniqueStoreDictionary> make_enum_store_dictionary(IEnumStore &store, bool has_postings, const search::DictionaryConfig & dict_cfg, std::unique_ptr<EntryComparator> compare, @@ -74,7 +85,7 @@ EnumStoreT<EntryT>::load_unique_value(const void* src, size_t available, Index& template <typename EntryT> EnumStoreT<EntryT>::EnumStoreT(bool has_postings, const DictionaryConfig& dict_cfg, std::shared_ptr<vespalib::alloc::MemoryAllocator> memory_allocator, EntryType default_value) - : _store(std::move(memory_allocator)), + : _store(std::move(memory_allocator), [&dict_cfg](const auto& data_store) { return make_enum_store_comparator<ComparatorType>(data_store, dict_cfg); }), _dict(), _is_folded(dict_cfg.getMatch() == DictionaryConfig::Match::UNCASED), _foldedComparator(make_optionally_folded_comparator(is_folded())), |