summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@yahooinc.com>2023-09-20 14:20:53 +0200
committerGitHub <noreply@github.com>2023-09-20 14:20:53 +0200
commit062a17042f773ac28d5d8ebd80629191e6777c80 (patch)
treef0728b3e9ab8902c4c130efe8bfab7cb080d5b05 /searchlib
parent55f1dee010c13aecbc9e0317ccf1257f84c72c49 (diff)
parentfddf3cefb9cecb68ec6cae5a3ae1ad25a7d8ae43 (diff)
Merge pull request #28580 from vespa-engine/toregge/switch-sort-order-for-cased-string-enum-store
Switch sort order for cased string enum store.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp57
-rw-r--r--searchlib/src/tests/attribute/enumstore/enumstore_test.cpp38
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp29
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enumcomparator.h42
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enumstore.hpp13
5 files changed, 155 insertions, 24 deletions
diff --git a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
index 8d101b5329a..1c7b8b2b695 100644
--- a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
+++ b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
@@ -11,6 +11,13 @@ using namespace vespalib::btree;
using vespalib::datastore::AtomicEntryRef;
+namespace vespalib::datastore {
+
+std::ostream & operator << (std::ostream& os, const EntryRef& ref) {
+ return os << "EntryRef(" << ref.ref() << ")";
+}
+
+}
namespace search {
using NumericEnumStore = EnumStoreT<int32_t>;
@@ -25,6 +32,7 @@ using TreeType = BTreeRoot<AtomicEntryRef, BTreeNoLeafData,
using NodeAllocator = TreeType::NodeAllocatorType;
using attribute::DfaStringComparator;
+using vespalib::datastore::EntryComparator;
TEST(EnumComparatorTest, require_that_numeric_less_is_working)
@@ -152,6 +160,13 @@ TEST(EnumComparatorTest, require_that_comparator_with_tree_is_working)
m.reclaim_memory(g.get_oldest_used_generation());
}
+using EnumIndexVector = std::vector<EnumIndex>;
+
+void sort_enum_indexes(EnumIndexVector &vec, const EntryComparator &compare)
+{
+ std::stable_sort(vec.begin(), vec.end(), [&compare](auto& lhs, auto& rhs) { return compare.less(lhs, rhs); });
+}
+
TEST(EnumComparatorTest, require_that_folded_less_is_working)
{
StringEnumStore es(false, DictionaryConfig::Type::BTREE);
@@ -170,6 +185,18 @@ TEST(EnumComparatorTest, require_that_folded_less_is_working)
EXPECT_FALSE(cmp2.less(e4, EnumIndex()));
EXPECT_FALSE(cmp3.less(EnumIndex(), e4)); // similar when prefix
EXPECT_FALSE(cmp3.less(e4, EnumIndex())); // similar when prefix
+ // Full sort, CompareStrategy::UNCASED_THEN_CASED
+ EnumIndexVector vec{e4, e3, e2, e1};
+ sort_enum_indexes(vec, es.get_comparator());
+ EXPECT_EQ((EnumIndexVector{e1, e2, e3, e4}), vec);
+ // Partial sort, CompareStrategy::UNCASED
+ EnumIndexVector vec2{e4, e3, e2, e1};
+ sort_enum_indexes(vec2, cmp1);
+ EXPECT_EQ((EnumIndexVector{e2, e1, e3, e4}), vec2);
+ // Partial sort, CompareStrategy::UNCASED
+ EnumIndexVector vec3{e4, e3, e1, e2};
+ sort_enum_indexes(vec3, cmp1);
+ EXPECT_EQ((EnumIndexVector{e1, e2, e3, e4}), vec3);
}
TEST(EnumComparatorTest, require_that_equal_is_working)
@@ -190,6 +217,36 @@ TEST(EnumComparatorTest, require_that_equal_is_working)
EXPECT_TRUE(cmp1.equal(e3, e3));
}
+TEST(EnumComparatorTest, require_that_cased_less_is_working)
+{
+ StringEnumStore es(false, DictionaryConfig(DictionaryConfig::Type::BTREE, DictionaryConfig::Match::CASED));
+ EnumIndex e1 = es.insert("Aa");
+ EnumIndex e2 = es.insert("aa");
+ EnumIndex e3 = es.insert("aB");
+ EnumIndex e4 = es.insert("Folded");
+ const auto & cmp1 = es.get_folded_comparator();
+ EXPECT_TRUE(cmp1.less(e1, e2));
+ EXPECT_FALSE(cmp1.less(e2, e1));
+ EXPECT_FALSE(cmp1.less(e2, e3));
+ EXPECT_TRUE(cmp1.less(e3, e2));
+ auto cmp2 = es.make_folded_comparator("fol");
+ auto cmp3 = es.make_folded_comparator_prefix("fol");
+ EXPECT_FALSE(cmp2.less(EnumIndex(), e4)); // case mismatch
+ EXPECT_TRUE(cmp2.less(e4, EnumIndex())); // case mismatch
+ EXPECT_FALSE(cmp3.less(EnumIndex(), e4)); // case mismatch
+ EXPECT_TRUE(cmp3.less(e4, EnumIndex())); // case mismatch
+ auto cmp4 = es.make_folded_comparator("Fol");
+ auto cmp5 = es.make_folded_comparator_prefix("Fol");
+ EXPECT_TRUE(cmp4.less(EnumIndex(), e4)); // no match
+ EXPECT_FALSE(cmp4.less(e4, EnumIndex())); // no match
+ EXPECT_FALSE(cmp5.less(EnumIndex(), e4)); // prefix match
+ EXPECT_FALSE(cmp5.less(e4, EnumIndex())); // prefix match
+ // Full sort, CompareStrategy::CASED
+ EnumIndexVector vec{e4, e3, e2, e1};
+ sort_enum_indexes(vec, es.get_comparator());
+ EXPECT_EQ((EnumIndexVector{e1, e4, e3, e2}), vec);
+}
+
TEST(DfaStringComparatorTest, require_that_less_is_working)
{
StringEnumStore es(false, DictionaryConfig::Type::BTREE);
diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
index 2b01c266e80..7a11453f61b 100644
--- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
+++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
@@ -8,7 +8,9 @@
#include <vespa/log/log.h>
LOG_SETUP("enumstore_test");
-using Type = search::DictionaryConfig::Type;
+using DictionaryConfig = search::DictionaryConfig;
+using Type = DictionaryConfig::Type;
+using Match = DictionaryConfig::Match;
using vespalib::datastore::AtomicEntryRef;
using vespalib::datastore::CompactionStrategy;
using vespalib::datastore::EntryRef;
@@ -41,61 +43,91 @@ using StringEnumStore = EnumStoreT<const char*>;
struct BTreeDoubleEnumStore {
using EnumStoreType = DoubleEnumStore;
static constexpr Type type = Type::BTREE;
+ static constexpr Match match = Match::UNCASED;
};
struct HybridDoubleEnumStore {
using EnumStoreType = DoubleEnumStore;
static constexpr Type type = Type::BTREE_AND_HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct HashDoubleEnumStore {
using EnumStoreType = DoubleEnumStore;
static constexpr Type type = Type::HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct BTreeFloatEnumStore {
using EnumStoreType = FloatEnumStore;
static constexpr Type type = Type::BTREE;
+ static constexpr Match match = Match::UNCASED;
};
struct HybridFloatEnumStore {
using EnumStoreType = FloatEnumStore;
static constexpr Type type = Type::BTREE_AND_HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct HashFloatEnumStore {
using EnumStoreType = FloatEnumStore;
static constexpr Type type = Type::HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct BTreeNumericEnumStore {
using EnumStoreType = NumericEnumStore;
static constexpr Type type = Type::BTREE;
+ static constexpr Match match = Match::UNCASED;
};
struct HybridNumericEnumStore {
using EnumStoreType = NumericEnumStore;
static constexpr Type type = Type::BTREE_AND_HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct HashNumericEnumStore {
using EnumStoreType = NumericEnumStore;
static constexpr Type type = Type::HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct BTreeStringEnumStore {
using EnumStoreType = StringEnumStore;
static constexpr Type type = Type::BTREE;
+ static constexpr Match match = Match::UNCASED;
};
struct HybridStringEnumStore {
using EnumStoreType = StringEnumStore;
static constexpr Type type = Type::BTREE_AND_HASH;
+ static constexpr Match match = Match::UNCASED;
};
struct HashStringEnumStore {
using EnumStoreType = StringEnumStore;
static constexpr Type type = Type::HASH;
+ static constexpr Match match = Match::UNCASED;
+};
+
+struct BTreeCasedStringEnumStore {
+ using EnumStoreType = StringEnumStore;
+ static constexpr Type type = Type::BTREE;
+ static constexpr Match match = Match::CASED;
+};
+
+struct HybridCasedStringEnumStore {
+ using EnumStoreType = StringEnumStore;
+ static constexpr Type type = Type::BTREE_AND_HASH;
+ static constexpr Match match = Match::CASED;
+};
+
+struct HashCasedStringEnumStore {
+ using EnumStoreType = StringEnumStore;
+ static constexpr Type type = Type::HASH;
+ static constexpr Match match = Match::CASED;
};
using StringVector = std::vector<std::string>;
@@ -146,7 +178,7 @@ public:
using EnumStoreType = typename EnumStoreTypeAndDictionaryType::EnumStoreType;
EnumStoreType es;
FloatEnumStoreTest()
- : es(false, EnumStoreTypeAndDictionaryType::type)
+ : es(false, DictionaryConfig(EnumStoreTypeAndDictionaryType::type, EnumStoreTypeAndDictionaryType::match))
{}
};
@@ -555,7 +587,7 @@ public:
};
-using LoaderTestTypes = ::testing::Types<BTreeNumericEnumStore, BTreeFloatEnumStore, BTreeStringEnumStore, HybridNumericEnumStore, HybridFloatEnumStore, HybridStringEnumStore, HashNumericEnumStore, HashFloatEnumStore, HashStringEnumStore>;
+using LoaderTestTypes = ::testing::Types<BTreeNumericEnumStore, BTreeFloatEnumStore, BTreeStringEnumStore, HybridNumericEnumStore, HybridFloatEnumStore, HybridStringEnumStore, HashNumericEnumStore, HashFloatEnumStore, HashStringEnumStore, BTreeCasedStringEnumStore, HybridCasedStringEnumStore, HashCasedStringEnumStore>;
TYPED_TEST_SUITE(LoaderTest, LoaderTestTypes);
TYPED_TEST(LoaderTest, store_is_instantiated_with_enumerated_loader)
diff --git a/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp b/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp
index b3ef9ed5ca0..651a6fd7edf 100644
--- a/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/enumcomparator.cpp
@@ -12,25 +12,25 @@ EnumStoreComparator<EntryT>::equal_helper(const EntryT& lhs, const EntryT& rhs)
return vespalib::datastore::UniqueStoreComparatorHelper<EntryT>::equal(lhs, rhs);
}
-EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold)
+EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy)
: ParentType(data_store, nullptr),
- _fold(fold),
+ _compare_strategy(compare_strategy),
_prefix(false),
_prefix_len(0)
{
}
-EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value)
+EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value)
: ParentType(data_store, lookup_value),
- _fold(fold),
+ _compare_strategy(compare_strategy),
_prefix(false),
_prefix_len(0)
{
}
-EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value, bool prefix)
+EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value, bool prefix)
: ParentType(data_store, lookup_value),
- _fold(fold),
+ _compare_strategy(compare_strategy),
_prefix(prefix),
_prefix_len(0)
{
@@ -41,14 +41,21 @@ EnumStoreStringComparator::EnumStoreStringComparator(const DataStoreType& data_s
bool
EnumStoreStringComparator::less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const {
- return _fold
- ? (use_prefix()
+ switch (_compare_strategy) {
+ case CompareStrategy::UNCASED:
+ return (use_prefix()
? (FoldedStringCompare::compareFoldedPrefix<true, true>(get(lhs), get(rhs), _prefix_len) < 0)
- : (FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0))
- : (use_prefix()
+ : (FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0));
+ case CompareStrategy::CASED:
+ return (use_prefix()
+ ? (FoldedStringCompare::compareFoldedPrefix<false, false>(get(lhs), get(rhs), _prefix_len) < 0)
+ : (FoldedStringCompare::compareFolded<false, false>(get(lhs), get(rhs)) < 0));
+ case CompareStrategy::UNCASED_THEN_CASED:
+ default:
+ return (use_prefix()
? (FoldedStringCompare::comparePrefix(get(lhs), get(rhs), _prefix_len) < 0)
: (FoldedStringCompare::compare(get(lhs), get(rhs)) < 0));
-
+ }
}
template class EnumStoreComparator<int8_t>;
diff --git a/searchlib/src/vespa/searchlib/attribute/enumcomparator.h b/searchlib/src/vespa/searchlib/attribute/enumcomparator.h
index 4bc09fcfe2a..3aec7c18461 100644
--- a/searchlib/src/vespa/searchlib/attribute/enumcomparator.h
+++ b/searchlib/src/vespa/searchlib/attribute/enumcomparator.h
@@ -51,35 +51,59 @@ protected:
private:
using ParentType::get;
+ /*
+ * CompareStrategy determines how to compare string values:
+ *
+ * UNCASED_THEN_CASED compares the values ignoring case (i.e.
+ * performs case folding during the first compare). If the values
+ * are equal then they are compared again using case.
+ *
+ * UNCASED compares the values ignoring case.
+ *
+ * CASED compares the value, using case.
+ *
+ * Only UNCASED_THEN_CASED or CASED can be used for sorting.
+ * UNCASED can be used for lookup during search when sort order is
+ * UNCASED_THEN_CASED.
+ *
+ * UNCASED_THEN_CASED sort order: ["BAR", "bar", "FOO", "foo"]
+ * CASED sort order: ["BAR", "FOO", "bar", "foo"]
+ */
+ enum class CompareStrategy : uint8_t {
+ UNCASED_THEN_CASED,
+ UNCASED,
+ CASED
+ };
+
public:
- EnumStoreStringComparator(const DataStoreType& data_store)
- : EnumStoreStringComparator(data_store, false)
+ EnumStoreStringComparator(const DataStoreType& data_store, bool cased)
+ : EnumStoreStringComparator(data_store, cased ? CompareStrategy::CASED : CompareStrategy::UNCASED_THEN_CASED)
{}
private:
- EnumStoreStringComparator(const DataStoreType& data_store, bool fold);
+ EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy);
/**
* Creates a comparator using the given low-level data store and that uses the
* given value during compare if the enum index is invalid.
*/
- EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value);
- EnumStoreStringComparator(const DataStoreType& data_store, bool fold, const char* lookup_value, bool prefix);
+ EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value);
+ EnumStoreStringComparator(const DataStoreType& data_store, CompareStrategy compare_strategy, const char* lookup_value, bool prefix);
public:
bool less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const override;
EnumStoreStringComparator make_folded() const {
- return EnumStoreStringComparator(_store, true);
+ return EnumStoreStringComparator(_store, _compare_strategy == CompareStrategy::UNCASED_THEN_CASED ? CompareStrategy::UNCASED : _compare_strategy);
}
EnumStoreStringComparator make_for_lookup(const char* lookup_value) const {
- return EnumStoreStringComparator(_store, _fold, lookup_value);
+ return EnumStoreStringComparator(_store, _compare_strategy, lookup_value);
}
EnumStoreStringComparator make_for_prefix_lookup(const char* lookup_value) const {
- return EnumStoreStringComparator(_store, _fold, lookup_value, true);
+ return EnumStoreStringComparator(_store, _compare_strategy, lookup_value, true);
}
private:
inline bool use_prefix() const noexcept { return _prefix; }
- const bool _fold;
+ const CompareStrategy _compare_strategy;
const bool _prefix;
uint32_t _prefix_len;
};
diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp
index 140ac207d32..135bbcf5a72 100644
--- a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp
@@ -26,6 +26,17 @@ namespace search {
using vespalib::datastore::CompactionStrategy;
using vespalib::datastore::EntryComparator;
+template <typename Comparator, typename DataStore>
+Comparator
+make_enum_store_comparator(const DataStore& data_store, const DictionaryConfig& dict_cfg)
+{
+ if constexpr (std::is_same_v<Comparator, EnumStoreStringComparator>) {
+ return Comparator(data_store, dict_cfg.getMatch() == DictionaryConfig::Match::CASED);
+ } else {
+ return Comparator(data_store);
+ }
+}
+
std::unique_ptr<vespalib::datastore::IUniqueStoreDictionary>
make_enum_store_dictionary(IEnumStore &store, bool has_postings, const search::DictionaryConfig & dict_cfg,
std::unique_ptr<EntryComparator> compare,
@@ -74,7 +85,7 @@ EnumStoreT<EntryT>::load_unique_value(const void* src, size_t available, Index&
template <typename EntryT>
EnumStoreT<EntryT>::EnumStoreT(bool has_postings, const DictionaryConfig& dict_cfg, std::shared_ptr<vespalib::alloc::MemoryAllocator> memory_allocator, EntryType default_value)
- : _store(std::move(memory_allocator)),
+ : _store(std::move(memory_allocator), [&dict_cfg](const auto& data_store) { return make_enum_store_comparator<ComparatorType>(data_store, dict_cfg); }),
_dict(),
_is_folded(dict_cfg.getMatch() == DictionaryConfig::Match::UNCASED),
_foldedComparator(make_optionally_folded_comparator(is_folded())),