diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-09-15 16:47:34 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-15 16:47:34 +0200 |
commit | 6c18fe68fb0803d4ab1f93ff3033a43ba44a552d (patch) | |
tree | 500e6c0fba2f2a5bdff29dc00d13de060f4e84c7 | |
parent | 137c710f508a762ffffc000351e17e8b553b9c59 (diff) | |
parent | 7f2be58127647a95c9bc44d1f7100e65a3fcfc09 (diff) |
Merge pull request #28547 from vespa-engine/toregge/add-dfa-string-comparator
Add DfaStringComparator.
6 files changed, 95 insertions, 0 deletions
diff --git a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp index 4a02ee7e943..d1bb5260b45 100644 --- a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp +++ b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/searchlib/attribute/enumcomparator.h> +#include <vespa/searchlib/attribute/dfa_string_comparator.h> #include <vespa/vespalib/btree/btreeroot.h> #include <vespa/vespalib/gtest/gtest.h> @@ -23,6 +24,8 @@ using TreeType = BTreeRoot<AtomicEntryRef, BTreeNoLeafData, const vespalib::datastore::EntryComparatorWrapper>; using NodeAllocator = TreeType::NodeAllocatorType; +using attribute::DfaStringComparator; + TEST(EnumComparatorTest, require_that_numeric_less_is_working) { @@ -192,6 +195,28 @@ TEST(EnumComparatorTest, require_that_folded_equal_is_working) EXPECT_TRUE(cmp3.equal(EnumIndex(), EnumIndex())); // similar when prefix } +TEST(DfaStringComparatorTest, require_that_less_is_working) +{ + StringEnumStore es(false, DictionaryConfig::Type::BTREE); + EnumIndex e1 = es.insert("Aa"); + EnumIndex e2 = es.insert("aa"); + EnumIndex e3 = es.insert("aB"); + DfaStringComparator cmp1(es.get_data_store(), "aa"); + EXPECT_FALSE(cmp1.less(EnumIndex(), e1)); + EXPECT_FALSE(cmp1.less(EnumIndex(), e2)); + EXPECT_TRUE(cmp1.less(EnumIndex(), e3)); + EXPECT_FALSE(cmp1.less(e1, EnumIndex())); + EXPECT_FALSE(cmp1.less(e2, EnumIndex())); + EXPECT_FALSE(cmp1.less(e3, EnumIndex())); + DfaStringComparator cmp2(es.get_data_store(), "Aa"); + EXPECT_TRUE(cmp2.less(EnumIndex(), e1)); + EXPECT_TRUE(cmp2.less(EnumIndex(), e2)); + EXPECT_TRUE(cmp2.less(EnumIndex(), e3)); + EXPECT_FALSE(cmp2.less(e1, EnumIndex())); + EXPECT_FALSE(cmp2.less(e2, EnumIndex())); + EXPECT_FALSE(cmp2.less(e3, EnumIndex())); +} + } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt index c54169ffd11..26f714258eb 100644 --- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt @@ -36,6 +36,7 @@ vespa_add_library(searchlib_attribute OBJECT createsinglefastsearch.cpp createsinglestd.cpp defines.cpp + dfa_string_comparator.cpp distance_metric_utils.cpp diversity.cpp dociditerator.cpp diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp new file mode 100644 index 00000000000..ddbe4fd110f --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp @@ -0,0 +1,31 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "dfa_string_comparator.h" +#include <vespa/searchlib/util/foldedstringcompare.h> + +namespace search::attribute { + +DfaStringComparator::DfaStringComparator(const DataStoreType& data_store, const char* candidate) + : ParentType(data_store, candidate) +{ +} + +bool +DfaStringComparator::less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const +{ + if (lhs.valid()) { + if (rhs.valid()) { + return FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0; + } else { + return FoldedStringCompare::compareFolded<true, false>(get(lhs), get(rhs)) < 0; + } + } else { + if (rhs.valid()) { + return FoldedStringCompare::compareFolded<false, true>(get(lhs), get(rhs)) < 0; + } else { + return FoldedStringCompare::compareFolded<false, false>(get(lhs), get(rhs)) < 0; + } + } +} + +} diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h new file mode 100644 index 00000000000..7ef14aa1719 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h @@ -0,0 +1,34 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_enum_store.h" +#include <vespa/vespalib/datastore/unique_store_string_comparator.h> + +namespace search::attribute { + +/** + * Less-than comparator used for comparing next candidate string + * (successor) from vespa::fuzzy::LevenshteinDfa with strings stored + * in an enum store as part of a dictionary iterator seek, skipping + * entries that don't match the fuzzy term. + * + * The code points from the candidate string are not folded during + * the comparison. + */ + +class DfaStringComparator : public vespalib::datastore::UniqueStoreStringComparator<IEnumStore::InternalIndex> +{ +public: + using ParentType = vespalib::datastore::UniqueStoreStringComparator<IEnumStore::InternalIndex>; + using DataStoreType = ParentType::DataStoreType; +private: + using ParentType::get; + +public: + DfaStringComparator(const DataStoreType& data_store, const char* candidate); + + bool less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const override; +}; + +} diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.h b/searchlib/src/vespa/searchlib/attribute/enumstore.h index f6467194d74..73980c5bcd3 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.h +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.h @@ -239,6 +239,9 @@ public: auto cmp = make_folded_comparator(value); return _dict->find_matching_enums(cmp); } + const vespalib::datastore::DataStoreT<IEnumStore::InternalIndex>& get_data_store() const noexcept { + return _store.get_data_store(); + } }; template <> diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h b/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h index e71dcd3aafb..579facd28b6 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h +++ b/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h @@ -4,6 +4,7 @@ #include "entry_comparator.h" #include "unique_store_string_allocator.h" +#include <vespa/vespalib/stllike/hash_fun.h> namespace vespalib::datastore { |