summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-09-15 16:47:34 +0200
committerGitHub <noreply@github.com>2023-09-15 16:47:34 +0200
commit6c18fe68fb0803d4ab1f93ff3033a43ba44a552d (patch)
tree500e6c0fba2f2a5bdff29dc00d13de060f4e84c7
parent137c710f508a762ffffc000351e17e8b553b9c59 (diff)
parent7f2be58127647a95c9bc44d1f7100e65a3fcfc09 (diff)
Merge pull request #28547 from vespa-engine/toregge/add-dfa-string-comparator
Add DfaStringComparator.
-rw-r--r--searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp25
-rw-r--r--searchlib/src/vespa/searchlib/attribute/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp31
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h34
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enumstore.h3
-rw-r--r--vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h1
6 files changed, 95 insertions, 0 deletions
diff --git a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
index 4a02ee7e943..d1bb5260b45 100644
--- a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
+++ b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/searchlib/attribute/enumcomparator.h>
+#include <vespa/searchlib/attribute/dfa_string_comparator.h>
#include <vespa/vespalib/btree/btreeroot.h>
#include <vespa/vespalib/gtest/gtest.h>
@@ -23,6 +24,8 @@ using TreeType = BTreeRoot<AtomicEntryRef, BTreeNoLeafData,
const vespalib::datastore::EntryComparatorWrapper>;
using NodeAllocator = TreeType::NodeAllocatorType;
+using attribute::DfaStringComparator;
+
TEST(EnumComparatorTest, require_that_numeric_less_is_working)
{
@@ -192,6 +195,28 @@ TEST(EnumComparatorTest, require_that_folded_equal_is_working)
EXPECT_TRUE(cmp3.equal(EnumIndex(), EnumIndex())); // similar when prefix
}
+TEST(DfaStringComparatorTest, require_that_less_is_working)
+{
+ StringEnumStore es(false, DictionaryConfig::Type::BTREE);
+ EnumIndex e1 = es.insert("Aa");
+ EnumIndex e2 = es.insert("aa");
+ EnumIndex e3 = es.insert("aB");
+ DfaStringComparator cmp1(es.get_data_store(), "aa");
+ EXPECT_FALSE(cmp1.less(EnumIndex(), e1));
+ EXPECT_FALSE(cmp1.less(EnumIndex(), e2));
+ EXPECT_TRUE(cmp1.less(EnumIndex(), e3));
+ EXPECT_FALSE(cmp1.less(e1, EnumIndex()));
+ EXPECT_FALSE(cmp1.less(e2, EnumIndex()));
+ EXPECT_FALSE(cmp1.less(e3, EnumIndex()));
+ DfaStringComparator cmp2(es.get_data_store(), "Aa");
+ EXPECT_TRUE(cmp2.less(EnumIndex(), e1));
+ EXPECT_TRUE(cmp2.less(EnumIndex(), e2));
+ EXPECT_TRUE(cmp2.less(EnumIndex(), e3));
+ EXPECT_FALSE(cmp2.less(e1, EnumIndex()));
+ EXPECT_FALSE(cmp2.less(e2, EnumIndex()));
+ EXPECT_FALSE(cmp2.less(e3, EnumIndex()));
+}
+
}
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
index c54169ffd11..26f714258eb 100644
--- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(searchlib_attribute OBJECT
createsinglefastsearch.cpp
createsinglestd.cpp
defines.cpp
+ dfa_string_comparator.cpp
distance_metric_utils.cpp
diversity.cpp
dociditerator.cpp
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp
new file mode 100644
index 00000000000..ddbe4fd110f
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp
@@ -0,0 +1,31 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dfa_string_comparator.h"
+#include <vespa/searchlib/util/foldedstringcompare.h>
+
+namespace search::attribute {
+
+DfaStringComparator::DfaStringComparator(const DataStoreType& data_store, const char* candidate)
+ : ParentType(data_store, candidate)
+{
+}
+
+bool
+DfaStringComparator::less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const
+{
+ if (lhs.valid()) {
+ if (rhs.valid()) {
+ return FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0;
+ } else {
+ return FoldedStringCompare::compareFolded<true, false>(get(lhs), get(rhs)) < 0;
+ }
+ } else {
+ if (rhs.valid()) {
+ return FoldedStringCompare::compareFolded<false, true>(get(lhs), get(rhs)) < 0;
+ } else {
+ return FoldedStringCompare::compareFolded<false, false>(get(lhs), get(rhs)) < 0;
+ }
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h
new file mode 100644
index 00000000000..7ef14aa1719
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h
@@ -0,0 +1,34 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "i_enum_store.h"
+#include <vespa/vespalib/datastore/unique_store_string_comparator.h>
+
+namespace search::attribute {
+
+/**
+ * Less-than comparator used for comparing next candidate string
+ * (successor) from vespa::fuzzy::LevenshteinDfa with strings stored
+ * in an enum store as part of a dictionary iterator seek, skipping
+ * entries that don't match the fuzzy term.
+ *
+ * The code points from the candidate string are not folded during
+ * the comparison.
+ */
+
+class DfaStringComparator : public vespalib::datastore::UniqueStoreStringComparator<IEnumStore::InternalIndex>
+{
+public:
+ using ParentType = vespalib::datastore::UniqueStoreStringComparator<IEnumStore::InternalIndex>;
+ using DataStoreType = ParentType::DataStoreType;
+private:
+ using ParentType::get;
+
+public:
+ DfaStringComparator(const DataStoreType& data_store, const char* candidate);
+
+ bool less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const override;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.h b/searchlib/src/vespa/searchlib/attribute/enumstore.h
index f6467194d74..73980c5bcd3 100644
--- a/searchlib/src/vespa/searchlib/attribute/enumstore.h
+++ b/searchlib/src/vespa/searchlib/attribute/enumstore.h
@@ -239,6 +239,9 @@ public:
auto cmp = make_folded_comparator(value);
return _dict->find_matching_enums(cmp);
}
+ const vespalib::datastore::DataStoreT<IEnumStore::InternalIndex>& get_data_store() const noexcept {
+ return _store.get_data_store();
+ }
};
template <>
diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h b/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h
index e71dcd3aafb..579facd28b6 100644
--- a/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h
+++ b/vespalib/src/vespa/vespalib/datastore/unique_store_string_comparator.h
@@ -4,6 +4,7 @@
#include "entry_comparator.h"
#include "unique_store_string_allocator.h"
+#include <vespa/vespalib/stllike/hash_fun.h>
namespace vespalib::datastore {