aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-09-21 16:52:17 +0200
committerTor Egge <Tor.Egge@online.no>2023-09-21 16:52:17 +0200
commit884687f0d70161c67af5c918941b30ba98b797ec (patch)
tree0c15738d6a45bded5927c567b87f4b51455bd97f
parent95daa49ef952798c71b096b28a9ccd3c6f124478 (diff)
Use UTF-32 dfa fuzzy match successor.
-rw-r--r--searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h4
4 files changed, 35 insertions, 10 deletions
diff --git a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
index 1c7b8b2b695..975a2918026 100644
--- a/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
+++ b/searchlib/src/tests/attribute/enum_comparator/enum_comparator_test.cpp
@@ -4,12 +4,16 @@
#include <vespa/searchlib/attribute/dfa_string_comparator.h>
#include <vespa/vespalib/btree/btreeroot.h>
#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/text/utf8.h>
#include <vespa/searchlib/attribute/enumstore.hpp>
using namespace vespalib::btree;
using vespalib::datastore::AtomicEntryRef;
+using vespalib::LowerCase;
+using vespalib::Utf8ReaderForZTS;
namespace vespalib::datastore {
@@ -18,6 +22,22 @@ std::ostream & operator << (std::ostream& os, const EntryRef& ref) {
}
}
+
+namespace {
+
+std::vector<uint32_t> as_utf32(const char* key)
+{
+ std::vector<uint32_t> result;
+ Utf8ReaderForZTS reader(key);
+ while (reader.hasMore()) {
+ uint32_t code_point = reader.getChar();
+ result.push_back(code_point);
+ }
+ return result;
+}
+
+}
+
namespace search {
using NumericEnumStore = EnumStoreT<int32_t>;
@@ -253,14 +273,16 @@ TEST(DfaStringComparatorTest, require_that_less_is_working)
EnumIndex e1 = es.insert("Aa");
EnumIndex e2 = es.insert("aa");
EnumIndex e3 = es.insert("aB");
- DfaStringComparator cmp1(es.get_data_store(), "aa");
+ auto aa_utf32 = as_utf32("aa");
+ DfaStringComparator cmp1(es.get_data_store(), aa_utf32);
EXPECT_FALSE(cmp1.less(EnumIndex(), e1));
EXPECT_FALSE(cmp1.less(EnumIndex(), e2));
EXPECT_TRUE(cmp1.less(EnumIndex(), e3));
EXPECT_FALSE(cmp1.less(e1, EnumIndex()));
EXPECT_FALSE(cmp1.less(e2, EnumIndex()));
EXPECT_FALSE(cmp1.less(e3, EnumIndex()));
- DfaStringComparator cmp2(es.get_data_store(), "Aa");
+ auto Aa_utf32 = as_utf32("Aa");
+ DfaStringComparator cmp2(es.get_data_store(), Aa_utf32);
EXPECT_TRUE(cmp2.less(EnumIndex(), e1));
EXPECT_TRUE(cmp2.less(EnumIndex(), e2));
EXPECT_TRUE(cmp2.less(EnumIndex(), e3));
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 6b873020994..fcba13f85a4 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -17,7 +17,7 @@ namespace search::attribute {
class DfaFuzzyMatcher {
private:
vespalib::fuzzy::LevenshteinDfa _dfa;
- std::string _successor;
+ std::vector<uint32_t> _successor;
public:
DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
@@ -29,7 +29,7 @@ public:
if (match.matches()) {
return true;
} else {
- DfaStringComparator cmp(data_store, _successor.c_str());
+ DfaStringComparator cmp(data_store, _successor);
itr.seek(vespalib::datastore::AtomicEntryRef(), cmp);
return false;
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp
index ddbe4fd110f..e9710553ef1 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.cpp
@@ -5,8 +5,9 @@
namespace search::attribute {
-DfaStringComparator::DfaStringComparator(const DataStoreType& data_store, const char* candidate)
- : ParentType(data_store, candidate)
+DfaStringComparator::DfaStringComparator(const DataStoreType& data_store, const std::vector<uint32_t>& candidate)
+ : ParentType(data_store),
+ _candidate(std::cref(candidate))
{
}
@@ -17,13 +18,13 @@ DfaStringComparator::less(const vespalib::datastore::EntryRef lhs, const vespali
if (rhs.valid()) {
return FoldedStringCompare::compareFolded<true, true>(get(lhs), get(rhs)) < 0;
} else {
- return FoldedStringCompare::compareFolded<true, false>(get(lhs), get(rhs)) < 0;
+ return FoldedStringCompare::compareFolded<true, false>(get(lhs), _candidate) < 0;
}
} else {
if (rhs.valid()) {
- return FoldedStringCompare::compareFolded<false, true>(get(lhs), get(rhs)) < 0;
+ return FoldedStringCompare::compareFolded<false, true>(_candidate, get(rhs)) < 0;
} else {
- return FoldedStringCompare::compareFolded<false, false>(get(lhs), get(rhs)) < 0;
+ return false;
}
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h
index 7ef14aa1719..8c80035c8fb 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_string_comparator.h
@@ -4,6 +4,7 @@
#include "i_enum_store.h"
#include <vespa/vespalib/datastore/unique_store_string_comparator.h>
+#include <functional>
namespace search::attribute {
@@ -24,9 +25,10 @@ public:
using DataStoreType = ParentType::DataStoreType;
private:
using ParentType::get;
+ std::reference_wrapper<const std::vector<uint32_t>> _candidate;
public:
- DfaStringComparator(const DataStoreType& data_store, const char* candidate);
+ DfaStringComparator(const DataStoreType& data_store, const std::vector<uint32_t>& candidate);
bool less(const vespalib::datastore::EntryRef lhs, const vespalib::datastore::EntryRef rhs) const override;
};