diff options
author | Alexey Chernyshev <aleksei@spotify.com> | 2022-04-04 16:23:07 +0200 |
---|---|---|
committer | Alexey Chernyshev <aleksei@spotify.com> | 2022-04-07 14:44:30 +0200 |
commit | 7e9b33401201db9a9e22971dd419247e268bbfaa (patch) | |
tree | f5032a82e9fa74247b2fdeb3dcde4dc6cf98ce89 | |
parent | ad7cc1d11f0c19baa2344a643377576c559555f7 (diff) |
Propagating annotations for fuzzy query
31 files changed, 352 insertions, 82 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index dab61dfd46d..958dfaf0c65 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -548,7 +548,11 @@ "public" ], "methods": [ - "public void <init>(java.lang.String, boolean, java.lang.String)", + "public void <init>(java.lang.String, boolean, java.lang.String, int, int)", + "public void setMaxEditDistance(int)", + "public void setPrefixLength(int)", + "public int getPrefixLength()", + "public int getMaxEditDistance()", "public void setValue(java.lang.String)", "public java.lang.String getRawWord()", "public boolean isWords()", @@ -560,10 +564,13 @@ "public int getNumWords()", "public boolean equals(java.lang.Object)", "public int hashCode()", - "public java.lang.String toString()", + "protected void appendHeadingString(java.lang.StringBuilder)", "protected void encodeThis(java.nio.ByteBuffer)" ], - "fields": [] + "fields": [ + "public static int DefaultMaxEditDistance", + "public static int DefaultPrefixLength" + ] }, "com.yahoo.prelude.query.GeoLocationItem": { "superClass": "com.yahoo.prelude.query.TermItem", diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java index 74c31a5d1f0..fda96aa6ecc 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java @@ -1,7 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import com.yahoo.compress.IntegerCompressor; + import java.nio.ByteBuffer; +import java.util.Objects; /** * Fuzzy search term @@ -11,9 +14,37 @@ import java.nio.ByteBuffer; public class FuzzyItem extends TermItem { private String term; - public FuzzyItem(String indexName, boolean isFromQuery, String term) { + private int maxEditDistance; + private int prefixLength; + + public static int DefaultMaxEditDistance = 2; + public static int DefaultPrefixLength = 0; + + public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) { super(indexName, isFromQuery, null); setValue(term); + setMaxEditDistance(maxEditDistance); + setPrefixLength(prefixLength); + } + + public void setMaxEditDistance(int maxEditDistance) { + if (maxEditDistance < 0) + throw new IllegalArgumentException("Can not use negative maxEditDistance " + maxEditDistance); + this.maxEditDistance = maxEditDistance; + } + + public void setPrefixLength(int prefixLength) { + if (prefixLength < 0) + throw new IllegalArgumentException("Can not use negative prefixLength " + prefixLength); + this.prefixLength = prefixLength; + } + + public int getPrefixLength() { + return this.prefixLength; + } + + public int getMaxEditDistance() { + return this.maxEditDistance; } @Override @@ -73,35 +104,36 @@ public class FuzzyItem extends TermItem { return false; } FuzzyItem other = (FuzzyItem) obj; - if (term == null) { - if (other.term != null) { - return false; - } - } else if (!term.equals(other.term)) { - return false; - } + if (!this.term.equals(other.term)) return false; + if (this.maxEditDistance != other.maxEditDistance) return false; + if (this.prefixLength != other.prefixLength) return false; return true; } @Override public int hashCode() { - final int prime = 31; - int result = super.hashCode(); - result = prime * result + ((term == null) ? 0 : term.hashCode()); - return result; + return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength); } @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("FuzzyItem [term=").append(term).append("]"); - return builder.toString(); + protected void appendHeadingString(StringBuilder buffer) { + buffer.append(getName()); + buffer.append("("); + buffer.append(this.term); + buffer.append(","); + buffer.append(this.maxEditDistance); + buffer.append(","); + buffer.append(this.prefixLength); + buffer.append(")"); + buffer.append(" "); } @Override protected void encodeThis(ByteBuffer buffer) { super.encodeThis(buffer); putString(getIndexedString(), buffer); + IntegerCompressor.putCompressedPositiveNumber(this.maxEditDistance, buffer); + IntegerCompressor.putCompressedPositiveNumber(this.prefixLength, buffer); } } diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java index 320148ec01d..c1d1cdd566b 100644 --- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java +++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java @@ -61,6 +61,8 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import static com.yahoo.search.yql.YqlParser.MAX_EDIT_DISTANCE; +import static com.yahoo.search.yql.YqlParser.PREFIX_LENGTH; import static com.yahoo.slime.Type.ARRAY; import static com.yahoo.slime.Type.DOUBLE; import static com.yahoo.slime.Type.LONG; @@ -1161,9 +1163,16 @@ public class SelectParser implements Parser { private Item instantiateFuzzyItem(String field, String key, Inspector value) { HashMap<Integer, Inspector> children = childMap(value); + HashMap<String, Inspector> annotations = getAnnotationMap(value); + Preconditions.checkArgument(children.size() == 1, "Expected 1 argument, got %s.", children.size()); + String wordData = children.get(0).asString(); - FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + + Integer maxEditDistance = getIntegerAnnotation(MAX_EDIT_DISTANCE, annotations, FuzzyItem.DefaultMaxEditDistance); + Integer prefixLength = getIntegerAnnotation(PREFIX_LENGTH, annotations, FuzzyItem.DefaultPrefixLength); + + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength); return leafStyleSettings(getAnnotations(value), fuzzy); } diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java index e778798b0e5..4b511df5e5f 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java +++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java @@ -21,6 +21,7 @@ import static com.yahoo.search.yql.YqlParser.GEO_LOCATION; import static com.yahoo.search.yql.YqlParser.HIT_LIMIT; import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS; import static com.yahoo.search.yql.YqlParser.LABEL; +import static com.yahoo.search.yql.YqlParser.MAX_EDIT_DISTANCE; import static com.yahoo.search.yql.YqlParser.NEAR; import static com.yahoo.search.yql.YqlParser.NEAREST_NEIGHBOR; import static com.yahoo.search.yql.YqlParser.NORMALIZE_CASE; @@ -31,6 +32,7 @@ import static com.yahoo.search.yql.YqlParser.ORIGIN_OFFSET; import static com.yahoo.search.yql.YqlParser.ORIGIN_ORIGINAL; import static com.yahoo.search.yql.YqlParser.PHRASE; import static com.yahoo.search.yql.YqlParser.PREFIX; +import static com.yahoo.search.yql.YqlParser.PREFIX_LENGTH; import static com.yahoo.search.yql.YqlParser.RANGE; import static com.yahoo.search.yql.YqlParser.RANK; import static com.yahoo.search.yql.YqlParser.RANKED; @@ -526,7 +528,8 @@ public class VespaSerializer { @Override boolean serialize(StringBuilder destination, FuzzyItem fuzzy) { - String annotations = leafAnnotations(fuzzy); + String annotations = fuzzyAnnotations(fuzzy); + destination.append(normalizeIndexName(fuzzy.getIndexName())).append(" contains "); if (annotations.length() > 0) { @@ -543,6 +546,30 @@ public class VespaSerializer { } return false; } + + static String fuzzyAnnotations(FuzzyItem fuzzyItem) { + boolean isMaxEditDistanceSet = fuzzyItem.getMaxEditDistance() != FuzzyItem.DefaultMaxEditDistance; + boolean isPrefixLengthSet = fuzzyItem.getPrefixLength() != FuzzyItem.DefaultPrefixLength; + boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet; + + StringBuilder builder = new StringBuilder(); + if (anyAnnotationSet) { + builder.append("[{"); + } + if (isMaxEditDistanceSet) { + builder.append(MAX_EDIT_DISTANCE + ":").append(fuzzyItem.getMaxEditDistance()); + } + if (isMaxEditDistanceSet && isPrefixLengthSet) { + builder.append(","); + } + if (isPrefixLengthSet) { + builder.append(PREFIX_LENGTH + ":").append(fuzzyItem.getPrefixLength()); + } + if (anyAnnotationSet) { + builder.append("}]"); + } + return builder.toString(); + } } private static class ONearSerializer extends Serializer<ONearItem> { diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java index 06ee0e706f3..fcb19dde10d 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java +++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java @@ -126,6 +126,8 @@ public class YqlParser implements Parser { private static final String RANKED_DESCRIPTION = "setting for whether to use term for ranking"; private static final String STEM_DESCRIPTION = "setting for whether to use stem if field implies it"; private static final String USE_POSITION_DATA_DESCRIPTION = "setting for whether to use position data for ranking this item"; + private static final String MAX_EDIT_DISTANCE_DESCRIPTION = "setting for an inclusive upper bound for a fuzzy edit-distance search"; + private static final String PREFIX_LENGTH_DESCRIPTION = "setting for a prefix length that is considered frozen for a fuzzy search"; private static final String USER_INPUT_ALLOW_EMPTY = "allowEmpty"; private static final String USER_INPUT_DEFAULT_INDEX = "defaultIndex"; private static final String USER_INPUT_GRAMMAR = "grammar"; @@ -194,6 +196,9 @@ public class YqlParser implements Parser { public static final String WEIGHT = "weight"; public static final String WEIGHTED_SET = "weightedSet"; public static final String FUZZY = "fuzzy"; + public static final String MAX_EDIT_DISTANCE = "maxEditDistance"; + public static final String PREFIX_LENGTH = "prefixLength"; + private final IndexFacts indexFacts; private final List<ConnectedItem> connectedItems = new ArrayList<>(); @@ -1313,7 +1318,21 @@ public class YqlParser implements Parser { String wordData = getStringContents(args.get(0)); - FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + Integer maxEditDistance = getAnnotation( + ast, + MAX_EDIT_DISTANCE, + Integer.class, + FuzzyItem.DefaultMaxEditDistance, + MAX_EDIT_DISTANCE_DESCRIPTION); + + Integer prefixLength = getAnnotation( + ast, + PREFIX_LENGTH, + Integer.class, + FuzzyItem.DefaultPrefixLength, + PREFIX_LENGTH_DESCRIPTION); + + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength); return leafStyleSettings(ast, fuzzy); } diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java index 1269c2a5aef..8a90d224003 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java @@ -449,4 +449,9 @@ public class VespaSerializerTestCase { parseAndConfirm("foo contains fuzzy(\"a\")"); } + @Test + public void testFuzzyAnnotations() { + parseAndConfirm("foo contains ([{maxEditDistance:3,prefixLength:5}]fuzzy(\"a\"))"); + } + } diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index f40e212adde..0ee0597689f 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -47,9 +47,7 @@ import com.yahoo.search.searchchain.Execution; import org.junit.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -389,6 +387,21 @@ public class YqlParserTestCase { assertSame(FuzzyItem.class, root.getClass()); assertEquals("baz", ((FuzzyItem) root).getIndexName()); assertEquals("a b", ((FuzzyItem) root).stringValue()); + assertEquals(FuzzyItem.DefaultMaxEditDistance, ((FuzzyItem) root).getMaxEditDistance()); + assertEquals(FuzzyItem.DefaultPrefixLength, ((FuzzyItem) root).getPrefixLength()); + } + + @Test + public void testFuzzyAnnotations() { + QueryTree x = parse( + "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10}fuzzy(\"a b\"))" + ); + Item root = x.getRoot(); + assertSame(FuzzyItem.class, root.getClass()); + assertEquals("baz", ((FuzzyItem) root).getIndexName()); + assertEquals("a b", ((FuzzyItem) root).stringValue()); + assertEquals(3, ((FuzzyItem) root).getMaxEditDistance()); + assertEquals(10, ((FuzzyItem) root).getPrefixLength()); } @Test diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp index aa64e944baa..947e4aa30c2 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp @@ -402,7 +402,8 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st { uint32_t indexLen = index.size(); uint32_t termLen = term.size(); - uint32_t queryPacketSize = 1 + 2 * 4 + indexLen + termLen; + uint32_t fuzzyParametersSize = (termType == TermType::FUZZYTERM) ? 8 : 0; + uint32_t queryPacketSize = 1 + 2 * 4 + indexLen + termLen + fuzzyParametersSize; uint32_t p = 0; buffer.resize(queryPacketSize); switch (termType) { @@ -419,6 +420,12 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st p += vespalib::compress::Integer::compressPositive(termLen, &buffer[p]); memcpy(&buffer[p], term.c_str(), termLen); p += termLen; + + if (termType == TermType::FUZZYTERM) { + p += vespalib::compress::Integer::compressPositive(2, &buffer[p]); // max edit distance + p += vespalib::compress::Integer::compressPositive(0, &buffer[p]); // prefix length + } + buffer.resize(p); } diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index 1f570e0a381..4deb287df0e 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -388,8 +388,8 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(SearchContext)); - EXPECT_EQUAL(56u, sizeof(StringSearchHelper)); - EXPECT_EQUAL(104u, sizeof(attribute::SingleStringEnumSearchContext)); + EXPECT_EQUAL(88u, sizeof(StringSearchHelper)); + EXPECT_EQUAL(136u, sizeof(attribute::SingleStringEnumSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); SingleValueStringAttribute svsa("svsa", cfg); diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp index 0e8155e23c3..b1e979b5bcb 100644 --- a/searchlib/src/tests/query/customtypevisitor_test.cpp +++ b/searchlib/src/tests/query/customtypevisitor_test.cpp @@ -36,7 +36,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {}; struct MyStringTerm : InitTerm<StringTerm> {}; struct MySubstrTerm : InitTerm<SubstringTerm> {}; struct MySuffixTerm : InitTerm<SuffixTerm> {}; -struct MyFuzzyTerm : InitTerm<FuzzyTerm> {}; +struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} }; struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} }; struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} }; struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} }; diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp index f770213e8e5..8e8be997be0 100644 --- a/searchlib/src/tests/query/query_visitor_test.cpp +++ b/searchlib/src/tests/query/query_visitor_test.cpp @@ -86,7 +86,7 @@ TEST("requireThatAllNodesCanBeVisited") { checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25)); checkVisit<TrueQueryNode>(new SimpleTrue()); checkVisit<FalseQueryNode>(new SimpleFalse()); - checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0))); + checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0)); } } // namespace diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp index 2ea566027c4..5b410879fa0 100644 --- a/searchlib/src/tests/query/querybuilder_test.cpp +++ b/searchlib/src/tests/query/querybuilder_test.cpp @@ -7,6 +7,8 @@ #include <vespa/searchlib/query/tree/querybuilder.h> #include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/query/tree/stackdumpcreator.h> +#include <vespa/searchlib/query/query_term_decoder.h> +#include <vespa/searchlib/query/query_term_simple.h> #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/log/log.h> @@ -115,7 +117,7 @@ Node::UP createQueryTree() { builder.add_true_node(); builder.add_false_node(); } - builder.addFuzzyTerm(str[5], view[5], id[5], weight[5]); + builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1); } Node::UP node = builder.build(); ASSERT_TRUE(node.get()); @@ -311,6 +313,8 @@ void checkQueryTreeTypes(Node *node) { auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]); EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5])); + EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance()); + EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength()); } struct AbstractTypes { @@ -434,8 +438,9 @@ struct MyNearestNeighborTerm : NearestNeighborTerm { struct MyTrue : TrueQueryNode {}; struct MyFalse : FalseQueryNode {}; struct MyFuzzyTerm : FuzzyTerm { - MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w) - : FuzzyTerm(t, f, i, w) { + MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w, + uint32_t m, uint32_t p) + : FuzzyTerm(t, f, i, w, m, p) { } }; @@ -578,6 +583,7 @@ TEST("require that Query Tree Creator Can Replicate Queries") { TEST("require that Query Tree Creator Can Create Queries From Stack") { Node::UP node = createQueryTree<MyQueryNodeTypes>(); string stackDump = StackDumpCreator::create(*node); + SimpleQueryStackDumpIterator iterator(stackDump); Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); @@ -618,6 +624,26 @@ TEST("require that All Range Syntaxes Work") { EXPECT_TRUE(range2 == range_term->getTerm()); } +TEST("require that fuzzy node can be created") { + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1); + Node::UP node = builder.build(); + + string stackDump = StackDumpCreator::create(*node); + { + SimpleQueryStackDumpIterator iterator(stackDump); + Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); + FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get()); + EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance()); + EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength()); + } + { + search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump); + EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance()); + EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength()); + } +} + TEST("require that empty intermediate node can be added") { QueryBuilder<SimpleQueryNodeTypes> builder; builder.addAnd(0); diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp index ec292eda8eb..4b5433dd370 100644 --- a/searchlib/src/tests/query/streaming_query_test.cpp +++ b/searchlib/src/tests/query/streaming_query_test.cpp @@ -805,9 +805,9 @@ TEST("testSameElementEvaluate") { } TEST("Control the size of query terms") { - EXPECT_EQUAL(104u, sizeof(QueryTermSimple)); - EXPECT_EQUAL(120u, sizeof(QueryTermUCS4)); - EXPECT_EQUAL(264u, sizeof(QueryTerm)); + EXPECT_EQUAL(112u, sizeof(QueryTermSimple)); + EXPECT_EQUAL(128u, sizeof(QueryTermUCS4)); + EXPECT_EQUAL(272u, sizeof(QueryTerm)); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index d876d80e73f..cd233b75438 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -27,7 +27,10 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) _term._char = term.getTerm(); _termLen = term.getTermLen(); } else if (isFuzzy()) { - _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(term.getTerm()); + _fuzzyMatcher = vespalib::FuzzyMatcher::from_term( + term.getTerm(), + term.getFuzzyMaxEditDistance(), + term.getFuzzyPrefixLength()); } else { term.term(_term._ucs4); } diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp index 85b55284b35..f625a2b7fd2 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp @@ -170,7 +170,6 @@ bool SimpleQueryStackDumpIterator::readNext() { case ParseItem::ITEM_EXACTSTRINGTERM: case ParseItem::ITEM_SUFFIXTERM: case ParseItem::ITEM_REGEXP: - case ParseItem::ITEM_FUZZY: _curr_index_name = read_stringref(p); _curr_term = read_stringref(p); _currArity = 0; @@ -188,6 +187,9 @@ bool SimpleQueryStackDumpIterator::readNext() { case ParseItem::ITEM_NEAREST_NEIGHBOR: if ( ! readNN(p)) return false; break; + case ParseItem::ITEM_FUZZY: + if (!readFuzzy(p)) return false; + break; case ParseItem::ITEM_TRUE: case ParseItem::ITEM_FALSE: // no content @@ -256,4 +258,14 @@ SimpleQueryStackDumpIterator::readComplexTerm(const char *& p) { return true; } +bool +SimpleQueryStackDumpIterator::readFuzzy(const char *&p) { + _curr_index_name = read_stringref(p); + _curr_term = read_stringref(p); // fuzzy term + _extraIntArg1 = readCompressedPositiveInt(p); // maxEditDistance + _extraIntArg2 = readCompressedPositiveInt(p); // prefixLength + _currArity = 0; + return true; +} + } diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h index 5cec16000b7..ca3e3f6be10 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h @@ -54,6 +54,7 @@ private: VESPA_DLL_LOCAL bool readPredicate(const char *&p); VESPA_DLL_LOCAL bool readNN(const char *&p); VESPA_DLL_LOCAL bool readComplexTerm(const char *& p); + VESPA_DLL_LOCAL bool readFuzzy(const char *&p); VESPA_DLL_LOCAL bool readNext(); public: /** @@ -115,6 +116,10 @@ public: bool getAllowApproximate() const { return (_extraIntArg2 != 0); } uint32_t getExploreAdditionalHits() const { return _extraIntArg3; } + // fuzzy match arguments + uint32_t getFuzzyMaxEditDistance() const { return _extraIntArg1; } + uint32_t getFuzzyPrefixLength() const { return _extraIntArg2; } + query::PredicateQueryTerm::UP getPredicateQueryTerm() { return std::move(_predicate_query_term); } vespalib::stringref getIndexName() const { return _curr_index_name; } diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index e0fcc8b690c..17e50216a23 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -210,7 +210,9 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type) _diversityCutoffStrict(false), _valid(true), _term(term_), - _diversityAttribute() + _diversityAttribute(), + _fuzzyMaxEditDistance(2), + _fuzzyPrefixLength(0) { if (isFullRange(_term)) { stringref rest(_term.c_str() + 1, _term.size() - 2); diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 0d5dd116826..b84813cd12e 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -52,6 +52,8 @@ public: size_t getDiversityCutoffGroups() const { return _diversityCutoffGroups; } bool getDiversityCutoffStrict() const { return _diversityCutoffStrict; } vespalib::stringref getDiversityAttribute() const { return _diversityAttribute; } + size_t getFuzzyMaxEditDistance() const { return _fuzzyMaxEditDistance; } + size_t getFuzzyPrefixLength() const { return _fuzzyPrefixLength; } bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const; bool getAsDoubleTerm(double & lower, double & upper) const; const char * getTerm() const { return _term.c_str(); } @@ -68,6 +70,7 @@ public: vespalib::string getClassName() const; bool isValid() const { return _valid; } const string & getTermString() const { return _term; } + private: bool getRangeInternal(int64_t & low, int64_t & high) const; template <typename N> @@ -84,6 +87,10 @@ private: stringref _diversityAttribute; template <typename T, typename D> bool getAsNumericTerm(T & lower, T & upper, D d) const; + +protected: + uint32_t _fuzzyMaxEditDistance; // set in QueryTerm + uint32_t _fuzzyPrefixLength; }; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 6f126c7a3eb..6d59886a4f5 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -140,6 +140,10 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm); qt->setWeight(queryRep.GetWeight()); qt->setUniqueId(queryRep.getUniqueId()); + if (qt->isFuzzy()) { + qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance()); + qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength()); + } if (qt->encoding().isBase10Integer() || ! qt->encoding().isFloat() || ! factory.getRewriteFloatTerms() || diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 25f2b598413..34b1b87491e 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -85,6 +85,10 @@ public: void visitMembers(vespalib::ObjectVisitor &visitor) const override; void setIndex(const string & index_) override { _index = index_; } const string & getIndex() const override { return _index; } + void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; } + void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; } + uint32_t fuzzyMaxEditDistance() const { return _fuzzyMaxEditDistance; } + uint32_t fuzzyPrefixLength() const { return _fuzzyPrefixLength; } protected: using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h index ee3a944cce1..2273dadbfcf 100644 --- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h +++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h @@ -222,8 +222,9 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st } template <class NodeTypes> typename NodeTypes::FuzzyTerm * -createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight) { - return new typename NodeTypes::FuzzyTerm(term, view, id, weight); +createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight, + uint32_t maxEditDistance, uint32_t prefixLength) { + return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); } @@ -333,9 +334,10 @@ public: adjustWeight(weight); return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight)); } - typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight) { + typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight, + uint32_t maxEditDistance, uint32_t prefixLength) { adjustWeight(weight); - return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight)); + return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength)); } typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name, int32_t id, Weight weight, uint32_t target_num_hits, diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h index ecaee350b21..52cfbc5effb 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h @@ -197,7 +197,8 @@ private: void visit(FuzzyTerm &node) override { replicate(node, _builder.addFuzzyTerm( node.getTerm(), node.getView(), - node.getId(), node.getWeight())); + node.getId(), node.getWeight(), + node.getMaxEditDistance(), node.getPrefixLength())); } }; diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h index 00dad2597ce..05e731c0c5c 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h @@ -154,8 +154,9 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm { }; struct SimpleFuzzyTerm : FuzzyTerm { SimpleFuzzyTerm(const Type &term, vespalib::stringref view, - int32_t id, Weight weight) - : FuzzyTerm(term, view, id, weight) { + int32_t id, Weight weight, + uint32_t maxEditDistance, uint32_t prefixLength) + : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) { } ~SimpleFuzzyTerm() override; }; diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp index f36410d1845..5ed33ec18a1 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp @@ -280,6 +280,8 @@ class QueryNodeConverter : public QueryVisitor { void visit(FuzzyTerm &node) override { createTerm(node, ParseItem::ITEM_FUZZY); + appendCompressedPositiveNumber(node.getMaxEditDistance()); + appendCompressedPositiveNumber(node.getPrefixLength()); } void visit(NearestNeighborTerm &node) override { diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h index a5f3be3e618..4103c3b1766 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h @@ -198,7 +198,9 @@ private: } else if (type == ParseItem::ITEM_REGEXP) { t = &builder.addRegExpTerm(term, view, id, weight); } else if (type == ParseItem::ITEM_FUZZY) { - t = &builder.addFuzzyTerm(term, view, id, weight); + uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance(); + uint32_t prefixLength = queryStack.getFuzzyPrefixLength(); + t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); } else { vespalib::Issue::report("query builder: Unable to create query tree from stack dump. node type = %d.", type); } diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h index 7aa867e25ed..5604a3afb52 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h @@ -117,13 +117,21 @@ public: //----------------------------------------------------------------------------- -class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> -{ +class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> { +private: + uint32_t _maxEditDistance; + uint32_t _prefixLength; public: FuzzyTerm(const Type &term, vespalib::stringref view, - int32_t id, Weight weight) - : QueryNodeMixinType(term, view, id, weight) + int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength) + : QueryNodeMixinType(term, view, id, weight), + _maxEditDistance(maxEditDistance), + _prefixLength(prefixLength) {} + + uint32_t getMaxEditDistance() const { return _maxEditDistance; } + uint32_t getPrefixLength() const { return _prefixLength; } + virtual ~FuzzyTerm() = 0; }; diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp index 60a4eab3f57..1395915fa5f 100644 --- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -6,12 +6,34 @@ using namespace vespalib; -FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) { - return {LowerCase::convert_to_ucs4(term), threshold, prefix_size}; + +template <typename T> +void assert_span(std::span<const T> left, std::vector<T> right) { + EXPECT_TRUE(std::equal(left.begin(), left.end(), right.begin(), right.end())); +} + +TEST(FuzzyMatcherTest, get_prefix_edge_cases) { + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 0), {}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 1), {1 }); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 2), {1, 2}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 3), {1, 2, 3}); + assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 10),{1, 2, 3}); + assert_span(FuzzyMatcher::get_prefix({}, 0), {}); + assert_span(FuzzyMatcher::get_prefix({}, 10), {}); +} + +TEST(FuzzyMatcherTest, get_suffix_edge_cases) { + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 0), {1, 2, 3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 1), {2, 3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 2), {3}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 3), {}); + assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 10), {}); + assert_span(FuzzyMatcher::get_suffix({}, 0), {}); + assert_span(FuzzyMatcher::get_suffix({}, 10),{}); } TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { - FuzzyMatcher fuzzy = from_term("abc", 2, 0); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("ABC")); EXPECT_TRUE(fuzzy.isMatch("ab1")); @@ -20,22 +42,30 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { } TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { - FuzzyMatcher fuzzy = from_term("abcdef", 2, 2); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2); EXPECT_TRUE(fuzzy.isMatch("abcdef")); EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); EXPECT_TRUE(fuzzy.isMatch("abcde1")); EXPECT_TRUE(fuzzy.isMatch("abcd12")); EXPECT_FALSE(fuzzy.isMatch("abc123")); - EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced + EXPECT_FALSE(fuzzy.isMatch("12cdef")); } TEST(FuzzyMatcherTest, get_prefix_is_empty) { - FuzzyMatcher fuzzy = from_term("whatever", 2, 0); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0); EXPECT_EQ(fuzzy.getPrefix(), ""); } +TEST(FuzzyMatcherTest, term_is_empty) { + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0); + EXPECT_TRUE(fuzzy.isMatch("")); + EXPECT_TRUE(fuzzy.isMatch("a")); + EXPECT_TRUE(fuzzy.isMatch("aa")); + EXPECT_FALSE(fuzzy.isMatch("aaa")); +} + TEST(FuzzyMatcherTest, get_prefix_non_empty) { - FuzzyMatcher fuzzy = from_term("abcd", 2, 2); + FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2); EXPECT_EQ(fuzzy.getPrefix(), "ab"); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp index e170f83c541..21fdb86a48d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -6,24 +6,51 @@ #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> -vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) { - return FuzzyMatcher(LowerCase::convert_to_ucs4(term)); +vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) { + return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength); +} + +std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) { + if (prefixLength == 0 || termCodepoints.empty()) { + return {}; + } else { + uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size())); + return {termCodepoints.begin(), termCodepoints.begin() + actualPrefixLength}; + } +} + +std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) { + if (termCodepoints.empty()) { + return {}; + } else { + uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size())); + return {termCodepoints.begin() + actualPrefixLength, termCodepoints.end()}; + } } bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target); + if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty + std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size); + // if prefix does not match, early stop + if (!std::equal(_folded_term_codepoints_prefix.begin(), _folded_term_codepoints_prefix.end(), + targetPrefix.begin(), targetPrefix.end())) { + return false; + } + } + return LevenshteinDistance::calculate( - _folded_term_codepoints, - targetCodepoints, + _folded_term_codepoints_suffix, + get_suffix(targetCodepoints, _prefix_size), _max_edit_distance).has_value(); } vespalib::string vespalib::FuzzyMatcher::getPrefix() const { vespalib::string prefix; Utf8Writer writer(prefix); - for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { - writer.putChar(_folded_term_codepoints.at(i)); + for (const uint32_t& code: _folded_term_codepoints_prefix) { + writer.putChar(code); } return prefix; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h index d4ee0e6d40f..12cc039706f 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -3,6 +3,7 @@ #include <string_view> #include <vector> +#include <span> #include <vespa/vespalib/stllike/string.h> @@ -13,37 +14,46 @@ namespace vespalib { * Class has two main parameters: * - prefix size, i.e the size of the prefix that is considered frozen. * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms - * Note that prefix size is not impacting matching logic, - * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic. - * But there is no code to actually enforcing it. + * Prefix size dictates of how match of a prefix is frozen, + * i.e. if prefixes between the document and the query do not match (after lowercase) + * matcher would return false early, without fuzzy match. */ class FuzzyMatcher { private: - static constexpr uint8_t DefaultPrefixSize = 0u; - static constexpr uint8_t DefaultMaxEditDistance = 2u; + static constexpr uint32_t DefaultPrefixSize = 0u; + static constexpr uint32_t DefaultMaxEditDistance = 2u; + + uint32_t _max_edit_distance; // max edit distance + uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy std::vector<uint32_t> _folded_term_codepoints; - uint8_t _max_edit_distance; // max edit distance - uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy + std::span<const uint32_t> _folded_term_codepoints_prefix; + std::span<const uint32_t> _folded_term_codepoints_suffix; public: FuzzyMatcher(): - _folded_term_codepoints(), _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize) + _prefix_size(DefaultPrefixSize), + _folded_term_codepoints(), + _folded_term_codepoints_prefix(), + _folded_term_codepoints_suffix() {} - FuzzyMatcher(std::vector<uint32_t> codepoints): - _folded_term_codepoints(std::move(codepoints)), + FuzzyMatcher(const std::vector<uint32_t>&& codepoints): _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize) + _prefix_size(DefaultPrefixSize), + _folded_term_codepoints(codepoints), + _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), + _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) {} - FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size): - _folded_term_codepoints(std::move(codepoints)), + FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size): _max_edit_distance(max_edit_distance), - _prefix_size(prefix_size) + _prefix_size(prefix_size), + _folded_term_codepoints(codepoints), + _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), + _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) {} [[nodiscard]] bool isMatch(std::string_view target) const; @@ -52,7 +62,11 @@ public: /// - static FuzzyMatcher from_term(std::string_view term); + static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength); + + static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); + + static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); }; diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp index 86ced3e52db..5f97a652f52 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp @@ -4,9 +4,10 @@ #include "levenshtein_distance.h" #include <limits> +#include <vector> std::optional<uint32_t> -vespalib::LevenshteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) +vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold) { uint32_t n = left.size(); uint32_t m = right.size(); diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h index 311211aba7a..c1a3435db44 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h @@ -3,7 +3,7 @@ #include <optional> #include <cstdint> -#include <vector> +#include <span> namespace vespalib { @@ -19,7 +19,7 @@ namespace vespalib { */ class LevenshteinDistance { public: - static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold); + static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold); }; } |