diff options
62 files changed, 757 insertions, 15 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index b7aa1a8d0ef..6249988a5ee 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -541,6 +541,30 @@ ], "fields": [] }, + "com.yahoo.prelude.query.FuzzyItem": { + "superClass": "com.yahoo.prelude.query.TermItem", + "interfaces": [], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>(java.lang.String, boolean, java.lang.String)", + "public void setValue(java.lang.String)", + "public java.lang.String getRawWord()", + "public boolean isWords()", + "public com.yahoo.prelude.query.Item$ItemType getItemType()", + "public java.lang.String getName()", + "public java.lang.String stringValue()", + "public boolean isStemmed()", + "public java.lang.String getIndexedString()", + "public int getNumWords()", + "public boolean equals(java.lang.Object)", + "public int hashCode()", + "public java.lang.String toString()", + "protected void encodeThis(java.nio.ByteBuffer)" + ], + "fields": [] + }, "com.yahoo.prelude.query.GeoLocationItem": { "superClass": "com.yahoo.prelude.query.TermItem", "interfaces": [], @@ -742,6 +766,7 @@ "public static final enum com.yahoo.prelude.query.Item$ItemType GEO_LOCATION_TERM", "public static final enum com.yahoo.prelude.query.Item$ItemType TRUE", "public static final enum com.yahoo.prelude.query.Item$ItemType FALSE", + "public static final enum com.yahoo.prelude.query.Item$ItemType FUZZY", "public final int code" ] }, diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java new file mode 100644 index 00000000000..74c31a5d1f0 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java @@ -0,0 +1,107 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.query; + +import java.nio.ByteBuffer; + +/** + * Fuzzy search term + * + * @author alexeyche + */ +public class FuzzyItem extends TermItem { + private String term; + + public FuzzyItem(String indexName, boolean isFromQuery, String term) { + super(indexName, isFromQuery, null); + setValue(term); + } + + @Override + public void setValue(String value) { + this.term = value; + } + + @Override + public String getRawWord() { + return stringValue(); + } + + @Override + public boolean isWords() { + return false; + } + + @Override + public ItemType getItemType() { + return ItemType.FUZZY; + } + + @Override + public String getName() { + return "FUZZY"; + } + + @Override + public String stringValue() { + return term; + } + + @Override + public boolean isStemmed() { + return false; + } + + @Override + public String getIndexedString() { + return stringValue(); + } + + @Override + public int getNumWords() { + return 1; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + FuzzyItem other = (FuzzyItem) obj; + if (term == null) { + if (other.term != null) { + return false; + } + } else if (!term.equals(other.term)) { + return false; + } + return true; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FuzzyItem [term=").append(term).append("]"); + return builder.toString(); + } + + @Override + protected void encodeThis(ByteBuffer buffer) { + super.encodeThis(buffer); + putString(getIndexedString(), buffer); + } +} + diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java index 2e0c3cf8593..02b208b6ce1 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java @@ -56,7 +56,8 @@ public abstract class Item implements Cloneable { NEAREST_NEIGHBOR(26), GEO_LOCATION_TERM(27), TRUE(28), - FALSE(29); + FALSE(29), + FUZZY(30); public final int code; diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java index 1805a11ff5e..320148ec01d 100644 --- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java +++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java @@ -16,6 +16,7 @@ import com.yahoo.prelude.query.CompositeItem; import com.yahoo.prelude.query.DotProductItem; import com.yahoo.prelude.query.EquivItem; import com.yahoo.prelude.query.ExactStringItem; +import com.yahoo.prelude.query.FuzzyItem; import com.yahoo.prelude.query.IntItem; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.Limit; @@ -82,6 +83,7 @@ import static com.yahoo.search.yql.YqlParser.DISTANCE_THRESHOLD; import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT; import static com.yahoo.search.yql.YqlParser.EQUIV; import static com.yahoo.search.yql.YqlParser.FILTER; +import static com.yahoo.search.yql.YqlParser.FUZZY; import static com.yahoo.search.yql.YqlParser.GEO_LOCATION; import static com.yahoo.search.yql.YqlParser.HIT_LIMIT; import static com.yahoo.search.yql.YqlParser.HNSW_EXPLORE_ADDITIONAL_HITS; @@ -926,6 +928,8 @@ public class SelectParser implements Parser { return instantiateONearItem(field, key, value); case EQUIV: return instantiateEquivItem(field, key, value); + case FUZZY: + return instantiateFuzzyItem(field, key, value); case ALTERNATIVES: return instantiateWordAlternativesItem(field, key, value); default: @@ -1155,6 +1159,15 @@ public class SelectParser implements Parser { return leafStyleSettings(getAnnotations(value), equiv); } + private Item instantiateFuzzyItem(String field, String key, Inspector value) { + HashMap<Integer, Inspector> children = childMap(value); + Preconditions.checkArgument(children.size() == 1, "Expected 1 argument, got %s.", children.size()); + String wordData = children.get(0).asString(); + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + + return leafStyleSettings(getAnnotations(value), fuzzy); + } + private Item instantiateWordAlternativesItem(String field, String key, Inspector value) { HashMap<Integer, Inspector> children = childMap(value); Preconditions.checkArgument(children.size() >= 1, "Expected 1 or more arguments, got %s.", children.size()); diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java index cc441eb0c3d..e778798b0e5 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java +++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java @@ -16,6 +16,7 @@ import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT; import static com.yahoo.search.yql.YqlParser.END_ANCHOR; import static com.yahoo.search.yql.YqlParser.EQUIV; import static com.yahoo.search.yql.YqlParser.FILTER; +import static com.yahoo.search.yql.YqlParser.FUZZY; import static com.yahoo.search.yql.YqlParser.GEO_LOCATION; import static com.yahoo.search.yql.YqlParser.HIT_LIMIT; import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS; @@ -70,6 +71,7 @@ import com.yahoo.prelude.query.BoolItem; import com.yahoo.prelude.query.DotProductItem; import com.yahoo.prelude.query.EquivItem; import com.yahoo.prelude.query.FalseItem; +import com.yahoo.prelude.query.FuzzyItem; import com.yahoo.prelude.query.ExactStringItem; import com.yahoo.prelude.query.IndexedItem; import com.yahoo.prelude.query.IntItem; @@ -517,6 +519,32 @@ public class VespaSerializer { } } + private static class FuzzySerializer extends Serializer<FuzzyItem> { + + @Override + void onExit(StringBuilder destination, FuzzyItem item) { } + + @Override + boolean serialize(StringBuilder destination, FuzzyItem fuzzy) { + String annotations = leafAnnotations(fuzzy); + destination.append(normalizeIndexName(fuzzy.getIndexName())).append(" contains "); + + if (annotations.length() > 0) { + destination.append('(').append(annotations); + } + + destination.append(FUZZY).append('('); + destination.append('"'); + escape(fuzzy.getIndexedString(), destination).append('"'); + destination.append(')'); + + if (annotations.length() > 0) { + destination.append(')'); + } + return false; + } + } + private static class ONearSerializer extends Serializer<ONearItem> { @Override @@ -1239,6 +1267,7 @@ public class VespaSerializer { dispatchBuilder.put(WordItem.class, new WordSerializer()); dispatchBuilder.put(RegExpItem.class, new RegExpSerializer()); dispatchBuilder.put(UriItem.class, new UriSerializer()); + dispatchBuilder.put(FuzzyItem.class, new FuzzySerializer()); dispatch = ImmutableMap.copyOf(dispatchBuilder); } diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java index 26508fec3c4..06ee0e706f3 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java +++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java @@ -34,6 +34,7 @@ import com.yahoo.prelude.query.CompositeItem; import com.yahoo.prelude.query.DotProductItem; import com.yahoo.prelude.query.EquivItem; import com.yahoo.prelude.query.FalseItem; +import com.yahoo.prelude.query.FuzzyItem; import com.yahoo.prelude.query.ExactStringItem; import com.yahoo.prelude.query.IntItem; import com.yahoo.prelude.query.Item; @@ -192,6 +193,7 @@ public class YqlParser implements Parser { public static final String WEAK_AND = "weakAnd"; public static final String WEIGHT = "weight"; public static final String WEIGHTED_SET = "weightedSet"; + public static final String FUZZY = "fuzzy"; private final IndexFacts indexFacts; private final List<ConnectedItem> connectedItems = new ArrayList<>(); @@ -1171,7 +1173,7 @@ public class YqlParser implements Parser { assertHasOperator(ast, ExpressionOperator.CONTAINS); String field = getIndex(ast.getArgument(0)); if (userQuery != null && indexFactsSession.getIndex(field).isAttribute()) { - userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly", 2); + userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly (unless fuzzy is used)", 2); } return instantiateLeafItem(field, ast.<OperatorNode<ExpressionOperator>> getArgument(1)); } @@ -1298,11 +1300,23 @@ public class YqlParser implements Parser { return instantiateWordAlternativesItem(field, ast); case URI: return instantiateUriItem(field, ast); + case FUZZY: + return instantiateFuzzyItem(field, ast); default: - throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI); + throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI, FUZZY); } } + private Item instantiateFuzzyItem(String field, OperatorNode<ExpressionOperator> ast) { + List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); + Preconditions.checkArgument(args.size() == 1, "Expected 1 argument, got %s.", args.size()); + + String wordData = getStringContents(args.get(0)); + + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + return leafStyleSettings(ast, fuzzy); + } + private Item instantiateEquivItem(String field, OperatorNode<ExpressionOperator> ast) { List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); Preconditions.checkArgument(args.size() >= 2, "Expected 2 or more arguments, got %s.", args.size()); diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java index 7057f996041..1269c2a5aef 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java @@ -443,4 +443,10 @@ public class VespaSerializerTestCase { + "alternatives({\"trees\": 1.0, \"tree\": 0.7}))" + ")"); } + + @Test + public void testFuzzy() { + parseAndConfirm("foo contains fuzzy(\"a\")"); + } + } diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index 15713dc1f97..f40e212adde 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -11,6 +11,7 @@ import com.yahoo.prelude.SearchDefinition; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.BoolItem; import com.yahoo.prelude.query.ExactStringItem; +import com.yahoo.prelude.query.FuzzyItem; import com.yahoo.prelude.query.IndexedItem; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.MarkerWordItem; @@ -382,6 +383,15 @@ public class YqlParserTestCase { } @Test + public void testFuzzy() { + QueryTree x = parse("select foo from bar where baz contains fuzzy(\"a b\")"); + Item root = x.getRoot(); + assertSame(FuzzyItem.class, root.getClass()); + assertEquals("baz", ((FuzzyItem) root).getIndexName()); + assertEquals("a b", ((FuzzyItem) root).stringValue()); + } + + @Test public void testStemming() { assertTrue(getRootWord("select foo from bar where baz contains " + "([ {stem: false} ]\"colors\")").isStemmed()); diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java index 0dcfb8392ef..9b867be1484 100644 --- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java +++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java @@ -6,6 +6,7 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.ExactStringItem; +import com.yahoo.prelude.query.FuzzyItem; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.PhraseItem; import com.yahoo.prelude.query.PrefixItem; @@ -674,6 +675,15 @@ public class SelectTestCase { checkWordAlternativesContent(alternatives); } + @Test + public void testFuzzy() { + QueryTree x = parseWhere("{ \"contains\": [\"description\", { \"fuzzy\": [\"a b\"] }] }"); + Item root = x.getRoot(); + assertSame(FuzzyItem.class, root.getClass()); + assertEquals("description", ((FuzzyItem) root).getIndexName()); + assertEquals("a b", ((FuzzyItem) root).stringValue()); + } + //------------------------------------------------------------------- grouping tests @Test diff --git a/docproc/abi-spec.json b/docproc/abi-spec.json index e2bbfd8ec10..5ad2c6d454d 100644 --- a/docproc/abi-spec.json +++ b/docproc/abi-spec.json @@ -251,10 +251,10 @@ "methods": [ "public void <init>()", "public abstract com.yahoo.docproc.DocumentProcessor$Progress process(com.yahoo.docproc.Processing)", - "public java.lang.String toString()", "public void setFieldMap(java.util.Map)", "public java.util.Map getFieldMap()", - "public java.util.Map getDocMap(java.lang.String)" + "public java.util.Map getDocMap(java.lang.String)", + "public java.lang.String toString()" ], "fields": [] }, diff --git a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp index 4bcf87c14a6..9c67b3b3ef7 100644 --- a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp +++ b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <mirror_with_all.h> +#include "mirror_with_all.h" + #include <vespa/slobrok/sbmirror.h> #include <vespa/fnet/frt/supervisor.h> #include <vespa/fnet/transport.h> diff --git a/searchcore/src/tests/proton/matching/query_test.cpp b/searchcore/src/tests/proton/matching/query_test.cpp index a09437be46b..fd3dd0da518 100644 --- a/searchcore/src/tests/proton/matching/query_test.cpp +++ b/searchcore/src/tests/proton/matching/query_test.cpp @@ -278,6 +278,7 @@ public: void visit(ProtonSubstringTerm &n) override { checkNode(n, 0, true); } void visit(ProtonSuffixTerm &n) override { checkNode(n, 2, false); } void visit(ProtonPhrase &n) override { checkNode(n, 0, true); } + void visit(ProtonFuzzyTerm &n) override { checkNode(n, 1, false); } void visit(ProtonWeightedSetTerm &) override {} void visit(ProtonDotProduct &) override {} void visit(ProtonWandTerm &) override {} @@ -435,6 +436,7 @@ public: void visit(ProtonPredicateQuery &) override {} void visit(ProtonRegExpTerm &) override {} void visit(ProtonNearestNeighborTerm &) override {} + void visit(ProtonFuzzyTerm &) override {} }; void Test::requireThatTermDataIsFilledIn() { diff --git a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp index ffd08b3bbef..78486f54704 100644 --- a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp +++ b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp @@ -69,6 +69,7 @@ struct DumpQuery : QueryVisitor { void visit(NearestNeighborTerm &) override {} void visit(TrueQueryNode &) override {} void visit(FalseQueryNode &) override {} + void visit(FuzzyTerm &) override {} }; std::string dump_query(Node &root) { diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index 9be5920c0ed..92d7e385293 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -165,6 +165,7 @@ protected: void visit(ProtonFalse &) override { _result = std::make_unique<EmptyBlueprint>(); } + void visit(ProtonFuzzyTerm &n) override { buildTerm(n); } public: BlueprintBuilderVisitor(const IRequestContext & requestContext, ISearchContext &context) : diff --git a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h index e7817dcecd2..d8060832d5d 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h +++ b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h @@ -137,6 +137,7 @@ typedef ProtonTerm<search::query::WandTerm> ProtonWandTerm; typedef ProtonTerm<search::query::PredicateQuery> ProtonPredicateQuery; typedef ProtonTerm<search::query::RegExpTerm> ProtonRegExpTerm; typedef ProtonTerm<search::query::NearestNeighborTerm> ProtonNearestNeighborTerm; +typedef ProtonTerm<search::query::FuzzyTerm> ProtonFuzzyTerm; struct ProtonNodeTypes { typedef ProtonAnd And; @@ -164,6 +165,7 @@ struct ProtonNodeTypes { typedef ProtonNearestNeighborTerm NearestNeighborTerm; typedef ProtonTrue TrueQueryNode; typedef ProtonFalse FalseQueryNode; + typedef ProtonFuzzyTerm FuzzyTerm; }; } diff --git a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp index 34cb5369c1e..574df384a63 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp @@ -69,6 +69,7 @@ public: void visit(ProtonNearestNeighborTerm &) override {} void visit(ProtonTrue &) override {} void visit(ProtonFalse &) override {} + void visit(ProtonFuzzyTerm &n) override { visitTerm(n); } }; } // namespace proton::matching::<unnamed> diff --git a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp index c2abeaa36ee..a32107836ca 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp @@ -45,6 +45,7 @@ struct TermDataFromTermVisitor void visit(ProtonPredicateQuery &) override { } void visit(ProtonRegExpTerm &n) override { visitTerm(n); } void visit(ProtonNearestNeighborTerm &n) override { visitTerm(n); } + void visit(ProtonFuzzyTerm &n) override { visitTerm(n); } }; } // namespace diff --git a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp index 72a53a1a3d2..72e0153b5c2 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp @@ -59,6 +59,7 @@ struct TermExpander : QueryVisitor { void visit(NearestNeighborTerm &) override {} void visit(TrueQueryNode &) override {} void visit(FalseQueryNode &) override {} + void visit(FuzzyTerm &) override {} void flush(Intermediate &parent) { for (Node::UP &term: terms) { diff --git a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp index 5592215f9e2..d69f7d1b0a4 100644 --- a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp +++ b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp @@ -205,6 +205,7 @@ private: void visit(PredicateQuery &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } void visit(NearestNeighborTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } public: CreateBlueprintVisitor(const IIndexCollection &indexes, diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp index 4f037415b35..65de302ae04 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp @@ -242,6 +242,12 @@ private: void testPrefixSearch(const AttributePtr & ptr); void testPrefixSearch(); + // test fuzzy search + void performFuzzySearch(const StringAttribute & vec, const vespalib::string & term, + const DocSet & expected, TermType termType); + void testFuzzySearch(const AttributePtr & ptr); + void testFuzzySearch(); + // test that search is working after clear doc template <typename VectorType, typename ValueType> void requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name, const Config & cfg, @@ -402,6 +408,7 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st switch (termType) { case TermType::PREFIXTERM: buffer[p++] = ParseItem::ITEM_PREFIXTERM; break; case TermType::REGEXP: buffer[p++] = ParseItem::ITEM_REGEXP; break; + case TermType::FUZZYTERM: buffer[p++] = ParseItem::ITEM_FUZZY; break; default: buffer[p++] = ParseItem::ITEM_TERM; break; @@ -1498,6 +1505,70 @@ SearchContextTest::testPrefixSearch() } } +//----------------------------------------------------------------------------- +// Test fuzzy search +//----------------------------------------------------------------------------- + +void +SearchContextTest::performFuzzySearch(const StringAttribute & vec, const vespalib::string & term, + const DocSet & expected, TermType termType) +{ + performSearch(vec, term, expected, termType); +} + +void +SearchContextTest::testFuzzySearch(const AttributePtr & ptr) +{ + LOG(info, "testFuzzySearch: vector '%s'", ptr->getName().c_str()); + + auto & vec = dynamic_cast<StringAttribute &>(*ptr.get()); + + uint32_t numDocs = 2; + addDocs(*ptr.get(), numDocs); + + const char * strings [] = {"fuzzysearch", "FUZZYSEARCH"}; + const char * terms[][2] = { + {"fuzzysearch", "FUZZYSEARCH"}, + {"fuzzysearck", "FUZZYSEARCK"}, + {"fuzzysekkkk", "FUZZYSEKKKK"} + }; + + for (uint32_t doc = 1; doc < numDocs + 1; ++doc) { + ASSERT_TRUE(doc < vec.getNumDocs()); + EXPECT_TRUE(vec.update(doc, strings[doc - 1])); + } + + ptr->commit(true); + + std::vector<DocSet> expected; + DocSet empty; + { + uint32_t docs[] = {1, 2}; + expected.emplace_back(docs, docs + 2); // normal search + } + { + uint32_t docs[] = {1, 2}; + expected.emplace_back(docs, docs + 2); // fuzzy search + } + + expected.emplace_back(); // results + + for (uint32_t i = 0; i < 3; ++i) { + for (uint32_t j = 0; j < 2; ++j) { + performFuzzySearch(vec, terms[i][j], expected[i], TermType::FUZZYTERM); + } + } +} + +void +SearchContextTest::testFuzzySearch() +{ + for (const auto & cfg : _stringCfg) { + testFuzzySearch(AttributeFactory::createAttribute(cfg.first, cfg.second)); + } +} + + template <typename VectorType, typename ValueType> void SearchContextTest::requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name, @@ -2028,6 +2099,7 @@ SearchContextTest::Main() testPrefixSearch(); testSearchIteratorConformance(); testSearchIteratorUnpacking(); + testFuzzySearch(); TEST_DO(requireThatSearchIsWorkingAfterClearDoc()); TEST_DO(requireThatSearchIsWorkingAfterLoadAndClearDoc()); TEST_DO(requireThatSearchIsWorkingAfterUpdates()); diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index 8a17114057c..d8619386892 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -386,8 +386,8 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext)); - EXPECT_EQUAL(24u, sizeof(StringSearchHelper)); - EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); + EXPECT_EQUAL(56u, sizeof(StringSearchHelper)); + EXPECT_EQUAL(88u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); SingleValueStringAttribute svsa("svsa", cfg); @@ -494,4 +494,20 @@ TEST("test cased regex match") { EXPECT_FALSE(helper.isMatch("xY")); } +TEST("test fuzzy match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::FUZZYTERM); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_TRUE(helper.isFuzzy()); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xyv")); + EXPECT_TRUE(helper.isMatch("xy")); + EXPECT_TRUE(helper.isMatch("x")); + EXPECT_TRUE(helper.isMatch("xvv")); + EXPECT_FALSE(helper.isMatch("vvv")); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp index 35280fb0bd8..0e8155e23c3 100644 --- a/searchlib/src/tests/query/customtypevisitor_test.cpp +++ b/searchlib/src/tests/query/customtypevisitor_test.cpp @@ -36,6 +36,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {}; struct MyStringTerm : InitTerm<StringTerm> {}; struct MySubstrTerm : InitTerm<SubstringTerm> {}; struct MySuffixTerm : InitTerm<SuffixTerm> {}; +struct MyFuzzyTerm : InitTerm<FuzzyTerm> {}; struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} }; struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} }; struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} }; @@ -65,6 +66,7 @@ struct MyQueryNodeTypes { typedef MyStringTerm StringTerm; typedef MySubstrTerm SubstringTerm; typedef MySuffixTerm SuffixTerm; + typedef MyFuzzyTerm FuzzyTerm; typedef MyWeakAnd WeakAnd; typedef MyWeightedSetTerm WeightedSetTerm; typedef MyDotProduct DotProduct; @@ -112,6 +114,7 @@ public: void visit(MyNearestNeighborTerm &) override { setVisited<MyNearestNeighborTerm>(); } void visit(MyTrue &) override { setVisited<MyTrue>(); } void visit(MyFalse &) override { setVisited<MyFalse>(); } + void visit(MyFuzzyTerm &) override { setVisited<MyFuzzyTerm>(); } }; template <class T> @@ -148,6 +151,7 @@ TEST("customtypevisitor_test") { requireThatNodeIsVisited<MyNearestNeighborTerm>(); requireThatNodeIsVisited<MyTrue>(); requireThatNodeIsVisited<MyFalse>(); + requireThatNodeIsVisited<MyFuzzyTerm>(); } } // namespace diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp index 9f73c1ff585..f770213e8e5 100644 --- a/searchlib/src/tests/query/query_visitor_test.cpp +++ b/searchlib/src/tests/query/query_visitor_test.cpp @@ -49,6 +49,7 @@ public: void visit(NearestNeighborTerm &) override { isVisited<NearestNeighborTerm>() = true; } void visit(TrueQueryNode &) override { isVisited<TrueQueryNode>() = true; } void visit(FalseQueryNode &) override { isVisited<FalseQueryNode>() = true; } + void visit(FuzzyTerm &) override { isVisited<FuzzyTerm>() = true; } }; template <class T> @@ -85,6 +86,7 @@ TEST("requireThatAllNodesCanBeVisited") { checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25)); checkVisit<TrueQueryNode>(new SimpleTrue()); checkVisit<FalseQueryNode>(new SimpleFalse()); + checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0))); } } // namespace diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp index 93cfad27742..2ea566027c4 100644 --- a/searchlib/src/tests/query/querybuilder_test.cpp +++ b/searchlib/src/tests/query/querybuilder_test.cpp @@ -47,7 +47,7 @@ PredicateQueryTerm::UP getPredicateQueryTerm() { template <class NodeTypes> Node::UP createQueryTree() { QueryBuilder<NodeTypes> builder; - builder.addAnd(12); + builder.addAnd(13); { builder.addRank(2); { @@ -115,6 +115,7 @@ Node::UP createQueryTree() { builder.add_true_node(); builder.add_false_node(); } + builder.addFuzzyTerm(str[5], view[5], id[5], weight[5]); } Node::UP node = builder.build(); ASSERT_TRUE(node.get()); @@ -179,10 +180,11 @@ void checkQueryTreeTypes(Node *node) { typedef typename NodeTypes::RegExpTerm RegExpTerm; typedef typename NodeTypes::TrueQueryNode TrueNode; typedef typename NodeTypes::FalseQueryNode FalseNode; + typedef typename NodeTypes::FuzzyTerm FuzzyTerm; ASSERT_TRUE(node); auto* and_node = as_node<And>(node); - EXPECT_EQUAL(12u, and_node->getChildren().size()); + EXPECT_EQUAL(13u, and_node->getChildren().size()); auto* rank = as_node<Rank>(and_node->getChildren()[0]); EXPECT_EQUAL(2u, rank->getChildren().size()); @@ -306,6 +308,9 @@ void checkQueryTreeTypes(Node *node) { auto* false_node = as_node<FalseNode>(and_not->getChildren()[1]); EXPECT_TRUE(true_node); EXPECT_TRUE(false_node); + + auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]); + EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5])); } struct AbstractTypes { @@ -332,6 +337,7 @@ struct AbstractTypes { typedef search::query::RegExpTerm RegExpTerm; typedef search::query::TrueQueryNode TrueQueryNode; typedef search::query::FalseQueryNode FalseQueryNode; + typedef search::query::FuzzyTerm FuzzyTerm; }; // Builds a tree with simplequery and checks that the results have the @@ -427,6 +433,11 @@ struct MyNearestNeighborTerm : NearestNeighborTerm { }; struct MyTrue : TrueQueryNode {}; struct MyFalse : FalseQueryNode {}; +struct MyFuzzyTerm : FuzzyTerm { + MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w) + : FuzzyTerm(t, f, i, w) { + } +}; struct MyQueryNodeTypes { typedef MyAnd And; @@ -454,6 +465,7 @@ struct MyQueryNodeTypes { typedef MyNearestNeighborTerm NearestNeighborTerm; typedef MyTrue TrueQueryNode; typedef MyFalse FalseQueryNode; + typedef MyFuzzyTerm FuzzyTerm; }; TEST("require that Custom Query Trees Can Be Built") { diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index 906400f50a5..f14966dbfc8 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -779,6 +779,8 @@ public: n.get_distance_threshold(), getRequestContext().get_attribute_blueprint_params().nearest_neighbor_brute_force_limit)); } + + void visit(query::FuzzyTerm &n) override { visitTerm(n); } }; template <typename WS> diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp index ca440c2a249..c7e092799d1 100644 --- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp @@ -125,6 +125,10 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched) vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzyMatcher().getPrefix()); + auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); + lookupRange(comp, comp); } else { auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm()); lookupTerm(comp); diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 97c2c7d2b63..a0fbdf89f15 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -282,6 +282,10 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str()); this->lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzyMatcher().getPrefix()); + auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str()); + this->lookupRange(comp, comp); } else { auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm()); this->lookupTerm(comp); @@ -301,6 +305,8 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::useThis(const PostingListSearc : false; } else if ( this->isCased() ) { return this->isMatch(_enumStore.get_value(it.getKey().load_acquire())); + } else if (this->isFuzzy()) { + return this->getFuzzyMatcher().isMatch(_enumStore.get_value(it.getKey().load_acquire())); } return true; } diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp index 730ad1107a7..7c8815af30b 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp @@ -61,6 +61,10 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzyMatcher().getPrefix()); + auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); + lookupRange(comp, comp); } else { auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm()); lookupTerm(comp); diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp index 6062c4f2096..4baaaa45d3a 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp @@ -18,11 +18,13 @@ namespace search { StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) : _regex(), + _fuzzyMatcher(), _term(), _termLen(), _isPrefix(term.isPrefix()), _isRegex(term.isRegex()), - _isCased(cased) + _isCased(cased), + _isFuzzy(term.isFuzzy()) { if (isRegex()) { if (isCased()) { @@ -33,6 +35,8 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) } else if (isCased()) { _term._char = term.getTerm(); _termLen = term.getTermLen(); + } else if (isFuzzy()) { + _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(term.getTerm()); } else { term.term(_term._ucs4); } @@ -54,6 +58,9 @@ StringSearchHelper::isMatch(const char *src) const { int res = strncmp(_term._char, src, _termLen); return (res == 0) && (src[_termLen] == 0 || isPrefix()); } + if (__builtin_expect(isFuzzy(), false)) { + return getFuzzyMatcher().isMatch(src); + } vespalib::Utf8ReaderForZTS u8reader(src); uint32_t j = 0; uint32_t val; diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h index 495427d3e45..6ba4c801590 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.h +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h @@ -10,6 +10,7 @@ #include <vespa/vespalib/regex/regex.h> #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> #include <optional> namespace search { @@ -26,9 +27,12 @@ public: bool isPrefix() const { return _isPrefix; } bool isRegex() const { return _isRegex; } bool isCased() const { return _isCased; } + bool isFuzzy() const { return _isFuzzy; } const vespalib::Regex & getRegex() const { return _regex; } + const vespalib::FuzzyMatcher & getFuzzyMatcher() const { return _fuzzyMatcher; } private: vespalib::Regex _regex; + vespalib::FuzzyMatcher _fuzzyMatcher; union { const ucs4_t *_ucs4; const char *_char; @@ -37,6 +41,7 @@ private: bool _isPrefix; bool _isRegex; bool _isCased; + bool _isFuzzy; }; class ReaderBase; @@ -126,7 +131,9 @@ protected: bool isPrefix() const { return _helper.isPrefix(); } bool isRegex() const { return _helper.isRegex(); } bool isCased() const { return _helper.isCased(); } + bool isFuzzy() const { return _helper.isFuzzy(); } const vespalib::Regex & getRegex() const { return _helper.getRegex(); } + const vespalib::FuzzyMatcher & getFuzzyMatcher() const { return _helper.getFuzzyMatcher(); } class CollectHitCount { public: diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index 7cde1102bc1..eb8054317dc 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -409,6 +409,7 @@ public: void visit(RegExpTerm &n) override { visitTerm(n); } void visit(PredicateQuery &n) override { not_supported(n); } void visit(NearestNeighborTerm &n) override { not_supported(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } }; Blueprint::UP diff --git a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp index 330320d5047..f8ad85859fa 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp @@ -28,6 +28,7 @@ using index::IFieldLengthInspector; using index::IndexBuilder; using index::Schema; using index::SchemaUtil; +using query::FuzzyTerm; using query::LocationTerm; using query::NearestNeighborTerm; using query::Node; @@ -168,6 +169,7 @@ public: void visit(SubstringTerm &n) override { visitTerm(n); } void visit(SuffixTerm &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } void visit(PredicateQuery &n) override { not_supported(n); } void visit(NearestNeighborTerm &n) override { not_supported(n); } diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 34ea692c370..0d665d1f04d 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -56,8 +56,9 @@ public: ITEM_GEO_LOCATION_TERM = 27, ITEM_TRUE = 28, ITEM_FALSE = 29, - ITEM_MAX = 30, // Indicates how long tables must be. - ITEM_UNDEF = 31, + ITEM_FUZZY = 30, + ITEM_MAX = 31, // Indicates how long tables must be. + ITEM_UNDEF = 32, }; /** A tag identifying the origin of this query node. diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp index aa13c93810a..85b55284b35 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp @@ -170,6 +170,7 @@ bool SimpleQueryStackDumpIterator::readNext() { case ParseItem::ITEM_EXACTSTRINGTERM: case ParseItem::ITEM_SUFFIXTERM: case ParseItem::ITEM_REGEXP: + case ParseItem::ITEM_FUZZY: _curr_index_name = read_stringref(p); _curr_term = read_stringref(p); _currArity = 0; diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 433ab7d56dd..0d5dd116826 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -22,7 +22,8 @@ public: EXACTSTRINGTERM = 3, SUFFIXTERM = 4, REGEXP = 5, - GEO_LOCATION = 6 + GEO_LOCATION = 6, + FUZZYTERM = 7 }; template <typename N> @@ -61,6 +62,7 @@ public: bool isWord() const { return (_type == Type::WORD); } bool isRegex() const { return (_type == Type::REGEXP); } bool isGeoLoc() const { return (_type == Type::GEO_LOCATION); } + bool isFuzzy() const { return (_type == Type::FUZZYTERM); } bool empty() const { return _term.empty(); } virtual void visitMembers(vespalib::ObjectVisitor &visitor) const; vespalib::string getClassName() const; diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 77fc97913a4..6f126c7a3eb 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -86,6 +86,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_SUFFIXTERM: case ParseItem::ITEM_PURE_WEIGHTED_STRING: case ParseItem::ITEM_PURE_WEIGHTED_LONG: + case ParseItem::ITEM_FUZZY: { vespalib::string index = queryRep.getIndexName(); if (index.empty()) { @@ -116,6 +117,9 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_SUFFIXTERM: sTerm = TermType::SUFFIXTERM; break; + case ParseItem::ITEM_FUZZY: + sTerm = TermType::FUZZYTERM; + break; default: break; } diff --git a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h index 9f29c34aa05..abc48db9d87 100644 --- a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h @@ -52,6 +52,7 @@ public: virtual void visit(typename NodeTypes::NearestNeighborTerm &) = 0; virtual void visit(typename NodeTypes::TrueQueryNode &) = 0; virtual void visit(typename NodeTypes::FalseQueryNode &) = 0; + virtual void visit(typename NodeTypes::FuzzyTerm &) = 0; private: // Route QueryVisit requests to the correct custom type. @@ -81,6 +82,7 @@ private: typedef typename NodeTypes::NearestNeighborTerm TNearestNeighborTerm; typedef typename NodeTypes::TrueQueryNode TTrueQueryNode; typedef typename NodeTypes::FalseQueryNode TFalseQueryNode; + typedef typename NodeTypes::FuzzyTerm TFuzzyTerm; void visit(And &n) override { visit(static_cast<TAnd&>(n)); } void visit(AndNot &n) override { visit(static_cast<TAndNot&>(n)); } @@ -107,6 +109,7 @@ private: void visit(NearestNeighborTerm &n) override { visit(static_cast<TNearestNeighborTerm&>(n)); } void visit(TrueQueryNode &n) override { visit(static_cast<TTrueQueryNode&>(n)); } void visit(FalseQueryNode &n) override { visit(static_cast<TFalseQueryNode&>(n)); } + void visit(FuzzyTerm &n) override { visit(static_cast<TFuzzyTerm &>(n)); } }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h index 9631e2afded..ee3a944cce1 100644 --- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h +++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h @@ -220,6 +220,12 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st target_num_hits, allow_approximate, explore_additional_hits, distance_threshold); } +template <class NodeTypes> +typename NodeTypes::FuzzyTerm * +createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight) { + return new typename NodeTypes::FuzzyTerm(term, view, id, weight); +} + template <class NodeTypes> class QueryBuilder : public QueryBuilderBase { @@ -327,6 +333,10 @@ public: adjustWeight(weight); return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight)); } + typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight) { + adjustWeight(weight); + return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight)); + } typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name, int32_t id, Weight weight, uint32_t target_num_hits, bool allow_approximate, uint32_t explore_additional_hits, diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h index 3fb72f93b23..ecaee350b21 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h @@ -193,6 +193,12 @@ private: void visit(FalseQueryNode &) override { _builder.add_false_node(); } + + void visit(FuzzyTerm &node) override { + replicate(node, _builder.addFuzzyTerm( + node.getTerm(), node.getView(), + node.getId(), node.getWeight())); + } }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h index 02887975085..90faa25bd99 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h @@ -29,6 +29,7 @@ class SameElement; class NearestNeighborTerm; class TrueQueryNode; class FalseQueryNode; +class FuzzyTerm; struct QueryVisitor { virtual ~QueryVisitor() {} @@ -58,6 +59,7 @@ struct QueryVisitor { virtual void visit(NearestNeighborTerm &) = 0; virtual void visit(TrueQueryNode &) = 0; virtual void visit(FalseQueryNode &) = 0; + virtual void visit(FuzzyTerm &) = 0; }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp index cad97279b4c..e3cad4ed33a 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp @@ -52,4 +52,6 @@ SimpleRegExpTerm::~SimpleRegExpTerm() = default; SimpleNearestNeighborTerm::~SimpleNearestNeighborTerm() = default; +SimpleFuzzyTerm::~SimpleFuzzyTerm() = default; + } diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h index 5047e072cb7..00dad2597ce 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h @@ -152,7 +152,13 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm { {} ~SimpleNearestNeighborTerm() override; }; - +struct SimpleFuzzyTerm : FuzzyTerm { + SimpleFuzzyTerm(const Type &term, vespalib::stringref view, + int32_t id, Weight weight) + : FuzzyTerm(term, view, id, weight) { + } + ~SimpleFuzzyTerm() override; +}; struct SimpleQueryNodeTypes { using And = SimpleAnd; @@ -180,6 +186,7 @@ struct SimpleQueryNodeTypes { using PredicateQuery = SimplePredicateQuery; using RegExpTerm = SimpleRegExpTerm; using NearestNeighborTerm = SimpleNearestNeighborTerm; + using FuzzyTerm = SimpleFuzzyTerm; }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp index d45a72d316a..f36410d1845 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp @@ -278,6 +278,10 @@ class QueryNodeConverter : public QueryVisitor { createTerm(node, ParseItem::ITEM_REGEXP); } + void visit(FuzzyTerm &node) override { + createTerm(node, ParseItem::ITEM_FUZZY); + } + void visit(NearestNeighborTerm &node) override { createTermNode(node, ParseItem::ITEM_NEAREST_NEIGHBOR); appendString(node.get_query_tensor_name()); diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h index 5a6f315205e..a5f3be3e618 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h @@ -197,6 +197,8 @@ private: t = &builder.addPredicateQuery(queryStack.getPredicateQueryTerm(), view, id, weight); } else if (type == ParseItem::ITEM_REGEXP) { t = &builder.addRegExpTerm(term, view, id, weight); + } else if (type == ParseItem::ITEM_FUZZY) { + t = &builder.addFuzzyTerm(term, view, id, weight); } else { vespalib::Issue::report("query builder: Unable to create query tree from stack dump. node type = %d.", type); } diff --git a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h index fc3570f44d8..a6eae257afd 100644 --- a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h @@ -32,6 +32,7 @@ class TemplateTermVisitor : public CustomTypeTermVisitor<NodeTypes> { void visit(typename NodeTypes::PredicateQuery &n) override { myVisit(n); } void visit(typename NodeTypes::RegExpTerm &n) override { myVisit(n); } void visit(typename NodeTypes::NearestNeighborTerm &n) override { myVisit(n); } + void visit(typename NodeTypes::FuzzyTerm &n) override { myVisit(n); } // Phrases are terms with children. This visitor will not visit // the phrase's children, unless this member function is diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp index dcf0533ff7a..6e889e76f21 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp @@ -24,6 +24,7 @@ RegExpTerm::~RegExpTerm() = default; WeightedSetTerm::~WeightedSetTerm() = default; DotProduct::~DotProduct() = default; WandTerm::~WandTerm() = default; +FuzzyTerm::~FuzzyTerm() = default; namespace { diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h index a728b674999..7aa867e25ed 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h @@ -115,6 +115,18 @@ public: virtual ~RegExpTerm() = 0; }; +//----------------------------------------------------------------------------- + +class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> +{ +public: + FuzzyTerm(const Type &term, vespalib::stringref view, + int32_t id, Weight weight) + : QueryNodeMixinType(term, view, id, weight) + {} + virtual ~FuzzyTerm() = 0; +}; + /** * Term matching the K nearest neighbors in a multi-dimensional vector space. * diff --git a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h index 86cde64a197..30c7e1722fb 100644 --- a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h +++ b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h @@ -73,6 +73,7 @@ public: void visit(query::SuffixTerm &n) override = 0; void visit(query::RegExpTerm &n) override = 0; void visit(query::NearestNeighborTerm &n) override = 0; + void visit(query::FuzzyTerm &n) override = 0; void visit(query::TrueQueryNode &) final override; void visit(query::FalseQueryNode &) final override; diff --git a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp index 519f6e81774..614f219cbcb 100644 --- a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp @@ -6,6 +6,7 @@ #include "create_blueprint_visitor_helper.h" #include <vespa/vespalib/objects/visit.h> +using search::query::FuzzyTerm; using search::query::LocationTerm; using search::query::NearestNeighborTerm; using search::query::Node; @@ -66,6 +67,7 @@ public: void visit(PredicateQuery &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } void visit(NearestNeighborTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } }; template <class Map> diff --git a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp index 08c0280ee68..63bf16e6016 100644 --- a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp @@ -17,6 +17,7 @@ using search::query::AndNot; using search::query::DotProduct; using search::query::Equiv; using search::query::FalseQueryNode; +using search::query::FuzzyTerm; using search::query::LocationTerm; using search::query::Near; using search::query::NearestNeighborTerm; @@ -105,6 +106,7 @@ struct TermAsStringVisitor : public QueryVisitor { void visit(SubstringTerm &n) override {visitTerm(n); } void visit(SuffixTerm &n) override {visitTerm(n); } void visit(RegExpTerm &n) override {visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } void visit(PredicateQuery &) override {illegalVisit(); } void visit(NearestNeighborTerm &) override { illegalVisit(); } void visit(TrueQueryNode &) override { illegalVisit(); } diff --git a/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp b/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp index f3706bfc7c1..7f9557232b0 100644 --- a/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp +++ b/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp @@ -158,6 +158,7 @@ SimpleQueryStackItem::AppendBuffer(RawBuf *buf) const case ITEM_EXACTSTRINGTERM: case ITEM_SUFFIXTERM: case ITEM_REGEXP: + case ITEM_FUZZY: buf->appendCompressedPositiveNumber(indexLen); buf->append(_indexName.c_str(), indexLen); buf->appendCompressedPositiveNumber(termLen); diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 29a0c1e7bb8..c3c7fee2cef 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -62,6 +62,7 @@ vespa_define_module( src/tests/explore_modern_cpp src/tests/false src/tests/fiddle + src/tests/fuzzy src/tests/gencnt src/tests/guard src/tests/host_name @@ -169,6 +170,7 @@ vespa_define_module( src/vespa/vespalib/data src/vespa/vespalib/data/slime src/vespa/vespalib/datastore + src/vespa/vespalib/fuzzy src/vespa/vespalib/geo src/vespa/vespalib/hwaccelrated src/vespa/vespalib/io diff --git a/vespalib/src/tests/fuzzy/.gitignore b/vespalib/src/tests/fuzzy/.gitignore new file mode 100644 index 00000000000..cd5ca9b3b88 --- /dev/null +++ b/vespalib/src/tests/fuzzy/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +fuzzy_test +vespalib_fuzzy_test_app diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..2a415a9ad62 --- /dev/null +++ b/vespalib/src/tests/fuzzy/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_fuzzy_matcher_test_app TEST + SOURCES + fuzzy_matcher_test.cpp + DEPENDS + vespalib + GTest::GTest + ) +vespa_add_test(NAME vespalib_fuzzy_matcher_test_app COMMAND vespalib_fuzzy_matcher_test_app) + +vespa_add_executable(vespalib_levenstein_distance_test_app TEST + SOURCES + levenstein_distance_test.cpp + DEPENDS + vespalib + GTest::GTest + ) +vespa_add_test(NAME vespalib_levenstein_distance_test_app COMMAND vespalib_levenstein_distance_test_app)
\ No newline at end of file diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp new file mode 100644 index 00000000000..60a4eab3f57 --- /dev/null +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -0,0 +1,42 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/gtest/gtest.h> + +using namespace vespalib; + +FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) { + return {LowerCase::convert_to_ucs4(term), threshold, prefix_size}; +} + +TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { + FuzzyMatcher fuzzy = from_term("abc", 2, 0); + EXPECT_TRUE(fuzzy.isMatch("abc")); + EXPECT_TRUE(fuzzy.isMatch("ABC")); + EXPECT_TRUE(fuzzy.isMatch("ab1")); + EXPECT_TRUE(fuzzy.isMatch("a12")); + EXPECT_FALSE(fuzzy.isMatch("123")); +} + +TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { + FuzzyMatcher fuzzy = from_term("abcdef", 2, 2); + EXPECT_TRUE(fuzzy.isMatch("abcdef")); + EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); + EXPECT_TRUE(fuzzy.isMatch("abcde1")); + EXPECT_TRUE(fuzzy.isMatch("abcd12")); + EXPECT_FALSE(fuzzy.isMatch("abc123")); + EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced +} + +TEST(FuzzyMatcherTest, get_prefix_is_empty) { + FuzzyMatcher fuzzy = from_term("whatever", 2, 0); + EXPECT_EQ(fuzzy.getPrefix(), ""); +} + +TEST(FuzzyMatcherTest, get_prefix_non_empty) { + FuzzyMatcher fuzzy = from_term("abcd", 2, 2); + EXPECT_EQ(fuzzy.getPrefix(), "ab"); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp new file mode 100644 index 00000000000..efdcc82fce1 --- /dev/null +++ b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/fuzzy/levenstein_distance.h> +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/gtest/gtest.h> + +std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) { + std::vector<uint32_t> leftCodepoints = vespalib::LowerCase::convert_to_ucs4(left); + std::vector<uint32_t> rightCodepoints = vespalib::LowerCase::convert_to_ucs4(right); + + std::optional<uint32_t> leftRight = vespalib::LevensteinDistance::calculate(leftCodepoints,rightCodepoints, threshold); + std::optional<uint32_t> rightLeft = vespalib::LevensteinDistance::calculate(rightCodepoints,leftCodepoints, threshold); + + EXPECT_EQ(leftRight, rightLeft); // should be independent whether left or right strings are swapped + + return leftRight; +} + +TEST(LevensteinDistance, calculate_edgecases) { + EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0}); + EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "1bc", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "a1c", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "ab", 2), std::optional{1}); + EXPECT_EQ(calculate("abc", "abcd", 2), std::optional{1}); + EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("a", "", 2), std::optional{1}); + EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "", 2), std::nullopt); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); +} + +GTEST_MAIN_RUN_ALL_TESTS() + diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt index dea0f509bad..1fffddf2f2f 100644 --- a/vespalib/src/vespa/vespalib/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(vespalib $<TARGET_OBJECTS:vespalib_vespalib_data> $<TARGET_OBJECTS:vespalib_vespalib_data_slime> $<TARGET_OBJECTS:vespalib_vespalib_datastore> + $<TARGET_OBJECTS:vespalib_vespalib_fuzzy> $<TARGET_OBJECTS:vespalib_vespalib_geo> $<TARGET_OBJECTS:vespalib_vespalib_hwaccelrated> $<TARGET_OBJECTS:vespalib_vespalib_io> diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..0f7583da1d7 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vespalib_vespalib_fuzzy OBJECT + SOURCES + fuzzy_matcher.cpp + levenstein_distance.cpp + DEPENDS + ) + diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp new file mode 100644 index 00000000000..495e08c42e3 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fuzzy_matcher.h" +#include "levenstein_distance.h" + +#include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/text/utf8.h> + +vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) { + return FuzzyMatcher(LowerCase::convert_to_ucs4(term)); +} + +bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { + std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target); + + return LevensteinDistance::calculate( + _folded_term_codepoints, + targetCodepoints, + _max_edit_distance).has_value(); +} + +vespalib::string vespalib::FuzzyMatcher::getPrefix() const { + vespalib::string prefix; + Utf8Writer writer(prefix); + for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { + writer.putChar(_folded_term_codepoints.at(i)); + } + return prefix; +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h new file mode 100644 index 00000000000..bfcf98d79fd --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <string_view> +#include <vector> + +#include <vespa/vespalib/stllike/string.h> + +namespace vespalib { + +/** + * Fuzzy matching between a lowercased instances of query and document terms based on Levenstein distance. + * Class has two main parameters: + * - prefix size, i.e the size of the prefix that is considered frozen. + * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms + * Note that prefix size is not impacting matching logic, + * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic. + * But there is no code to actually enforcing it. + */ +class FuzzyMatcher { +private: + static constexpr uint8_t DefaultPrefixSize = 0u; + static constexpr uint8_t DefaultMaxEditDistance = 2u; + + std::vector<uint32_t> _folded_term_codepoints; + + uint8_t _max_edit_distance; // max edit distance + uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy + +public: + FuzzyMatcher(): + _folded_term_codepoints(), + _max_edit_distance(DefaultMaxEditDistance), + _prefix_size(DefaultPrefixSize) + {} + + FuzzyMatcher(std::vector<uint32_t> codepoints): + _folded_term_codepoints(std::move(codepoints)), + _max_edit_distance(DefaultMaxEditDistance), + _prefix_size(DefaultPrefixSize) + {} + + FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size): + _folded_term_codepoints(std::move(codepoints)), + _max_edit_distance(max_edit_distance), + _prefix_size(prefix_size) + {} + + [[nodiscard]] bool isMatch(std::string_view target) const; + + [[nodiscard]] vespalib::string getPrefix() const; + + /// + + static FuzzyMatcher from_term(std::string_view term); + +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp new file mode 100644 index 00000000000..5ee0f2d4a2d --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp @@ -0,0 +1,86 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Levenstein distance algorithm is based off Java implementation from apache commons-text library licensed under the Apache 2.0 license. + +#include "levenstein_distance.h" + +#include <limits> + +std::optional<uint32_t> vespalib::LevensteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) { + uint32_t n = left.size(); + uint32_t m = right.size(); + + if (n > m) { + return calculate(right, left, threshold); + } + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? std::optional(m) : std::nullopt; + } + if (m == 0) { + return n <= threshold ? std::optional(n) : std::nullopt; + } + + + // the edit distance cannot be less than the length difference + if (m - n > threshold) { + return std::nullopt; + } + + std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally + std::vector<uint32_t> d(n+1); // cost array, horizontally + + const uint32_t boundary = std::min(n, threshold) + 1; + + for (uint32_t i = 0; i < boundary; ++i) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + for (uint32_t i = boundary; i < p.size(); ++i) { + p[i] = std::numeric_limits<uint32_t>::max(); + } + for (uint32_t i = 0; i < d.size(); ++i) { + d[i] = std::numeric_limits<uint32_t>::max(); + } + + // iterates through t + for (uint32_t j = 1; j <= m; ++j) { + uint32_t rightJ = right[j - 1]; // jth character of right + d[0] = j; + + int32_t min = std::max(1, static_cast<int32_t>(j) - static_cast<int32_t>(threshold)); + + uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ? + n : std::min(n, j + threshold); + + // ignore entry left of leftmost + if (min > 1) { + d[min - 1] = std::numeric_limits<uint32_t>::max(); + } + + uint32_t lowerBound = std::numeric_limits<uint32_t>::max(); + + for (uint32_t i = min; i <= max; ++i) { + if (left[i - 1] == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + + d[i] = 1 + std::min(std::min(d[i - 1], p[i]), p[i - 1]); + } + lowerBound = std::min(lowerBound, d[i]); + } + if (lowerBound > threshold) { + return std::nullopt; + } + std::swap(p, d); + } + if (p[n] <= threshold) { + return {p[n]}; + } + return std::nullopt; +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h new file mode 100644 index 00000000000..bce7599fc23 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <optional> +#include <cstdint> +#include <vector> + +namespace vespalib { + +/** + * LevensteinDistance::calculate implements basic Levenstein distance algorithm + * with early stopping if the distance is already too high. + * If the distance is above threshold method would return empty optional, + * if the distance is below or equal to it, the distance will be wrapped in optional. + * The types it's built upon are uint32 and it used to compare codepoints from terms, + * but in future code can be generalized. + * + * Algorithm is based off Java implementation from commons-text library + */ +class LevensteinDistance { +public: + static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold); +}; + +} |