diff options
58 files changed, 695 insertions, 247 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 73d4b99b382..d634fe7795f 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -541,6 +541,30 @@ ], "fields": [] }, + "com.yahoo.prelude.query.FuzzyItem": { + "superClass": "com.yahoo.prelude.query.TermItem", + "interfaces": [], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>(java.lang.String, boolean, java.lang.String)", + "public void setValue(java.lang.String)", + "public java.lang.String getRawWord()", + "public boolean isWords()", + "public com.yahoo.prelude.query.Item$ItemType getItemType()", + "public java.lang.String getName()", + "public java.lang.String stringValue()", + "public boolean isStemmed()", + "public java.lang.String getIndexedString()", + "public int getNumWords()", + "public boolean equals(java.lang.Object)", + "public int hashCode()", + "public java.lang.String toString()", + "protected void encodeThis(java.nio.ByteBuffer)" + ], + "fields": [] + }, "com.yahoo.prelude.query.GeoLocationItem": { "superClass": "com.yahoo.prelude.query.TermItem", "interfaces": [], @@ -742,6 +766,7 @@ "public static final enum com.yahoo.prelude.query.Item$ItemType GEO_LOCATION_TERM", "public static final enum com.yahoo.prelude.query.Item$ItemType TRUE", "public static final enum com.yahoo.prelude.query.Item$ItemType FALSE", + "public static final enum com.yahoo.prelude.query.Item$ItemType FUZZY", "public final int code" ] }, diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java new file mode 100644 index 00000000000..9a710deca67 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java @@ -0,0 +1,106 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.query; + +import java.nio.ByteBuffer; + +/** + * Fuzzy search term + * + * @author alexeyche + */ +public class FuzzyItem extends TermItem { + private String fuzzyQuery; + + public FuzzyItem(String indexName, boolean isFromQuery, String fuzzyQuery) { + super(indexName, isFromQuery, null); + setValue(fuzzyQuery); + } + + @Override + public void setValue(String value) { + this.fuzzyQuery = value; + } + + @Override + public String getRawWord() { + return stringValue(); + } + + @Override + public boolean isWords() { + return false; + } + + @Override + public ItemType getItemType() { + return ItemType.FUZZY; + } + + @Override + public String getName() { + return "FUZZY"; + } + + @Override + public String stringValue() { + return fuzzyQuery; + } + + @Override + public boolean isStemmed() { + return false; + } + + @Override + public String getIndexedString() { + return stringValue(); + } + + @Override + public int getNumWords() { + return 1; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + FuzzyItem other = (FuzzyItem) obj; + if (fuzzyQuery == null) { + if (other.fuzzyQuery != null) { + return false; + } + } else if (!fuzzyQuery.equals(other.fuzzyQuery)) { + return false; + } + return true; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((fuzzyQuery == null) ? 0 : fuzzyQuery.hashCode()); + return result; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FuzzyItem [fuzzyQuery=").append(fuzzyQuery).append("]"); + return builder.toString(); + } + + protected void encodeThis(ByteBuffer buffer) { + super.encodeThis(buffer); + putString(getIndexedString(), buffer); + } +} + diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java index 2e0c3cf8593..02b208b6ce1 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java @@ -56,7 +56,8 @@ public abstract class Item implements Cloneable { NEAREST_NEIGHBOR(26), GEO_LOCATION_TERM(27), TRUE(28), - FALSE(29); + FALSE(29), + FUZZY(30); public final int code; diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java index 1805a11ff5e..6f7aff798a5 100644 --- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java +++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java @@ -2,6 +2,7 @@ package com.yahoo.search.query; import com.google.common.base.Preconditions; +import com.yahoo.prelude.query.*; import com.yahoo.processing.IllegalInputException; import com.yahoo.collections.LazyMap; import com.yahoo.geo.DistanceParser; @@ -10,38 +11,6 @@ import com.yahoo.language.Language; import com.yahoo.language.process.Normalizer; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.Location; -import com.yahoo.prelude.query.AndItem; -import com.yahoo.prelude.query.BoolItem; -import com.yahoo.prelude.query.CompositeItem; -import com.yahoo.prelude.query.DotProductItem; -import com.yahoo.prelude.query.EquivItem; -import com.yahoo.prelude.query.ExactStringItem; -import com.yahoo.prelude.query.IntItem; -import com.yahoo.prelude.query.Item; -import com.yahoo.prelude.query.Limit; -import com.yahoo.prelude.query.GeoLocationItem; -import com.yahoo.prelude.query.NearItem; -import com.yahoo.prelude.query.NearestNeighborItem; -import com.yahoo.prelude.query.NotItem; -import com.yahoo.prelude.query.ONearItem; -import com.yahoo.prelude.query.OrItem; -import com.yahoo.prelude.query.PhraseItem; -import com.yahoo.prelude.query.PredicateQueryItem; -import com.yahoo.prelude.query.PrefixItem; -import com.yahoo.prelude.query.RangeItem; -import com.yahoo.prelude.query.RankItem; -import com.yahoo.prelude.query.RegExpItem; -import com.yahoo.prelude.query.SameElementItem; -import com.yahoo.prelude.query.SegmentingRule; -import com.yahoo.prelude.query.Substring; -import com.yahoo.prelude.query.SubstringItem; -import com.yahoo.prelude.query.SuffixItem; -import com.yahoo.prelude.query.TaggableItem; -import com.yahoo.prelude.query.WandItem; -import com.yahoo.prelude.query.WeakAndItem; -import com.yahoo.prelude.query.WeightedSetItem; -import com.yahoo.prelude.query.WordAlternativesItem; -import com.yahoo.prelude.query.WordItem; import com.yahoo.search.grouping.request.GroupingOperation; import com.yahoo.search.query.parser.Parsable; import com.yahoo.search.query.parser.Parser; @@ -60,65 +29,13 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import static com.yahoo.search.yql.YqlParser.*; import static com.yahoo.slime.Type.ARRAY; import static com.yahoo.slime.Type.DOUBLE; import static com.yahoo.slime.Type.LONG; import static com.yahoo.slime.Type.OBJECT; import static com.yahoo.slime.Type.STRING; -import static com.yahoo.search.yql.YqlParser.ACCENT_DROP; -import static com.yahoo.search.yql.YqlParser.ALTERNATIVES; -import static com.yahoo.search.yql.YqlParser.AND_SEGMENTING; -import static com.yahoo.search.yql.YqlParser.ANNOTATIONS; -import static com.yahoo.search.yql.YqlParser.APPROXIMATE; -import static com.yahoo.search.yql.YqlParser.ASCENDING_HITS_ORDER; -import static com.yahoo.search.yql.YqlParser.CONNECTION_ID; -import static com.yahoo.search.yql.YqlParser.CONNECTION_WEIGHT; -import static com.yahoo.search.yql.YqlParser.CONNECTIVITY; -import static com.yahoo.search.yql.YqlParser.DEFAULT_TARGET_NUM_HITS; -import static com.yahoo.search.yql.YqlParser.DESCENDING_HITS_ORDER; -import static com.yahoo.search.yql.YqlParser.DISTANCE; -import static com.yahoo.search.yql.YqlParser.DISTANCE_THRESHOLD; -import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT; -import static com.yahoo.search.yql.YqlParser.EQUIV; -import static com.yahoo.search.yql.YqlParser.FILTER; -import static com.yahoo.search.yql.YqlParser.GEO_LOCATION; -import static com.yahoo.search.yql.YqlParser.HIT_LIMIT; -import static com.yahoo.search.yql.YqlParser.HNSW_EXPLORE_ADDITIONAL_HITS; -import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS; -import static com.yahoo.search.yql.YqlParser.LABEL; -import static com.yahoo.search.yql.YqlParser.NEAR; -import static com.yahoo.search.yql.YqlParser.NEAREST_NEIGHBOR; -import static com.yahoo.search.yql.YqlParser.NFKC; -import static com.yahoo.search.yql.YqlParser.NORMALIZE_CASE; -import static com.yahoo.search.yql.YqlParser.ONEAR; -import static com.yahoo.search.yql.YqlParser.ORIGIN; -import static com.yahoo.search.yql.YqlParser.ORIGIN_LENGTH; -import static com.yahoo.search.yql.YqlParser.ORIGIN_OFFSET; -import static com.yahoo.search.yql.YqlParser.ORIGIN_ORIGINAL; -import static com.yahoo.search.yql.YqlParser.PHRASE; -import static com.yahoo.search.yql.YqlParser.PREDICATE; -import static com.yahoo.search.yql.YqlParser.PREFIX; -import static com.yahoo.search.yql.YqlParser.RANGE; -import static com.yahoo.search.yql.YqlParser.RANK; -import static com.yahoo.search.yql.YqlParser.RANKED; -import static com.yahoo.search.yql.YqlParser.SAME_ELEMENT; -import static com.yahoo.search.yql.YqlParser.SCORE_THRESHOLD; -import static com.yahoo.search.yql.YqlParser.SIGNIFICANCE; -import static com.yahoo.search.yql.YqlParser.STEM; -import static com.yahoo.search.yql.YqlParser.SUBSTRING; -import static com.yahoo.search.yql.YqlParser.SUFFIX; -import static com.yahoo.search.yql.YqlParser.TARGET_HITS; -import static com.yahoo.search.yql.YqlParser.TARGET_NUM_HITS; -import static com.yahoo.search.yql.YqlParser.THRESHOLD_BOOST_FACTOR; -import static com.yahoo.search.yql.YqlParser.UNIQUE_ID; -import static com.yahoo.search.yql.YqlParser.USE_POSITION_DATA; -import static com.yahoo.search.yql.YqlParser.USER_INPUT_LANGUAGE; -import static com.yahoo.search.yql.YqlParser.WAND; -import static com.yahoo.search.yql.YqlParser.WEAK_AND; -import static com.yahoo.search.yql.YqlParser.WEIGHT; -import static com.yahoo.search.yql.YqlParser.WEIGHTED_SET; - /** * The Select query language. * @@ -926,6 +843,8 @@ public class SelectParser implements Parser { return instantiateONearItem(field, key, value); case EQUIV: return instantiateEquivItem(field, key, value); + case FUZZY: + return instantiateFuzzyItem(field, key, value); case ALTERNATIVES: return instantiateWordAlternativesItem(field, key, value); default: @@ -1155,6 +1074,15 @@ public class SelectParser implements Parser { return leafStyleSettings(getAnnotations(value), equiv); } + private Item instantiateFuzzyItem(String field, String key, Inspector value) { + HashMap<Integer, Inspector> children = childMap(value); + Preconditions.checkArgument(children.size() == 1, "Expected 1 argument, got %s.", children.size()); + String wordData = children.get(0).asString(); + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + + return leafStyleSettings(getAnnotations(value), fuzzy); + } + private Item instantiateWordAlternativesItem(String field, String key, Inspector value) { HashMap<Integer, Inspector> children = childMap(value); Preconditions.checkArgument(children.size() >= 1, "Expected 1 or more arguments, got %s.", children.size()); diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java index cc441eb0c3d..afc903ba7cf 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java +++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java @@ -1,55 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.yql; -import static com.yahoo.search.yql.YqlParser.ACCENT_DROP; -import static com.yahoo.search.yql.YqlParser.ALTERNATIVES; -import static com.yahoo.search.yql.YqlParser.AND_SEGMENTING; -import static com.yahoo.search.yql.YqlParser.BOUNDS; -import static com.yahoo.search.yql.YqlParser.BOUNDS_LEFT_OPEN; -import static com.yahoo.search.yql.YqlParser.BOUNDS_OPEN; -import static com.yahoo.search.yql.YqlParser.BOUNDS_RIGHT_OPEN; -import static com.yahoo.search.yql.YqlParser.CONNECTION_ID; -import static com.yahoo.search.yql.YqlParser.CONNECTION_WEIGHT; -import static com.yahoo.search.yql.YqlParser.CONNECTIVITY; -import static com.yahoo.search.yql.YqlParser.DISTANCE; -import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT; -import static com.yahoo.search.yql.YqlParser.END_ANCHOR; -import static com.yahoo.search.yql.YqlParser.EQUIV; -import static com.yahoo.search.yql.YqlParser.FILTER; -import static com.yahoo.search.yql.YqlParser.GEO_LOCATION; -import static com.yahoo.search.yql.YqlParser.HIT_LIMIT; -import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS; -import static com.yahoo.search.yql.YqlParser.LABEL; -import static com.yahoo.search.yql.YqlParser.NEAR; -import static com.yahoo.search.yql.YqlParser.NEAREST_NEIGHBOR; -import static com.yahoo.search.yql.YqlParser.NORMALIZE_CASE; -import static com.yahoo.search.yql.YqlParser.ONEAR; -import static com.yahoo.search.yql.YqlParser.ORIGIN; -import static com.yahoo.search.yql.YqlParser.ORIGIN_LENGTH; -import static com.yahoo.search.yql.YqlParser.ORIGIN_OFFSET; -import static com.yahoo.search.yql.YqlParser.ORIGIN_ORIGINAL; -import static com.yahoo.search.yql.YqlParser.PHRASE; -import static com.yahoo.search.yql.YqlParser.PREFIX; -import static com.yahoo.search.yql.YqlParser.RANGE; -import static com.yahoo.search.yql.YqlParser.RANK; -import static com.yahoo.search.yql.YqlParser.RANKED; -import static com.yahoo.search.yql.YqlParser.SAME_ELEMENT; -import static com.yahoo.search.yql.YqlParser.SCORE_THRESHOLD; -import static com.yahoo.search.yql.YqlParser.SIGNIFICANCE; -import static com.yahoo.search.yql.YqlParser.START_ANCHOR; -import static com.yahoo.search.yql.YqlParser.STEM; -import static com.yahoo.search.yql.YqlParser.SUBSTRING; -import static com.yahoo.search.yql.YqlParser.SUFFIX; -import static com.yahoo.search.yql.YqlParser.TARGET_NUM_HITS; -import static com.yahoo.search.yql.YqlParser.THRESHOLD_BOOST_FACTOR; -import static com.yahoo.search.yql.YqlParser.UNIQUE_ID; -import static com.yahoo.search.yql.YqlParser.URI; -import static com.yahoo.search.yql.YqlParser.USE_POSITION_DATA; -import static com.yahoo.search.yql.YqlParser.WAND; -import static com.yahoo.search.yql.YqlParser.WEAK_AND; -import static com.yahoo.search.yql.YqlParser.WEIGHT; -import static com.yahoo.search.yql.YqlParser.WEIGHTED_SET; - import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; @@ -64,51 +15,15 @@ import java.util.Map; import java.util.Map.Entry; import com.google.common.collect.ImmutableMap; -import com.yahoo.prelude.query.AndItem; -import com.yahoo.prelude.query.AndSegmentItem; -import com.yahoo.prelude.query.BoolItem; -import com.yahoo.prelude.query.DotProductItem; -import com.yahoo.prelude.query.EquivItem; -import com.yahoo.prelude.query.FalseItem; -import com.yahoo.prelude.query.ExactStringItem; -import com.yahoo.prelude.query.IndexedItem; -import com.yahoo.prelude.query.IntItem; -import com.yahoo.prelude.query.Item; -import com.yahoo.prelude.query.GeoLocationItem; -import com.yahoo.prelude.query.MarkerWordItem; -import com.yahoo.prelude.query.NearItem; -import com.yahoo.prelude.query.NearestNeighborItem; -import com.yahoo.prelude.query.NotItem; -import com.yahoo.prelude.query.NullItem; -import com.yahoo.prelude.query.ONearItem; -import com.yahoo.prelude.query.OrItem; -import com.yahoo.prelude.query.PhraseItem; -import com.yahoo.prelude.query.PhraseSegmentItem; -import com.yahoo.prelude.query.PredicateQueryItem; -import com.yahoo.prelude.query.PrefixItem; -import com.yahoo.prelude.query.RangeItem; -import com.yahoo.prelude.query.RankItem; -import com.yahoo.prelude.query.RegExpItem; -import com.yahoo.prelude.query.SameElementItem; -import com.yahoo.prelude.query.SegmentingRule; -import com.yahoo.prelude.query.Substring; -import com.yahoo.prelude.query.SubstringItem; -import com.yahoo.prelude.query.SuffixItem; -import com.yahoo.prelude.query.TaggableItem; -import com.yahoo.prelude.query.ToolBox; +import com.yahoo.prelude.query.*; import com.yahoo.prelude.query.ToolBox.QueryVisitor; -import com.yahoo.prelude.query.TrueItem; -import com.yahoo.prelude.query.UriItem; -import com.yahoo.prelude.query.WandItem; -import com.yahoo.prelude.query.WeakAndItem; -import com.yahoo.prelude.query.WeightedSetItem; -import com.yahoo.prelude.query.WordAlternativesItem; -import com.yahoo.prelude.query.WordItem; import com.yahoo.search.Query; import com.yahoo.search.grouping.Continuation; import com.yahoo.search.grouping.GroupingRequest; import com.yahoo.search.query.QueryTree; +import static com.yahoo.search.yql.YqlParser.*; + /** * Serialize Vespa query trees to YQL+ strings. * @@ -517,6 +432,32 @@ public class VespaSerializer { } } + private static class FuzzySerializer extends Serializer<FuzzyItem> { + + @Override + void onExit(StringBuilder destination, FuzzyItem item) { } + + @Override + boolean serialize(StringBuilder destination, FuzzyItem fuzzy) { + String annotations = leafAnnotations(fuzzy); + destination.append(normalizeIndexName(fuzzy.getIndexName())).append(" contains "); + + if (annotations.length() > 0) { + destination.append('(').append(annotations); + } + + destination.append(FUZZY).append('('); + destination.append('"'); + escape(fuzzy.getIndexedString(), destination).append('"'); + destination.append(')'); + + if (annotations.length() > 0) { + destination.append(')'); + } + return false; + } + } + private static class ONearSerializer extends Serializer<ONearItem> { @Override @@ -1239,6 +1180,7 @@ public class VespaSerializer { dispatchBuilder.put(WordItem.class, new WordSerializer()); dispatchBuilder.put(RegExpItem.class, new RegExpSerializer()); dispatchBuilder.put(UriItem.class, new UriSerializer()); + dispatchBuilder.put(FuzzyItem.class, new FuzzySerializer()); dispatch = ImmutableMap.copyOf(dispatchBuilder); } diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java index 26508fec3c4..99a449026d8 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java +++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java @@ -27,48 +27,8 @@ import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.Location; -import com.yahoo.prelude.query.AndItem; -import com.yahoo.prelude.query.AndSegmentItem; -import com.yahoo.prelude.query.BoolItem; -import com.yahoo.prelude.query.CompositeItem; -import com.yahoo.prelude.query.DotProductItem; -import com.yahoo.prelude.query.EquivItem; -import com.yahoo.prelude.query.FalseItem; -import com.yahoo.prelude.query.ExactStringItem; -import com.yahoo.prelude.query.IntItem; -import com.yahoo.prelude.query.Item; -import com.yahoo.prelude.query.Limit; -import com.yahoo.prelude.query.GeoLocationItem; -import com.yahoo.prelude.query.NearItem; -import com.yahoo.prelude.query.NearestNeighborItem; -import com.yahoo.prelude.query.NotItem; -import com.yahoo.prelude.query.NullItem; -import com.yahoo.prelude.query.ONearItem; -import com.yahoo.prelude.query.OrItem; -import com.yahoo.prelude.query.PhraseItem; -import com.yahoo.prelude.query.PhraseSegmentItem; -import com.yahoo.prelude.query.PredicateQueryItem; -import com.yahoo.prelude.query.PrefixItem; -import com.yahoo.prelude.query.RangeItem; -import com.yahoo.prelude.query.RankItem; -import com.yahoo.prelude.query.RegExpItem; -import com.yahoo.prelude.query.SameElementItem; -import com.yahoo.prelude.query.SegmentItem; -import com.yahoo.prelude.query.SegmentingRule; -import com.yahoo.prelude.query.Substring; -import com.yahoo.prelude.query.SubstringItem; -import com.yahoo.prelude.query.SuffixItem; -import com.yahoo.prelude.query.TaggableItem; -import com.yahoo.prelude.query.TermItem; -import com.yahoo.prelude.query.ToolBox; +import com.yahoo.prelude.query.*; import com.yahoo.prelude.query.ToolBox.QueryVisitor; -import com.yahoo.prelude.query.TrueItem; -import com.yahoo.prelude.query.UriItem; -import com.yahoo.prelude.query.WandItem; -import com.yahoo.prelude.query.WeakAndItem; -import com.yahoo.prelude.query.WeightedSetItem; -import com.yahoo.prelude.query.WordAlternativesItem; -import com.yahoo.prelude.query.WordItem; import com.yahoo.processing.IllegalInputException; import com.yahoo.search.Query; import com.yahoo.search.grouping.Continuation; @@ -192,6 +152,7 @@ public class YqlParser implements Parser { public static final String WEAK_AND = "weakAnd"; public static final String WEIGHT = "weight"; public static final String WEIGHTED_SET = "weightedSet"; + public static final String FUZZY = "fuzzy"; private final IndexFacts indexFacts; private final List<ConnectedItem> connectedItems = new ArrayList<>(); @@ -1171,7 +1132,7 @@ public class YqlParser implements Parser { assertHasOperator(ast, ExpressionOperator.CONTAINS); String field = getIndex(ast.getArgument(0)); if (userQuery != null && indexFactsSession.getIndex(field).isAttribute()) { - userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly", 2); + userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly (unless fuzzy query is used)", 2); } return instantiateLeafItem(field, ast.<OperatorNode<ExpressionOperator>> getArgument(1)); } @@ -1298,11 +1259,23 @@ public class YqlParser implements Parser { return instantiateWordAlternativesItem(field, ast); case URI: return instantiateUriItem(field, ast); + case FUZZY: + return instantiateFuzzyItem(field, ast); default: - throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI); + throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI, FUZZY); } } + private Item instantiateFuzzyItem(String field, OperatorNode<ExpressionOperator> ast) { + List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); + Preconditions.checkArgument(args.size() == 1, "Expected 1 argument, got %s.", args.size()); + + String wordData = getStringContents(args.get(0)); + + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData); + return leafStyleSettings(ast, fuzzy); + } + private Item instantiateEquivItem(String field, OperatorNode<ExpressionOperator> ast) { List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); Preconditions.checkArgument(args.size() >= 2, "Expected 2 or more arguments, got %s.", args.size()); diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java index 7057f996041..1269c2a5aef 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java @@ -443,4 +443,10 @@ public class VespaSerializerTestCase { + "alternatives({\"trees\": 1.0, \"tree\": 0.7}))" + ")"); } + + @Test + public void testFuzzy() { + parseAndConfirm("foo contains fuzzy(\"a\")"); + } + } diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index 15713dc1f97..a057d6f7c16 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -8,24 +8,7 @@ import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.IndexModel; import com.yahoo.prelude.SearchDefinition; -import com.yahoo.prelude.query.AndItem; -import com.yahoo.prelude.query.BoolItem; -import com.yahoo.prelude.query.ExactStringItem; -import com.yahoo.prelude.query.IndexedItem; -import com.yahoo.prelude.query.Item; -import com.yahoo.prelude.query.MarkerWordItem; -import com.yahoo.prelude.query.PhraseItem; -import com.yahoo.prelude.query.PhraseSegmentItem; -import com.yahoo.prelude.query.PrefixItem; -import com.yahoo.prelude.query.QueryCanonicalizer; -import com.yahoo.prelude.query.RegExpItem; -import com.yahoo.prelude.query.SegmentingRule; -import com.yahoo.prelude.query.Substring; -import com.yahoo.prelude.query.SubstringItem; -import com.yahoo.prelude.query.SuffixItem; -import com.yahoo.prelude.query.WeakAndItem; -import com.yahoo.prelude.query.WordAlternativesItem; -import com.yahoo.prelude.query.WordItem; +import com.yahoo.prelude.query.*; import com.yahoo.prelude.querytransform.QueryRewrite; import com.yahoo.processing.IllegalInputException; import com.yahoo.search.Query; @@ -382,6 +365,12 @@ public class YqlParserTestCase { } @Test + public void testFuzzy() { + assertParse("select foo from bar where baz contains fuzzy(\"a\")", + "FUZZY baz:a"); + } + + @Test public void testStemming() { assertTrue(getRootWord("select foo from bar where baz contains " + "([ {stem: false} ]\"colors\")").isStemmed()); diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java index 0dcfb8392ef..1c8433541e1 100644 --- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java +++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java @@ -674,6 +674,12 @@ public class SelectTestCase { checkWordAlternativesContent(alternatives); } + @Test + public void testFuzzy() { + assertParse("{ \"contains\": [\"description\", { \"fuzzy\": [\"a\"] }] }", + "FUZZY description:a"); + } + //------------------------------------------------------------------- grouping tests @Test diff --git a/docproc/abi-spec.json b/docproc/abi-spec.json index e2bbfd8ec10..5ad2c6d454d 100644 --- a/docproc/abi-spec.json +++ b/docproc/abi-spec.json @@ -251,10 +251,10 @@ "methods": [ "public void <init>()", "public abstract com.yahoo.docproc.DocumentProcessor$Progress process(com.yahoo.docproc.Processing)", - "public java.lang.String toString()", "public void setFieldMap(java.util.Map)", "public java.util.Map getFieldMap()", - "public java.util.Map getDocMap(java.lang.String)" + "public java.util.Map getDocMap(java.lang.String)", + "public java.lang.String toString()" ], "fields": [] }, diff --git a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp index 4bcf87c14a6..9c67b3b3ef7 100644 --- a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp +++ b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <mirror_with_all.h> +#include "mirror_with_all.h" + #include <vespa/slobrok/sbmirror.h> #include <vespa/fnet/frt/supervisor.h> #include <vespa/fnet/transport.h> diff --git a/searchcore/src/tests/proton/matching/query_test.cpp b/searchcore/src/tests/proton/matching/query_test.cpp index a09437be46b..fd3dd0da518 100644 --- a/searchcore/src/tests/proton/matching/query_test.cpp +++ b/searchcore/src/tests/proton/matching/query_test.cpp @@ -278,6 +278,7 @@ public: void visit(ProtonSubstringTerm &n) override { checkNode(n, 0, true); } void visit(ProtonSuffixTerm &n) override { checkNode(n, 2, false); } void visit(ProtonPhrase &n) override { checkNode(n, 0, true); } + void visit(ProtonFuzzyTerm &n) override { checkNode(n, 1, false); } void visit(ProtonWeightedSetTerm &) override {} void visit(ProtonDotProduct &) override {} void visit(ProtonWandTerm &) override {} @@ -435,6 +436,7 @@ public: void visit(ProtonPredicateQuery &) override {} void visit(ProtonRegExpTerm &) override {} void visit(ProtonNearestNeighborTerm &) override {} + void visit(ProtonFuzzyTerm &) override {} }; void Test::requireThatTermDataIsFilledIn() { diff --git a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp index ffd08b3bbef..78486f54704 100644 --- a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp +++ b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp @@ -69,6 +69,7 @@ struct DumpQuery : QueryVisitor { void visit(NearestNeighborTerm &) override {} void visit(TrueQueryNode &) override {} void visit(FalseQueryNode &) override {} + void visit(FuzzyTerm &) override {} }; std::string dump_query(Node &root) { diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index 9be5920c0ed..92d7e385293 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -165,6 +165,7 @@ protected: void visit(ProtonFalse &) override { _result = std::make_unique<EmptyBlueprint>(); } + void visit(ProtonFuzzyTerm &n) override { buildTerm(n); } public: BlueprintBuilderVisitor(const IRequestContext & requestContext, ISearchContext &context) : diff --git a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h index e7817dcecd2..d8060832d5d 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h +++ b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h @@ -137,6 +137,7 @@ typedef ProtonTerm<search::query::WandTerm> ProtonWandTerm; typedef ProtonTerm<search::query::PredicateQuery> ProtonPredicateQuery; typedef ProtonTerm<search::query::RegExpTerm> ProtonRegExpTerm; typedef ProtonTerm<search::query::NearestNeighborTerm> ProtonNearestNeighborTerm; +typedef ProtonTerm<search::query::FuzzyTerm> ProtonFuzzyTerm; struct ProtonNodeTypes { typedef ProtonAnd And; @@ -164,6 +165,7 @@ struct ProtonNodeTypes { typedef ProtonNearestNeighborTerm NearestNeighborTerm; typedef ProtonTrue TrueQueryNode; typedef ProtonFalse FalseQueryNode; + typedef ProtonFuzzyTerm FuzzyTerm; }; } diff --git a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp index 34cb5369c1e..574df384a63 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp @@ -69,6 +69,7 @@ public: void visit(ProtonNearestNeighborTerm &) override {} void visit(ProtonTrue &) override {} void visit(ProtonFalse &) override {} + void visit(ProtonFuzzyTerm &n) override { visitTerm(n); } }; } // namespace proton::matching::<unnamed> diff --git a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp index c2abeaa36ee..a32107836ca 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp @@ -45,6 +45,7 @@ struct TermDataFromTermVisitor void visit(ProtonPredicateQuery &) override { } void visit(ProtonRegExpTerm &n) override { visitTerm(n); } void visit(ProtonNearestNeighborTerm &n) override { visitTerm(n); } + void visit(ProtonFuzzyTerm &n) override { visitTerm(n); } }; } // namespace diff --git a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp index 72a53a1a3d2..72e0153b5c2 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp @@ -59,6 +59,7 @@ struct TermExpander : QueryVisitor { void visit(NearestNeighborTerm &) override {} void visit(TrueQueryNode &) override {} void visit(FalseQueryNode &) override {} + void visit(FuzzyTerm &) override {} void flush(Intermediate &parent) { for (Node::UP &term: terms) { diff --git a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp index 5592215f9e2..d69f7d1b0a4 100644 --- a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp +++ b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp @@ -205,6 +205,7 @@ private: void visit(PredicateQuery &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } void visit(NearestNeighborTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } public: CreateBlueprintVisitor(const IIndexCollection &indexes, diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp index 4f037415b35..65de302ae04 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp @@ -242,6 +242,12 @@ private: void testPrefixSearch(const AttributePtr & ptr); void testPrefixSearch(); + // test fuzzy search + void performFuzzySearch(const StringAttribute & vec, const vespalib::string & term, + const DocSet & expected, TermType termType); + void testFuzzySearch(const AttributePtr & ptr); + void testFuzzySearch(); + // test that search is working after clear doc template <typename VectorType, typename ValueType> void requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name, const Config & cfg, @@ -402,6 +408,7 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st switch (termType) { case TermType::PREFIXTERM: buffer[p++] = ParseItem::ITEM_PREFIXTERM; break; case TermType::REGEXP: buffer[p++] = ParseItem::ITEM_REGEXP; break; + case TermType::FUZZYTERM: buffer[p++] = ParseItem::ITEM_FUZZY; break; default: buffer[p++] = ParseItem::ITEM_TERM; break; @@ -1498,6 +1505,70 @@ SearchContextTest::testPrefixSearch() } } +//----------------------------------------------------------------------------- +// Test fuzzy search +//----------------------------------------------------------------------------- + +void +SearchContextTest::performFuzzySearch(const StringAttribute & vec, const vespalib::string & term, + const DocSet & expected, TermType termType) +{ + performSearch(vec, term, expected, termType); +} + +void +SearchContextTest::testFuzzySearch(const AttributePtr & ptr) +{ + LOG(info, "testFuzzySearch: vector '%s'", ptr->getName().c_str()); + + auto & vec = dynamic_cast<StringAttribute &>(*ptr.get()); + + uint32_t numDocs = 2; + addDocs(*ptr.get(), numDocs); + + const char * strings [] = {"fuzzysearch", "FUZZYSEARCH"}; + const char * terms[][2] = { + {"fuzzysearch", "FUZZYSEARCH"}, + {"fuzzysearck", "FUZZYSEARCK"}, + {"fuzzysekkkk", "FUZZYSEKKKK"} + }; + + for (uint32_t doc = 1; doc < numDocs + 1; ++doc) { + ASSERT_TRUE(doc < vec.getNumDocs()); + EXPECT_TRUE(vec.update(doc, strings[doc - 1])); + } + + ptr->commit(true); + + std::vector<DocSet> expected; + DocSet empty; + { + uint32_t docs[] = {1, 2}; + expected.emplace_back(docs, docs + 2); // normal search + } + { + uint32_t docs[] = {1, 2}; + expected.emplace_back(docs, docs + 2); // fuzzy search + } + + expected.emplace_back(); // results + + for (uint32_t i = 0; i < 3; ++i) { + for (uint32_t j = 0; j < 2; ++j) { + performFuzzySearch(vec, terms[i][j], expected[i], TermType::FUZZYTERM); + } + } +} + +void +SearchContextTest::testFuzzySearch() +{ + for (const auto & cfg : _stringCfg) { + testFuzzySearch(AttributeFactory::createAttribute(cfg.first, cfg.second)); + } +} + + template <typename VectorType, typename ValueType> void SearchContextTest::requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name, @@ -2028,6 +2099,7 @@ SearchContextTest::Main() testPrefixSearch(); testSearchIteratorConformance(); testSearchIteratorUnpacking(); + testFuzzySearch(); TEST_DO(requireThatSearchIsWorkingAfterClearDoc()); TEST_DO(requireThatSearchIsWorkingAfterLoadAndClearDoc()); TEST_DO(requireThatSearchIsWorkingAfterUpdates()); diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index 8a17114057c..2f0f0d5a6ae 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -386,8 +386,8 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext)); - EXPECT_EQUAL(24u, sizeof(StringSearchHelper)); - EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); + EXPECT_EQUAL(32u, sizeof(StringSearchHelper)); + EXPECT_EQUAL(64u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); SingleValueStringAttribute svsa("svsa", cfg); @@ -494,4 +494,20 @@ TEST("test cased regex match") { EXPECT_FALSE(helper.isMatch("xY")); } +TEST("test fuzzy match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::FUZZYTERM); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_TRUE(helper.isFuzzy()); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xyv")); + EXPECT_TRUE(helper.isMatch("xy")); + EXPECT_TRUE(helper.isMatch("x")); + EXPECT_TRUE(helper.isMatch("xvv")); + EXPECT_FALSE(helper.isMatch("vvv")); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp index 35280fb0bd8..0e8155e23c3 100644 --- a/searchlib/src/tests/query/customtypevisitor_test.cpp +++ b/searchlib/src/tests/query/customtypevisitor_test.cpp @@ -36,6 +36,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {}; struct MyStringTerm : InitTerm<StringTerm> {}; struct MySubstrTerm : InitTerm<SubstringTerm> {}; struct MySuffixTerm : InitTerm<SuffixTerm> {}; +struct MyFuzzyTerm : InitTerm<FuzzyTerm> {}; struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} }; struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} }; struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} }; @@ -65,6 +66,7 @@ struct MyQueryNodeTypes { typedef MyStringTerm StringTerm; typedef MySubstrTerm SubstringTerm; typedef MySuffixTerm SuffixTerm; + typedef MyFuzzyTerm FuzzyTerm; typedef MyWeakAnd WeakAnd; typedef MyWeightedSetTerm WeightedSetTerm; typedef MyDotProduct DotProduct; @@ -112,6 +114,7 @@ public: void visit(MyNearestNeighborTerm &) override { setVisited<MyNearestNeighborTerm>(); } void visit(MyTrue &) override { setVisited<MyTrue>(); } void visit(MyFalse &) override { setVisited<MyFalse>(); } + void visit(MyFuzzyTerm &) override { setVisited<MyFuzzyTerm>(); } }; template <class T> @@ -148,6 +151,7 @@ TEST("customtypevisitor_test") { requireThatNodeIsVisited<MyNearestNeighborTerm>(); requireThatNodeIsVisited<MyTrue>(); requireThatNodeIsVisited<MyFalse>(); + requireThatNodeIsVisited<MyFuzzyTerm>(); } } // namespace diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp index 9f73c1ff585..f770213e8e5 100644 --- a/searchlib/src/tests/query/query_visitor_test.cpp +++ b/searchlib/src/tests/query/query_visitor_test.cpp @@ -49,6 +49,7 @@ public: void visit(NearestNeighborTerm &) override { isVisited<NearestNeighborTerm>() = true; } void visit(TrueQueryNode &) override { isVisited<TrueQueryNode>() = true; } void visit(FalseQueryNode &) override { isVisited<FalseQueryNode>() = true; } + void visit(FuzzyTerm &) override { isVisited<FuzzyTerm>() = true; } }; template <class T> @@ -85,6 +86,7 @@ TEST("requireThatAllNodesCanBeVisited") { checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25)); checkVisit<TrueQueryNode>(new SimpleTrue()); checkVisit<FalseQueryNode>(new SimpleFalse()); + checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0))); } } // namespace diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp index 93cfad27742..2ea566027c4 100644 --- a/searchlib/src/tests/query/querybuilder_test.cpp +++ b/searchlib/src/tests/query/querybuilder_test.cpp @@ -47,7 +47,7 @@ PredicateQueryTerm::UP getPredicateQueryTerm() { template <class NodeTypes> Node::UP createQueryTree() { QueryBuilder<NodeTypes> builder; - builder.addAnd(12); + builder.addAnd(13); { builder.addRank(2); { @@ -115,6 +115,7 @@ Node::UP createQueryTree() { builder.add_true_node(); builder.add_false_node(); } + builder.addFuzzyTerm(str[5], view[5], id[5], weight[5]); } Node::UP node = builder.build(); ASSERT_TRUE(node.get()); @@ -179,10 +180,11 @@ void checkQueryTreeTypes(Node *node) { typedef typename NodeTypes::RegExpTerm RegExpTerm; typedef typename NodeTypes::TrueQueryNode TrueNode; typedef typename NodeTypes::FalseQueryNode FalseNode; + typedef typename NodeTypes::FuzzyTerm FuzzyTerm; ASSERT_TRUE(node); auto* and_node = as_node<And>(node); - EXPECT_EQUAL(12u, and_node->getChildren().size()); + EXPECT_EQUAL(13u, and_node->getChildren().size()); auto* rank = as_node<Rank>(and_node->getChildren()[0]); EXPECT_EQUAL(2u, rank->getChildren().size()); @@ -306,6 +308,9 @@ void checkQueryTreeTypes(Node *node) { auto* false_node = as_node<FalseNode>(and_not->getChildren()[1]); EXPECT_TRUE(true_node); EXPECT_TRUE(false_node); + + auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]); + EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5])); } struct AbstractTypes { @@ -332,6 +337,7 @@ struct AbstractTypes { typedef search::query::RegExpTerm RegExpTerm; typedef search::query::TrueQueryNode TrueQueryNode; typedef search::query::FalseQueryNode FalseQueryNode; + typedef search::query::FuzzyTerm FuzzyTerm; }; // Builds a tree with simplequery and checks that the results have the @@ -427,6 +433,11 @@ struct MyNearestNeighborTerm : NearestNeighborTerm { }; struct MyTrue : TrueQueryNode {}; struct MyFalse : FalseQueryNode {}; +struct MyFuzzyTerm : FuzzyTerm { + MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w) + : FuzzyTerm(t, f, i, w) { + } +}; struct MyQueryNodeTypes { typedef MyAnd And; @@ -454,6 +465,7 @@ struct MyQueryNodeTypes { typedef MyNearestNeighborTerm NearestNeighborTerm; typedef MyTrue TrueQueryNode; typedef MyFalse FalseQueryNode; + typedef MyFuzzyTerm FuzzyTerm; }; TEST("require that Custom Query Trees Can Be Built") { diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index 906400f50a5..f14966dbfc8 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -779,6 +779,8 @@ public: n.get_distance_threshold(), getRequestContext().get_attribute_blueprint_params().nearest_neighbor_brute_force_limit)); } + + void visit(query::FuzzyTerm &n) override { visitTerm(n); } }; template <typename WS> diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp index ca440c2a249..e28b576319f 100644 --- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp @@ -125,6 +125,10 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched) vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzy().getPrefix()); + auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); + lookupRange(comp, comp); } else { auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm()); lookupTerm(comp); diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 97c2c7d2b63..a1d79c6131b 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -282,6 +282,10 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str()); this->lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzy().getPrefix()); + auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str()); + this->lookupRange(comp, comp); } else { auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm()); this->lookupTerm(comp); @@ -301,6 +305,8 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::useThis(const PostingListSearc : false; } else if ( this->isCased() ) { return this->isMatch(_enumStore.get_value(it.getKey().load_acquire())); + } else if (this->isFuzzy()) { + return this->getFuzzy().isMatch(_enumStore.get_value(it.getKey().load_acquire())); } return true; } diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp index 730ad1107a7..a6feadac724 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp @@ -61,6 +61,10 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); lookupRange(comp, comp); + } else if (this->isFuzzy()) { + vespalib::string prefix(this->getFuzzy().getPrefix()); + auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str()); + lookupRange(comp, comp); } else { auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm()); lookupTerm(comp); diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp index 6062c4f2096..9cccce4b19d 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp @@ -18,11 +18,13 @@ namespace search { StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) : _regex(), + _fuzzy(), _term(), _termLen(), _isPrefix(term.isPrefix()), _isRegex(term.isRegex()), - _isCased(cased) + _isCased(cased), + _isFuzzy(term.isFuzzy()) { if (isRegex()) { if (isCased()) { @@ -33,6 +35,8 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) } else if (isCased()) { _term._char = term.getTerm(); _termLen = term.getTermLen(); + } else if (isFuzzy()) { + _fuzzy = vespalib::Fuzzy::from_term(term.getTerm()); } else { term.term(_term._ucs4); } @@ -54,6 +58,9 @@ StringSearchHelper::isMatch(const char *src) const { int res = strncmp(_term._char, src, _termLen); return (res == 0) && (src[_termLen] == 0 || isPrefix()); } + if (__builtin_expect(isFuzzy(), false)) { + return getFuzzy().isMatch(src); + } vespalib::Utf8ReaderForZTS u8reader(src); uint32_t j = 0; uint32_t val; diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h index 495427d3e45..175f56f8b45 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.h +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h @@ -10,6 +10,7 @@ #include <vespa/vespalib/regex/regex.h> #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/fuzzy/fuzzy.h> #include <optional> namespace search { @@ -26,9 +27,12 @@ public: bool isPrefix() const { return _isPrefix; } bool isRegex() const { return _isRegex; } bool isCased() const { return _isCased; } + bool isFuzzy() const { return _isFuzzy; } const vespalib::Regex & getRegex() const { return _regex; } + const vespalib::Fuzzy & getFuzzy() const { return _fuzzy; } private: vespalib::Regex _regex; + vespalib::Fuzzy _fuzzy; union { const ucs4_t *_ucs4; const char *_char; @@ -37,6 +41,7 @@ private: bool _isPrefix; bool _isRegex; bool _isCased; + bool _isFuzzy; }; class ReaderBase; @@ -126,7 +131,9 @@ protected: bool isPrefix() const { return _helper.isPrefix(); } bool isRegex() const { return _helper.isRegex(); } bool isCased() const { return _helper.isCased(); } + bool isFuzzy() const { return _helper.isFuzzy(); } const vespalib::Regex & getRegex() const { return _helper.getRegex(); } + const vespalib::Fuzzy & getFuzzy() const { return _helper.getFuzzy(); } class CollectHitCount { public: diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index 7cde1102bc1..eb8054317dc 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -409,6 +409,7 @@ public: void visit(RegExpTerm &n) override { visitTerm(n); } void visit(PredicateQuery &n) override { not_supported(n); } void visit(NearestNeighborTerm &n) override { not_supported(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } }; Blueprint::UP diff --git a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp index 330320d5047..f8ad85859fa 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp @@ -28,6 +28,7 @@ using index::IFieldLengthInspector; using index::IndexBuilder; using index::Schema; using index::SchemaUtil; +using query::FuzzyTerm; using query::LocationTerm; using query::NearestNeighborTerm; using query::Node; @@ -168,6 +169,7 @@ public: void visit(SubstringTerm &n) override { visitTerm(n); } void visit(SuffixTerm &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } void visit(PredicateQuery &n) override { not_supported(n); } void visit(NearestNeighborTerm &n) override { not_supported(n); } diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 34ea692c370..0d665d1f04d 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -56,8 +56,9 @@ public: ITEM_GEO_LOCATION_TERM = 27, ITEM_TRUE = 28, ITEM_FALSE = 29, - ITEM_MAX = 30, // Indicates how long tables must be. - ITEM_UNDEF = 31, + ITEM_FUZZY = 30, + ITEM_MAX = 31, // Indicates how long tables must be. + ITEM_UNDEF = 32, }; /** A tag identifying the origin of this query node. diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp index aa13c93810a..85b55284b35 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp @@ -170,6 +170,7 @@ bool SimpleQueryStackDumpIterator::readNext() { case ParseItem::ITEM_EXACTSTRINGTERM: case ParseItem::ITEM_SUFFIXTERM: case ParseItem::ITEM_REGEXP: + case ParseItem::ITEM_FUZZY: _curr_index_name = read_stringref(p); _curr_term = read_stringref(p); _currArity = 0; diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 433ab7d56dd..0d5dd116826 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -22,7 +22,8 @@ public: EXACTSTRINGTERM = 3, SUFFIXTERM = 4, REGEXP = 5, - GEO_LOCATION = 6 + GEO_LOCATION = 6, + FUZZYTERM = 7 }; template <typename N> @@ -61,6 +62,7 @@ public: bool isWord() const { return (_type == Type::WORD); } bool isRegex() const { return (_type == Type::REGEXP); } bool isGeoLoc() const { return (_type == Type::GEO_LOCATION); } + bool isFuzzy() const { return (_type == Type::FUZZYTERM); } bool empty() const { return _term.empty(); } virtual void visitMembers(vespalib::ObjectVisitor &visitor) const; vespalib::string getClassName() const; diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 77fc97913a4..6f126c7a3eb 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -86,6 +86,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_SUFFIXTERM: case ParseItem::ITEM_PURE_WEIGHTED_STRING: case ParseItem::ITEM_PURE_WEIGHTED_LONG: + case ParseItem::ITEM_FUZZY: { vespalib::string index = queryRep.getIndexName(); if (index.empty()) { @@ -116,6 +117,9 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_SUFFIXTERM: sTerm = TermType::SUFFIXTERM; break; + case ParseItem::ITEM_FUZZY: + sTerm = TermType::FUZZYTERM; + break; default: break; } diff --git a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h index 9f29c34aa05..abc48db9d87 100644 --- a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h @@ -52,6 +52,7 @@ public: virtual void visit(typename NodeTypes::NearestNeighborTerm &) = 0; virtual void visit(typename NodeTypes::TrueQueryNode &) = 0; virtual void visit(typename NodeTypes::FalseQueryNode &) = 0; + virtual void visit(typename NodeTypes::FuzzyTerm &) = 0; private: // Route QueryVisit requests to the correct custom type. @@ -81,6 +82,7 @@ private: typedef typename NodeTypes::NearestNeighborTerm TNearestNeighborTerm; typedef typename NodeTypes::TrueQueryNode TTrueQueryNode; typedef typename NodeTypes::FalseQueryNode TFalseQueryNode; + typedef typename NodeTypes::FuzzyTerm TFuzzyTerm; void visit(And &n) override { visit(static_cast<TAnd&>(n)); } void visit(AndNot &n) override { visit(static_cast<TAndNot&>(n)); } @@ -107,6 +109,7 @@ private: void visit(NearestNeighborTerm &n) override { visit(static_cast<TNearestNeighborTerm&>(n)); } void visit(TrueQueryNode &n) override { visit(static_cast<TTrueQueryNode&>(n)); } void visit(FalseQueryNode &n) override { visit(static_cast<TFalseQueryNode&>(n)); } + void visit(FuzzyTerm &n) override { visit(static_cast<TFuzzyTerm &>(n)); } }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h index 9631e2afded..ee3a944cce1 100644 --- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h +++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h @@ -220,6 +220,12 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st target_num_hits, allow_approximate, explore_additional_hits, distance_threshold); } +template <class NodeTypes> +typename NodeTypes::FuzzyTerm * +createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight) { + return new typename NodeTypes::FuzzyTerm(term, view, id, weight); +} + template <class NodeTypes> class QueryBuilder : public QueryBuilderBase { @@ -327,6 +333,10 @@ public: adjustWeight(weight); return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight)); } + typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight) { + adjustWeight(weight); + return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight)); + } typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name, int32_t id, Weight weight, uint32_t target_num_hits, bool allow_approximate, uint32_t explore_additional_hits, diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h index 3fb72f93b23..ecaee350b21 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h @@ -193,6 +193,12 @@ private: void visit(FalseQueryNode &) override { _builder.add_false_node(); } + + void visit(FuzzyTerm &node) override { + replicate(node, _builder.addFuzzyTerm( + node.getTerm(), node.getView(), + node.getId(), node.getWeight())); + } }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h index 02887975085..90faa25bd99 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h @@ -29,6 +29,7 @@ class SameElement; class NearestNeighborTerm; class TrueQueryNode; class FalseQueryNode; +class FuzzyTerm; struct QueryVisitor { virtual ~QueryVisitor() {} @@ -58,6 +59,7 @@ struct QueryVisitor { virtual void visit(NearestNeighborTerm &) = 0; virtual void visit(TrueQueryNode &) = 0; virtual void visit(FalseQueryNode &) = 0; + virtual void visit(FuzzyTerm &) = 0; }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp index cad97279b4c..e3cad4ed33a 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp @@ -52,4 +52,6 @@ SimpleRegExpTerm::~SimpleRegExpTerm() = default; SimpleNearestNeighborTerm::~SimpleNearestNeighborTerm() = default; +SimpleFuzzyTerm::~SimpleFuzzyTerm() = default; + } diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h index 5047e072cb7..00dad2597ce 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h @@ -152,7 +152,13 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm { {} ~SimpleNearestNeighborTerm() override; }; - +struct SimpleFuzzyTerm : FuzzyTerm { + SimpleFuzzyTerm(const Type &term, vespalib::stringref view, + int32_t id, Weight weight) + : FuzzyTerm(term, view, id, weight) { + } + ~SimpleFuzzyTerm() override; +}; struct SimpleQueryNodeTypes { using And = SimpleAnd; @@ -180,6 +186,7 @@ struct SimpleQueryNodeTypes { using PredicateQuery = SimplePredicateQuery; using RegExpTerm = SimpleRegExpTerm; using NearestNeighborTerm = SimpleNearestNeighborTerm; + using FuzzyTerm = SimpleFuzzyTerm; }; } diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp index d45a72d316a..f36410d1845 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp @@ -278,6 +278,10 @@ class QueryNodeConverter : public QueryVisitor { createTerm(node, ParseItem::ITEM_REGEXP); } + void visit(FuzzyTerm &node) override { + createTerm(node, ParseItem::ITEM_FUZZY); + } + void visit(NearestNeighborTerm &node) override { createTermNode(node, ParseItem::ITEM_NEAREST_NEIGHBOR); appendString(node.get_query_tensor_name()); diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h index 5a6f315205e..a5f3be3e618 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h @@ -197,6 +197,8 @@ private: t = &builder.addPredicateQuery(queryStack.getPredicateQueryTerm(), view, id, weight); } else if (type == ParseItem::ITEM_REGEXP) { t = &builder.addRegExpTerm(term, view, id, weight); + } else if (type == ParseItem::ITEM_FUZZY) { + t = &builder.addFuzzyTerm(term, view, id, weight); } else { vespalib::Issue::report("query builder: Unable to create query tree from stack dump. node type = %d.", type); } diff --git a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h index fc3570f44d8..a6eae257afd 100644 --- a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h +++ b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h @@ -32,6 +32,7 @@ class TemplateTermVisitor : public CustomTypeTermVisitor<NodeTypes> { void visit(typename NodeTypes::PredicateQuery &n) override { myVisit(n); } void visit(typename NodeTypes::RegExpTerm &n) override { myVisit(n); } void visit(typename NodeTypes::NearestNeighborTerm &n) override { myVisit(n); } + void visit(typename NodeTypes::FuzzyTerm &n) override { myVisit(n); } // Phrases are terms with children. This visitor will not visit // the phrase's children, unless this member function is diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp index dcf0533ff7a..6e889e76f21 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp @@ -24,6 +24,7 @@ RegExpTerm::~RegExpTerm() = default; WeightedSetTerm::~WeightedSetTerm() = default; DotProduct::~DotProduct() = default; WandTerm::~WandTerm() = default; +FuzzyTerm::~FuzzyTerm() = default; namespace { diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h index a728b674999..7aa867e25ed 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h @@ -115,6 +115,18 @@ public: virtual ~RegExpTerm() = 0; }; +//----------------------------------------------------------------------------- + +class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> +{ +public: + FuzzyTerm(const Type &term, vespalib::stringref view, + int32_t id, Weight weight) + : QueryNodeMixinType(term, view, id, weight) + {} + virtual ~FuzzyTerm() = 0; +}; + /** * Term matching the K nearest neighbors in a multi-dimensional vector space. * diff --git a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h index 86cde64a197..30c7e1722fb 100644 --- a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h +++ b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h @@ -73,6 +73,7 @@ public: void visit(query::SuffixTerm &n) override = 0; void visit(query::RegExpTerm &n) override = 0; void visit(query::NearestNeighborTerm &n) override = 0; + void visit(query::FuzzyTerm &n) override = 0; void visit(query::TrueQueryNode &) final override; void visit(query::FalseQueryNode &) final override; diff --git a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp index 519f6e81774..614f219cbcb 100644 --- a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp @@ -6,6 +6,7 @@ #include "create_blueprint_visitor_helper.h" #include <vespa/vespalib/objects/visit.h> +using search::query::FuzzyTerm; using search::query::LocationTerm; using search::query::NearestNeighborTerm; using search::query::Node; @@ -66,6 +67,7 @@ public: void visit(PredicateQuery &n) override { visitTerm(n); } void visit(RegExpTerm &n) override { visitTerm(n); } void visit(NearestNeighborTerm &n) override { visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } }; template <class Map> diff --git a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp index 08c0280ee68..63bf16e6016 100644 --- a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp @@ -17,6 +17,7 @@ using search::query::AndNot; using search::query::DotProduct; using search::query::Equiv; using search::query::FalseQueryNode; +using search::query::FuzzyTerm; using search::query::LocationTerm; using search::query::Near; using search::query::NearestNeighborTerm; @@ -105,6 +106,7 @@ struct TermAsStringVisitor : public QueryVisitor { void visit(SubstringTerm &n) override {visitTerm(n); } void visit(SuffixTerm &n) override {visitTerm(n); } void visit(RegExpTerm &n) override {visitTerm(n); } + void visit(FuzzyTerm &n) override { visitTerm(n); } void visit(PredicateQuery &) override {illegalVisit(); } void visit(NearestNeighborTerm &) override { illegalVisit(); } void visit(TrueQueryNode &) override { illegalVisit(); } diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 29a0c1e7bb8..c3c7fee2cef 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -62,6 +62,7 @@ vespa_define_module( src/tests/explore_modern_cpp src/tests/false src/tests/fiddle + src/tests/fuzzy src/tests/gencnt src/tests/guard src/tests/host_name @@ -169,6 +170,7 @@ vespa_define_module( src/vespa/vespalib/data src/vespa/vespalib/data/slime src/vespa/vespalib/datastore + src/vespa/vespalib/fuzzy src/vespa/vespalib/geo src/vespa/vespalib/hwaccelrated src/vespa/vespalib/io diff --git a/vespalib/src/tests/fuzzy/.gitignore b/vespalib/src/tests/fuzzy/.gitignore new file mode 100644 index 00000000000..cd5ca9b3b88 --- /dev/null +++ b/vespalib/src/tests/fuzzy/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +fuzzy_test +vespalib_fuzzy_test_app diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..f58602296a5 --- /dev/null +++ b/vespalib/src/tests/fuzzy/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_fuzzy_test_app TEST + SOURCES + fuzzy.cpp + DEPENDS + vespalib + ) +vespa_add_test(NAME vespalib_fuzzy_test_app COMMAND vespalib_fuzzy_test_app) diff --git a/vespalib/src/tests/fuzzy/fuzzy.cpp b/vespalib/src/tests/fuzzy/fuzzy.cpp new file mode 100644 index 00000000000..9ffb77b3742 --- /dev/null +++ b/vespalib/src/tests/fuzzy/fuzzy.cpp @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/fuzzy/fuzzy.h> + +using namespace vespalib; + + +TEST("require that levenstein distance works") { + EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "abc", 2).value()); + EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "ABC", 2).value()); + EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("abc", "abd", 2).value()); + EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("ABC", "abd", 2).value()); + EXPECT_EQUAL(2u, Fuzzy::levenstein_distance("ABC", "add", 2).value()); + EXPECT_FALSE(Fuzzy::levenstein_distance("ABC", "ddd", 2).has_value()); +} + +TEST("require that extracting of a prefix works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 2, 2); + EXPECT_EQUAL("pr", fuzzy.getPrefix()); +} + +TEST("require that empty prefix works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 0, 2); + EXPECT_EQUAL("", fuzzy.getPrefix()); +} + +TEST("require that longer prefix size works") { + Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 100, 2); + EXPECT_EQUAL("prefix", fuzzy.getPrefix()); +} + + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt index dea0f509bad..1fffddf2f2f 100644 --- a/vespalib/src/vespa/vespalib/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(vespalib $<TARGET_OBJECTS:vespalib_vespalib_data> $<TARGET_OBJECTS:vespalib_vespalib_data_slime> $<TARGET_OBJECTS:vespalib_vespalib_datastore> + $<TARGET_OBJECTS:vespalib_vespalib_fuzzy> $<TARGET_OBJECTS:vespalib_vespalib_geo> $<TARGET_OBJECTS:vespalib_vespalib_hwaccelrated> $<TARGET_OBJECTS:vespalib_vespalib_io> diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt new file mode 100644 index 00000000000..0c6c8f5c446 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vespalib_vespalib_fuzzy OBJECT + SOURCES + fuzzy.cpp + DEPENDS + ) + diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp new file mode 100644 index 00000000000..55d11b74123 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Levenstein distance algorithm is based off Java implementation from apache commons-text library + */ + +#include "fuzzy.h" + +#include <limits> + +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/text/lowercase.h> + +vespalib::Fuzzy vespalib::Fuzzy::from_term(std::string_view term) { + return Fuzzy(folded_codepoints(term)); +} + +std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(const char* src, size_t srcSize) { + std::vector<uint32_t> result; + result.reserve(srcSize); + Utf8ReaderForZTS srcReader(src); + while (srcReader.hasMore()) { + result.emplace_back(LowerCase::convert(srcReader.getChar())); + } + return result; +} + +std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(std::string_view src) { + return folded_codepoints(src.data(), src.size()); +} + +std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold) { + std::vector<uint32_t> sourceCodepoints = folded_codepoints(source.data(), source.size()); + std::vector<uint32_t> targetCodepoints = folded_codepoints(target.data(), target.size()); + return levenstein_distance(sourceCodepoints, targetCodepoints, threshold); +} + +std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) { + uint32_t n = left.size(); + uint32_t m = right.size(); + + if (n > m) { + return levenstein_distance(right, left, threshold); + } + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? std::optional(m) : std::nullopt; + } + if (m == 0) { + return n <= threshold ? std::optional(n) : std::nullopt; + } + + + // the edit distance cannot be less than the length difference + if (m - n > threshold) { + return std::nullopt; + } + + std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally + std::vector<uint32_t> d(n+1); // cost array, horizontally + + const uint32_t boundary = std::min(n, threshold) + 1; + + for (uint32_t i = 0; i < boundary; ++i) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + for (uint32_t i = boundary; i < p.size(); ++i) { + p[i] = std::numeric_limits<uint32_t>::max(); + } + for (uint32_t i = 0; i < d.size(); ++i) { + d[i] = std::numeric_limits<uint32_t>::max(); + } + + // iterates through t + for (uint32_t j = 1; j <= m; ++j) { + uint32_t rightJ = right[j - 1]; // jth character of right + d[0] = j; + + int32_t min = std::max(1, static_cast<int32_t>(j) - static_cast<int32_t>(threshold)); + + uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ? + n : std::min(n, j + threshold); + + // ignore entry left of leftmost + // printf("j = %d, min = %d, max = %d, n = %d\n", j, min, max, n); + if (min > 1) { + d[min - 1] = std::numeric_limits<uint32_t>::max(); + } + + uint32_t lowerBound = std::numeric_limits<uint32_t>::max(); + + for (uint32_t i = min; i <= max; ++i) { + if (left[i - 1] == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + + d[i] = 1 + std::min(std::min(d[i - 1], p[i]), p[i - 1]); + } + lowerBound = std::min(lowerBound, d[i]); + } + if (lowerBound > threshold) { + return std::nullopt; + } + std::swap(p, d); + } + if (p[n] <= threshold) { + return {p[n]}; + } + return std::nullopt; +} + +bool vespalib::Fuzzy::isMatch(std::string_view src) const { + std::vector<uint32_t> srcCodepoints = folded_codepoints(src); + return levenstein_distance(_folded_term_codepoints, srcCodepoints, _edit_distance).has_value(); +} + +vespalib::string vespalib::Fuzzy::getPrefix() const { + vespalib::string prefix; + Utf8Writer writer(prefix); + for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) { + writer.putChar(_folded_term_codepoints.at(i)); + } + return prefix; +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h new file mode 100644 index 00000000000..775c25445e3 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h @@ -0,0 +1,60 @@ +#pragma once + +#include <string_view> +#include <utility> +#include <vector> +#include <optional> + +#include <vespa/vespalib/stllike/string.h> + +namespace vespalib { + +class Fuzzy { +private: + static constexpr uint8_t DefaultPrefixSize = 0u; + static constexpr uint8_t DefaultEditDistance = 2u; + + std::vector<uint32_t> _folded_term_codepoints; + + uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e non-fuzzy + uint8_t _edit_distance; // max edit distance + + +public: + Fuzzy(): + _folded_term_codepoints(), + _prefix_size(DefaultPrefixSize), + _edit_distance(DefaultEditDistance) + {} + + Fuzzy(std::vector<uint32_t> codepoints): + _folded_term_codepoints(std::move(codepoints)), + _prefix_size(DefaultPrefixSize), + _edit_distance(DefaultEditDistance) + {} + + Fuzzy(std::vector<uint32_t> codepoints, uint8_t prefix_size, uint8_t edit_distance): + _folded_term_codepoints(std::move(codepoints)), + _prefix_size(prefix_size), + _edit_distance(edit_distance) + {} + + [[nodiscard]] bool isMatch(std::string_view src) const; + + [[nodiscard]] vespalib::string getPrefix() const; + + /// + + static Fuzzy from_term(std::string_view term); + + static std::optional<uint32_t> levenstein_distance(const std::vector<uint32_t>& source, const std::vector<uint32_t>& target, uint32_t threshold); + + static std::optional<uint32_t> levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold); + + static std::vector<uint32_t> folded_codepoints(const char* src, size_t srcSize); + + static std::vector<uint32_t> folded_codepoints(std::string_view src); + +}; + +} |