summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-03-24 10:47:38 +0100
committerGitHub <noreply@github.com>2022-03-24 10:47:38 +0100
commitb5ed37b45465f0f7a63cd7bbae8e8487208adf2f (patch)
tree66ef9e3a8b9b7153c549ac583777d72272cbc0fd
parente1e8e116f74c3b32faf5d901a07bbe14c67d1249 (diff)
parent8b2e9391720de4a568f9594d129a42c32a275c42 (diff)
Merge pull request #21689 from alexeyche/fuzzy-attribute-search
Introducing fuzzy search [WIP]
-rw-r--r--container-search/abi-spec.json25
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java107
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/Item.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/SelectParser.java13
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java29
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/YqlParser.java18
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java6
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java10
-rw-r--r--container-search/src/test/java/com/yahoo/select/SelectTestCase.java10
-rw-r--r--docproc/abi-spec.json4
-rw-r--r--documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp3
-rw-r--r--searchcore/src/tests/proton/matching/query_test.cpp2
-rw-r--r--searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp1
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp1
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/querynodes.h2
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp1
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp1
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp1
-rw-r--r--searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp1
-rw-r--r--searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp72
-rw-r--r--searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp20
-rw-r--r--searchlib/src/tests/query/customtypevisitor_test.cpp4
-rw-r--r--searchlib/src/tests/query/query_visitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/querybuilder_test.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.h7
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/diskindex.cpp1
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/parse.h5
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp1
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h4
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h3
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/querybuilder.h10
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/queryreplicator.h6
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/queryvisitor.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/simplequery.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/simplequery.h9
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/termnodes.cpp1
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/termnodes.h12
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h1
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/termasstring.cpp2
-rw-r--r--searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp1
-rw-r--r--vespalib/CMakeLists.txt2
-rw-r--r--vespalib/src/tests/fuzzy/.gitignore4
-rw-r--r--vespalib/src/tests/fuzzy/CMakeLists.txt18
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp42
-rw-r--r--vespalib/src/tests/fuzzy/levenstein_distance_test.cpp39
-rw-r--r--vespalib/src/vespa/vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt8
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp29
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h59
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp86
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h25
62 files changed, 757 insertions, 15 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index b7aa1a8d0ef..6249988a5ee 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -541,6 +541,30 @@
],
"fields": []
},
+ "com.yahoo.prelude.query.FuzzyItem": {
+ "superClass": "com.yahoo.prelude.query.TermItem",
+ "interfaces": [],
+ "attributes": [
+ "public"
+ ],
+ "methods": [
+ "public void <init>(java.lang.String, boolean, java.lang.String)",
+ "public void setValue(java.lang.String)",
+ "public java.lang.String getRawWord()",
+ "public boolean isWords()",
+ "public com.yahoo.prelude.query.Item$ItemType getItemType()",
+ "public java.lang.String getName()",
+ "public java.lang.String stringValue()",
+ "public boolean isStemmed()",
+ "public java.lang.String getIndexedString()",
+ "public int getNumWords()",
+ "public boolean equals(java.lang.Object)",
+ "public int hashCode()",
+ "public java.lang.String toString()",
+ "protected void encodeThis(java.nio.ByteBuffer)"
+ ],
+ "fields": []
+ },
"com.yahoo.prelude.query.GeoLocationItem": {
"superClass": "com.yahoo.prelude.query.TermItem",
"interfaces": [],
@@ -742,6 +766,7 @@
"public static final enum com.yahoo.prelude.query.Item$ItemType GEO_LOCATION_TERM",
"public static final enum com.yahoo.prelude.query.Item$ItemType TRUE",
"public static final enum com.yahoo.prelude.query.Item$ItemType FALSE",
+ "public static final enum com.yahoo.prelude.query.Item$ItemType FUZZY",
"public final int code"
]
},
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
new file mode 100644
index 00000000000..74c31a5d1f0
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
@@ -0,0 +1,107 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.prelude.query;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Fuzzy search term
+ *
+ * @author alexeyche
+ */
+public class FuzzyItem extends TermItem {
+ private String term;
+
+ public FuzzyItem(String indexName, boolean isFromQuery, String term) {
+ super(indexName, isFromQuery, null);
+ setValue(term);
+ }
+
+ @Override
+ public void setValue(String value) {
+ this.term = value;
+ }
+
+ @Override
+ public String getRawWord() {
+ return stringValue();
+ }
+
+ @Override
+ public boolean isWords() {
+ return false;
+ }
+
+ @Override
+ public ItemType getItemType() {
+ return ItemType.FUZZY;
+ }
+
+ @Override
+ public String getName() {
+ return "FUZZY";
+ }
+
+ @Override
+ public String stringValue() {
+ return term;
+ }
+
+ @Override
+ public boolean isStemmed() {
+ return false;
+ }
+
+ @Override
+ public String getIndexedString() {
+ return stringValue();
+ }
+
+ @Override
+ public int getNumWords() {
+ return 1;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!super.equals(obj)) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ FuzzyItem other = (FuzzyItem) obj;
+ if (term == null) {
+ if (other.term != null) {
+ return false;
+ }
+ } else if (!term.equals(other.term)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append("FuzzyItem [term=").append(term).append("]");
+ return builder.toString();
+ }
+
+ @Override
+ protected void encodeThis(ByteBuffer buffer) {
+ super.encodeThis(buffer);
+ putString(getIndexedString(), buffer);
+ }
+}
+
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
index 2e0c3cf8593..02b208b6ce1 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
@@ -56,7 +56,8 @@ public abstract class Item implements Cloneable {
NEAREST_NEIGHBOR(26),
GEO_LOCATION_TERM(27),
TRUE(28),
- FALSE(29);
+ FALSE(29),
+ FUZZY(30);
public final int code;
diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
index 1805a11ff5e..320148ec01d 100644
--- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
+++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
@@ -16,6 +16,7 @@ import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.DotProductItem;
import com.yahoo.prelude.query.EquivItem;
import com.yahoo.prelude.query.ExactStringItem;
+import com.yahoo.prelude.query.FuzzyItem;
import com.yahoo.prelude.query.IntItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.Limit;
@@ -82,6 +83,7 @@ import static com.yahoo.search.yql.YqlParser.DISTANCE_THRESHOLD;
import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT;
import static com.yahoo.search.yql.YqlParser.EQUIV;
import static com.yahoo.search.yql.YqlParser.FILTER;
+import static com.yahoo.search.yql.YqlParser.FUZZY;
import static com.yahoo.search.yql.YqlParser.GEO_LOCATION;
import static com.yahoo.search.yql.YqlParser.HIT_LIMIT;
import static com.yahoo.search.yql.YqlParser.HNSW_EXPLORE_ADDITIONAL_HITS;
@@ -926,6 +928,8 @@ public class SelectParser implements Parser {
return instantiateONearItem(field, key, value);
case EQUIV:
return instantiateEquivItem(field, key, value);
+ case FUZZY:
+ return instantiateFuzzyItem(field, key, value);
case ALTERNATIVES:
return instantiateWordAlternativesItem(field, key, value);
default:
@@ -1155,6 +1159,15 @@ public class SelectParser implements Parser {
return leafStyleSettings(getAnnotations(value), equiv);
}
+ private Item instantiateFuzzyItem(String field, String key, Inspector value) {
+ HashMap<Integer, Inspector> children = childMap(value);
+ Preconditions.checkArgument(children.size() == 1, "Expected 1 argument, got %s.", children.size());
+ String wordData = children.get(0).asString();
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData);
+
+ return leafStyleSettings(getAnnotations(value), fuzzy);
+ }
+
private Item instantiateWordAlternativesItem(String field, String key, Inspector value) {
HashMap<Integer, Inspector> children = childMap(value);
Preconditions.checkArgument(children.size() >= 1, "Expected 1 or more arguments, got %s.", children.size());
diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
index cc441eb0c3d..e778798b0e5 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
@@ -16,6 +16,7 @@ import static com.yahoo.search.yql.YqlParser.DOT_PRODUCT;
import static com.yahoo.search.yql.YqlParser.END_ANCHOR;
import static com.yahoo.search.yql.YqlParser.EQUIV;
import static com.yahoo.search.yql.YqlParser.FILTER;
+import static com.yahoo.search.yql.YqlParser.FUZZY;
import static com.yahoo.search.yql.YqlParser.GEO_LOCATION;
import static com.yahoo.search.yql.YqlParser.HIT_LIMIT;
import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS;
@@ -70,6 +71,7 @@ import com.yahoo.prelude.query.BoolItem;
import com.yahoo.prelude.query.DotProductItem;
import com.yahoo.prelude.query.EquivItem;
import com.yahoo.prelude.query.FalseItem;
+import com.yahoo.prelude.query.FuzzyItem;
import com.yahoo.prelude.query.ExactStringItem;
import com.yahoo.prelude.query.IndexedItem;
import com.yahoo.prelude.query.IntItem;
@@ -517,6 +519,32 @@ public class VespaSerializer {
}
}
+ private static class FuzzySerializer extends Serializer<FuzzyItem> {
+
+ @Override
+ void onExit(StringBuilder destination, FuzzyItem item) { }
+
+ @Override
+ boolean serialize(StringBuilder destination, FuzzyItem fuzzy) {
+ String annotations = leafAnnotations(fuzzy);
+ destination.append(normalizeIndexName(fuzzy.getIndexName())).append(" contains ");
+
+ if (annotations.length() > 0) {
+ destination.append('(').append(annotations);
+ }
+
+ destination.append(FUZZY).append('(');
+ destination.append('"');
+ escape(fuzzy.getIndexedString(), destination).append('"');
+ destination.append(')');
+
+ if (annotations.length() > 0) {
+ destination.append(')');
+ }
+ return false;
+ }
+ }
+
private static class ONearSerializer extends Serializer<ONearItem> {
@Override
@@ -1239,6 +1267,7 @@ public class VespaSerializer {
dispatchBuilder.put(WordItem.class, new WordSerializer());
dispatchBuilder.put(RegExpItem.class, new RegExpSerializer());
dispatchBuilder.put(UriItem.class, new UriSerializer());
+ dispatchBuilder.put(FuzzyItem.class, new FuzzySerializer());
dispatch = ImmutableMap.copyOf(dispatchBuilder);
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
index 26508fec3c4..06ee0e706f3 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
@@ -34,6 +34,7 @@ import com.yahoo.prelude.query.CompositeItem;
import com.yahoo.prelude.query.DotProductItem;
import com.yahoo.prelude.query.EquivItem;
import com.yahoo.prelude.query.FalseItem;
+import com.yahoo.prelude.query.FuzzyItem;
import com.yahoo.prelude.query.ExactStringItem;
import com.yahoo.prelude.query.IntItem;
import com.yahoo.prelude.query.Item;
@@ -192,6 +193,7 @@ public class YqlParser implements Parser {
public static final String WEAK_AND = "weakAnd";
public static final String WEIGHT = "weight";
public static final String WEIGHTED_SET = "weightedSet";
+ public static final String FUZZY = "fuzzy";
private final IndexFacts indexFacts;
private final List<ConnectedItem> connectedItems = new ArrayList<>();
@@ -1171,7 +1173,7 @@ public class YqlParser implements Parser {
assertHasOperator(ast, ExpressionOperator.CONTAINS);
String field = getIndex(ast.getArgument(0));
if (userQuery != null && indexFactsSession.getIndex(field).isAttribute()) {
- userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly", 2);
+ userQuery.trace("Field '" + field + "' is an attribute, 'contains' will only match exactly (unless fuzzy is used)", 2);
}
return instantiateLeafItem(field, ast.<OperatorNode<ExpressionOperator>> getArgument(1));
}
@@ -1298,11 +1300,23 @@ public class YqlParser implements Parser {
return instantiateWordAlternativesItem(field, ast);
case URI:
return instantiateUriItem(field, ast);
+ case FUZZY:
+ return instantiateFuzzyItem(field, ast);
default:
- throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI);
+ throw newUnexpectedArgumentException(names.get(0), EQUIV, NEAR, ONEAR, PHRASE, SAME_ELEMENT, URI, FUZZY);
}
}
+ private Item instantiateFuzzyItem(String field, OperatorNode<ExpressionOperator> ast) {
+ List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1);
+ Preconditions.checkArgument(args.size() == 1, "Expected 1 argument, got %s.", args.size());
+
+ String wordData = getStringContents(args.get(0));
+
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData);
+ return leafStyleSettings(ast, fuzzy);
+ }
+
private Item instantiateEquivItem(String field, OperatorNode<ExpressionOperator> ast) {
List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1);
Preconditions.checkArgument(args.size() >= 2, "Expected 2 or more arguments, got %s.", args.size());
diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
index 7057f996041..1269c2a5aef 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
@@ -443,4 +443,10 @@ public class VespaSerializerTestCase {
+ "alternatives({\"trees\": 1.0, \"tree\": 0.7}))"
+ ")");
}
+
+ @Test
+ public void testFuzzy() {
+ parseAndConfirm("foo contains fuzzy(\"a\")");
+ }
+
}
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index 15713dc1f97..f40e212adde 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -11,6 +11,7 @@ import com.yahoo.prelude.SearchDefinition;
import com.yahoo.prelude.query.AndItem;
import com.yahoo.prelude.query.BoolItem;
import com.yahoo.prelude.query.ExactStringItem;
+import com.yahoo.prelude.query.FuzzyItem;
import com.yahoo.prelude.query.IndexedItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.MarkerWordItem;
@@ -382,6 +383,15 @@ public class YqlParserTestCase {
}
@Test
+ public void testFuzzy() {
+ QueryTree x = parse("select foo from bar where baz contains fuzzy(\"a b\")");
+ Item root = x.getRoot();
+ assertSame(FuzzyItem.class, root.getClass());
+ assertEquals("baz", ((FuzzyItem) root).getIndexName());
+ assertEquals("a b", ((FuzzyItem) root).stringValue());
+ }
+
+ @Test
public void testStemming() {
assertTrue(getRootWord("select foo from bar where baz contains " +
"([ {stem: false} ]\"colors\")").isStemmed());
diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
index 0dcfb8392ef..9b867be1484 100644
--- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
+++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
@@ -6,6 +6,7 @@ import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.yahoo.prelude.query.AndItem;
import com.yahoo.prelude.query.ExactStringItem;
+import com.yahoo.prelude.query.FuzzyItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.PhraseItem;
import com.yahoo.prelude.query.PrefixItem;
@@ -674,6 +675,15 @@ public class SelectTestCase {
checkWordAlternativesContent(alternatives);
}
+ @Test
+ public void testFuzzy() {
+ QueryTree x = parseWhere("{ \"contains\": [\"description\", { \"fuzzy\": [\"a b\"] }] }");
+ Item root = x.getRoot();
+ assertSame(FuzzyItem.class, root.getClass());
+ assertEquals("description", ((FuzzyItem) root).getIndexName());
+ assertEquals("a b", ((FuzzyItem) root).stringValue());
+ }
+
//------------------------------------------------------------------- grouping tests
@Test
diff --git a/docproc/abi-spec.json b/docproc/abi-spec.json
index e2bbfd8ec10..5ad2c6d454d 100644
--- a/docproc/abi-spec.json
+++ b/docproc/abi-spec.json
@@ -251,10 +251,10 @@
"methods": [
"public void <init>()",
"public abstract com.yahoo.docproc.DocumentProcessor$Progress process(com.yahoo.docproc.Processing)",
- "public java.lang.String toString()",
"public void setFieldMap(java.util.Map)",
"public java.util.Map getFieldMap()",
- "public java.util.Map getDocMap(java.lang.String)"
+ "public java.util.Map getDocMap(java.lang.String)",
+ "public java.lang.String toString()"
],
"fields": []
},
diff --git a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp
index 4bcf87c14a6..9c67b3b3ef7 100644
--- a/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp
+++ b/documentapi/src/vespa/documentapi/messagebus/policies/mirror_with_all.cpp
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <mirror_with_all.h>
+#include "mirror_with_all.h"
+
#include <vespa/slobrok/sbmirror.h>
#include <vespa/fnet/frt/supervisor.h>
#include <vespa/fnet/transport.h>
diff --git a/searchcore/src/tests/proton/matching/query_test.cpp b/searchcore/src/tests/proton/matching/query_test.cpp
index a09437be46b..fd3dd0da518 100644
--- a/searchcore/src/tests/proton/matching/query_test.cpp
+++ b/searchcore/src/tests/proton/matching/query_test.cpp
@@ -278,6 +278,7 @@ public:
void visit(ProtonSubstringTerm &n) override { checkNode(n, 0, true); }
void visit(ProtonSuffixTerm &n) override { checkNode(n, 2, false); }
void visit(ProtonPhrase &n) override { checkNode(n, 0, true); }
+ void visit(ProtonFuzzyTerm &n) override { checkNode(n, 1, false); }
void visit(ProtonWeightedSetTerm &) override {}
void visit(ProtonDotProduct &) override {}
void visit(ProtonWandTerm &) override {}
@@ -435,6 +436,7 @@ public:
void visit(ProtonPredicateQuery &) override {}
void visit(ProtonRegExpTerm &) override {}
void visit(ProtonNearestNeighborTerm &) override {}
+ void visit(ProtonFuzzyTerm &) override {}
};
void Test::requireThatTermDataIsFilledIn() {
diff --git a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp
index ffd08b3bbef..78486f54704 100644
--- a/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp
+++ b/searchcore/src/tests/proton/matching/unpacking_iterators_optimizer/unpacking_iterators_optimizer_test.cpp
@@ -69,6 +69,7 @@ struct DumpQuery : QueryVisitor {
void visit(NearestNeighborTerm &) override {}
void visit(TrueQueryNode &) override {}
void visit(FalseQueryNode &) override {}
+ void visit(FuzzyTerm &) override {}
};
std::string dump_query(Node &root) {
diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
index 9be5920c0ed..92d7e385293 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp
@@ -165,6 +165,7 @@ protected:
void visit(ProtonFalse &) override {
_result = std::make_unique<EmptyBlueprint>();
}
+ void visit(ProtonFuzzyTerm &n) override { buildTerm(n); }
public:
BlueprintBuilderVisitor(const IRequestContext & requestContext, ISearchContext &context) :
diff --git a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
index e7817dcecd2..d8060832d5d 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/querynodes.h
@@ -137,6 +137,7 @@ typedef ProtonTerm<search::query::WandTerm> ProtonWandTerm;
typedef ProtonTerm<search::query::PredicateQuery> ProtonPredicateQuery;
typedef ProtonTerm<search::query::RegExpTerm> ProtonRegExpTerm;
typedef ProtonTerm<search::query::NearestNeighborTerm> ProtonNearestNeighborTerm;
+typedef ProtonTerm<search::query::FuzzyTerm> ProtonFuzzyTerm;
struct ProtonNodeTypes {
typedef ProtonAnd And;
@@ -164,6 +165,7 @@ struct ProtonNodeTypes {
typedef ProtonNearestNeighborTerm NearestNeighborTerm;
typedef ProtonTrue TrueQueryNode;
typedef ProtonFalse FalseQueryNode;
+ typedef ProtonFuzzyTerm FuzzyTerm;
};
}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp
index 34cb5369c1e..574df384a63 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/same_element_builder.cpp
@@ -69,6 +69,7 @@ public:
void visit(ProtonNearestNeighborTerm &) override {}
void visit(ProtonTrue &) override {}
void visit(ProtonFalse &) override {}
+ void visit(ProtonFuzzyTerm &n) override { visitTerm(n); }
};
} // namespace proton::matching::<unnamed>
diff --git a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp
index c2abeaa36ee..a32107836ca 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/termdatafromnode.cpp
@@ -45,6 +45,7 @@ struct TermDataFromTermVisitor
void visit(ProtonPredicateQuery &) override { }
void visit(ProtonRegExpTerm &n) override { visitTerm(n); }
void visit(ProtonNearestNeighborTerm &n) override { visitTerm(n); }
+ void visit(ProtonFuzzyTerm &n) override { visitTerm(n); }
};
} // namespace
diff --git a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp
index 72a53a1a3d2..72e0153b5c2 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/unpacking_iterators_optimizer.cpp
@@ -59,6 +59,7 @@ struct TermExpander : QueryVisitor {
void visit(NearestNeighborTerm &) override {}
void visit(TrueQueryNode &) override {}
void visit(FalseQueryNode &) override {}
+ void visit(FuzzyTerm &) override {}
void flush(Intermediate &parent) {
for (Node::UP &term: terms) {
diff --git a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp
index 5592215f9e2..d69f7d1b0a4 100644
--- a/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp
+++ b/searchcorespi/src/vespa/searchcorespi/index/indexcollection.cpp
@@ -205,6 +205,7 @@ private:
void visit(PredicateQuery &n) override { visitTerm(n); }
void visit(RegExpTerm &n) override { visitTerm(n); }
void visit(NearestNeighborTerm &n) override { visitTerm(n); }
+ void visit(FuzzyTerm &n) override { visitTerm(n); }
public:
CreateBlueprintVisitor(const IIndexCollection &indexes,
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
index 4f037415b35..65de302ae04 100644
--- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
+++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
@@ -242,6 +242,12 @@ private:
void testPrefixSearch(const AttributePtr & ptr);
void testPrefixSearch();
+ // test fuzzy search
+ void performFuzzySearch(const StringAttribute & vec, const vespalib::string & term,
+ const DocSet & expected, TermType termType);
+ void testFuzzySearch(const AttributePtr & ptr);
+ void testFuzzySearch();
+
// test that search is working after clear doc
template <typename VectorType, typename ValueType>
void requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name, const Config & cfg,
@@ -402,6 +408,7 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st
switch (termType) {
case TermType::PREFIXTERM: buffer[p++] = ParseItem::ITEM_PREFIXTERM; break;
case TermType::REGEXP: buffer[p++] = ParseItem::ITEM_REGEXP; break;
+ case TermType::FUZZYTERM: buffer[p++] = ParseItem::ITEM_FUZZY; break;
default:
buffer[p++] = ParseItem::ITEM_TERM;
break;
@@ -1498,6 +1505,70 @@ SearchContextTest::testPrefixSearch()
}
}
+//-----------------------------------------------------------------------------
+// Test fuzzy search
+//-----------------------------------------------------------------------------
+
+void
+SearchContextTest::performFuzzySearch(const StringAttribute & vec, const vespalib::string & term,
+ const DocSet & expected, TermType termType)
+{
+ performSearch(vec, term, expected, termType);
+}
+
+void
+SearchContextTest::testFuzzySearch(const AttributePtr & ptr)
+{
+ LOG(info, "testFuzzySearch: vector '%s'", ptr->getName().c_str());
+
+ auto & vec = dynamic_cast<StringAttribute &>(*ptr.get());
+
+ uint32_t numDocs = 2;
+ addDocs(*ptr.get(), numDocs);
+
+ const char * strings [] = {"fuzzysearch", "FUZZYSEARCH"};
+ const char * terms[][2] = {
+ {"fuzzysearch", "FUZZYSEARCH"},
+ {"fuzzysearck", "FUZZYSEARCK"},
+ {"fuzzysekkkk", "FUZZYSEKKKK"}
+ };
+
+ for (uint32_t doc = 1; doc < numDocs + 1; ++doc) {
+ ASSERT_TRUE(doc < vec.getNumDocs());
+ EXPECT_TRUE(vec.update(doc, strings[doc - 1]));
+ }
+
+ ptr->commit(true);
+
+ std::vector<DocSet> expected;
+ DocSet empty;
+ {
+ uint32_t docs[] = {1, 2};
+ expected.emplace_back(docs, docs + 2); // normal search
+ }
+ {
+ uint32_t docs[] = {1, 2};
+ expected.emplace_back(docs, docs + 2); // fuzzy search
+ }
+
+ expected.emplace_back(); // results
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ for (uint32_t j = 0; j < 2; ++j) {
+ performFuzzySearch(vec, terms[i][j], expected[i], TermType::FUZZYTERM);
+ }
+ }
+}
+
+void
+SearchContextTest::testFuzzySearch()
+{
+ for (const auto & cfg : _stringCfg) {
+ testFuzzySearch(AttributeFactory::createAttribute(cfg.first, cfg.second));
+ }
+}
+
+
template <typename VectorType, typename ValueType>
void
SearchContextTest::requireThatSearchIsWorkingAfterClearDoc(const vespalib::string & name,
@@ -2028,6 +2099,7 @@ SearchContextTest::Main()
testPrefixSearch();
testSearchIteratorConformance();
testSearchIteratorUnpacking();
+ testFuzzySearch();
TEST_DO(requireThatSearchIsWorkingAfterClearDoc());
TEST_DO(requireThatSearchIsWorkingAfterLoadAndClearDoc());
TEST_DO(requireThatSearchIsWorkingAfterUpdates());
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
index 8a17114057c..d8619386892 100644
--- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
+++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
@@ -386,8 +386,8 @@ testSingleValue(Attribute & svsa, Config &cfg)
TEST("testSingleValue")
{
EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext));
- EXPECT_EQUAL(24u, sizeof(StringSearchHelper));
- EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext));
+ EXPECT_EQUAL(56u, sizeof(StringSearchHelper));
+ EXPECT_EQUAL(88u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext));
{
Config cfg(BasicType::STRING, CollectionType::SINGLE);
SingleValueStringAttribute svsa("svsa", cfg);
@@ -494,4 +494,20 @@ TEST("test cased regex match") {
EXPECT_FALSE(helper.isMatch("xY"));
}
+TEST("test fuzzy match") {
+ QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::FUZZYTERM);
+ StringSearchHelper helper(xyz, false);
+ EXPECT_FALSE(helper.isCased());
+ EXPECT_FALSE(helper.isPrefix());
+ EXPECT_FALSE(helper.isRegex());
+ EXPECT_TRUE(helper.isFuzzy());
+ EXPECT_TRUE(helper.isMatch("xyz"));
+ EXPECT_TRUE(helper.isMatch("xyza"));
+ EXPECT_TRUE(helper.isMatch("xyv"));
+ EXPECT_TRUE(helper.isMatch("xy"));
+ EXPECT_TRUE(helper.isMatch("x"));
+ EXPECT_TRUE(helper.isMatch("xvv"));
+ EXPECT_FALSE(helper.isMatch("vvv"));
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp
index 35280fb0bd8..0e8155e23c3 100644
--- a/searchlib/src/tests/query/customtypevisitor_test.cpp
+++ b/searchlib/src/tests/query/customtypevisitor_test.cpp
@@ -36,6 +36,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {};
struct MyStringTerm : InitTerm<StringTerm> {};
struct MySubstrTerm : InitTerm<SubstringTerm> {};
struct MySuffixTerm : InitTerm<SuffixTerm> {};
+struct MyFuzzyTerm : InitTerm<FuzzyTerm> {};
struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} };
struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} };
struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} };
@@ -65,6 +66,7 @@ struct MyQueryNodeTypes {
typedef MyStringTerm StringTerm;
typedef MySubstrTerm SubstringTerm;
typedef MySuffixTerm SuffixTerm;
+ typedef MyFuzzyTerm FuzzyTerm;
typedef MyWeakAnd WeakAnd;
typedef MyWeightedSetTerm WeightedSetTerm;
typedef MyDotProduct DotProduct;
@@ -112,6 +114,7 @@ public:
void visit(MyNearestNeighborTerm &) override { setVisited<MyNearestNeighborTerm>(); }
void visit(MyTrue &) override { setVisited<MyTrue>(); }
void visit(MyFalse &) override { setVisited<MyFalse>(); }
+ void visit(MyFuzzyTerm &) override { setVisited<MyFuzzyTerm>(); }
};
template <class T>
@@ -148,6 +151,7 @@ TEST("customtypevisitor_test") {
requireThatNodeIsVisited<MyNearestNeighborTerm>();
requireThatNodeIsVisited<MyTrue>();
requireThatNodeIsVisited<MyFalse>();
+ requireThatNodeIsVisited<MyFuzzyTerm>();
}
} // namespace
diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp
index 9f73c1ff585..f770213e8e5 100644
--- a/searchlib/src/tests/query/query_visitor_test.cpp
+++ b/searchlib/src/tests/query/query_visitor_test.cpp
@@ -49,6 +49,7 @@ public:
void visit(NearestNeighborTerm &) override { isVisited<NearestNeighborTerm>() = true; }
void visit(TrueQueryNode &) override { isVisited<TrueQueryNode>() = true; }
void visit(FalseQueryNode &) override { isVisited<FalseQueryNode>() = true; }
+ void visit(FuzzyTerm &) override { isVisited<FuzzyTerm>() = true; }
};
template <class T>
@@ -85,6 +86,7 @@ TEST("requireThatAllNodesCanBeVisited") {
checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25));
checkVisit<TrueQueryNode>(new SimpleTrue());
checkVisit<FalseQueryNode>(new SimpleFalse());
+ checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0)));
}
} // namespace
diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp
index 93cfad27742..2ea566027c4 100644
--- a/searchlib/src/tests/query/querybuilder_test.cpp
+++ b/searchlib/src/tests/query/querybuilder_test.cpp
@@ -47,7 +47,7 @@ PredicateQueryTerm::UP getPredicateQueryTerm() {
template <class NodeTypes>
Node::UP createQueryTree() {
QueryBuilder<NodeTypes> builder;
- builder.addAnd(12);
+ builder.addAnd(13);
{
builder.addRank(2);
{
@@ -115,6 +115,7 @@ Node::UP createQueryTree() {
builder.add_true_node();
builder.add_false_node();
}
+ builder.addFuzzyTerm(str[5], view[5], id[5], weight[5]);
}
Node::UP node = builder.build();
ASSERT_TRUE(node.get());
@@ -179,10 +180,11 @@ void checkQueryTreeTypes(Node *node) {
typedef typename NodeTypes::RegExpTerm RegExpTerm;
typedef typename NodeTypes::TrueQueryNode TrueNode;
typedef typename NodeTypes::FalseQueryNode FalseNode;
+ typedef typename NodeTypes::FuzzyTerm FuzzyTerm;
ASSERT_TRUE(node);
auto* and_node = as_node<And>(node);
- EXPECT_EQUAL(12u, and_node->getChildren().size());
+ EXPECT_EQUAL(13u, and_node->getChildren().size());
auto* rank = as_node<Rank>(and_node->getChildren()[0]);
EXPECT_EQUAL(2u, rank->getChildren().size());
@@ -306,6 +308,9 @@ void checkQueryTreeTypes(Node *node) {
auto* false_node = as_node<FalseNode>(and_not->getChildren()[1]);
EXPECT_TRUE(true_node);
EXPECT_TRUE(false_node);
+
+ auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]);
+ EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5]));
}
struct AbstractTypes {
@@ -332,6 +337,7 @@ struct AbstractTypes {
typedef search::query::RegExpTerm RegExpTerm;
typedef search::query::TrueQueryNode TrueQueryNode;
typedef search::query::FalseQueryNode FalseQueryNode;
+ typedef search::query::FuzzyTerm FuzzyTerm;
};
// Builds a tree with simplequery and checks that the results have the
@@ -427,6 +433,11 @@ struct MyNearestNeighborTerm : NearestNeighborTerm {
};
struct MyTrue : TrueQueryNode {};
struct MyFalse : FalseQueryNode {};
+struct MyFuzzyTerm : FuzzyTerm {
+ MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w)
+ : FuzzyTerm(t, f, i, w) {
+ }
+};
struct MyQueryNodeTypes {
typedef MyAnd And;
@@ -454,6 +465,7 @@ struct MyQueryNodeTypes {
typedef MyNearestNeighborTerm NearestNeighborTerm;
typedef MyTrue TrueQueryNode;
typedef MyFalse FalseQueryNode;
+ typedef MyFuzzyTerm FuzzyTerm;
};
TEST("require that Custom Query Trees Can Be Built") {
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index 906400f50a5..f14966dbfc8 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -779,6 +779,8 @@ public:
n.get_distance_threshold(),
getRequestContext().get_attribute_blueprint_params().nearest_neighbor_brute_force_limit));
}
+
+ void visit(query::FuzzyTerm &n) override { visitTerm(n); }
};
template <typename WS>
diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
index ca440c2a249..c7e092799d1 100644
--- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
@@ -125,6 +125,10 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched)
vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str());
lookupRange(comp, comp);
+ } else if (this->isFuzzy()) {
+ vespalib::string prefix(this->getFuzzyMatcher().getPrefix());
+ auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str());
+ lookupRange(comp, comp);
} else {
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm());
lookupTerm(comp);
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 97c2c7d2b63..a0fbdf89f15 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -282,6 +282,10 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att
vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str());
this->lookupRange(comp, comp);
+ } else if (this->isFuzzy()) {
+ vespalib::string prefix(this->getFuzzyMatcher().getPrefix());
+ auto comp = _enumStore.make_folded_comparator_prefix(prefix.c_str());
+ this->lookupRange(comp, comp);
} else {
auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm());
this->lookupTerm(comp);
@@ -301,6 +305,8 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::useThis(const PostingListSearc
: false;
} else if ( this->isCased() ) {
return this->isMatch(_enumStore.get_value(it.getKey().load_acquire()));
+ } else if (this->isFuzzy()) {
+ return this->getFuzzyMatcher().isMatch(_enumStore.get_value(it.getKey().load_acquire()));
}
return true;
}
diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
index 730ad1107a7..7c8815af30b 100644
--- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
@@ -61,6 +61,10 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte
vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str());
lookupRange(comp, comp);
+ } else if (this->isFuzzy()) {
+ vespalib::string prefix(this->getFuzzyMatcher().getPrefix());
+ auto comp = enumStore.make_folded_comparator_prefix(prefix.c_str());
+ lookupRange(comp, comp);
} else {
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm());
lookupTerm(comp);
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
index 6062c4f2096..4baaaa45d3a 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
@@ -18,11 +18,13 @@ namespace search {
StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
: _regex(),
+ _fuzzyMatcher(),
_term(),
_termLen(),
_isPrefix(term.isPrefix()),
_isRegex(term.isRegex()),
- _isCased(cased)
+ _isCased(cased),
+ _isFuzzy(term.isFuzzy())
{
if (isRegex()) {
if (isCased()) {
@@ -33,6 +35,8 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
} else if (isCased()) {
_term._char = term.getTerm();
_termLen = term.getTermLen();
+ } else if (isFuzzy()) {
+ _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(term.getTerm());
} else {
term.term(_term._ucs4);
}
@@ -54,6 +58,9 @@ StringSearchHelper::isMatch(const char *src) const {
int res = strncmp(_term._char, src, _termLen);
return (res == 0) && (src[_termLen] == 0 || isPrefix());
}
+ if (__builtin_expect(isFuzzy(), false)) {
+ return getFuzzyMatcher().isMatch(src);
+ }
vespalib::Utf8ReaderForZTS u8reader(src);
uint32_t j = 0;
uint32_t val;
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h
index 495427d3e45..6ba4c801590 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.h
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h
@@ -10,6 +10,7 @@
#include <vespa/vespalib/regex/regex.h>
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
#include <optional>
namespace search {
@@ -26,9 +27,12 @@ public:
bool isPrefix() const { return _isPrefix; }
bool isRegex() const { return _isRegex; }
bool isCased() const { return _isCased; }
+ bool isFuzzy() const { return _isFuzzy; }
const vespalib::Regex & getRegex() const { return _regex; }
+ const vespalib::FuzzyMatcher & getFuzzyMatcher() const { return _fuzzyMatcher; }
private:
vespalib::Regex _regex;
+ vespalib::FuzzyMatcher _fuzzyMatcher;
union {
const ucs4_t *_ucs4;
const char *_char;
@@ -37,6 +41,7 @@ private:
bool _isPrefix;
bool _isRegex;
bool _isCased;
+ bool _isFuzzy;
};
class ReaderBase;
@@ -126,7 +131,9 @@ protected:
bool isPrefix() const { return _helper.isPrefix(); }
bool isRegex() const { return _helper.isRegex(); }
bool isCased() const { return _helper.isCased(); }
+ bool isFuzzy() const { return _helper.isFuzzy(); }
const vespalib::Regex & getRegex() const { return _helper.getRegex(); }
+ const vespalib::FuzzyMatcher & getFuzzyMatcher() const { return _helper.getFuzzyMatcher(); }
class CollectHitCount {
public:
diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
index 7cde1102bc1..eb8054317dc 100644
--- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
@@ -409,6 +409,7 @@ public:
void visit(RegExpTerm &n) override { visitTerm(n); }
void visit(PredicateQuery &n) override { not_supported(n); }
void visit(NearestNeighborTerm &n) override { not_supported(n); }
+ void visit(FuzzyTerm &n) override { visitTerm(n); }
};
Blueprint::UP
diff --git a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp
index 330320d5047..f8ad85859fa 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/memory_index.cpp
@@ -28,6 +28,7 @@ using index::IFieldLengthInspector;
using index::IndexBuilder;
using index::Schema;
using index::SchemaUtil;
+using query::FuzzyTerm;
using query::LocationTerm;
using query::NearestNeighborTerm;
using query::Node;
@@ -168,6 +169,7 @@ public:
void visit(SubstringTerm &n) override { visitTerm(n); }
void visit(SuffixTerm &n) override { visitTerm(n); }
void visit(RegExpTerm &n) override { visitTerm(n); }
+ void visit(FuzzyTerm &n) override { visitTerm(n); }
void visit(PredicateQuery &n) override { not_supported(n); }
void visit(NearestNeighborTerm &n) override { not_supported(n); }
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h
index 34ea692c370..0d665d1f04d 100644
--- a/searchlib/src/vespa/searchlib/parsequery/parse.h
+++ b/searchlib/src/vespa/searchlib/parsequery/parse.h
@@ -56,8 +56,9 @@ public:
ITEM_GEO_LOCATION_TERM = 27,
ITEM_TRUE = 28,
ITEM_FALSE = 29,
- ITEM_MAX = 30, // Indicates how long tables must be.
- ITEM_UNDEF = 31,
+ ITEM_FUZZY = 30,
+ ITEM_MAX = 31, // Indicates how long tables must be.
+ ITEM_UNDEF = 32,
};
/** A tag identifying the origin of this query node.
diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
index aa13c93810a..85b55284b35 100644
--- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
+++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
@@ -170,6 +170,7 @@ bool SimpleQueryStackDumpIterator::readNext() {
case ParseItem::ITEM_EXACTSTRINGTERM:
case ParseItem::ITEM_SUFFIXTERM:
case ParseItem::ITEM_REGEXP:
+ case ParseItem::ITEM_FUZZY:
_curr_index_name = read_stringref(p);
_curr_term = read_stringref(p);
_currArity = 0;
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index 433ab7d56dd..0d5dd116826 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -22,7 +22,8 @@ public:
EXACTSTRINGTERM = 3,
SUFFIXTERM = 4,
REGEXP = 5,
- GEO_LOCATION = 6
+ GEO_LOCATION = 6,
+ FUZZYTERM = 7
};
template <typename N>
@@ -61,6 +62,7 @@ public:
bool isWord() const { return (_type == Type::WORD); }
bool isRegex() const { return (_type == Type::REGEXP); }
bool isGeoLoc() const { return (_type == Type::GEO_LOCATION); }
+ bool isFuzzy() const { return (_type == Type::FUZZYTERM); }
bool empty() const { return _term.empty(); }
virtual void visitMembers(vespalib::ObjectVisitor &visitor) const;
vespalib::string getClassName() const;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 77fc97913a4..6f126c7a3eb 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -86,6 +86,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
case ParseItem::ITEM_SUFFIXTERM:
case ParseItem::ITEM_PURE_WEIGHTED_STRING:
case ParseItem::ITEM_PURE_WEIGHTED_LONG:
+ case ParseItem::ITEM_FUZZY:
{
vespalib::string index = queryRep.getIndexName();
if (index.empty()) {
@@ -116,6 +117,9 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
case ParseItem::ITEM_SUFFIXTERM:
sTerm = TermType::SUFFIXTERM;
break;
+ case ParseItem::ITEM_FUZZY:
+ sTerm = TermType::FUZZYTERM;
+ break;
default:
break;
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h
index 9f29c34aa05..abc48db9d87 100644
--- a/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h
+++ b/searchlib/src/vespa/searchlib/query/tree/customtypevisitor.h
@@ -52,6 +52,7 @@ public:
virtual void visit(typename NodeTypes::NearestNeighborTerm &) = 0;
virtual void visit(typename NodeTypes::TrueQueryNode &) = 0;
virtual void visit(typename NodeTypes::FalseQueryNode &) = 0;
+ virtual void visit(typename NodeTypes::FuzzyTerm &) = 0;
private:
// Route QueryVisit requests to the correct custom type.
@@ -81,6 +82,7 @@ private:
typedef typename NodeTypes::NearestNeighborTerm TNearestNeighborTerm;
typedef typename NodeTypes::TrueQueryNode TTrueQueryNode;
typedef typename NodeTypes::FalseQueryNode TFalseQueryNode;
+ typedef typename NodeTypes::FuzzyTerm TFuzzyTerm;
void visit(And &n) override { visit(static_cast<TAnd&>(n)); }
void visit(AndNot &n) override { visit(static_cast<TAndNot&>(n)); }
@@ -107,6 +109,7 @@ private:
void visit(NearestNeighborTerm &n) override { visit(static_cast<TNearestNeighborTerm&>(n)); }
void visit(TrueQueryNode &n) override { visit(static_cast<TTrueQueryNode&>(n)); }
void visit(FalseQueryNode &n) override { visit(static_cast<TFalseQueryNode&>(n)); }
+ void visit(FuzzyTerm &n) override { visit(static_cast<TFuzzyTerm &>(n)); }
};
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
index 9631e2afded..ee3a944cce1 100644
--- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
+++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
@@ -220,6 +220,12 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st
target_num_hits, allow_approximate, explore_additional_hits,
distance_threshold);
}
+template <class NodeTypes>
+typename NodeTypes::FuzzyTerm *
+createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight) {
+ return new typename NodeTypes::FuzzyTerm(term, view, id, weight);
+}
+
template <class NodeTypes>
class QueryBuilder : public QueryBuilderBase {
@@ -327,6 +333,10 @@ public:
adjustWeight(weight);
return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight));
}
+ typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight) {
+ adjustWeight(weight);
+ return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight));
+ }
typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name,
int32_t id, Weight weight, uint32_t target_num_hits,
bool allow_approximate, uint32_t explore_additional_hits,
diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
index 3fb72f93b23..ecaee350b21 100644
--- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
@@ -193,6 +193,12 @@ private:
void visit(FalseQueryNode &) override {
_builder.add_false_node();
}
+
+ void visit(FuzzyTerm &node) override {
+ replicate(node, _builder.addFuzzyTerm(
+ node.getTerm(), node.getView(),
+ node.getId(), node.getWeight()));
+ }
};
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h
index 02887975085..90faa25bd99 100644
--- a/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h
+++ b/searchlib/src/vespa/searchlib/query/tree/queryvisitor.h
@@ -29,6 +29,7 @@ class SameElement;
class NearestNeighborTerm;
class TrueQueryNode;
class FalseQueryNode;
+class FuzzyTerm;
struct QueryVisitor {
virtual ~QueryVisitor() {}
@@ -58,6 +59,7 @@ struct QueryVisitor {
virtual void visit(NearestNeighborTerm &) = 0;
virtual void visit(TrueQueryNode &) = 0;
virtual void visit(FalseQueryNode &) = 0;
+ virtual void visit(FuzzyTerm &) = 0;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp
index cad97279b4c..e3cad4ed33a 100644
--- a/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.cpp
@@ -52,4 +52,6 @@ SimpleRegExpTerm::~SimpleRegExpTerm() = default;
SimpleNearestNeighborTerm::~SimpleNearestNeighborTerm() = default;
+SimpleFuzzyTerm::~SimpleFuzzyTerm() = default;
+
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
index 5047e072cb7..00dad2597ce 100644
--- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h
+++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
@@ -152,7 +152,13 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm {
{}
~SimpleNearestNeighborTerm() override;
};
-
+struct SimpleFuzzyTerm : FuzzyTerm {
+ SimpleFuzzyTerm(const Type &term, vespalib::stringref view,
+ int32_t id, Weight weight)
+ : FuzzyTerm(term, view, id, weight) {
+ }
+ ~SimpleFuzzyTerm() override;
+};
struct SimpleQueryNodeTypes {
using And = SimpleAnd;
@@ -180,6 +186,7 @@ struct SimpleQueryNodeTypes {
using PredicateQuery = SimplePredicateQuery;
using RegExpTerm = SimpleRegExpTerm;
using NearestNeighborTerm = SimpleNearestNeighborTerm;
+ using FuzzyTerm = SimpleFuzzyTerm;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
index d45a72d316a..f36410d1845 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
@@ -278,6 +278,10 @@ class QueryNodeConverter : public QueryVisitor {
createTerm(node, ParseItem::ITEM_REGEXP);
}
+ void visit(FuzzyTerm &node) override {
+ createTerm(node, ParseItem::ITEM_FUZZY);
+ }
+
void visit(NearestNeighborTerm &node) override {
createTermNode(node, ParseItem::ITEM_NEAREST_NEIGHBOR);
appendString(node.get_query_tensor_name());
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
index 5a6f315205e..a5f3be3e618 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
@@ -197,6 +197,8 @@ private:
t = &builder.addPredicateQuery(queryStack.getPredicateQueryTerm(), view, id, weight);
} else if (type == ParseItem::ITEM_REGEXP) {
t = &builder.addRegExpTerm(term, view, id, weight);
+ } else if (type == ParseItem::ITEM_FUZZY) {
+ t = &builder.addFuzzyTerm(term, view, id, weight);
} else {
vespalib::Issue::report("query builder: Unable to create query tree from stack dump. node type = %d.", type);
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h
index fc3570f44d8..a6eae257afd 100644
--- a/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h
+++ b/searchlib/src/vespa/searchlib/query/tree/templatetermvisitor.h
@@ -32,6 +32,7 @@ class TemplateTermVisitor : public CustomTypeTermVisitor<NodeTypes> {
void visit(typename NodeTypes::PredicateQuery &n) override { myVisit(n); }
void visit(typename NodeTypes::RegExpTerm &n) override { myVisit(n); }
void visit(typename NodeTypes::NearestNeighborTerm &n) override { myVisit(n); }
+ void visit(typename NodeTypes::FuzzyTerm &n) override { myVisit(n); }
// Phrases are terms with children. This visitor will not visit
// the phrase's children, unless this member function is
diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp
index dcf0533ff7a..6e889e76f21 100644
--- a/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.cpp
@@ -24,6 +24,7 @@ RegExpTerm::~RegExpTerm() = default;
WeightedSetTerm::~WeightedSetTerm() = default;
DotProduct::~DotProduct() = default;
WandTerm::~WandTerm() = default;
+FuzzyTerm::~FuzzyTerm() = default;
namespace {
diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
index a728b674999..7aa867e25ed 100644
--- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h
+++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
@@ -115,6 +115,18 @@ public:
virtual ~RegExpTerm() = 0;
};
+//-----------------------------------------------------------------------------
+
+class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase>
+{
+public:
+ FuzzyTerm(const Type &term, vespalib::stringref view,
+ int32_t id, Weight weight)
+ : QueryNodeMixinType(term, view, id, weight)
+ {}
+ virtual ~FuzzyTerm() = 0;
+};
+
/**
* Term matching the K nearest neighbors in a multi-dimensional vector space.
*
diff --git a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h
index 86cde64a197..30c7e1722fb 100644
--- a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h
+++ b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h
@@ -73,6 +73,7 @@ public:
void visit(query::SuffixTerm &n) override = 0;
void visit(query::RegExpTerm &n) override = 0;
void visit(query::NearestNeighborTerm &n) override = 0;
+ void visit(query::FuzzyTerm &n) override = 0;
void visit(query::TrueQueryNode &) final override;
void visit(query::FalseQueryNode &) final override;
diff --git a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp
index 519f6e81774..614f219cbcb 100644
--- a/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/fake_searchable.cpp
@@ -6,6 +6,7 @@
#include "create_blueprint_visitor_helper.h"
#include <vespa/vespalib/objects/visit.h>
+using search::query::FuzzyTerm;
using search::query::LocationTerm;
using search::query::NearestNeighborTerm;
using search::query::Node;
@@ -66,6 +67,7 @@ public:
void visit(PredicateQuery &n) override { visitTerm(n); }
void visit(RegExpTerm &n) override { visitTerm(n); }
void visit(NearestNeighborTerm &n) override { visitTerm(n); }
+ void visit(FuzzyTerm &n) override { visitTerm(n); }
};
template <class Map>
diff --git a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp
index 08c0280ee68..63bf16e6016 100644
--- a/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/termasstring.cpp
@@ -17,6 +17,7 @@ using search::query::AndNot;
using search::query::DotProduct;
using search::query::Equiv;
using search::query::FalseQueryNode;
+using search::query::FuzzyTerm;
using search::query::LocationTerm;
using search::query::Near;
using search::query::NearestNeighborTerm;
@@ -105,6 +106,7 @@ struct TermAsStringVisitor : public QueryVisitor {
void visit(SubstringTerm &n) override {visitTerm(n); }
void visit(SuffixTerm &n) override {visitTerm(n); }
void visit(RegExpTerm &n) override {visitTerm(n); }
+ void visit(FuzzyTerm &n) override { visitTerm(n); }
void visit(PredicateQuery &) override {illegalVisit(); }
void visit(NearestNeighborTerm &) override { illegalVisit(); }
void visit(TrueQueryNode &) override { illegalVisit(); }
diff --git a/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp b/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp
index f3706bfc7c1..7f9557232b0 100644
--- a/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp
+++ b/searchsummary/src/tests/extractkeywords/simplequerystackitem.cpp
@@ -158,6 +158,7 @@ SimpleQueryStackItem::AppendBuffer(RawBuf *buf) const
case ITEM_EXACTSTRINGTERM:
case ITEM_SUFFIXTERM:
case ITEM_REGEXP:
+ case ITEM_FUZZY:
buf->appendCompressedPositiveNumber(indexLen);
buf->append(_indexName.c_str(), indexLen);
buf->appendCompressedPositiveNumber(termLen);
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index 29a0c1e7bb8..c3c7fee2cef 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -62,6 +62,7 @@ vespa_define_module(
src/tests/explore_modern_cpp
src/tests/false
src/tests/fiddle
+ src/tests/fuzzy
src/tests/gencnt
src/tests/guard
src/tests/host_name
@@ -169,6 +170,7 @@ vespa_define_module(
src/vespa/vespalib/data
src/vespa/vespalib/data/slime
src/vespa/vespalib/datastore
+ src/vespa/vespalib/fuzzy
src/vespa/vespalib/geo
src/vespa/vespalib/hwaccelrated
src/vespa/vespalib/io
diff --git a/vespalib/src/tests/fuzzy/.gitignore b/vespalib/src/tests/fuzzy/.gitignore
new file mode 100644
index 00000000000..cd5ca9b3b88
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+fuzzy_test
+vespalib_fuzzy_test_app
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt
new file mode 100644
index 00000000000..2a415a9ad62
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_fuzzy_matcher_test_app TEST
+ SOURCES
+ fuzzy_matcher_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+ )
+vespa_add_test(NAME vespalib_fuzzy_matcher_test_app COMMAND vespalib_fuzzy_matcher_test_app)
+
+vespa_add_executable(vespalib_levenstein_distance_test_app TEST
+ SOURCES
+ levenstein_distance_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+ )
+vespa_add_test(NAME vespalib_levenstein_distance_test_app COMMAND vespalib_levenstein_distance_test_app) \ No newline at end of file
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
new file mode 100644
index 00000000000..60a4eab3f57
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -0,0 +1,42 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+
+FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) {
+ return {LowerCase::convert_to_ucs4(term), threshold, prefix_size};
+}
+
+TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
+ FuzzyMatcher fuzzy = from_term("abc", 2, 0);
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("ABC"));
+ EXPECT_TRUE(fuzzy.isMatch("ab1"));
+ EXPECT_TRUE(fuzzy.isMatch("a12"));
+ EXPECT_FALSE(fuzzy.isMatch("123"));
+}
+
+TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
+ FuzzyMatcher fuzzy = from_term("abcdef", 2, 2);
+ EXPECT_TRUE(fuzzy.isMatch("abcdef"));
+ EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
+ EXPECT_TRUE(fuzzy.isMatch("abcde1"));
+ EXPECT_TRUE(fuzzy.isMatch("abcd12"));
+ EXPECT_FALSE(fuzzy.isMatch("abc123"));
+ EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced
+}
+
+TEST(FuzzyMatcherTest, get_prefix_is_empty) {
+ FuzzyMatcher fuzzy = from_term("whatever", 2, 0);
+ EXPECT_EQ(fuzzy.getPrefix(), "");
+}
+
+TEST(FuzzyMatcherTest, get_prefix_non_empty) {
+ FuzzyMatcher fuzzy = from_term("abcd", 2, 2);
+ EXPECT_EQ(fuzzy.getPrefix(), "ab");
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp
new file mode 100644
index 00000000000..efdcc82fce1
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/levenstein_distance.h>
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ std::vector<uint32_t> leftCodepoints = vespalib::LowerCase::convert_to_ucs4(left);
+ std::vector<uint32_t> rightCodepoints = vespalib::LowerCase::convert_to_ucs4(right);
+
+ std::optional<uint32_t> leftRight = vespalib::LevensteinDistance::calculate(leftCodepoints,rightCodepoints, threshold);
+ std::optional<uint32_t> rightLeft = vespalib::LevensteinDistance::calculate(rightCodepoints,leftCodepoints, threshold);
+
+ EXPECT_EQ(leftRight, rightLeft); // should be independent whether left or right strings are swapped
+
+ return leftRight;
+}
+
+TEST(LevensteinDistance, calculate_edgecases) {
+ EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
+ EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "1bc", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "a1c", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "ab", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "abcd", 2), std::optional{1});
+ EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+ EXPECT_EQ(calculate("a", "", 2), std::optional{1});
+ EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
+
diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt
index dea0f509bad..1fffddf2f2f 100644
--- a/vespalib/src/vespa/vespalib/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(vespalib
$<TARGET_OBJECTS:vespalib_vespalib_data>
$<TARGET_OBJECTS:vespalib_vespalib_data_slime>
$<TARGET_OBJECTS:vespalib_vespalib_datastore>
+ $<TARGET_OBJECTS:vespalib_vespalib_fuzzy>
$<TARGET_OBJECTS:vespalib_vespalib_geo>
$<TARGET_OBJECTS:vespalib_vespalib_hwaccelrated>
$<TARGET_OBJECTS:vespalib_vespalib_io>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
new file mode 100644
index 00000000000..0f7583da1d7
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vespalib_vespalib_fuzzy OBJECT
+ SOURCES
+ fuzzy_matcher.cpp
+ levenstein_distance.cpp
+ DEPENDS
+ )
+
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
new file mode 100644
index 00000000000..495e08c42e3
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -0,0 +1,29 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "fuzzy_matcher.h"
+#include "levenstein_distance.h"
+
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/text/utf8.h>
+
+vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) {
+ return FuzzyMatcher(LowerCase::convert_to_ucs4(term));
+}
+
+bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
+ std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+
+ return LevensteinDistance::calculate(
+ _folded_term_codepoints,
+ targetCodepoints,
+ _max_edit_distance).has_value();
+}
+
+vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
+ vespalib::string prefix;
+ Utf8Writer writer(prefix);
+ for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
+ writer.putChar(_folded_term_codepoints.at(i));
+ }
+ return prefix;
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
new file mode 100644
index 00000000000..bfcf98d79fd
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <string_view>
+#include <vector>
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace vespalib {
+
+/**
+ * Fuzzy matching between a lowercased instances of query and document terms based on Levenstein distance.
+ * Class has two main parameters:
+ * - prefix size, i.e the size of the prefix that is considered frozen.
+ * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms
+ * Note that prefix size is not impacting matching logic,
+ * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic.
+ * But there is no code to actually enforcing it.
+ */
+class FuzzyMatcher {
+private:
+ static constexpr uint8_t DefaultPrefixSize = 0u;
+ static constexpr uint8_t DefaultMaxEditDistance = 2u;
+
+ std::vector<uint32_t> _folded_term_codepoints;
+
+ uint8_t _max_edit_distance; // max edit distance
+ uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+
+public:
+ FuzzyMatcher():
+ _folded_term_codepoints(),
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize)
+ {}
+
+ FuzzyMatcher(std::vector<uint32_t> codepoints):
+ _folded_term_codepoints(std::move(codepoints)),
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize)
+ {}
+
+ FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size):
+ _folded_term_codepoints(std::move(codepoints)),
+ _max_edit_distance(max_edit_distance),
+ _prefix_size(prefix_size)
+ {}
+
+ [[nodiscard]] bool isMatch(std::string_view target) const;
+
+ [[nodiscard]] vespalib::string getPrefix() const;
+
+ ///
+
+ static FuzzyMatcher from_term(std::string_view term);
+
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp
new file mode 100644
index 00000000000..5ee0f2d4a2d
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp
@@ -0,0 +1,86 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Levenstein distance algorithm is based off Java implementation from apache commons-text library licensed under the Apache 2.0 license.
+
+#include "levenstein_distance.h"
+
+#include <limits>
+
+std::optional<uint32_t> vespalib::LevensteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) {
+ uint32_t n = left.size();
+ uint32_t m = right.size();
+
+ if (n > m) {
+ return calculate(right, left, threshold);
+ }
+
+ // if one string is empty, the edit distance is necessarily the length
+ // of the other
+ if (n == 0) {
+ return m <= threshold ? std::optional(m) : std::nullopt;
+ }
+ if (m == 0) {
+ return n <= threshold ? std::optional(n) : std::nullopt;
+ }
+
+
+ // the edit distance cannot be less than the length difference
+ if (m - n > threshold) {
+ return std::nullopt;
+ }
+
+ std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally
+ std::vector<uint32_t> d(n+1); // cost array, horizontally
+
+ const uint32_t boundary = std::min(n, threshold) + 1;
+
+ for (uint32_t i = 0; i < boundary; ++i) {
+ p[i] = i;
+ }
+ // these fills ensure that the value above the rightmost entry of our
+ // stripe will be ignored in following loop iterations
+ for (uint32_t i = boundary; i < p.size(); ++i) {
+ p[i] = std::numeric_limits<uint32_t>::max();
+ }
+ for (uint32_t i = 0; i < d.size(); ++i) {
+ d[i] = std::numeric_limits<uint32_t>::max();
+ }
+
+ // iterates through t
+ for (uint32_t j = 1; j <= m; ++j) {
+ uint32_t rightJ = right[j - 1]; // jth character of right
+ d[0] = j;
+
+ int32_t min = std::max(1, static_cast<int32_t>(j) - static_cast<int32_t>(threshold));
+
+ uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ?
+ n : std::min(n, j + threshold);
+
+ // ignore entry left of leftmost
+ if (min > 1) {
+ d[min - 1] = std::numeric_limits<uint32_t>::max();
+ }
+
+ uint32_t lowerBound = std::numeric_limits<uint32_t>::max();
+
+ for (uint32_t i = min; i <= max; ++i) {
+ if (left[i - 1] == rightJ) {
+ // diagonally left and up
+ d[i] = p[i - 1];
+ } else {
+ // 1 + minimum of cell to the left, to the top, diagonally
+ // left and up
+
+ d[i] = 1 + std::min(std::min(d[i - 1], p[i]), p[i - 1]);
+ }
+ lowerBound = std::min(lowerBound, d[i]);
+ }
+ if (lowerBound > threshold) {
+ return std::nullopt;
+ }
+ std::swap(p, d);
+ }
+ if (p[n] <= threshold) {
+ return {p[n]};
+ }
+ return std::nullopt;
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h
new file mode 100644
index 00000000000..bce7599fc23
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h
@@ -0,0 +1,25 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <optional>
+#include <cstdint>
+#include <vector>
+
+namespace vespalib {
+
+/**
+ * LevensteinDistance::calculate implements basic Levenstein distance algorithm
+ * with early stopping if the distance is already too high.
+ * If the distance is above threshold method would return empty optional,
+ * if the distance is below or equal to it, the distance will be wrapped in optional.
+ * The types it's built upon are uint32 and it used to compare codepoints from terms,
+ * but in future code can be generalized.
+ *
+ * Algorithm is based off Java implementation from commons-text library
+ */
+class LevensteinDistance {
+public:
+ static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold);
+};
+
+}