summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-04-11 11:26:02 +0200
committerGitHub <noreply@github.com>2022-04-11 11:26:02 +0200
commit65dc21685f2286a30c82c7d14c9fe5fe5c42d412 (patch)
treec28d2a5dc1bff4dab1051c163042b84899d2bb2c
parent23841f2517967c1a59cf9826f1de953c5caa7199 (diff)
parent16aaf73dc37c63fa92a3298e6c9f8fa6ed32422a (diff)
Merge pull request #21972 from alexeyche/alexeyche/fuzzy-query-annotations
Propagating annotations for fuzzy query [WIP]
-rw-r--r--container-search/abi-spec.json13
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java64
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/SelectParser.java11
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java29
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/YqlParser.java21
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java5
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java17
-rw-r--r--searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp9
-rw-r--r--searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp4
-rw-r--r--searchlib/src/tests/query/customtypevisitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/query_visitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/querybuilder_test.cpp32
-rw-r--r--searchlib/src/tests/query/streaming_query_test.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h7
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/querybuilder.h10
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/queryreplicator.h3
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/simplequery.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h4
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/termnodes.h16
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp44
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp39
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h46
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h4
31 files changed, 350 insertions, 82 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index fafb79ae6f6..303e0f65b0c 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -548,7 +548,11 @@
"public"
],
"methods": [
- "public void <init>(java.lang.String, boolean, java.lang.String)",
+ "public void <init>(java.lang.String, boolean, java.lang.String, int, int)",
+ "public void setMaxEditDistance(int)",
+ "public void setPrefixLength(int)",
+ "public int getPrefixLength()",
+ "public int getMaxEditDistance()",
"public void setValue(java.lang.String)",
"public java.lang.String getRawWord()",
"public boolean isWords()",
@@ -560,10 +564,13 @@
"public int getNumWords()",
"public boolean equals(java.lang.Object)",
"public int hashCode()",
- "public java.lang.String toString()",
+ "protected void appendHeadingString(java.lang.StringBuilder)",
"protected void encodeThis(java.nio.ByteBuffer)"
],
- "fields": []
+ "fields": [
+ "public static int DEFAULT_MAX_EDIT_DISTANCE",
+ "public static int DEFAULT_PREFIX_LENGTH"
+ ]
},
"com.yahoo.prelude.query.GeoLocationItem": {
"superClass": "com.yahoo.prelude.query.TermItem",
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
index 74c31a5d1f0..b26205b74e9 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
@@ -1,7 +1,10 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.query;
+import com.yahoo.compress.IntegerCompressor;
+
import java.nio.ByteBuffer;
+import java.util.Objects;
/**
* Fuzzy search term
@@ -11,9 +14,37 @@ import java.nio.ByteBuffer;
public class FuzzyItem extends TermItem {
private String term;
- public FuzzyItem(String indexName, boolean isFromQuery, String term) {
+ private int maxEditDistance;
+ private int prefixLength;
+
+ public static int DEFAULT_MAX_EDIT_DISTANCE = 2;
+ public static int DEFAULT_PREFIX_LENGTH = 0;
+
+ public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) {
super(indexName, isFromQuery, null);
setValue(term);
+ setMaxEditDistance(maxEditDistance);
+ setPrefixLength(prefixLength);
+ }
+
+ public void setMaxEditDistance(int maxEditDistance) {
+ if (maxEditDistance < 0)
+ throw new IllegalArgumentException("Can not use negative maxEditDistance " + maxEditDistance);
+ this.maxEditDistance = maxEditDistance;
+ }
+
+ public void setPrefixLength(int prefixLength) {
+ if (prefixLength < 0)
+ throw new IllegalArgumentException("Can not use negative prefixLength " + prefixLength);
+ this.prefixLength = prefixLength;
+ }
+
+ public int getPrefixLength() {
+ return this.prefixLength;
+ }
+
+ public int getMaxEditDistance() {
+ return this.maxEditDistance;
}
@Override
@@ -73,35 +104,36 @@ public class FuzzyItem extends TermItem {
return false;
}
FuzzyItem other = (FuzzyItem) obj;
- if (term == null) {
- if (other.term != null) {
- return false;
- }
- } else if (!term.equals(other.term)) {
- return false;
- }
+ if (!this.term.equals(other.term)) return false;
+ if (this.maxEditDistance != other.maxEditDistance) return false;
+ if (this.prefixLength != other.prefixLength) return false;
return true;
}
@Override
public int hashCode() {
- final int prime = 31;
- int result = super.hashCode();
- result = prime * result + ((term == null) ? 0 : term.hashCode());
- return result;
+ return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength);
}
@Override
- public String toString() {
- StringBuilder builder = new StringBuilder();
- builder.append("FuzzyItem [term=").append(term).append("]");
- return builder.toString();
+ protected void appendHeadingString(StringBuilder buffer) {
+ buffer.append(getName());
+ buffer.append("(");
+ buffer.append(this.term);
+ buffer.append(",");
+ buffer.append(this.maxEditDistance);
+ buffer.append(",");
+ buffer.append(this.prefixLength);
+ buffer.append(")");
+ buffer.append(" ");
}
@Override
protected void encodeThis(ByteBuffer buffer) {
super.encodeThis(buffer);
putString(getIndexedString(), buffer);
+ IntegerCompressor.putCompressedPositiveNumber(this.maxEditDistance, buffer);
+ IntegerCompressor.putCompressedPositiveNumber(this.prefixLength, buffer);
}
}
diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
index 320148ec01d..ccf24a13f34 100644
--- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
+++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
@@ -61,6 +61,8 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
+import static com.yahoo.search.yql.YqlParser.MAX_EDIT_DISTANCE;
+import static com.yahoo.search.yql.YqlParser.PREFIX_LENGTH;
import static com.yahoo.slime.Type.ARRAY;
import static com.yahoo.slime.Type.DOUBLE;
import static com.yahoo.slime.Type.LONG;
@@ -1161,9 +1163,16 @@ public class SelectParser implements Parser {
private Item instantiateFuzzyItem(String field, String key, Inspector value) {
HashMap<Integer, Inspector> children = childMap(value);
+ HashMap<String, Inspector> annotations = getAnnotationMap(value);
+
Preconditions.checkArgument(children.size() == 1, "Expected 1 argument, got %s.", children.size());
+
String wordData = children.get(0).asString();
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData);
+
+ Integer maxEditDistance = getIntegerAnnotation(MAX_EDIT_DISTANCE, annotations, FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE);
+ Integer prefixLength = getIntegerAnnotation(PREFIX_LENGTH, annotations, FuzzyItem.DEFAULT_PREFIX_LENGTH);
+
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
return leafStyleSettings(getAnnotations(value), fuzzy);
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
index e778798b0e5..194863e3129 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
@@ -21,6 +21,7 @@ import static com.yahoo.search.yql.YqlParser.GEO_LOCATION;
import static com.yahoo.search.yql.YqlParser.HIT_LIMIT;
import static com.yahoo.search.yql.YqlParser.IMPLICIT_TRANSFORMS;
import static com.yahoo.search.yql.YqlParser.LABEL;
+import static com.yahoo.search.yql.YqlParser.MAX_EDIT_DISTANCE;
import static com.yahoo.search.yql.YqlParser.NEAR;
import static com.yahoo.search.yql.YqlParser.NEAREST_NEIGHBOR;
import static com.yahoo.search.yql.YqlParser.NORMALIZE_CASE;
@@ -31,6 +32,7 @@ import static com.yahoo.search.yql.YqlParser.ORIGIN_OFFSET;
import static com.yahoo.search.yql.YqlParser.ORIGIN_ORIGINAL;
import static com.yahoo.search.yql.YqlParser.PHRASE;
import static com.yahoo.search.yql.YqlParser.PREFIX;
+import static com.yahoo.search.yql.YqlParser.PREFIX_LENGTH;
import static com.yahoo.search.yql.YqlParser.RANGE;
import static com.yahoo.search.yql.YqlParser.RANK;
import static com.yahoo.search.yql.YqlParser.RANKED;
@@ -526,7 +528,8 @@ public class VespaSerializer {
@Override
boolean serialize(StringBuilder destination, FuzzyItem fuzzy) {
- String annotations = leafAnnotations(fuzzy);
+ String annotations = fuzzyAnnotations(fuzzy);
+
destination.append(normalizeIndexName(fuzzy.getIndexName())).append(" contains ");
if (annotations.length() > 0) {
@@ -543,6 +546,30 @@ public class VespaSerializer {
}
return false;
}
+
+ static String fuzzyAnnotations(FuzzyItem fuzzyItem) {
+ boolean isMaxEditDistanceSet = fuzzyItem.getMaxEditDistance() != FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE;
+ boolean isPrefixLengthSet = fuzzyItem.getPrefixLength() != FuzzyItem.DEFAULT_PREFIX_LENGTH;
+ boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet;
+
+ StringBuilder builder = new StringBuilder();
+ if (anyAnnotationSet) {
+ builder.append("[{");
+ }
+ if (isMaxEditDistanceSet) {
+ builder.append(MAX_EDIT_DISTANCE + ":").append(fuzzyItem.getMaxEditDistance());
+ }
+ if (isMaxEditDistanceSet && isPrefixLengthSet) {
+ builder.append(",");
+ }
+ if (isPrefixLengthSet) {
+ builder.append(PREFIX_LENGTH + ":").append(fuzzyItem.getPrefixLength());
+ }
+ if (anyAnnotationSet) {
+ builder.append("}]");
+ }
+ return builder.toString();
+ }
}
private static class ONearSerializer extends Serializer<ONearItem> {
diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
index 06ee0e706f3..19d4e6d41ba 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
@@ -126,6 +126,8 @@ public class YqlParser implements Parser {
private static final String RANKED_DESCRIPTION = "setting for whether to use term for ranking";
private static final String STEM_DESCRIPTION = "setting for whether to use stem if field implies it";
private static final String USE_POSITION_DATA_DESCRIPTION = "setting for whether to use position data for ranking this item";
+ private static final String MAX_EDIT_DISTANCE_DESCRIPTION = "setting for an inclusive upper bound for a fuzzy edit-distance search";
+ private static final String PREFIX_LENGTH_DESCRIPTION = "setting for a prefix length that is considered frozen for a fuzzy search";
private static final String USER_INPUT_ALLOW_EMPTY = "allowEmpty";
private static final String USER_INPUT_DEFAULT_INDEX = "defaultIndex";
private static final String USER_INPUT_GRAMMAR = "grammar";
@@ -194,6 +196,9 @@ public class YqlParser implements Parser {
public static final String WEIGHT = "weight";
public static final String WEIGHTED_SET = "weightedSet";
public static final String FUZZY = "fuzzy";
+ public static final String MAX_EDIT_DISTANCE = "maxEditDistance";
+ public static final String PREFIX_LENGTH = "prefixLength";
+
private final IndexFacts indexFacts;
private final List<ConnectedItem> connectedItems = new ArrayList<>();
@@ -1313,7 +1318,21 @@ public class YqlParser implements Parser {
String wordData = getStringContents(args.get(0));
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData);
+ Integer maxEditDistance = getAnnotation(
+ ast,
+ MAX_EDIT_DISTANCE,
+ Integer.class,
+ FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE,
+ MAX_EDIT_DISTANCE_DESCRIPTION);
+
+ Integer prefixLength = getAnnotation(
+ ast,
+ PREFIX_LENGTH,
+ Integer.class,
+ FuzzyItem.DEFAULT_PREFIX_LENGTH,
+ PREFIX_LENGTH_DESCRIPTION);
+
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
return leafStyleSettings(ast, fuzzy);
}
diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
index 1269c2a5aef..8a90d224003 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
@@ -449,4 +449,9 @@ public class VespaSerializerTestCase {
parseAndConfirm("foo contains fuzzy(\"a\")");
}
+ @Test
+ public void testFuzzyAnnotations() {
+ parseAndConfirm("foo contains ([{maxEditDistance:3,prefixLength:5}]fuzzy(\"a\"))");
+ }
+
}
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index f40e212adde..3a6641b0d7a 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -47,9 +47,7 @@ import com.yahoo.search.searchchain.Execution;
import org.junit.Test;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
-import java.util.Collections;
import java.util.HashSet;
import java.util.List;
@@ -389,6 +387,21 @@ public class YqlParserTestCase {
assertSame(FuzzyItem.class, root.getClass());
assertEquals("baz", ((FuzzyItem) root).getIndexName());
assertEquals("a b", ((FuzzyItem) root).stringValue());
+ assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, ((FuzzyItem) root).getMaxEditDistance());
+ assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, ((FuzzyItem) root).getPrefixLength());
+ }
+
+ @Test
+ public void testFuzzyAnnotations() {
+ QueryTree x = parse(
+ "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10}fuzzy(\"a b\"))"
+ );
+ Item root = x.getRoot();
+ assertSame(FuzzyItem.class, root.getClass());
+ assertEquals("baz", ((FuzzyItem) root).getIndexName());
+ assertEquals("a b", ((FuzzyItem) root).stringValue());
+ assertEquals(3, ((FuzzyItem) root).getMaxEditDistance());
+ assertEquals(10, ((FuzzyItem) root).getPrefixLength());
}
@Test
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
index aa64e944baa..947e4aa30c2 100644
--- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
+++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
@@ -402,7 +402,8 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st
{
uint32_t indexLen = index.size();
uint32_t termLen = term.size();
- uint32_t queryPacketSize = 1 + 2 * 4 + indexLen + termLen;
+ uint32_t fuzzyParametersSize = (termType == TermType::FUZZYTERM) ? 8 : 0;
+ uint32_t queryPacketSize = 1 + 2 * 4 + indexLen + termLen + fuzzyParametersSize;
uint32_t p = 0;
buffer.resize(queryPacketSize);
switch (termType) {
@@ -419,6 +420,12 @@ SearchContextTest::buildTermQuery(std::vector<char> & buffer, const vespalib::st
p += vespalib::compress::Integer::compressPositive(termLen, &buffer[p]);
memcpy(&buffer[p], term.c_str(), termLen);
p += termLen;
+
+ if (termType == TermType::FUZZYTERM) {
+ p += vespalib::compress::Integer::compressPositive(2, &buffer[p]); // max edit distance
+ p += vespalib::compress::Integer::compressPositive(0, &buffer[p]); // prefix length
+ }
+
buffer.resize(p);
}
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
index 1f570e0a381..4deb287df0e 100644
--- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
+++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
@@ -388,8 +388,8 @@ testSingleValue(Attribute & svsa, Config &cfg)
TEST("testSingleValue")
{
EXPECT_EQUAL(24u, sizeof(SearchContext));
- EXPECT_EQUAL(56u, sizeof(StringSearchHelper));
- EXPECT_EQUAL(104u, sizeof(attribute::SingleStringEnumSearchContext));
+ EXPECT_EQUAL(88u, sizeof(StringSearchHelper));
+ EXPECT_EQUAL(136u, sizeof(attribute::SingleStringEnumSearchContext));
{
Config cfg(BasicType::STRING, CollectionType::SINGLE);
SingleValueStringAttribute svsa("svsa", cfg);
diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp
index 0e8155e23c3..b1e979b5bcb 100644
--- a/searchlib/src/tests/query/customtypevisitor_test.cpp
+++ b/searchlib/src/tests/query/customtypevisitor_test.cpp
@@ -36,7 +36,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {};
struct MyStringTerm : InitTerm<StringTerm> {};
struct MySubstrTerm : InitTerm<SubstringTerm> {};
struct MySuffixTerm : InitTerm<SuffixTerm> {};
-struct MyFuzzyTerm : InitTerm<FuzzyTerm> {};
+struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} };
struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} };
struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} };
struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} };
diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp
index f770213e8e5..8e8be997be0 100644
--- a/searchlib/src/tests/query/query_visitor_test.cpp
+++ b/searchlib/src/tests/query/query_visitor_test.cpp
@@ -86,7 +86,7 @@ TEST("requireThatAllNodesCanBeVisited") {
checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25));
checkVisit<TrueQueryNode>(new SimpleTrue());
checkVisit<FalseQueryNode>(new SimpleFalse());
- checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0)));
+ checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0));
}
} // namespace
diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp
index 2ea566027c4..5b410879fa0 100644
--- a/searchlib/src/tests/query/querybuilder_test.cpp
+++ b/searchlib/src/tests/query/querybuilder_test.cpp
@@ -7,6 +7,8 @@
#include <vespa/searchlib/query/tree/querybuilder.h>
#include <vespa/searchlib/query/tree/simplequery.h>
#include <vespa/searchlib/query/tree/stackdumpcreator.h>
+#include <vespa/searchlib/query/query_term_decoder.h>
+#include <vespa/searchlib/query/query_term_simple.h>
#include <vespa/vespalib/testkit/test_kit.h>
#include <vespa/log/log.h>
@@ -115,7 +117,7 @@ Node::UP createQueryTree() {
builder.add_true_node();
builder.add_false_node();
}
- builder.addFuzzyTerm(str[5], view[5], id[5], weight[5]);
+ builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1);
}
Node::UP node = builder.build();
ASSERT_TRUE(node.get());
@@ -311,6 +313,8 @@ void checkQueryTreeTypes(Node *node) {
auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]);
EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5]));
+ EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance());
+ EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength());
}
struct AbstractTypes {
@@ -434,8 +438,9 @@ struct MyNearestNeighborTerm : NearestNeighborTerm {
struct MyTrue : TrueQueryNode {};
struct MyFalse : FalseQueryNode {};
struct MyFuzzyTerm : FuzzyTerm {
- MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w)
- : FuzzyTerm(t, f, i, w) {
+ MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w,
+ uint32_t m, uint32_t p)
+ : FuzzyTerm(t, f, i, w, m, p) {
}
};
@@ -578,6 +583,7 @@ TEST("require that Query Tree Creator Can Replicate Queries") {
TEST("require that Query Tree Creator Can Create Queries From Stack") {
Node::UP node = createQueryTree<MyQueryNodeTypes>();
string stackDump = StackDumpCreator::create(*node);
+
SimpleQueryStackDumpIterator iterator(stackDump);
Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
@@ -618,6 +624,26 @@ TEST("require that All Range Syntaxes Work") {
EXPECT_TRUE(range2 == range_term->getTerm());
}
+TEST("require that fuzzy node can be created") {
+ QueryBuilder<SimpleQueryNodeTypes> builder;
+ builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1);
+ Node::UP node = builder.build();
+
+ string stackDump = StackDumpCreator::create(*node);
+ {
+ SimpleQueryStackDumpIterator iterator(stackDump);
+ Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
+ FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get());
+ EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance());
+ EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength());
+ }
+ {
+ search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump);
+ EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance());
+ EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength());
+ }
+}
+
TEST("require that empty intermediate node can be added") {
QueryBuilder<SimpleQueryNodeTypes> builder;
builder.addAnd(0);
diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp
index ec292eda8eb..4b5433dd370 100644
--- a/searchlib/src/tests/query/streaming_query_test.cpp
+++ b/searchlib/src/tests/query/streaming_query_test.cpp
@@ -805,9 +805,9 @@ TEST("testSameElementEvaluate") {
}
TEST("Control the size of query terms") {
- EXPECT_EQUAL(104u, sizeof(QueryTermSimple));
- EXPECT_EQUAL(120u, sizeof(QueryTermUCS4));
- EXPECT_EQUAL(264u, sizeof(QueryTerm));
+ EXPECT_EQUAL(112u, sizeof(QueryTermSimple));
+ EXPECT_EQUAL(128u, sizeof(QueryTermUCS4));
+ EXPECT_EQUAL(272u, sizeof(QueryTerm));
}
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index d876d80e73f..cd233b75438 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -27,7 +27,10 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
_term._char = term.getTerm();
_termLen = term.getTermLen();
} else if (isFuzzy()) {
- _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(term.getTerm());
+ _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(
+ term.getTerm(),
+ term.getFuzzyMaxEditDistance(),
+ term.getFuzzyPrefixLength());
} else {
term.term(_term._ucs4);
}
diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
index 85b55284b35..f625a2b7fd2 100644
--- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
+++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.cpp
@@ -170,7 +170,6 @@ bool SimpleQueryStackDumpIterator::readNext() {
case ParseItem::ITEM_EXACTSTRINGTERM:
case ParseItem::ITEM_SUFFIXTERM:
case ParseItem::ITEM_REGEXP:
- case ParseItem::ITEM_FUZZY:
_curr_index_name = read_stringref(p);
_curr_term = read_stringref(p);
_currArity = 0;
@@ -188,6 +187,9 @@ bool SimpleQueryStackDumpIterator::readNext() {
case ParseItem::ITEM_NEAREST_NEIGHBOR:
if ( ! readNN(p)) return false;
break;
+ case ParseItem::ITEM_FUZZY:
+ if (!readFuzzy(p)) return false;
+ break;
case ParseItem::ITEM_TRUE:
case ParseItem::ITEM_FALSE:
// no content
@@ -256,4 +258,14 @@ SimpleQueryStackDumpIterator::readComplexTerm(const char *& p) {
return true;
}
+bool
+SimpleQueryStackDumpIterator::readFuzzy(const char *&p) {
+ _curr_index_name = read_stringref(p);
+ _curr_term = read_stringref(p); // fuzzy term
+ _extraIntArg1 = readCompressedPositiveInt(p); // maxEditDistance
+ _extraIntArg2 = readCompressedPositiveInt(p); // prefixLength
+ _currArity = 0;
+ return true;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
index 5cec16000b7..ca3e3f6be10 100644
--- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
+++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
@@ -54,6 +54,7 @@ private:
VESPA_DLL_LOCAL bool readPredicate(const char *&p);
VESPA_DLL_LOCAL bool readNN(const char *&p);
VESPA_DLL_LOCAL bool readComplexTerm(const char *& p);
+ VESPA_DLL_LOCAL bool readFuzzy(const char *&p);
VESPA_DLL_LOCAL bool readNext();
public:
/**
@@ -115,6 +116,10 @@ public:
bool getAllowApproximate() const { return (_extraIntArg2 != 0); }
uint32_t getExploreAdditionalHits() const { return _extraIntArg3; }
+ // fuzzy match arguments
+ uint32_t getFuzzyMaxEditDistance() const { return _extraIntArg1; }
+ uint32_t getFuzzyPrefixLength() const { return _extraIntArg2; }
+
query::PredicateQueryTerm::UP getPredicateQueryTerm() { return std::move(_predicate_query_term); }
vespalib::stringref getIndexName() const { return _curr_index_name; }
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
index e0fcc8b690c..17e50216a23 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
@@ -210,7 +210,9 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type)
_diversityCutoffStrict(false),
_valid(true),
_term(term_),
- _diversityAttribute()
+ _diversityAttribute(),
+ _fuzzyMaxEditDistance(2),
+ _fuzzyPrefixLength(0)
{
if (isFullRange(_term)) {
stringref rest(_term.c_str() + 1, _term.size() - 2);
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index 0d5dd116826..b84813cd12e 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -52,6 +52,8 @@ public:
size_t getDiversityCutoffGroups() const { return _diversityCutoffGroups; }
bool getDiversityCutoffStrict() const { return _diversityCutoffStrict; }
vespalib::stringref getDiversityAttribute() const { return _diversityAttribute; }
+ size_t getFuzzyMaxEditDistance() const { return _fuzzyMaxEditDistance; }
+ size_t getFuzzyPrefixLength() const { return _fuzzyPrefixLength; }
bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const;
bool getAsDoubleTerm(double & lower, double & upper) const;
const char * getTerm() const { return _term.c_str(); }
@@ -68,6 +70,7 @@ public:
vespalib::string getClassName() const;
bool isValid() const { return _valid; }
const string & getTermString() const { return _term; }
+
private:
bool getRangeInternal(int64_t & low, int64_t & high) const;
template <typename N>
@@ -84,6 +87,10 @@ private:
stringref _diversityAttribute;
template <typename T, typename D>
bool getAsNumericTerm(T & lower, T & upper, D d) const;
+
+protected:
+ uint32_t _fuzzyMaxEditDistance; // set in QueryTerm
+ uint32_t _fuzzyPrefixLength;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 6f126c7a3eb..6d59886a4f5 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -140,6 +140,10 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm);
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
+ if (qt->isFuzzy()) {
+ qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance());
+ qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength());
+ }
if (qt->encoding().isBase10Integer() ||
! qt->encoding().isFloat() ||
! factory.getRewriteFloatTerms() ||
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 25f2b598413..af32458c5da 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -85,6 +85,8 @@ public:
void visitMembers(vespalib::ObjectVisitor &visitor) const override;
void setIndex(const string & index_) override { _index = index_; }
const string & getIndex() const override { return _index; }
+ void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; }
+ void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
index ee3a944cce1..2273dadbfcf 100644
--- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
+++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
@@ -222,8 +222,9 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st
}
template <class NodeTypes>
typename NodeTypes::FuzzyTerm *
-createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight) {
- return new typename NodeTypes::FuzzyTerm(term, view, id, weight);
+createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight,
+ uint32_t maxEditDistance, uint32_t prefixLength) {
+ return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
}
@@ -333,9 +334,10 @@ public:
adjustWeight(weight);
return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight));
}
- typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight) {
+ typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight,
+ uint32_t maxEditDistance, uint32_t prefixLength) {
adjustWeight(weight);
- return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight));
+ return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength));
}
typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name,
int32_t id, Weight weight, uint32_t target_num_hits,
diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
index ecaee350b21..52cfbc5effb 100644
--- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
@@ -197,7 +197,8 @@ private:
void visit(FuzzyTerm &node) override {
replicate(node, _builder.addFuzzyTerm(
node.getTerm(), node.getView(),
- node.getId(), node.getWeight()));
+ node.getId(), node.getWeight(),
+ node.getMaxEditDistance(), node.getPrefixLength()));
}
};
diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
index 00dad2597ce..05e731c0c5c 100644
--- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h
+++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
@@ -154,8 +154,9 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm {
};
struct SimpleFuzzyTerm : FuzzyTerm {
SimpleFuzzyTerm(const Type &term, vespalib::stringref view,
- int32_t id, Weight weight)
- : FuzzyTerm(term, view, id, weight) {
+ int32_t id, Weight weight,
+ uint32_t maxEditDistance, uint32_t prefixLength)
+ : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) {
}
~SimpleFuzzyTerm() override;
};
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
index f36410d1845..5ed33ec18a1 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
@@ -280,6 +280,8 @@ class QueryNodeConverter : public QueryVisitor {
void visit(FuzzyTerm &node) override {
createTerm(node, ParseItem::ITEM_FUZZY);
+ appendCompressedPositiveNumber(node.getMaxEditDistance());
+ appendCompressedPositiveNumber(node.getPrefixLength());
}
void visit(NearestNeighborTerm &node) override {
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
index a5f3be3e618..4103c3b1766 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
@@ -198,7 +198,9 @@ private:
} else if (type == ParseItem::ITEM_REGEXP) {
t = &builder.addRegExpTerm(term, view, id, weight);
} else if (type == ParseItem::ITEM_FUZZY) {
- t = &builder.addFuzzyTerm(term, view, id, weight);
+ uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance();
+ uint32_t prefixLength = queryStack.getFuzzyPrefixLength();
+ t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
} else {
vespalib::Issue::report("query builder: Unable to create query tree from stack dump. node type = %d.", type);
}
diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
index 7aa867e25ed..5604a3afb52 100644
--- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h
+++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
@@ -117,13 +117,21 @@ public:
//-----------------------------------------------------------------------------
-class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase>
-{
+class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> {
+private:
+ uint32_t _maxEditDistance;
+ uint32_t _prefixLength;
public:
FuzzyTerm(const Type &term, vespalib::stringref view,
- int32_t id, Weight weight)
- : QueryNodeMixinType(term, view, id, weight)
+ int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength)
+ : QueryNodeMixinType(term, view, id, weight),
+ _maxEditDistance(maxEditDistance),
+ _prefixLength(prefixLength)
{}
+
+ uint32_t getMaxEditDistance() const { return _maxEditDistance; }
+ uint32_t getPrefixLength() const { return _prefixLength; }
+
virtual ~FuzzyTerm() = 0;
};
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index 60a4eab3f57..1395915fa5f 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -6,12 +6,34 @@
using namespace vespalib;
-FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) {
- return {LowerCase::convert_to_ucs4(term), threshold, prefix_size};
+
+template <typename T>
+void assert_span(std::span<const T> left, std::vector<T> right) {
+ EXPECT_TRUE(std::equal(left.begin(), left.end(), right.begin(), right.end()));
+}
+
+TEST(FuzzyMatcherTest, get_prefix_edge_cases) {
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 0), {});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 1), {1 });
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 2), {1, 2});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 3), {1, 2, 3});
+ assert_span(FuzzyMatcher::get_prefix({1, 2, 3}, 10),{1, 2, 3});
+ assert_span(FuzzyMatcher::get_prefix({}, 0), {});
+ assert_span(FuzzyMatcher::get_prefix({}, 10), {});
+}
+
+TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 0), {1, 2, 3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 1), {2, 3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 2), {3});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 3), {});
+ assert_span(FuzzyMatcher::get_suffix({1, 2, 3}, 10), {});
+ assert_span(FuzzyMatcher::get_suffix({}, 0), {});
+ assert_span(FuzzyMatcher::get_suffix({}, 10),{});
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy = from_term("abc", 2, 0);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -20,22 +42,30 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
}
TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy = from_term("abcdef", 2, 2);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
EXPECT_TRUE(fuzzy.isMatch("abcd12"));
EXPECT_FALSE(fuzzy.isMatch("abc123"));
- EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced
+ EXPECT_FALSE(fuzzy.isMatch("12cdef"));
}
TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy = from_term("whatever", 2, 0);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
+TEST(FuzzyMatcherTest, term_is_empty) {
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0);
+ EXPECT_TRUE(fuzzy.isMatch(""));
+ EXPECT_TRUE(fuzzy.isMatch("a"));
+ EXPECT_TRUE(fuzzy.isMatch("aa"));
+ EXPECT_FALSE(fuzzy.isMatch("aaa"));
+}
+
TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy = from_term("abcd", 2, 2);
+ FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index e170f83c541..21fdb86a48d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -6,24 +6,51 @@
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
-vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) {
- return FuzzyMatcher(LowerCase::convert_to_ucs4(term));
+vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) {
+ return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength);
+}
+
+std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
+ if (prefixLength == 0 || termCodepoints.empty()) {
+ return {};
+ } else {
+ uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size()));
+ return {termCodepoints.begin(), termCodepoints.begin() + actualPrefixLength};
+ }
+}
+
+std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
+ if (termCodepoints.empty()) {
+ return {};
+ } else {
+ uint32_t actualPrefixLength = std::min(prefixLength, static_cast<uint32_t>(termCodepoints.size()));
+ return {termCodepoints.begin() + actualPrefixLength, termCodepoints.end()};
+ }
}
bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+ if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty
+ std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size);
+ // if prefix does not match, early stop
+ if (!std::equal(_folded_term_codepoints_prefix.begin(), _folded_term_codepoints_prefix.end(),
+ targetPrefix.begin(), targetPrefix.end())) {
+ return false;
+ }
+ }
+
return LevenshteinDistance::calculate(
- _folded_term_codepoints,
- targetCodepoints,
+ _folded_term_codepoints_suffix,
+ get_suffix(targetCodepoints, _prefix_size),
_max_edit_distance).has_value();
}
vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
vespalib::string prefix;
Utf8Writer writer(prefix);
- for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
- writer.putChar(_folded_term_codepoints.at(i));
+ for (const uint32_t& code: _folded_term_codepoints_prefix) {
+ writer.putChar(code);
}
return prefix;
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index d4ee0e6d40f..12cc039706f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -3,6 +3,7 @@
#include <string_view>
#include <vector>
+#include <span>
#include <vespa/vespalib/stllike/string.h>
@@ -13,37 +14,46 @@ namespace vespalib {
* Class has two main parameters:
* - prefix size, i.e the size of the prefix that is considered frozen.
* - max edit distance, i.e an upper bound for edit distance for it to be a match between terms
- * Note that prefix size is not impacting matching logic,
- * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic.
- * But there is no code to actually enforcing it.
+ * Prefix size dictates of how match of a prefix is frozen,
+ * i.e. if prefixes between the document and the query do not match (after lowercase)
+ * matcher would return false early, without fuzzy match.
*/
class FuzzyMatcher {
private:
- static constexpr uint8_t DefaultPrefixSize = 0u;
- static constexpr uint8_t DefaultMaxEditDistance = 2u;
+ static constexpr uint32_t DefaultPrefixSize = 0u;
+ static constexpr uint32_t DefaultMaxEditDistance = 2u;
+
+ uint32_t _max_edit_distance; // max edit distance
+ uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
std::vector<uint32_t> _folded_term_codepoints;
- uint8_t _max_edit_distance; // max edit distance
- uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+ std::span<const uint32_t> _folded_term_codepoints_prefix;
+ std::span<const uint32_t> _folded_term_codepoints_suffix;
public:
FuzzyMatcher():
- _folded_term_codepoints(),
_max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize)
+ _prefix_size(DefaultPrefixSize),
+ _folded_term_codepoints(),
+ _folded_term_codepoints_prefix(),
+ _folded_term_codepoints_suffix()
{}
- FuzzyMatcher(std::vector<uint32_t> codepoints):
- _folded_term_codepoints(std::move(codepoints)),
+ FuzzyMatcher(const std::vector<uint32_t>&& codepoints):
_max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize)
+ _prefix_size(DefaultPrefixSize),
+ _folded_term_codepoints(codepoints),
+ _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
+ _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
{}
- FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size):
- _folded_term_codepoints(std::move(codepoints)),
+ FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size):
_max_edit_distance(max_edit_distance),
- _prefix_size(prefix_size)
+ _prefix_size(prefix_size),
+ _folded_term_codepoints(codepoints),
+ _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
+ _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
{}
[[nodiscard]] bool isMatch(std::string_view target) const;
@@ -52,7 +62,11 @@ public:
///
- static FuzzyMatcher from_term(std::string_view term);
+ static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength);
+
+ static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
+
+ static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
};
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
index 86ced3e52db..5f97a652f52 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.cpp
@@ -4,9 +4,10 @@
#include "levenshtein_distance.h"
#include <limits>
+#include <vector>
std::optional<uint32_t>
-vespalib::LevenshteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold)
+vespalib::LevenshteinDistance::calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold)
{
uint32_t n = left.size();
uint32_t m = right.size();
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
index 311211aba7a..c1a3435db44 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_distance.h
@@ -3,7 +3,7 @@
#include <optional>
#include <cstdint>
-#include <vector>
+#include <span>
namespace vespalib {
@@ -19,7 +19,7 @@ namespace vespalib {
*/
class LevenshteinDistance {
public:
- static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold);
+ static std::optional<uint32_t> calculate(std::span<const uint32_t> left, std::span<const uint32_t> right, uint32_t threshold);
};
}