aboutsummaryrefslogtreecommitdiffstats
path: root/config-model
diff options
context:
space:
mode:
authorGeir Storli <geirst@vespa.ai>2024-05-08 14:16:06 +0200
committerGitHub <noreply@github.com>2024-05-08 14:16:06 +0200
commit545ac5436db577b10c401af4389497746983a5d5 (patch)
tree34e8009330926281fb2bc99fc0a34c7d7cbd7641 /config-model
parent64a8b6931a655b52c91c55bcc61cf5abbe847464 (diff)
parent548ee1a5950a972ec5ef6a3be3d18f68035322a5 (diff)
Merge pull request #31139 from vespa-engine/toregge/enable-setting-max-token-length-in-field-match
Enable setting max-token-length in field match.
Diffstat (limited to 'config-model')
-rw-r--r--config-model/src/main/java/com/yahoo/schema/document/Matching.java4
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java1
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java3
-rw-r--r--config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java15
-rw-r--r--config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java4
-rw-r--r--config-model/src/main/javacc/SchemaParser.jj14
-rw-r--r--config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java17
-rw-r--r--config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java19
8 files changed, 73 insertions, 4 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java
index 9d68553fa80..33256fa8586 100644
--- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java
+++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java
@@ -33,6 +33,8 @@ public class Matching implements Cloneable, Serializable {
private Integer maxLength;
/** Maximum number of occurrences for each term */
private Integer maxTermOccurrences;
+ /** Maximum number of characters in a token. */
+ private Integer maxTokenLength;
private String exactMatchTerminator = null;
@@ -61,6 +63,8 @@ public class Matching implements Cloneable, Serializable {
public Matching maxLength(int maxLength) { this.maxLength = maxLength; return this; }
public Integer maxTermOccurrences() { return maxTermOccurrences; }
public Matching maxTermOccurrences(int maxTermOccurrences) { this.maxTermOccurrences = maxTermOccurrences; return this; }
+ public Integer maxTokenLength() { return maxTokenLength; }
+ public Matching maxTokenLength(int maxTokenLength) { this.maxTokenLength = maxTokenLength; return this; }
public boolean isTypeUserSet() { return typeUserSet; }
public MatchAlgorithm getAlgorithm() { return algorithm; }
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
index 7659a1e6562..173eebe2a94 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
@@ -44,6 +44,7 @@ public class ConvertParsedFields {
parsed.getGramSize().ifPresent(gramSize -> field.getMatching().setGramSize(gramSize));
parsed.getMaxLength().ifPresent(maxLength -> field.getMatching().maxLength(maxLength));
parsed.getMaxTermOccurrences().ifPresent(maxTermOccurrences -> field.getMatching().maxTermOccurrences(maxTermOccurrences));
+ parsed.getMaxTokenLength().ifPresent(maxTokenLength -> field.getMatching().maxTokenLength(maxTokenLength));
parsed.getMatchAlgorithm().ifPresent
(matchingAlgorithm -> field.setMatchingAlgorithm(matchingAlgorithm));
parsed.getExactTerminator().ifPresent
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java
index c7d1a215ce3..bac2c894283 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java
@@ -23,6 +23,7 @@ public class ParsedMatchSettings {
private Integer gramSize = null;
private Integer maxLength = null;
private Integer maxTermOccurrences = null;
+ private Integer maxTokenLength = null;
Optional<MatchType> getMatchType() { return Optional.ofNullable(matchType); }
Optional<Case> getMatchCase() { return Optional.ofNullable(matchCase); }
@@ -31,6 +32,7 @@ public class ParsedMatchSettings {
Optional<Integer> getGramSize() { return Optional.ofNullable(gramSize); }
Optional<Integer> getMaxLength() { return Optional.ofNullable(maxLength); }
Optional<Integer> getMaxTermOccurrences() { return Optional.ofNullable(maxTermOccurrences); }
+ Optional<Integer> getMaxTokenLength() { return Optional.ofNullable(maxTokenLength); }
// TODO - consider allowing each set only once:
void setType(MatchType value) { this.matchType = value; }
@@ -40,5 +42,6 @@ public class ParsedMatchSettings {
void setGramSize(int value) { this.gramSize = value; }
void setMaxLength(int value) { this.maxLength = value; }
void setMaxTermOccurrences(int value) { this.maxTermOccurrences = value; }
+ void setMaxTokenLength(int value) { this.maxTokenLength = value; }
}
diff --git a/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java b/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java
index 056c37a9830..4313ceb4be1 100644
--- a/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java
+++ b/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java
@@ -16,6 +16,7 @@ import com.yahoo.vespa.indexinglanguage.expressions.ForEachExpression;
import com.yahoo.vespa.indexinglanguage.expressions.IndexExpression;
import com.yahoo.vespa.indexinglanguage.expressions.OutputExpression;
import com.yahoo.vespa.indexinglanguage.expressions.ScriptExpression;
+import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
import com.yahoo.vespa.model.container.search.QueryProfiles;
/**
@@ -75,7 +76,11 @@ public class ExactMatch extends Processor {
}
ScriptExpression script = field.getIndexingScript();
if (new ExpressionSearcher<>(IndexExpression.class).containedIn(script)) {
- field.setIndexingScript(schema.getName(), (ScriptExpression)new MyProvider(schema).convert(field.getIndexingScript()));
+ var maxTokenLength = field.getMatching().maxTokenLength();
+ if (maxTokenLength == null) {
+ maxTokenLength = AnnotatorConfig.getDefaultMaxTokenLength();
+ }
+ field.setIndexingScript(schema.getName(), (ScriptExpression)new MyProvider(schema, maxTokenLength).convert(field.getIndexingScript()));
}
}
@@ -85,8 +90,12 @@ public class ExactMatch extends Processor {
private static class MyProvider extends TypedTransformProvider {
- MyProvider(Schema schema) {
+ private int maxTokenLength;
+
+ MyProvider(Schema schema, int maxTokenLength)
+ {
super(ExactExpression.class, schema);
+ this.maxTokenLength = maxTokenLength;
}
@Override
@@ -96,7 +105,7 @@ public class ExactMatch extends Processor {
@Override
protected Expression newTransform(DataType fieldType) {
- Expression exp = new ExactExpression();
+ Expression exp = new ExactExpression(maxTokenLength);
if (fieldType instanceof CollectionDataType) {
exp = new ForEachExpression(exp);
}
diff --git a/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java b/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java
index 1ff019038fc..3f23cbc9b2d 100644
--- a/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java
+++ b/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java
@@ -70,6 +70,10 @@ public class TextMatch extends Processor {
if (maxTermOccurrences != null) {
ret.setMaxTermOccurrences(maxTermOccurrences);
}
+ var maxTokenLength = fieldMatching.maxTokenLength();
+ if (maxTokenLength != null) {
+ ret.setMaxTokenLength(maxTokenLength);
+ }
}
return ret;
}
diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj
index b40f2d0796d..1365c133932 100644
--- a/config-model/src/main/javacc/SchemaParser.jj
+++ b/config-model/src/main/javacc/SchemaParser.jj
@@ -183,6 +183,7 @@ TOKEN :
| < GRAM_SIZE: "gram-size" >
| < MAX_LENGTH: "max-length" >
| < MAX_OCCURRENCES: "max-occurrences" >
+| < MAX_TOKEN_LENGTH: "max-token-length" >
| < PREFIX: "prefix" >
| < SUBSTRING: "substring" >
| < SUFFIX: "suffix" >
@@ -1368,7 +1369,8 @@ void matchType(ParsedMatchSettings matchInfo) : { }
*/
void matchItem(ParsedMatchSettings matchInfo) : { }
{
- ( matchType(matchInfo) | exactTerminator(matchInfo) | gramSize(matchInfo) | matchSize(matchInfo) | maxTermOccurrences(matchInfo))
+ ( matchType(matchInfo) | exactTerminator(matchInfo) | gramSize(matchInfo) | matchSize(matchInfo) |
+ maxTermOccurrences(matchInfo) | maxTokenLength(matchInfo) )
}
void exactTerminator(ParsedMatchSettings matchInfo) :
@@ -1413,6 +1415,16 @@ void maxTermOccurrences(ParsedMatchSettings matchInfo) :
}
}
+void maxTokenLength(ParsedMatchSettings matchInfo) :
+{
+ int maxTokenLength;
+}
+{
+ <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() {
+ matchInfo.setMaxTokenLength(maxTokenLength);
+ }
+}
+
/**
* Consumes a rank statement of a field element.
*
diff --git a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
index 34ca6c30a61..4186e352388 100644
--- a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
@@ -170,6 +170,23 @@ public class SchemaParserTestCase {
assertEquals(11, field.matchSettings().getMaxTermOccurrences().get());
}
+ @Test
+ void maxTokenLengthCanBeParsed() throws Exception {
+ String input = joinLines
+ ("schema foo {",
+ " document foo {",
+ " field bar type string {",
+ " indexing: summary | index",
+ " match { max-token-length: 11 }",
+ " }",
+ " }",
+ "}");
+ ParsedSchema schema = parseString(input);
+ var field = schema.getDocument().getFields().get(0);
+ assertEquals("bar", field.name());
+ assertEquals(11, field.matchSettings().getMaxTokenLength().get());
+ }
+
void checkFileParses(String fileName) throws Exception {
var schema = parseFile(fileName);
assertNotNull(schema);
diff --git a/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java b/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java
index de99d46b9ca..355a810f5ff 100644
--- a/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java
@@ -10,6 +10,7 @@ import com.yahoo.schema.Schema;
import com.yahoo.schema.ApplicationBuilder;
import com.yahoo.schema.AbstractSchemaTestCase;
import com.yahoo.schema.document.BooleanIndexDefinition;
+import com.yahoo.schema.document.MatchType;
import com.yahoo.schema.document.SDDocumentType;
import com.yahoo.schema.document.SDField;
import com.yahoo.vespa.documentmodel.SummaryField;
@@ -155,6 +156,24 @@ public class IndexingScriptRewriterTestCase extends AbstractSchemaTestCase {
field);
}
+ @Test
+ void requireThatMaxTokenLengthIsPropagated() {
+ var field = new SDField("test", DataType.STRING);
+ field.getMatching().maxTokenLength(10);
+ field.parseIndexingScript("test", "{ summary | index }");
+ assertIndexingScript("{ input test | tokenize normalize stem:\"BEST\" max-token-length:10 | summary test | index test; }",
+ field);
+ }
+
+ @Test
+ void requireThatMaxTokenLengthIsPropagatedForWordMatch() {
+ var field = new SDField("test", DataType.STRING);
+ field.getMatching().maxTokenLength(10).setType(MatchType.WORD);
+ field.parseIndexingScript("test", "{ summary | index }");
+ assertIndexingScript("{ input test | exact max-token-length:10 | summary test | index test; }",
+ field);
+ }
+
private static void assertIndexingScript(String expectedScript, SDField unprocessedField) {
assertEquals(expectedScript,
processField(unprocessedField).toString());