diff options
22 files changed, 205 insertions, 41 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java index 9d68553fa80..33256fa8586 100644 --- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java +++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java @@ -33,6 +33,8 @@ public class Matching implements Cloneable, Serializable { private Integer maxLength; /** Maximum number of occurrences for each term */ private Integer maxTermOccurrences; + /** Maximum number of characters in a token. */ + private Integer maxTokenLength; private String exactMatchTerminator = null; @@ -61,6 +63,8 @@ public class Matching implements Cloneable, Serializable { public Matching maxLength(int maxLength) { this.maxLength = maxLength; return this; } public Integer maxTermOccurrences() { return maxTermOccurrences; } public Matching maxTermOccurrences(int maxTermOccurrences) { this.maxTermOccurrences = maxTermOccurrences; return this; } + public Integer maxTokenLength() { return maxTokenLength; } + public Matching maxTokenLength(int maxTokenLength) { this.maxTokenLength = maxTokenLength; return this; } public boolean isTypeUserSet() { return typeUserSet; } public MatchAlgorithm getAlgorithm() { return algorithm; } diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java index 7659a1e6562..173eebe2a94 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java @@ -44,6 +44,7 @@ public class ConvertParsedFields { parsed.getGramSize().ifPresent(gramSize -> field.getMatching().setGramSize(gramSize)); parsed.getMaxLength().ifPresent(maxLength -> field.getMatching().maxLength(maxLength)); parsed.getMaxTermOccurrences().ifPresent(maxTermOccurrences -> field.getMatching().maxTermOccurrences(maxTermOccurrences)); + parsed.getMaxTokenLength().ifPresent(maxTokenLength -> field.getMatching().maxTokenLength(maxTokenLength)); parsed.getMatchAlgorithm().ifPresent (matchingAlgorithm -> field.setMatchingAlgorithm(matchingAlgorithm)); parsed.getExactTerminator().ifPresent diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java index c7d1a215ce3..bac2c894283 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedMatchSettings.java @@ -23,6 +23,7 @@ public class ParsedMatchSettings { private Integer gramSize = null; private Integer maxLength = null; private Integer maxTermOccurrences = null; + private Integer maxTokenLength = null; Optional<MatchType> getMatchType() { return Optional.ofNullable(matchType); } Optional<Case> getMatchCase() { return Optional.ofNullable(matchCase); } @@ -31,6 +32,7 @@ public class ParsedMatchSettings { Optional<Integer> getGramSize() { return Optional.ofNullable(gramSize); } Optional<Integer> getMaxLength() { return Optional.ofNullable(maxLength); } Optional<Integer> getMaxTermOccurrences() { return Optional.ofNullable(maxTermOccurrences); } + Optional<Integer> getMaxTokenLength() { return Optional.ofNullable(maxTokenLength); } // TODO - consider allowing each set only once: void setType(MatchType value) { this.matchType = value; } @@ -40,5 +42,6 @@ public class ParsedMatchSettings { void setGramSize(int value) { this.gramSize = value; } void setMaxLength(int value) { this.maxLength = value; } void setMaxTermOccurrences(int value) { this.maxTermOccurrences = value; } + void setMaxTokenLength(int value) { this.maxTokenLength = value; } } diff --git a/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java b/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java index 056c37a9830..4313ceb4be1 100644 --- a/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java +++ b/config-model/src/main/java/com/yahoo/schema/processing/ExactMatch.java @@ -16,6 +16,7 @@ import com.yahoo.vespa.indexinglanguage.expressions.ForEachExpression; import com.yahoo.vespa.indexinglanguage.expressions.IndexExpression; import com.yahoo.vespa.indexinglanguage.expressions.OutputExpression; import com.yahoo.vespa.indexinglanguage.expressions.ScriptExpression; +import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.model.container.search.QueryProfiles; /** @@ -75,7 +76,11 @@ public class ExactMatch extends Processor { } ScriptExpression script = field.getIndexingScript(); if (new ExpressionSearcher<>(IndexExpression.class).containedIn(script)) { - field.setIndexingScript(schema.getName(), (ScriptExpression)new MyProvider(schema).convert(field.getIndexingScript())); + var maxTokenLength = field.getMatching().maxTokenLength(); + if (maxTokenLength == null) { + maxTokenLength = AnnotatorConfig.getDefaultMaxTokenLength(); + } + field.setIndexingScript(schema.getName(), (ScriptExpression)new MyProvider(schema, maxTokenLength).convert(field.getIndexingScript())); } } @@ -85,8 +90,12 @@ public class ExactMatch extends Processor { private static class MyProvider extends TypedTransformProvider { - MyProvider(Schema schema) { + private int maxTokenLength; + + MyProvider(Schema schema, int maxTokenLength) + { super(ExactExpression.class, schema); + this.maxTokenLength = maxTokenLength; } @Override @@ -96,7 +105,7 @@ public class ExactMatch extends Processor { @Override protected Expression newTransform(DataType fieldType) { - Expression exp = new ExactExpression(); + Expression exp = new ExactExpression(maxTokenLength); if (fieldType instanceof CollectionDataType) { exp = new ForEachExpression(exp); } diff --git a/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java b/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java index 1ff019038fc..3f23cbc9b2d 100644 --- a/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java +++ b/config-model/src/main/java/com/yahoo/schema/processing/TextMatch.java @@ -70,6 +70,10 @@ public class TextMatch extends Processor { if (maxTermOccurrences != null) { ret.setMaxTermOccurrences(maxTermOccurrences); } + var maxTokenLength = fieldMatching.maxTokenLength(); + if (maxTokenLength != null) { + ret.setMaxTokenLength(maxTokenLength); + } } return ret; } diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj index b40f2d0796d..1365c133932 100644 --- a/config-model/src/main/javacc/SchemaParser.jj +++ b/config-model/src/main/javacc/SchemaParser.jj @@ -183,6 +183,7 @@ TOKEN : | < GRAM_SIZE: "gram-size" > | < MAX_LENGTH: "max-length" > | < MAX_OCCURRENCES: "max-occurrences" > +| < MAX_TOKEN_LENGTH: "max-token-length" > | < PREFIX: "prefix" > | < SUBSTRING: "substring" > | < SUFFIX: "suffix" > @@ -1368,7 +1369,8 @@ void matchType(ParsedMatchSettings matchInfo) : { } */ void matchItem(ParsedMatchSettings matchInfo) : { } { - ( matchType(matchInfo) | exactTerminator(matchInfo) | gramSize(matchInfo) | matchSize(matchInfo) | maxTermOccurrences(matchInfo)) + ( matchType(matchInfo) | exactTerminator(matchInfo) | gramSize(matchInfo) | matchSize(matchInfo) | + maxTermOccurrences(matchInfo) | maxTokenLength(matchInfo) ) } void exactTerminator(ParsedMatchSettings matchInfo) : @@ -1413,6 +1415,16 @@ void maxTermOccurrences(ParsedMatchSettings matchInfo) : } } +void maxTokenLength(ParsedMatchSettings matchInfo) : +{ + int maxTokenLength; +} +{ + <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() { + matchInfo.setMaxTokenLength(maxTokenLength); + } +} + /** * Consumes a rank statement of a field element. * diff --git a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java index 34ca6c30a61..4186e352388 100644 --- a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java @@ -170,6 +170,23 @@ public class SchemaParserTestCase { assertEquals(11, field.matchSettings().getMaxTermOccurrences().get()); } + @Test + void maxTokenLengthCanBeParsed() throws Exception { + String input = joinLines + ("schema foo {", + " document foo {", + " field bar type string {", + " indexing: summary | index", + " match { max-token-length: 11 }", + " }", + " }", + "}"); + ParsedSchema schema = parseString(input); + var field = schema.getDocument().getFields().get(0); + assertEquals("bar", field.name()); + assertEquals(11, field.matchSettings().getMaxTokenLength().get()); + } + void checkFileParses(String fileName) throws Exception { var schema = parseFile(fileName); assertNotNull(schema); diff --git a/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java b/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java index de99d46b9ca..355a810f5ff 100644 --- a/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/processing/IndexingScriptRewriterTestCase.java @@ -10,6 +10,7 @@ import com.yahoo.schema.Schema; import com.yahoo.schema.ApplicationBuilder; import com.yahoo.schema.AbstractSchemaTestCase; import com.yahoo.schema.document.BooleanIndexDefinition; +import com.yahoo.schema.document.MatchType; import com.yahoo.schema.document.SDDocumentType; import com.yahoo.schema.document.SDField; import com.yahoo.vespa.documentmodel.SummaryField; @@ -155,6 +156,24 @@ public class IndexingScriptRewriterTestCase extends AbstractSchemaTestCase { field); } + @Test + void requireThatMaxTokenLengthIsPropagated() { + var field = new SDField("test", DataType.STRING); + field.getMatching().maxTokenLength(10); + field.parseIndexingScript("test", "{ summary | index }"); + assertIndexingScript("{ input test | tokenize normalize stem:\"BEST\" max-token-length:10 | summary test | index test; }", + field); + } + + @Test + void requireThatMaxTokenLengthIsPropagatedForWordMatch() { + var field = new SDField("test", DataType.STRING); + field.getMatching().maxTokenLength(10).setType(MatchType.WORD); + field.parseIndexingScript("test", "{ summary | index }"); + assertIndexingScript("{ input test | exact max-token-length:10 | summary test | index test; }", + field); + } + private static void assertIndexingScript(String expectedScript, SDField unprocessedField) { assertEquals(expectedScript, processField(unprocessedField).toString()); diff --git a/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/StreamingSearcherTestCase.java b/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/StreamingSearcherTestCase.java index cd9ef708920..25b54267242 100644 --- a/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/StreamingSearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/StreamingSearcherTestCase.java @@ -167,6 +167,7 @@ public class StreamingSearcherTestCase { Query[] queries = new Query[4]; // Increase coverage for (int i = 0; i<queries.length; i++) { Query query = new Query(queryString); + query.setTimeout(1000); if (i == 0) { } else if (i == 1) { query.getPresentation().setSummary("summary"); diff --git a/dependency-versions/pom.xml b/dependency-versions/pom.xml index f9041eb3ccc..13c01a42f06 100644 --- a/dependency-versions/pom.xml +++ b/dependency-versions/pom.xml @@ -68,7 +68,7 @@ <assertj.vespa.version>3.25.3</assertj.vespa.version> <!-- Athenz dependencies. Make sure these dependencies match those in Vespa's internal repositories --> - <aws-sdk.vespa.version>1.12.716</aws-sdk.vespa.version> + <aws-sdk.vespa.version>1.12.717</aws-sdk.vespa.version> <athenz.vespa.version>1.11.57</athenz.vespa.version> <!-- Athenz END --> @@ -117,7 +117,7 @@ <junit.vespa.version>5.10.2</junit.vespa.version> <junit.platform.vespa.version>1.10.2</junit.platform.vespa.version> <junit4.vespa.version>4.13.2</junit4.vespa.version> - <kherud.llama.vespa.version>3.0.1</kherud.llama.vespa.version> + <kherud.llama.vespa.version>3.0.2</kherud.llama.vespa.version> <luben.zstd.vespa.version>1.5.6-3</luben.zstd.vespa.version> <lucene.vespa.version>9.10.0</lucene.vespa.version> <maven-archiver.vespa.version>3.6.2</maven-archiver.vespa.version> diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java index 855430f45fc..7481363b737 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java @@ -12,6 +12,9 @@ import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.IntegerFieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.process.TokenType; +import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; + +import java.util.OptionalInt; import static com.yahoo.language.LinguisticsCase.toLowerCase; @@ -20,8 +23,19 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; */ public final class ExactExpression extends Expression { - public ExactExpression() { + private int maxTokenLength; + + private ExactExpression(OptionalInt maxTokenLength) { super(DataType.STRING); + this.maxTokenLength = maxTokenLength.isPresent() ? maxTokenLength.getAsInt() : AnnotatorConfig.getDefaultMaxTokenLength(); + } + + public ExactExpression() { + this(OptionalInt.empty());; + } + + public ExactExpression(int maxTokenLength) { + this(OptionalInt.of(maxTokenLength)); } @Override @@ -36,6 +50,12 @@ public final class ExactExpression extends Expression { String next = toLowerCase(prev); SpanTree tree = output.getSpanTree(SpanTrees.LINGUISTICS); + if (next.length() > maxTokenLength) { + if (tree != null) { + output.removeSpanTree(SpanTrees.LINGUISTICS); + } + return; + } SpanList root; if (tree == null) { root = new SpanList(); @@ -64,8 +84,14 @@ public final class ExactExpression extends Expression { } @Override - public String toString() { - return "exact"; + public String toString() + { + StringBuilder ret = new StringBuilder(); + ret.append("exact"); + if (maxTokenLength != AnnotatorConfig.getDefaultMaxTokenLength()) { + ret.append(" max-token-length:" + maxTokenLength); + } + return ret.toString(); } @Override diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 849bc075a64..a3c404e50c3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -69,6 +69,9 @@ public final class TokenizeExpression extends Expression { if (config.hasNonDefaultMaxTokenizeLength()) { ret.append(" max-length:" + config.getMaxTokenizeLength()); } + if (config.hasNonDefaultMaxTokenLength()) { + ret.append(" max-token-length:" + config.getMaxTokenLength()); + } if (config.hasNonDefaultMaxTermOccurrences()) { ret.append(" max-occurrences:" + config.getMaxTermOccurrences()); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 4e5ef0d90df..6522e284fc8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -14,14 +14,17 @@ public class AnnotatorConfig implements Cloneable { private StemMode stemMode; private boolean removeAccents; private int maxTermOccurrences; + private int maxTokenLength; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; + private static final int DEFAULT_MAX_TOKEN_LENGTH; private static final int DEFAULT_MAX_TOKENIZE_LENGTH; static { IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences(); + DEFAULT_MAX_TOKEN_LENGTH = defaults.maxtokenlength(); DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength(); } @@ -30,6 +33,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = StemMode.NONE; removeAccents = false; maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -38,6 +42,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; maxTermOccurrences = rhs.maxTermOccurrences; + maxTokenLength = rhs.maxTokenLength; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -82,6 +87,17 @@ public class AnnotatorConfig implements Cloneable { return this; } + public AnnotatorConfig setMaxTokenLength(int maxTokenLength) { + this.maxTokenLength = maxTokenLength; + return this; + } + + public int getMaxTokenLength() { + return maxTokenLength; + } + + public static int getDefaultMaxTokenLength() { return DEFAULT_MAX_TOKEN_LENGTH; } + public AnnotatorConfig setMaxTokenizeLength(int maxTokenizeLength) { this.maxTokenizeLength = maxTokenizeLength; return this; @@ -91,6 +107,10 @@ public class AnnotatorConfig implements Cloneable { return maxTokenizeLength; } + public boolean hasNonDefaultMaxTokenLength() { + return maxTokenLength != DEFAULT_MAX_TOKEN_LENGTH; + } + public boolean hasNonDefaultMaxTokenizeLength() { return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -116,6 +136,9 @@ public class AnnotatorConfig implements Cloneable { if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } + if (maxTokenLength != rhs.maxTokenLength) { + return false; + } if (maxTokenizeLength != rhs.maxTokenizeLength) { return false; } @@ -125,7 +148,7 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenLength + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 86d4e91a567..913b874c6f6 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -78,7 +78,8 @@ public class LinguisticsAnnotator { TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) - addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences); + addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences, + config.getMaxTokenLength()); if (tree.numAnnotations() == 0) return false; text.setSpanTree(tree); @@ -100,17 +101,22 @@ public class LinguisticsAnnotator { return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term)); } - private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) { + private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences, + int maxTokenLength) { + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { here.annotate(termAnnotation(term, orig)); } } - private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) { + private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, + TermOccurrences termOccurrences, int maxTokenLength) { if ( ! token.isSpecialToken()) { if (token.getNumComponents() > 0) { for (int i = 0; i < token.getNumComponents(); ++i) { - addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences); + addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences, maxTokenLength); } return; } @@ -130,18 +136,21 @@ public class LinguisticsAnnotator { String lowercasedOrig = toLowerCase(token.getOrig()); String term = token.getTokenString(); if (term != null) { - addAnnotation(where, term, token.getOrig(), termOccurrences); + addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength); if ( ! term.equals(lowercasedOrig)) - addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences); + addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences, maxTokenLength); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); if (! (stem.equals(lowercasedOrig) || stem.equals(term))) - addAnnotation(where, stem, token.getOrig(), termOccurrences); + addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength); } } else { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig())); } diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index 77591d3e54e..29ca5270db8 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -174,6 +174,7 @@ TOKEN : <LOWER_CASE: "lowercase"> | <MAX_LENGTH: "max-length"> | <MAX_OCCURRENCES: "max-occurrences"> | + <MAX_TOKEN_LENGTH: "max-token-length"> | <NGRAM: "ngram"> | <NORMALIZE: "normalize"> | <NOW: "now"> | @@ -407,10 +408,13 @@ Expression embedExp() : { return new EmbedExpression(embedders, embedderId, embedderArguments); } } -Expression exactExp() : { } +Expression exactExp() : { - ( <EXACT> ) - { return new ExactExpression(); } + int maxTokenLength = annotatorCfg.getMaxTokenLength(); +} +{ + ( <EXACT> [ <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() ] ) + { return new ExactExpression(maxTokenLength); } } Expression flattenExp() : { } @@ -686,11 +690,13 @@ AnnotatorConfig tokenizeCfg() : String str = "SHORTEST"; Integer maxLength; Integer maxTermOccurrences; + Integer maxTokenLength; } { ( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } | <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenizeLength(maxLength); } | <MAX_OCCURRENCES> <COLON> maxTermOccurrences = integer() { val.setMaxTermOccurrences(maxTermOccurrences); } | + <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() { val.setMaxTokenLength(maxTokenLength); } | <NORMALIZE> { val.setRemoveAccents(true); } )+ { return val; } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java index 403d1820f70..b338c45f7a4 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java @@ -63,6 +63,15 @@ public class ExactTestCase { } @Test + public void requireThatLongStringsAreNotAnnotated() { + ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); + ctx.setValue(new StringFieldValue("foo")); + new ExactExpression(2).execute(ctx); + + assertNull(((StringFieldValue)ctx.getValue()).getSpanTree(SpanTrees.LINGUISTICS)); + } + + @Test public void requireThatEmptyStringsAreNotAnnotated() { ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); ctx.setValue(new StringFieldValue("")); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java index 01ffbe359f3..7ed3ab410a3 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java @@ -62,4 +62,15 @@ public class TokenizeTestCase { assertTrue(val instanceof StringFieldValue); assertNotNull(((StringFieldValue)val).getSpanTree(SpanTrees.LINGUISTICS)); } + + @Test + public void requireThatLongWordIsDropped() { + ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); + ctx.setValue(new StringFieldValue("foo")); + new TokenizeExpression(new SimpleLinguistics(), new AnnotatorConfig().setMaxTokenLength(2)).execute(ctx); + + FieldValue val = ctx.getValue(); + assertTrue(val instanceof StringFieldValue); + assertNull(((StringFieldValue)val).getSpanTree(SpanTrees.LINGUISTICS)); + } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java index 0d34d2841fd..c3131e28906 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java @@ -27,6 +27,8 @@ public class AnnotatorConfigTestCase { assertTrue(config.getRemoveAccents()); config.setRemoveAccents(false); assertFalse(config.getRemoveAccents()); + config.setMaxTokenLength(10); + assertEquals(10, config.getMaxTokenLength()); } @Test @@ -35,11 +37,13 @@ public class AnnotatorConfigTestCase { config.setLanguage(Language.ARABIC); config.setStemMode(StemMode.SHORTEST); config.setRemoveAccents(!config.getRemoveAccents()); + config.setMaxTokenLength(11); AnnotatorConfig other = new AnnotatorConfig(config); assertEquals(config.getLanguage(), other.getLanguage()); assertEquals(config.getStemMode(), other.getStemMode()); assertEquals(config.getRemoveAccents(), other.getRemoveAccents()); + assertEquals(config.getMaxTokenLength(), other.getMaxTokenLength()); } @Test @@ -49,6 +53,7 @@ public class AnnotatorConfigTestCase { assertFalse(config.equals(newConfig(Language.SPANISH, StemMode.SHORTEST, false))); assertFalse(config.equals(newConfig(Language.DUTCH, StemMode.SHORTEST, false))); assertFalse(config.equals(newConfig(Language.DUTCH, StemMode.NONE, false))); + assertNotEquals(config, newConfig(Language.DUTCH, StemMode.NONE, true).setMaxTokenLength(10)); assertEquals(config, newConfig(Language.DUTCH, StemMode.NONE, true)); assertEquals(config.hashCode(), newConfig(Language.DUTCH, StemMode.NONE, true).hashCode()); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java index a7ed7ae3e72..1b7c6973f1e 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java @@ -27,6 +27,7 @@ public class ExpressionTestCase { assertExpression(ClearStateExpression.class, "clear_state"); assertExpression(EchoExpression.class, "echo"); assertExpression(ExactExpression.class, "exact"); + assertExpression(ExactExpression.class, "exact max-token-length: 10", Optional.of("exact max-token-length:10")); assertExpression(FlattenExpression.class, "flatten"); assertExpression(ForEachExpression.class, "for_each { 1 }"); assertExpression(GetFieldExpression.class, "get_field field1"); @@ -73,6 +74,7 @@ public class ExpressionTestCase { assertExpression(TokenizeExpression.class, "tokenize stem:\"ALL\""); assertExpression(TokenizeExpression.class, "tokenize normalize"); assertExpression(TokenizeExpression.class, "tokenize max-occurrences: 15", Optional.of("tokenize max-occurrences:15")); + assertExpression(TokenizeExpression.class, "tokenize max-token-length: 15", Optional.of("tokenize max-token-length:15")); assertExpression(ToLongExpression.class, "to_long"); assertExpression(ToPositionExpression.class, "to_pos"); assertExpression(ToStringExpression.class, "to_string"); diff --git a/vespa-athenz/src/main/java/com/yahoo/vespa/athenz/utils/SiaUtils.java b/vespa-athenz/src/main/java/com/yahoo/vespa/athenz/utils/SiaUtils.java index af0da93edc3..56e64b2261d 100644 --- a/vespa-athenz/src/main/java/com/yahoo/vespa/athenz/utils/SiaUtils.java +++ b/vespa-athenz/src/main/java/com/yahoo/vespa/athenz/utils/SiaUtils.java @@ -1,10 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.athenz.utils; -import com.yahoo.vespa.athenz.api.AthenzIdentity; -import com.yahoo.vespa.athenz.api.AthenzService; import com.yahoo.security.KeyUtils; import com.yahoo.security.X509CertificateUtils; +import com.yahoo.vespa.athenz.api.AthenzIdentity; +import com.yahoo.vespa.athenz.api.AthenzService; import java.io.IOException; import java.io.UncheckedIOException; @@ -132,7 +132,7 @@ public class SiaUtils { try (DirectoryStream<Path> directoryStream = Files.newDirectoryStream(keysDirectory)) { return StreamSupport.stream(directoryStream.spliterator(), false) .map(path -> path.getFileName().toString()) - .filter(fileName -> fileName.endsWith(keyFileSuffix)) + .filter(fileName -> fileName.endsWith(keyFileSuffix) && ! fileName.contains(":role.")) .map(fileName -> fileName.substring(0, fileName.length() - keyFileSuffix.length())) .map(AthenzService::new) .collect(toList()); diff --git a/vespa-athenz/src/test/java/com/yahoo/vespa/athenz/utils/SiaUtilsTest.java b/vespa-athenz/src/test/java/com/yahoo/vespa/athenz/utils/SiaUtilsTest.java index 9ff59236c0c..8274fe7f7a6 100644 --- a/vespa-athenz/src/test/java/com/yahoo/vespa/athenz/utils/SiaUtilsTest.java +++ b/vespa-athenz/src/test/java/com/yahoo/vespa/athenz/utils/SiaUtilsTest.java @@ -32,6 +32,7 @@ public class SiaUtilsTest { Files.createFile(SiaUtils.getPrivateKeyFile(siaRoot, fooService)); AthenzService barService = new AthenzService("my.domain.bar"); Files.createFile(SiaUtils.getPrivateKeyFile(siaRoot, barService)); + Files.createFile(siaRoot.resolve("keys/my.domain.foo:role.my-role.key.pem")); List<AthenzIdentity> siaIdentities = SiaUtils.findSiaServices(siaRoot); assertEquals(2, siaIdentities.size()); diff --git a/vespalog/src/logger/runserver.cpp b/vespalog/src/logger/runserver.cpp index 9a0a499cd54..4e0141f88dc 100644 --- a/vespalog/src/logger/runserver.cpp +++ b/vespalog/src/logger/runserver.cpp @@ -6,7 +6,7 @@ #include <cerrno> #include <unistd.h> #include <csignal> - +#include <poll.h> #include <sys/select.h> #include <sys/types.h> #include <sys/wait.h> @@ -18,6 +18,7 @@ #include "llreader.h" #include <vespa/log/log.h> #include <chrono> +#include <array> LOG_SETUP("runserver"); @@ -179,8 +180,6 @@ int loop(const char *svc, char * const * run) pstdout[0], pstdout[1], pstderr[0], pstderr[1]); - int high = 1 + pstdout[0] + pstderr[0]; - pid_t child = fork(); if (child == 0) { @@ -237,24 +236,24 @@ int loop(const char *svc, char * const * run) bool outeof = false; bool erreof = false; - + constexpr int stdout_idx = 0, stderr_idx = 1; + std::array<pollfd, 2> fds{}; int wstat = 0; while (child || !outeof || !erreof) { - struct timeval timeout; - - timeout.tv_sec = 0; - timeout.tv_usec = 100000; // == 100 ms == 1/10 s - - fd_set pipes; - - FD_ZERO(&pipes); - if (!outeof) FD_SET(pstdout[0], &pipes); - if (!erreof) FD_SET(pstderr[0], &pipes); - - int n = select(high, &pipes, NULL, NULL, &timeout); + // Entries with negative fds are entirely ignored by the kernel. + fds[stdout_idx].fd = !outeof ? pstdout[0] : -1; + fds[stdout_idx].events = POLLIN; + fds[stdout_idx].revents = 0; + fds[stderr_idx].fd = !erreof ? pstderr[0] : -2; + fds[stderr_idx].events = POLLIN; + fds[stderr_idx].revents = 0; + + constexpr int poll_timeout_ms = 100; + int n = poll(fds.data(), fds.size(), poll_timeout_ms); if (n > 0) { - if (FD_ISSET(pstdout[0], &pipes)) { + constexpr short ev_mask = POLLIN | POLLERR | POLLHUP; + if ((fds[stdout_idx].revents & ev_mask) != 0) { LOG(debug, "out reader has input"); if (outReader.blockRead()) { while (outReader.hasInput()) { @@ -267,7 +266,7 @@ int loop(const char *svc, char * const * run) close(pstdout[0]); } } - if (FD_ISSET(pstderr[0], &pipes)) { + if ((fds[stderr_idx].revents & ev_mask) != 0) { LOG(debug, "err reader has input"); if (errReader.blockRead()) { while (errReader.hasInput()) { |