diff options
Diffstat (limited to 'indexinglanguage')
9 files changed, 108 insertions, 14 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java index 855430f45fc..7481363b737 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java @@ -12,6 +12,9 @@ import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.IntegerFieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.process.TokenType; +import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; + +import java.util.OptionalInt; import static com.yahoo.language.LinguisticsCase.toLowerCase; @@ -20,8 +23,19 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; */ public final class ExactExpression extends Expression { - public ExactExpression() { + private int maxTokenLength; + + private ExactExpression(OptionalInt maxTokenLength) { super(DataType.STRING); + this.maxTokenLength = maxTokenLength.isPresent() ? maxTokenLength.getAsInt() : AnnotatorConfig.getDefaultMaxTokenLength(); + } + + public ExactExpression() { + this(OptionalInt.empty());; + } + + public ExactExpression(int maxTokenLength) { + this(OptionalInt.of(maxTokenLength)); } @Override @@ -36,6 +50,12 @@ public final class ExactExpression extends Expression { String next = toLowerCase(prev); SpanTree tree = output.getSpanTree(SpanTrees.LINGUISTICS); + if (next.length() > maxTokenLength) { + if (tree != null) { + output.removeSpanTree(SpanTrees.LINGUISTICS); + } + return; + } SpanList root; if (tree == null) { root = new SpanList(); @@ -64,8 +84,14 @@ public final class ExactExpression extends Expression { } @Override - public String toString() { - return "exact"; + public String toString() + { + StringBuilder ret = new StringBuilder(); + ret.append("exact"); + if (maxTokenLength != AnnotatorConfig.getDefaultMaxTokenLength()) { + ret.append(" max-token-length:" + maxTokenLength); + } + return ret.toString(); } @Override diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 849bc075a64..a3c404e50c3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -69,6 +69,9 @@ public final class TokenizeExpression extends Expression { if (config.hasNonDefaultMaxTokenizeLength()) { ret.append(" max-length:" + config.getMaxTokenizeLength()); } + if (config.hasNonDefaultMaxTokenLength()) { + ret.append(" max-token-length:" + config.getMaxTokenLength()); + } if (config.hasNonDefaultMaxTermOccurrences()) { ret.append(" max-occurrences:" + config.getMaxTermOccurrences()); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 4e5ef0d90df..6522e284fc8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -14,14 +14,17 @@ public class AnnotatorConfig implements Cloneable { private StemMode stemMode; private boolean removeAccents; private int maxTermOccurrences; + private int maxTokenLength; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; + private static final int DEFAULT_MAX_TOKEN_LENGTH; private static final int DEFAULT_MAX_TOKENIZE_LENGTH; static { IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences(); + DEFAULT_MAX_TOKEN_LENGTH = defaults.maxtokenlength(); DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength(); } @@ -30,6 +33,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = StemMode.NONE; removeAccents = false; maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -38,6 +42,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; maxTermOccurrences = rhs.maxTermOccurrences; + maxTokenLength = rhs.maxTokenLength; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -82,6 +87,17 @@ public class AnnotatorConfig implements Cloneable { return this; } + public AnnotatorConfig setMaxTokenLength(int maxTokenLength) { + this.maxTokenLength = maxTokenLength; + return this; + } + + public int getMaxTokenLength() { + return maxTokenLength; + } + + public static int getDefaultMaxTokenLength() { return DEFAULT_MAX_TOKEN_LENGTH; } + public AnnotatorConfig setMaxTokenizeLength(int maxTokenizeLength) { this.maxTokenizeLength = maxTokenizeLength; return this; @@ -91,6 +107,10 @@ public class AnnotatorConfig implements Cloneable { return maxTokenizeLength; } + public boolean hasNonDefaultMaxTokenLength() { + return maxTokenLength != DEFAULT_MAX_TOKEN_LENGTH; + } + public boolean hasNonDefaultMaxTokenizeLength() { return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -116,6 +136,9 @@ public class AnnotatorConfig implements Cloneable { if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } + if (maxTokenLength != rhs.maxTokenLength) { + return false; + } if (maxTokenizeLength != rhs.maxTokenizeLength) { return false; } @@ -125,7 +148,7 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenLength + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 86d4e91a567..913b874c6f6 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -78,7 +78,8 @@ public class LinguisticsAnnotator { TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) - addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences); + addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences, + config.getMaxTokenLength()); if (tree.numAnnotations() == 0) return false; text.setSpanTree(tree); @@ -100,17 +101,22 @@ public class LinguisticsAnnotator { return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term)); } - private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) { + private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences, + int maxTokenLength) { + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { here.annotate(termAnnotation(term, orig)); } } - private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) { + private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, + TermOccurrences termOccurrences, int maxTokenLength) { if ( ! token.isSpecialToken()) { if (token.getNumComponents() > 0) { for (int i = 0; i < token.getNumComponents(); ++i) { - addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences); + addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences, maxTokenLength); } return; } @@ -130,18 +136,21 @@ public class LinguisticsAnnotator { String lowercasedOrig = toLowerCase(token.getOrig()); String term = token.getTokenString(); if (term != null) { - addAnnotation(where, term, token.getOrig(), termOccurrences); + addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength); if ( ! term.equals(lowercasedOrig)) - addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences); + addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences, maxTokenLength); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); if (! (stem.equals(lowercasedOrig) || stem.equals(term))) - addAnnotation(where, stem, token.getOrig(), termOccurrences); + addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength); } } else { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig())); } diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index 77591d3e54e..29ca5270db8 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -174,6 +174,7 @@ TOKEN : <LOWER_CASE: "lowercase"> | <MAX_LENGTH: "max-length"> | <MAX_OCCURRENCES: "max-occurrences"> | + <MAX_TOKEN_LENGTH: "max-token-length"> | <NGRAM: "ngram"> | <NORMALIZE: "normalize"> | <NOW: "now"> | @@ -407,10 +408,13 @@ Expression embedExp() : { return new EmbedExpression(embedders, embedderId, embedderArguments); } } -Expression exactExp() : { } +Expression exactExp() : { - ( <EXACT> ) - { return new ExactExpression(); } + int maxTokenLength = annotatorCfg.getMaxTokenLength(); +} +{ + ( <EXACT> [ <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() ] ) + { return new ExactExpression(maxTokenLength); } } Expression flattenExp() : { } @@ -686,11 +690,13 @@ AnnotatorConfig tokenizeCfg() : String str = "SHORTEST"; Integer maxLength; Integer maxTermOccurrences; + Integer maxTokenLength; } { ( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } | <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenizeLength(maxLength); } | <MAX_OCCURRENCES> <COLON> maxTermOccurrences = integer() { val.setMaxTermOccurrences(maxTermOccurrences); } | + <MAX_TOKEN_LENGTH> <COLON> maxTokenLength = integer() { val.setMaxTokenLength(maxTokenLength); } | <NORMALIZE> { val.setRemoveAccents(true); } )+ { return val; } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java index 403d1820f70..b338c45f7a4 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/ExactTestCase.java @@ -63,6 +63,15 @@ public class ExactTestCase { } @Test + public void requireThatLongStringsAreNotAnnotated() { + ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); + ctx.setValue(new StringFieldValue("foo")); + new ExactExpression(2).execute(ctx); + + assertNull(((StringFieldValue)ctx.getValue()).getSpanTree(SpanTrees.LINGUISTICS)); + } + + @Test public void requireThatEmptyStringsAreNotAnnotated() { ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); ctx.setValue(new StringFieldValue("")); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java index 01ffbe359f3..7ed3ab410a3 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeTestCase.java @@ -62,4 +62,15 @@ public class TokenizeTestCase { assertTrue(val instanceof StringFieldValue); assertNotNull(((StringFieldValue)val).getSpanTree(SpanTrees.LINGUISTICS)); } + + @Test + public void requireThatLongWordIsDropped() { + ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); + ctx.setValue(new StringFieldValue("foo")); + new TokenizeExpression(new SimpleLinguistics(), new AnnotatorConfig().setMaxTokenLength(2)).execute(ctx); + + FieldValue val = ctx.getValue(); + assertTrue(val instanceof StringFieldValue); + assertNull(((StringFieldValue)val).getSpanTree(SpanTrees.LINGUISTICS)); + } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java index 0d34d2841fd..c3131e28906 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfigTestCase.java @@ -27,6 +27,8 @@ public class AnnotatorConfigTestCase { assertTrue(config.getRemoveAccents()); config.setRemoveAccents(false); assertFalse(config.getRemoveAccents()); + config.setMaxTokenLength(10); + assertEquals(10, config.getMaxTokenLength()); } @Test @@ -35,11 +37,13 @@ public class AnnotatorConfigTestCase { config.setLanguage(Language.ARABIC); config.setStemMode(StemMode.SHORTEST); config.setRemoveAccents(!config.getRemoveAccents()); + config.setMaxTokenLength(11); AnnotatorConfig other = new AnnotatorConfig(config); assertEquals(config.getLanguage(), other.getLanguage()); assertEquals(config.getStemMode(), other.getStemMode()); assertEquals(config.getRemoveAccents(), other.getRemoveAccents()); + assertEquals(config.getMaxTokenLength(), other.getMaxTokenLength()); } @Test @@ -49,6 +53,7 @@ public class AnnotatorConfigTestCase { assertFalse(config.equals(newConfig(Language.SPANISH, StemMode.SHORTEST, false))); assertFalse(config.equals(newConfig(Language.DUTCH, StemMode.SHORTEST, false))); assertFalse(config.equals(newConfig(Language.DUTCH, StemMode.NONE, false))); + assertNotEquals(config, newConfig(Language.DUTCH, StemMode.NONE, true).setMaxTokenLength(10)); assertEquals(config, newConfig(Language.DUTCH, StemMode.NONE, true)); assertEquals(config.hashCode(), newConfig(Language.DUTCH, StemMode.NONE, true).hashCode()); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java index a7ed7ae3e72..1b7c6973f1e 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java @@ -27,6 +27,7 @@ public class ExpressionTestCase { assertExpression(ClearStateExpression.class, "clear_state"); assertExpression(EchoExpression.class, "echo"); assertExpression(ExactExpression.class, "exact"); + assertExpression(ExactExpression.class, "exact max-token-length: 10", Optional.of("exact max-token-length:10")); assertExpression(FlattenExpression.class, "flatten"); assertExpression(ForEachExpression.class, "for_each { 1 }"); assertExpression(GetFieldExpression.class, "get_field field1"); @@ -73,6 +74,7 @@ public class ExpressionTestCase { assertExpression(TokenizeExpression.class, "tokenize stem:\"ALL\""); assertExpression(TokenizeExpression.class, "tokenize normalize"); assertExpression(TokenizeExpression.class, "tokenize max-occurrences: 15", Optional.of("tokenize max-occurrences:15")); + assertExpression(TokenizeExpression.class, "tokenize max-token-length: 15", Optional.of("tokenize max-token-length:15")); assertExpression(ToLongExpression.class, "to_long"); assertExpression(ToPositionExpression.class, "to_pos"); assertExpression(ToStringExpression.class, "to_string"); |