diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:14:53 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:15:39 +0200 |
commit | 92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch) | |
tree | 03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage | |
parent | a69f61901d6a243eec05d7a8d60eecbf28d70931 (diff) |
Add capping of fields before tokenizing
Diffstat (limited to 'indexinglanguage')
5 files changed, 57 insertions, 3 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 661cc6c9c3e..b3cee971258 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -33,7 +33,8 @@ public class TokenizeExpression extends Expression { @Override protected void doExecute(ExecutionContext context) { - StringFieldValue output = ((StringFieldValue)context.getValue()).clone(); + StringFieldValue input = (StringFieldValue)context.getValue(); + StringFieldValue output = input.clone(); context.setValue(output); AnnotatorConfig cfg = new AnnotatorConfig(config); @@ -70,6 +71,9 @@ public class TokenizeExpression extends Expression { if (config.getStemMode() != StemMode.NONE) { ret.append(" stem:\""+config.getStemMode()+"\""); } + if (config.hasNonDefaultMaxTokenLength()) { + ret.append(" max-length:" + config.getMaxTokenizeLength()); + } return ret.toString(); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index ccc1f293112..990a8a513f2 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -14,8 +14,10 @@ public class AnnotatorConfig implements Cloneable { private StemMode stemMode; private boolean removeAccents; private int maxTermOccurences; + private int maxTokenizeLength = MAX_TOKENIZE_LENGTH; public static final int DEFAULT_MAX_TERM_OCCURRENCES; + private static final int MAX_TOKENIZE_LENGTH = 1000000; static { IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); @@ -34,6 +36,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; maxTermOccurences = rhs.maxTermOccurences; + maxTokenizeLength = rhs.maxTokenizeLength; } public Language getLanguage() { @@ -77,6 +80,19 @@ public class AnnotatorConfig implements Cloneable { return this; } + public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) { + this.maxTokenizeLength = maxTokenizeLength; + return this; + } + + public int getMaxTokenizeLength() { + return maxTokenizeLength; + } + + public boolean hasNonDefaultMaxTokenLength() { + return maxTokenizeLength != MAX_TOKENIZE_LENGTH; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof AnnotatorConfig)) { @@ -95,12 +111,15 @@ public class AnnotatorConfig implements Cloneable { if (maxTermOccurences != rhs.maxTermOccurences) { return false; } + if (maxTokenizeLength != rhs.maxTokenizeLength) { + return false; + } return true; } @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index b320bce6dbf..3adffa30725 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -64,7 +64,10 @@ public class LinguisticsAnnotator { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. Tokenizer tokenizer = factory.getTokenizer(); - Iterable<Token> tokens = tokenizer.tokenize(text.getString(), config.getLanguage(), config.getStemMode(), + String input = (text.getString().length() <= config.getMaxTokenizeLength()) + ? text.getString() + : text.getString().substring(0, config.getMaxTokenizeLength()); + Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index f1abb76c645..d564443bb48 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -164,6 +164,7 @@ TOKEN : <INPUT: "input"> | <JOIN: "join"> | <LOWER_CASE: "lowercase"> | + <MAX_LENGTH: "max-length"> | <NGRAM: "ngram"> | <NORMALIZE: "normalize"> | <NOW: "now"> | @@ -615,9 +616,11 @@ AnnotatorConfig tokenizeCfg() : { AnnotatorConfig val = new AnnotatorConfig(annotatorCfg); String str = "SHORTEST"; + Integer maxLength; } { ( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } | + <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenLength(maxLength); } | <NORMALIZE> { val.setRemoveAccents(true); } )+ { return val; } } @@ -723,6 +726,7 @@ String identifier() : <INPUT> | <JOIN> | <LOWER_CASE> | + <MAX_LENGTH> | <NGRAM> | <NORMALIZE> | <NOW> | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 5805d56aa57..2d18d410e66 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.language.simple.SimpleToken; import org.junit.Test; @@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase { } @Test + public void requireThatTokenizeCappingWorks() { + String shortString = "short string"; + SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS); + spanTree.setStringFieldValue(new StringFieldValue(shortString)); + spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM)); + spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM)); + + StringFieldValue shortValue = new StringFieldValue(shortString); + + Linguistics linguistics = new SimpleLinguistics(); + + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12)); + + assertTrue(annotator.annotate(shortValue)); + assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS)); + assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); + + StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string"); + assertTrue(annotator.annotate(cappedValue)); + assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); + } + + @Test public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from |