diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:14:53 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:15:39 +0200 |
commit | 92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch) | |
tree | 03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage/src/main | |
parent | a69f61901d6a243eec05d7a8d60eecbf28d70931 (diff) |
Add capping of fields before tokenizing
Diffstat (limited to 'indexinglanguage/src/main')
4 files changed, 33 insertions, 3 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 661cc6c9c3e..b3cee971258 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -33,7 +33,8 @@ public class TokenizeExpression extends Expression { @Override protected void doExecute(ExecutionContext context) { - StringFieldValue output = ((StringFieldValue)context.getValue()).clone(); + StringFieldValue input = (StringFieldValue)context.getValue(); + StringFieldValue output = input.clone(); context.setValue(output); AnnotatorConfig cfg = new AnnotatorConfig(config); @@ -70,6 +71,9 @@ public class TokenizeExpression extends Expression { if (config.getStemMode() != StemMode.NONE) { ret.append(" stem:\""+config.getStemMode()+"\""); } + if (config.hasNonDefaultMaxTokenLength()) { + ret.append(" max-length:" + config.getMaxTokenizeLength()); + } return ret.toString(); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index ccc1f293112..990a8a513f2 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -14,8 +14,10 @@ public class AnnotatorConfig implements Cloneable { private StemMode stemMode; private boolean removeAccents; private int maxTermOccurences; + private int maxTokenizeLength = MAX_TOKENIZE_LENGTH; public static final int DEFAULT_MAX_TERM_OCCURRENCES; + private static final int MAX_TOKENIZE_LENGTH = 1000000; static { IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); @@ -34,6 +36,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; maxTermOccurences = rhs.maxTermOccurences; + maxTokenizeLength = rhs.maxTokenizeLength; } public Language getLanguage() { @@ -77,6 +80,19 @@ public class AnnotatorConfig implements Cloneable { return this; } + public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) { + this.maxTokenizeLength = maxTokenizeLength; + return this; + } + + public int getMaxTokenizeLength() { + return maxTokenizeLength; + } + + public boolean hasNonDefaultMaxTokenLength() { + return maxTokenizeLength != MAX_TOKENIZE_LENGTH; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof AnnotatorConfig)) { @@ -95,12 +111,15 @@ public class AnnotatorConfig implements Cloneable { if (maxTermOccurences != rhs.maxTermOccurences) { return false; } + if (maxTokenizeLength != rhs.maxTokenizeLength) { + return false; + } return true; } @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index b320bce6dbf..3adffa30725 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -64,7 +64,10 @@ public class LinguisticsAnnotator { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. Tokenizer tokenizer = factory.getTokenizer(); - Iterable<Token> tokens = tokenizer.tokenize(text.getString(), config.getLanguage(), config.getStemMode(), + String input = (text.getString().length() <= config.getMaxTokenizeLength()) + ? text.getString() + : text.getString().substring(0, config.getMaxTokenizeLength()); + Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index f1abb76c645..d564443bb48 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -164,6 +164,7 @@ TOKEN : <INPUT: "input"> | <JOIN: "join"> | <LOWER_CASE: "lowercase"> | + <MAX_LENGTH: "max-length"> | <NGRAM: "ngram"> | <NORMALIZE: "normalize"> | <NOW: "now"> | @@ -615,9 +616,11 @@ AnnotatorConfig tokenizeCfg() : { AnnotatorConfig val = new AnnotatorConfig(annotatorCfg); String str = "SHORTEST"; + Integer maxLength; } { ( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } | + <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenLength(maxLength); } | <NORMALIZE> { val.setRemoveAccents(true); } )+ { return val; } } @@ -723,6 +726,7 @@ String identifier() : <INPUT> | <JOIN> | <LOWER_CASE> | + <MAX_LENGTH> | <NGRAM> | <NORMALIZE> | <NOW> | |