diff options
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics')
2 files changed, 41 insertions, 9 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 7b6f350d831..6522e284fc8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -14,14 +14,17 @@ public class AnnotatorConfig implements Cloneable { private StemMode stemMode; private boolean removeAccents; private int maxTermOccurrences; + private int maxTokenLength; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; + private static final int DEFAULT_MAX_TOKEN_LENGTH; private static final int DEFAULT_MAX_TOKENIZE_LENGTH; static { IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences(); + DEFAULT_MAX_TOKEN_LENGTH = defaults.maxtokenlength(); DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength(); } @@ -30,6 +33,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = StemMode.NONE; removeAccents = false; maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -38,6 +42,7 @@ public class AnnotatorConfig implements Cloneable { stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; maxTermOccurrences = rhs.maxTermOccurrences; + maxTokenLength = rhs.maxTokenLength; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -82,7 +87,18 @@ public class AnnotatorConfig implements Cloneable { return this; } - public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) { + public AnnotatorConfig setMaxTokenLength(int maxTokenLength) { + this.maxTokenLength = maxTokenLength; + return this; + } + + public int getMaxTokenLength() { + return maxTokenLength; + } + + public static int getDefaultMaxTokenLength() { return DEFAULT_MAX_TOKEN_LENGTH; } + + public AnnotatorConfig setMaxTokenizeLength(int maxTokenizeLength) { this.maxTokenizeLength = maxTokenizeLength; return this; } @@ -92,6 +108,10 @@ public class AnnotatorConfig implements Cloneable { } public boolean hasNonDefaultMaxTokenLength() { + return maxTokenLength != DEFAULT_MAX_TOKEN_LENGTH; + } + + public boolean hasNonDefaultMaxTokenizeLength() { return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -116,6 +136,9 @@ public class AnnotatorConfig implements Cloneable { if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } + if (maxTokenLength != rhs.maxTokenLength) { + return false; + } if (maxTokenizeLength != rhs.maxTokenizeLength) { return false; } @@ -125,7 +148,7 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenLength + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 86d4e91a567..913b874c6f6 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -78,7 +78,8 @@ public class LinguisticsAnnotator { TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) - addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences); + addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences, + config.getMaxTokenLength()); if (tree.numAnnotations() == 0) return false; text.setSpanTree(tree); @@ -100,17 +101,22 @@ public class LinguisticsAnnotator { return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term)); } - private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) { + private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences, + int maxTokenLength) { + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { here.annotate(termAnnotation(term, orig)); } } - private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) { + private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, + TermOccurrences termOccurrences, int maxTokenLength) { if ( ! token.isSpecialToken()) { if (token.getNumComponents() > 0) { for (int i = 0; i < token.getNumComponents(); ++i) { - addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences); + addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences, maxTokenLength); } return; } @@ -130,18 +136,21 @@ public class LinguisticsAnnotator { String lowercasedOrig = toLowerCase(token.getOrig()); String term = token.getTokenString(); if (term != null) { - addAnnotation(where, term, token.getOrig(), termOccurrences); + addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength); if ( ! term.equals(lowercasedOrig)) - addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences); + addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences, maxTokenLength); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); if (! (stem.equals(lowercasedOrig) || stem.equals(term))) - addAnnotation(where, stem, token.getOrig(), termOccurrences); + addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength); } } else { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; + if (term.length() > maxTokenLength) { + return; + } if (termOccurrences.termCountBelowLimit(term)) { parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig())); } |