2 files changed, 41 insertions, 9 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 7b6f350d831..6522e284fc8 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -14,14 +14,17 @@ public class AnnotatorConfig implements Cloneable {
     private StemMode stemMode;
     private boolean removeAccents;
     private int maxTermOccurrences;
+    private int maxTokenLength;
     private int maxTokenizeLength;
 
     public static final int DEFAULT_MAX_TERM_OCCURRENCES;
+    private static final int DEFAULT_MAX_TOKEN_LENGTH;
     private static final int DEFAULT_MAX_TOKENIZE_LENGTH;
 
     static {
         IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
         DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
+        DEFAULT_MAX_TOKEN_LENGTH = defaults.maxtokenlength();
         DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength();
     }
 
@@ -30,6 +33,7 @@ public class AnnotatorConfig implements Cloneable {
         stemMode = StemMode.NONE;
         removeAccents = false;
         maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
+        maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
         maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
     }
 
@@ -38,6 +42,7 @@ public class AnnotatorConfig implements Cloneable {
         stemMode = rhs.stemMode;
         removeAccents = rhs.removeAccents;
         maxTermOccurrences = rhs.maxTermOccurrences;
+        maxTokenLength = rhs.maxTokenLength;
         maxTokenizeLength = rhs.maxTokenizeLength;
     }
 
@@ -82,7 +87,18 @@ public class AnnotatorConfig implements Cloneable {
         return this;
     }
 
-    public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) {
+    public AnnotatorConfig setMaxTokenLength(int maxTokenLength) {
+        this.maxTokenLength = maxTokenLength;
+        return this;
+    }
+
+    public int getMaxTokenLength() {
+        return maxTokenLength;
+    }
+
+    public static int getDefaultMaxTokenLength() { return DEFAULT_MAX_TOKEN_LENGTH; }
+
+    public AnnotatorConfig setMaxTokenizeLength(int maxTokenizeLength) {
         this.maxTokenizeLength = maxTokenizeLength;
         return this;
     }
@@ -92,6 +108,10 @@ public class AnnotatorConfig implements Cloneable {
     }
 
     public boolean hasNonDefaultMaxTokenLength() {
+        return maxTokenLength != DEFAULT_MAX_TOKEN_LENGTH;
+    }
+
+    public boolean hasNonDefaultMaxTokenizeLength() {
         return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH;
     }
 
@@ -116,6 +136,9 @@ public class AnnotatorConfig implements Cloneable {
         if (maxTermOccurrences != rhs.maxTermOccurrences) {
             return false;
         }
+        if (maxTokenLength != rhs.maxTokenLength) {
+            return false;
+        }
         if (maxTokenizeLength != rhs.maxTokenizeLength) {
             return false;
         }
@@ -125,7 +148,7 @@ public class AnnotatorConfig implements Cloneable {
     @Override
     public int hashCode() {
         return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
-               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
+               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenLength + maxTokenizeLength;
     }
 
 }
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 86d4e91a567..913b874c6f6 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -78,7 +78,8 @@ public class LinguisticsAnnotator {
         TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
         SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
         for (Token token : tokens)
-            addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences);
+            addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences,
+                    config.getMaxTokenLength());
 
         if (tree.numAnnotations() == 0) return false;
         text.setSpanTree(tree);
@@ -100,17 +101,22 @@ public class LinguisticsAnnotator {
             return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
     }
 
-    private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
+    private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences,
+                                      int maxTokenLength) {
+        if (term.length() > maxTokenLength) {
+            return;
+        }
         if (termOccurrences.termCountBelowLimit(term)) {
             here.annotate(termAnnotation(term, orig));
         }
     }
 
-    private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) {
+    private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode,
+                                          TermOccurrences termOccurrences, int maxTokenLength) {
         if ( ! token.isSpecialToken()) {
             if (token.getNumComponents() > 0) {
                 for (int i = 0; i < token.getNumComponents(); ++i) {
-                    addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences);
+                    addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences, maxTokenLength);
                 }
                 return;
             }
@@ -130,18 +136,21 @@ public class LinguisticsAnnotator {
             String lowercasedOrig = toLowerCase(token.getOrig());
             String term = token.getTokenString();
             if (term != null) {
-                addAnnotation(where, term, token.getOrig(), termOccurrences);
+                addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength);
                 if ( ! term.equals(lowercasedOrig))
-                    addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences);
+                    addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences, maxTokenLength);
             }
             for (int i = 0; i < token.getNumStems(); i++) {
                 String stem = token.getStem(i);
                 if (! (stem.equals(lowercasedOrig) || stem.equals(term)))
-                    addAnnotation(where, stem, token.getOrig(), termOccurrences);
+                    addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength);
             }
         } else {
             String term = token.getTokenString();
             if (term == null || term.trim().isEmpty()) return;
+            if (term.length() > maxTokenLength) {
+                return;
+            }
             if (termOccurrences.termCountBelowLimit(term))  {
                 parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
             }