Add capping of fields before tokenizing

author: Henning Baldersheim <balder@yahoo-inc.com> 2017-08-07 18:14:53 +0200
committer: Henning Baldersheim <balder@yahoo-inc.com> 2017-08-07 18:15:39 +0200
commit: 92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch)
tree: 03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage/src/test
parent: a69f61901d6a243eec05d7a8d60eecbf28d70931 (diff)
1 files changed, 24 insertions, 0 deletions
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 5805d56aa57..2d18d410e66 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode;
 import com.yahoo.language.process.Token;
 import com.yahoo.language.process.TokenType;
 import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleLinguistics;
 import com.yahoo.language.simple.SimpleToken;
 
 import org.junit.Test;
@@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase {
     }
 
     @Test
+    public void requireThatTokenizeCappingWorks() {
+        String shortString = "short string";
+        SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
+        spanTree.setStringFieldValue(new StringFieldValue(shortString));
+        spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM));
+        spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM));
+
+        StringFieldValue shortValue = new StringFieldValue(shortString);
+
+        Linguistics linguistics = new SimpleLinguistics();
+
+        LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
+
+        assertTrue(annotator.annotate(shortValue));
+        assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
+        assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+
+        StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
+        assertTrue(annotator.annotate(cappedValue));
+        assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+    }
+
+    @Test
     public void requireThatMaxTermOccurencesIsHonored() {
         final String inputTerm = "foo";
         final String stemmedInputTerm = "bar"; // completely different from
author	Henning Baldersheim <balder@yahoo-inc.com>	2017-08-07 18:14:53 +0200
committer	Henning Baldersheim <balder@yahoo-inc.com>	2017-08-07 18:15:39 +0200
commit	92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch)
tree	03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage/src/test
parent	a69f61901d6a243eec05d7a8d60eecbf28d70931 (diff)