aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/test
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2017-08-07 18:14:53 +0200
committerHenning Baldersheim <balder@yahoo-inc.com>2017-08-07 18:15:39 +0200
commit92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch)
tree03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage/src/test
parenta69f61901d6a243eec05d7a8d60eecbf28d70931 (diff)
Add capping of fields before tokenizing
Diffstat (limited to 'indexinglanguage/src/test')
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java24
1 files changed, 24 insertions, 0 deletions
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 5805d56aa57..2d18d410e66 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.language.simple.SimpleToken;
import org.junit.Test;
@@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase {
}
@Test
+ public void requireThatTokenizeCappingWorks() {
+ String shortString = "short string";
+ SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
+ spanTree.setStringFieldValue(new StringFieldValue(shortString));
+ spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM));
+ spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM));
+
+ StringFieldValue shortValue = new StringFieldValue(shortString);
+
+ Linguistics linguistics = new SimpleLinguistics();
+
+ LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
+
+ assertTrue(annotator.annotate(shortValue));
+ assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
+ assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+
+ StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
+ assertTrue(annotator.annotate(cappedValue));
+ assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+ }
+
+ @Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
final String stemmedInputTerm = "bar"; // completely different from