diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:14:53 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2017-08-07 18:15:39 +0200 |
commit | 92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch) | |
tree | 03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage/src/test | |
parent | a69f61901d6a243eec05d7a8d60eecbf28d70931 (diff) |
Add capping of fields before tokenizing
Diffstat (limited to 'indexinglanguage/src/test')
-rw-r--r-- | indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 5805d56aa57..2d18d410e66 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.language.simple.SimpleToken; import org.junit.Test; @@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase { } @Test + public void requireThatTokenizeCappingWorks() { + String shortString = "short string"; + SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS); + spanTree.setStringFieldValue(new StringFieldValue(shortString)); + spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM)); + spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM)); + + StringFieldValue shortValue = new StringFieldValue(shortString); + + Linguistics linguistics = new SimpleLinguistics(); + + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12)); + + assertTrue(annotator.annotate(shortValue)); + assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS)); + assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); + + StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string"); + assertTrue(annotator.annotate(cappedValue)); + assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); + } + + @Test public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from |