diff options
Diffstat (limited to 'linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java new file mode 100644 index 00000000000..3cb82572976 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java @@ -0,0 +1,66 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public abstract class AbstractTokenizerTestCase { + + private boolean accentDrop = false; + private Language language = Language.ENGLISH; + private Linguistics linguistics; + private StemMode stemMode = StemMode.NONE; + + public void assertTokenStrings(String input, List<String> expectedTokenStrings) { + List<String> actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenStrings(token, actual); + } + assertEquals(expectedTokenStrings, actual); + } + + public List<String> findTokenStrings(Token token, List<String> out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getTokenString()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenStrings(token.getComponent(i), out); + } + } + return out; + } + + public Iterable<Token> tokenize(String input) { + return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); + } + + public AbstractTokenizerTestCase setAccentDrop(boolean accentDrop) { + this.accentDrop = accentDrop; + return this; + } + + public AbstractTokenizerTestCase setLanguage(Language language) { + this.language = language; + return this; + } + + public AbstractTokenizerTestCase setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public AbstractTokenizerTestCase setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + +} |