diff options
Diffstat (limited to 'lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java')
-rw-r--r-- | lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java new file mode 100644 index 00000000000..568f295b39d --- /dev/null +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -0,0 +1,139 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.config.FileReference; +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class LuceneTokenizerTest { + + @Test + public void testTokenizer() { + String text = "This is my Text"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable<Token> tokens = tokenizer + .tokenize(text, Language.ENGLISH, StemMode.ALL, true); + assertEquals(List.of("my", "text"), tokenStrings(tokens)); + } + + @Test + public void testLithuanianTokenizer() { + String text = "Žalgirio mūšio data yra 1410 metai"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable<Token> tokens = tokenizer + .tokenize(text, Language.LITHUANIAN, StemMode.ALL, true); + assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens)); + } + + private void assertToken(String tokenString, Iterator<Token> tokens) { + Token t = tokens.next(); + assertEquals(tokenString, t.getTokenString()); + } + + private List<Token> iterableToList(Iterable<Token> tokens) { + List<Token> tokenList = new ArrayList<>(); + tokens.forEach(tokenList::add); + return tokenList; + } + + private List<String> tokenStrings(Iterable<Token> tokens) { + List<String> tokenList = new ArrayList<>(); + tokens.forEach(token -> { + tokenList.add(token.getTokenString()); + }); + return tokenList; + } + + @Test + public void testAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig + .Analysis + .Builder() + .tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("uppercase")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("DOG", "AND", "CAT"), tokenStrings(tokens)); + } + + @Test + public void testEnglishStemmerAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); + } + + @Test + public void testStemmerWithStopWords() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("stop") + .conf("words", "stopwords.txt")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens)); + } +} |