diff options
author | Dainius Jocas <dainius.jocas@gmail.com> | 2023-07-31 13:27:43 +0300 |
---|---|---|
committer | Dainius Jocas <dainius.jocas@gmail.com> | 2023-07-31 13:27:43 +0300 |
commit | 5a60f6f3ae8e99f1f3de10e22a1f055d03fb37db (patch) | |
tree | 0f7cc48efba4b6661036a509269868d7354d6af2 /lucene-linguistics/src/test | |
parent | d488a7482e93ae233be571d61946caa796aba588 (diff) |
integrate Lucene Linguistics into the vespa project
Diffstat (limited to 'lucene-linguistics/src/test')
-rw-r--r-- | lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java | 139 | ||||
-rw-r--r-- | lucene-linguistics/src/test/resources/stopwords.txt | 1 |
2 files changed, 140 insertions, 0 deletions
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java new file mode 100644 index 00000000000..568f295b39d --- /dev/null +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -0,0 +1,139 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.config.FileReference; +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class LuceneTokenizerTest { + + @Test + public void testTokenizer() { + String text = "This is my Text"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable<Token> tokens = tokenizer + .tokenize(text, Language.ENGLISH, StemMode.ALL, true); + assertEquals(List.of("my", "text"), tokenStrings(tokens)); + } + + @Test + public void testLithuanianTokenizer() { + String text = "Žalgirio mūšio data yra 1410 metai"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable<Token> tokens = tokenizer + .tokenize(text, Language.LITHUANIAN, StemMode.ALL, true); + assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens)); + } + + private void assertToken(String tokenString, Iterator<Token> tokens) { + Token t = tokens.next(); + assertEquals(tokenString, t.getTokenString()); + } + + private List<Token> iterableToList(Iterable<Token> tokens) { + List<Token> tokenList = new ArrayList<>(); + tokens.forEach(tokenList::add); + return tokenList; + } + + private List<String> tokenStrings(Iterable<Token> tokens) { + List<String> tokenList = new ArrayList<>(); + tokens.forEach(token -> { + tokenList.add(token.getTokenString()); + }); + return tokenList; + } + + @Test + public void testAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig + .Analysis + .Builder() + .tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("uppercase")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("DOG", "AND", "CAT"), tokenStrings(tokens)); + } + + @Test + public void testEnglishStemmerAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); + } + + @Test + public void testStemmerWithStopWords() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("stop") + .conf("words", "stopwords.txt")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens)); + } +} diff --git a/lucene-linguistics/src/test/resources/stopwords.txt b/lucene-linguistics/src/test/resources/stopwords.txt new file mode 100644 index 00000000000..e8c07838bf5 --- /dev/null +++ b/lucene-linguistics/src/test/resources/stopwords.txt @@ -0,0 +1 @@ +and |