From 8beac01933b5121187d1cf6dd97cef0b34d1afd2 Mon Sep 17 00:00:00 2001 From: Dainius Jocas Date: Thu, 14 Sep 2023 18:14:57 +0300 Subject: LuceneLinguistics optional configDir --- .../yahoo/language/lucene/LuceneTokenizerTest.java | 40 ++++++++++++++++------ 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'lucene-linguistics/src/test/java/com/yahoo/language') diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 21d3a7bd33d..35373479bff 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -10,10 +10,7 @@ import com.yahoo.language.process.Token; import org.junit.Test; import java.io.File; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import static org.junit.Assert.assertEquals; @@ -47,10 +44,12 @@ public class LuceneTokenizerTest { } private Linguistics luceneLinguistics() { - return new LuceneLinguistics(new LuceneAnalysisConfig.Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) - .build(), - new ComponentRegistry<>()); + return new LuceneLinguistics( + new LuceneAnalysisConfig.Builder() + .configDir(Optional.of(FileReference + .mockFileReferenceForUnitTesting(new File(".")))) + .build(), + new ComponentRegistry<>()); } private void assertToken(String tokenString, Iterator tokens) { @@ -76,7 +75,7 @@ public class LuceneTokenizerTest { public void testAnalyzerConfiguration() { String languageCode = Language.ENGLISH.languageCode(); LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File(".")))) .analysis( Map.of(languageCode, new LuceneAnalysisConfig @@ -105,7 +104,7 @@ public class LuceneTokenizerTest { public void testEnglishStemmerAnalyzerConfiguration() { String languageCode = Language.ENGLISH.languageCode(); LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File(".")))) .analysis( Map.of(languageCode, new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( @@ -126,7 +125,7 @@ public class LuceneTokenizerTest { public void testStemmerWithStopWords() { String languageCode = Language.ENGLISH.languageCode(); LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File(".")))) .analysis( Map.of(languageCode, new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( @@ -149,4 +148,23 @@ public class LuceneTokenizerTest { assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens)); } + @Test + public void testOptionalPath() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); + } } -- cgit v1.2.3