aboutsummaryrefslogtreecommitdiffstats
path: root/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
diff options
context:
space:
mode:
authorDainius Jocas <dainius.jocas@gmail.com>2023-09-14 18:14:57 +0300
committerDainius Jocas <dainius.jocas@gmail.com>2023-09-14 18:14:57 +0300
commit8beac01933b5121187d1cf6dd97cef0b34d1afd2 (patch)
tree158ff555917aa58deedaf583aacf4a833e431ace /lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
parent3a539203f2cd93d248f46ec4e75922879699e55b (diff)
LuceneLinguistics optional configDir
Diffstat (limited to 'lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java')
-rw-r--r--lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java40
1 files changed, 29 insertions, 11 deletions
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
index 21d3a7bd33d..35373479bff 100644
--- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
+++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
@@ -10,10 +10,7 @@ import com.yahoo.language.process.Token;
import org.junit.Test;
import java.io.File;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import static org.junit.Assert.assertEquals;
@@ -47,10 +44,12 @@ public class LuceneTokenizerTest {
}
private Linguistics luceneLinguistics() {
- return new LuceneLinguistics(new LuceneAnalysisConfig.Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
- .build(),
- new ComponentRegistry<>());
+ return new LuceneLinguistics(
+ new LuceneAnalysisConfig.Builder()
+ .configDir(Optional.of(FileReference
+ .mockFileReferenceForUnitTesting(new File("."))))
+ .build(),
+ new ComponentRegistry<>());
}
private void assertToken(String tokenString, Iterator<Token> tokens) {
@@ -76,7 +75,7 @@ public class LuceneTokenizerTest {
public void testAnalyzerConfiguration() {
String languageCode = Language.ENGLISH.languageCode();
LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File("."))))
.analysis(
Map.of(languageCode,
new LuceneAnalysisConfig
@@ -105,7 +104,7 @@ public class LuceneTokenizerTest {
public void testEnglishStemmerAnalyzerConfiguration() {
String languageCode = Language.ENGLISH.languageCode();
LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File("."))))
.analysis(
Map.of(languageCode,
new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
@@ -126,7 +125,7 @@ public class LuceneTokenizerTest {
public void testStemmerWithStopWords() {
String languageCode = Language.ENGLISH.languageCode();
LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .configDir(Optional.of(FileReference.mockFileReferenceForUnitTesting(new File("."))))
.analysis(
Map.of(languageCode,
new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
@@ -149,4 +148,23 @@ public class LuceneTokenizerTest {
assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens));
}
+ @Test
+ public void testOptionalPath() {
+ String languageCode = Language.ENGLISH.languageCode();
+ LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
+ .analysis(
+ Map.of(languageCode,
+ new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("englishMinimalStem"))))
+ ).build();
+ LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>());
+ Iterable<Token> tokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
+ assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens));
+ }
}