diff options
author | Dainius Jocas <dainius.jocas@gmail.com> | 2023-09-14 22:10:58 +0300 |
---|---|---|
committer | Dainius Jocas <dainius.jocas@gmail.com> | 2023-09-14 22:10:58 +0300 |
commit | f089bfed28febfa82bfe0ad06e269de317d7a4e0 (patch) | |
tree | ec7350ede88959265d032529466e46a6274a7bca /lucene-linguistics | |
parent | 8beac01933b5121187d1cf6dd97cef0b34d1afd2 (diff) |
test: load stopwords file from the classpath
Diffstat (limited to 'lucene-linguistics')
-rw-r--r-- | lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java | 26 | ||||
-rw-r--r-- | lucene-linguistics/src/test/resources/classpath-stopwords.txt | 1 |
2 files changed, 27 insertions, 0 deletions
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 35373479bff..20a7351d0a0 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -167,4 +167,30 @@ public class LuceneTokenizerTest { .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); } + + @Test + public void testOptionalPathWithClasspathResources() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("stop") + .conf("words", "classpath-stopwords.txt")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("and", "Cat"), tokenStrings(tokens)); + } } diff --git a/lucene-linguistics/src/test/resources/classpath-stopwords.txt b/lucene-linguistics/src/test/resources/classpath-stopwords.txt new file mode 100644 index 00000000000..4dda64888cb --- /dev/null +++ b/lucene-linguistics/src/test/resources/classpath-stopwords.txt @@ -0,0 +1 @@ +Dog |