From f089bfed28febfa82bfe0ad06e269de317d7a4e0 Mon Sep 17 00:00:00 2001 From: Dainius Jocas Date: Thu, 14 Sep 2023 22:10:58 +0300 Subject: test: load stopwords file from the classpath --- .../yahoo/language/lucene/LuceneTokenizerTest.java | 26 ++++++++++++++++++++++ .../src/test/resources/classpath-stopwords.txt | 1 + 2 files changed, 27 insertions(+) create mode 100644 lucene-linguistics/src/test/resources/classpath-stopwords.txt (limited to 'lucene-linguistics/src/test') diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 35373479bff..20a7351d0a0 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -167,4 +167,30 @@ public class LuceneTokenizerTest { .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); } + + @Test + public void testOptionalPathWithClasspathResources() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("stop") + .conf("words", "classpath-stopwords.txt")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("and", "Cat"), tokenStrings(tokens)); + } } diff --git a/lucene-linguistics/src/test/resources/classpath-stopwords.txt b/lucene-linguistics/src/test/resources/classpath-stopwords.txt new file mode 100644 index 00000000000..4dda64888cb --- /dev/null +++ b/lucene-linguistics/src/test/resources/classpath-stopwords.txt @@ -0,0 +1 @@ +Dog -- cgit v1.2.3