diff options
author | Dainius Jocas <dainius.jocas@gmail.com> | 2023-09-27 12:12:35 +0300 |
---|---|---|
committer | Dainius Jocas <dainius.jocas@gmail.com> | 2023-09-27 12:13:54 +0300 |
commit | fda0d74dc1c5e833f01d96197bd1dac40ced7ad7 (patch) | |
tree | 58c51d9589292d1f0a6dc30fc2b050f72c5e6216 /lucene-linguistics | |
parent | 65c585ffcc50626b171b65eb6b2a0027c8798eff (diff) |
LuceneLinguistics: optionaly consider StemMode as analysis key
Diffstat (limited to 'lucene-linguistics')
-rw-r--r-- | lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java | 45 | ||||
-rw-r--r-- | lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java | 29 |
2 files changed, 64 insertions, 10 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java index 67a430a28dc..92ea77cdc13 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java @@ -11,9 +11,9 @@ import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import java.io.IOException; -import java.nio.file.Path; import java.util.HashMap; import java.util.Map; +import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Logger; @@ -60,13 +60,15 @@ class AnalyzerFactory { } private Analyzer createAnalyzer(AnalyzerKey analyzerKey) { - if (null != config.analysis(analyzerKey.languageCode())) { + LuceneAnalysisConfig.Analysis analysis = analysisConfig(analyzerKey); + if (null != analysis) { log.config("Creating analyzer for " + analyzerKey + " from config"); - return createAnalyzer(analyzerKey, config.analysis(analyzerKey.languageCode())); + return createAnalyzer(analyzerKey, analysis); } - if (null != analyzerComponents.getComponent(analyzerKey.languageCode())) { + Analyzer analyzerFromComponents = fromComponents(analyzerKey); + if (null != analyzerFromComponents) { log.config("Using analyzer for " + analyzerKey + " from components"); - return analyzerComponents.getComponent(analyzerKey.languageCode()); + return analyzerFromComponents; } if (null != defaultAnalyzers.get(analyzerKey.language())) { log.config("Using Analyzer for " + analyzerKey + " from a list of default language analyzers"); @@ -77,6 +79,24 @@ class AnalyzerFactory { return defaultAnalyzer; } + /** + * First, checks if more specific (language + stemMode) analysis is configured. + * Second, checks if analysis is configured only for a languageCode. + */ + private LuceneAnalysisConfig.Analysis analysisConfig(AnalyzerKey analyzerKey) { + LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey.languageCodeAndStemMode()); + return (null != analysis) ? analysis : config.analysis(analyzerKey.languageCode()); + } + + /** + * First, checks if a component is configured for a languageCode + StemMode. + * Second, checks if Analyzer is configured only for a languageCode. + */ + private Analyzer fromComponents(AnalyzerKey analyzerKey) { + Analyzer analyzer = analyzerComponents.getComponent(analyzerKey.languageCodeAndStemMode()); + return (null != analyzer) ? analyzer : analyzerComponents.getComponent(analyzerKey.languageCode()); + } + private Analyzer createAnalyzer(AnalyzerKey analyzerKey, LuceneAnalysisConfig.Analysis analysis) { try { CustomAnalyzer.Builder builder = config.configDir() @@ -143,9 +163,14 @@ class AnalyzerFactory { private record AnalyzerKey(Language language, StemMode stemMode, boolean removeAccents) { - // TODO: Identity here is determined by language only. - // Would it make sense to combine language + stemMode + removeAccents to make - // a composite key so we can have more variations possible? + /** + * Combines the languageCode and the stemMode. + * It allows to specify up to 6 (5 StemModes and only language code) analyzers per language. + * The `/` is used so that it doesn't conflict with ComponentRegistry keys. + */ + public String languageCodeAndStemMode() { + return language.languageCode() + "/" + stemMode.toString(); + } public String languageCode() { return language.languageCode(); @@ -155,12 +180,12 @@ class AnalyzerFactory { public boolean equals(Object o) { if (o == this) return true; if ( ! (o instanceof AnalyzerKey other)) return false; - return other.language == this.language; + return other.language == this.language && other.stemMode == this.stemMode; } @Override public int hashCode() { - return language.hashCode(); + return Objects.hash(language, stemMode); } } diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 92c369bc60c..fc29fcc0071 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -197,4 +197,33 @@ public class LuceneTokenizerTest { .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); assertEquals(List.of("and", "Cat"), tokenStrings(tokens)); } + + @Test + public void compositeConfigKey() { + String reversingAnalyzerKey = Language.ENGLISH.languageCode() + + "/" + + StemMode.ALL; + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .analysis( + Map.of(reversingAnalyzerKey, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("reverseString")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + // Matching StemMode + Iterable<Token> tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("sgoD", "dna", "staC"), tokenStrings(tokens)); + // StemMode is different + Iterable<Token> stemModeTokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.BEST, false); + assertEquals(List.of("dog", "cat"), tokenStrings(stemModeTokens)); + + } } |