summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics
diff options
context:
space:
mode:
authorDainius Jocas <dainius.jocas@gmail.com>2023-09-27 12:12:35 +0300
committerDainius Jocas <dainius.jocas@gmail.com>2023-09-27 12:13:54 +0300
commitfda0d74dc1c5e833f01d96197bd1dac40ced7ad7 (patch)
tree58c51d9589292d1f0a6dc30fc2b050f72c5e6216 /lucene-linguistics
parent65c585ffcc50626b171b65eb6b2a0027c8798eff (diff)
LuceneLinguistics: optionaly consider StemMode as analysis key
Diffstat (limited to 'lucene-linguistics')
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java45
-rw-r--r--lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java29
2 files changed, 64 insertions, 10 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
index 67a430a28dc..92ea77cdc13 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
@@ -11,9 +11,9 @@ import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
-import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
+import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Logger;
@@ -60,13 +60,15 @@ class AnalyzerFactory {
}
private Analyzer createAnalyzer(AnalyzerKey analyzerKey) {
- if (null != config.analysis(analyzerKey.languageCode())) {
+ LuceneAnalysisConfig.Analysis analysis = analysisConfig(analyzerKey);
+ if (null != analysis) {
log.config("Creating analyzer for " + analyzerKey + " from config");
- return createAnalyzer(analyzerKey, config.analysis(analyzerKey.languageCode()));
+ return createAnalyzer(analyzerKey, analysis);
}
- if (null != analyzerComponents.getComponent(analyzerKey.languageCode())) {
+ Analyzer analyzerFromComponents = fromComponents(analyzerKey);
+ if (null != analyzerFromComponents) {
log.config("Using analyzer for " + analyzerKey + " from components");
- return analyzerComponents.getComponent(analyzerKey.languageCode());
+ return analyzerFromComponents;
}
if (null != defaultAnalyzers.get(analyzerKey.language())) {
log.config("Using Analyzer for " + analyzerKey + " from a list of default language analyzers");
@@ -77,6 +79,24 @@ class AnalyzerFactory {
return defaultAnalyzer;
}
+ /**
+ * First, checks if more specific (language + stemMode) analysis is configured.
+ * Second, checks if analysis is configured only for a languageCode.
+ */
+ private LuceneAnalysisConfig.Analysis analysisConfig(AnalyzerKey analyzerKey) {
+ LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey.languageCodeAndStemMode());
+ return (null != analysis) ? analysis : config.analysis(analyzerKey.languageCode());
+ }
+
+ /**
+ * First, checks if a component is configured for a languageCode + StemMode.
+ * Second, checks if Analyzer is configured only for a languageCode.
+ */
+ private Analyzer fromComponents(AnalyzerKey analyzerKey) {
+ Analyzer analyzer = analyzerComponents.getComponent(analyzerKey.languageCodeAndStemMode());
+ return (null != analyzer) ? analyzer : analyzerComponents.getComponent(analyzerKey.languageCode());
+ }
+
private Analyzer createAnalyzer(AnalyzerKey analyzerKey, LuceneAnalysisConfig.Analysis analysis) {
try {
CustomAnalyzer.Builder builder = config.configDir()
@@ -143,9 +163,14 @@ class AnalyzerFactory {
private record AnalyzerKey(Language language, StemMode stemMode, boolean removeAccents) {
- // TODO: Identity here is determined by language only.
- // Would it make sense to combine language + stemMode + removeAccents to make
- // a composite key so we can have more variations possible?
+ /**
+ * Combines the languageCode and the stemMode.
+ * It allows to specify up to 6 (5 StemModes and only language code) analyzers per language.
+ * The `/` is used so that it doesn't conflict with ComponentRegistry keys.
+ */
+ public String languageCodeAndStemMode() {
+ return language.languageCode() + "/" + stemMode.toString();
+ }
public String languageCode() {
return language.languageCode();
@@ -155,12 +180,12 @@ class AnalyzerFactory {
public boolean equals(Object o) {
if (o == this) return true;
if ( ! (o instanceof AnalyzerKey other)) return false;
- return other.language == this.language;
+ return other.language == this.language && other.stemMode == this.stemMode;
}
@Override
public int hashCode() {
- return language.hashCode();
+ return Objects.hash(language, stemMode);
}
}
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
index 92c369bc60c..fc29fcc0071 100644
--- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
+++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
@@ -197,4 +197,33 @@ public class LuceneTokenizerTest {
.tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
assertEquals(List.of("and", "Cat"), tokenStrings(tokens));
}
+
+ @Test
+ public void compositeConfigKey() {
+ String reversingAnalyzerKey = Language.ENGLISH.languageCode()
+ + "/"
+ + StemMode.ALL;
+ LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
+ .analysis(
+ Map.of(reversingAnalyzerKey,
+ new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("reverseString"))))
+ ).build();
+ LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>());
+ // Matching StemMode
+ Iterable<Token> tokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
+ assertEquals(List.of("sgoD", "dna", "staC"), tokenStrings(tokens));
+ // StemMode is different
+ Iterable<Token> stemModeTokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.BEST, false);
+ assertEquals(List.of("dog", "cat"), tokenStrings(stemModeTokens));
+
+ }
}