diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-07-31 23:31:50 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-07-31 23:31:50 +0200 |
commit | 74ff5cf2fc8fb1a58edb4f822b25bdafd3ee7145 (patch) | |
tree | a73c5de71f06510b25ed7155e29d7444192a4ecd /lucene-linguistics | |
parent | 5d26801bc63c35705e708d3cc7086f0b0103e909 (diff) |
Make AnalyzerFactory thread safe
Diffstat (limited to 'lucene-linguistics')
-rw-r--r-- | lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java | 76 |
1 files changed, 44 insertions, 32 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java index 45cd4d4f186..71e31de34b3 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java @@ -14,6 +14,7 @@ import java.io.IOException; import java.nio.file.Path; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.logging.Logger; /** @@ -33,7 +34,7 @@ class AnalyzerFactory { // Registry of analyzers per language // The idea is to create analyzers ONLY WHEN they are needed // Analyzers are thread safe so no need to recreate them for every document - private final Map<String, Analyzer> languageAnalyzers = new HashMap<>(); + private final Map<AnalyzerKey, Analyzer> languageAnalyzers = new ConcurrentHashMap<>(); private final Analyzer defaultAnalyzer = new StandardAnalyzer(); @@ -58,43 +59,30 @@ class AnalyzerFactory { * Default analyzer is the `StandardAnalyzer`. */ public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) { - String analyzerKey = generateKey(language, stemMode, removeAccents); + return languageAnalyzers.computeIfAbsent(new AnalyzerKey(language, stemMode, removeAccents), + this::createAnalyzer); + } - // If analyzer for language is already known - if (null != languageAnalyzers.get(analyzerKey)) { - return languageAnalyzers.get(analyzerKey); - } - if (null != config.analysis(analyzerKey)) { - return setAndReturn(analyzerKey, setUpAnalyzer(analyzerKey)); + private Analyzer createAnalyzer(AnalyzerKey analyzerKey) { + if (null != config.analysis(analyzerKey.languageCode())) { + log.config("Creating analyzer for " + analyzerKey + " from config"); + return createAnalyzer(analyzerKey, config.analysis(analyzerKey.languageCode())); } - if (null != analyzerComponents.getComponent(analyzerKey)) { - log.config("Analyzer for language=" + analyzerKey + " is from components."); - return setAndReturn(analyzerKey, analyzerComponents.getComponent(analyzerKey)); + if (null != analyzerComponents.getComponent(analyzerKey.languageCode())) { + log.config("Using analyzer for " + analyzerKey + " from components"); + return analyzerComponents.getComponent(analyzerKey.languageCode()); } - if (null != defaultAnalyzers.get(language)) { - log.config("Analyzer for language=" + analyzerKey + " is from a list of default language analyzers."); - return setAndReturn(analyzerKey, defaultAnalyzers.get(language)); + if (null != defaultAnalyzers.get(analyzerKey.language())) { + log.config("Using Analyzer for " + analyzerKey + " from a list of default language analyzers"); + return defaultAnalyzers.get(analyzerKey.language()); } // set the default analyzer for the language - log.config("StandardAnalyzer is used for language=" + analyzerKey); - return setAndReturn(analyzerKey, defaultAnalyzer); - } - - private Analyzer setAndReturn(String analyzerKey, Analyzer analyzer) { - languageAnalyzers.put(analyzerKey, analyzer); - return analyzer; + log.config("StandardAnalyzer is used for " + analyzerKey); + return defaultAnalyzer; } - // TODO: Would it make sense to combine language + stemMode + removeAccents to make - // a composite key so we can have more variations possible? - private String generateKey(Language language, StemMode stemMode, boolean removeAccents) { - return language.languageCode(); - } - - private Analyzer setUpAnalyzer(String analyzerKey) { + private Analyzer createAnalyzer(AnalyzerKey analyzerKey, LuceneAnalysisConfig.Analysis analysis) { try { - LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey); - log.config("Creating analyzer for: '" + analyzerKey + "' with config: " + analysis); CustomAnalyzer.Builder builder = CustomAnalyzer.builder(configDir); builder = withTokenizer(builder, analysis); builder = addCharFilters(builder, analysis); @@ -104,8 +92,8 @@ class AnalyzerFactory { // Failing to set up the Analyzer, should blow up during testing and VAP should not be deployed. // Most likely cause for problems is that a specified resource is not available in VAP. // Unit tests should catch such problems and prevent the VAP being deployed. - throw new RuntimeException("Failed to build analyzer: '" + analyzerKey + - "', with configuration: '" + config.analysis(analyzerKey), e); + throw new RuntimeException("Failed to build analyzer " + analyzerKey + + ", with configuration " + analysis, e); } } @@ -153,4 +141,28 @@ class AnalyzerFactory { return new HashMap<>(map); } + private record AnalyzerKey(Language language, StemMode stemMode, boolean removeAccents) { + + // TODO: Identity here is determined by language only. + // Would it make sense to combine language + stemMode + removeAccents to make + // a composite key so we can have more variations possible? + + public String languageCode() { + return language.languageCode(); + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if ( ! (o instanceof AnalyzerKey other)) return false; + return other.language == this.language; + } + + @Override + public int hashCode() { + return language.hashCode(); + } + + } + } |