diff options
Diffstat (limited to 'lucene-linguistics/src/main')
6 files changed, 438 insertions, 0 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java new file mode 100644 index 00000000000..b7d3a618954 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java @@ -0,0 +1,160 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharFilterFactory; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenizerFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Logger; + +public class AnalyzerFactory { + private static final Logger log = Logger.getLogger(AnalyzerFactory.class.getName()); + + private final LuceneAnalysisConfig config; + + // Root config directory for all analysis components + private final Path configDir; + + // Registry of analyzers per language + // The idea is to create analyzers ONLY WHEN they are needed + // Analyzers are thread safe so no need to recreate them for every document + private final Map<String, Analyzer> languageAnalyzers = new HashMap<>(); + + private final Analyzer defaultAnalyzer = new StandardAnalyzer(); + + private final static String STANDARD_TOKENIZER = "standard"; + + private final ComponentRegistry<Analyzer> analyzerComponents; + private final DefaultAnalyzers defaultAnalyzers; + + public AnalyzerFactory(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) { + this.config = config; + this.configDir = config.configDir(); + this.analyzerComponents = analyzers; + this.defaultAnalyzers = DefaultAnalyzers.getInstance(); + log.info("Available in classpath char filters: " + CharFilterFactory.availableCharFilters()); + log.info("Available in classpath tokenizers: " + TokenizerFactory.availableTokenizers()); + log.info("Available in classpath token filters: " + TokenFilterFactory.availableTokenFilters()); + } + + /** + * Retrieves an analyzer with a given params. + * Sets up the analyzer if config is provided. + * Default analyzer is the `StandardAnalyzer`. + * @param language + * @param stemMode + * @param removeAccents + * @return + */ + public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) { + String analyzerKey = generateKey(language, stemMode, removeAccents); + + // If analyzer for language is already known + if (null != languageAnalyzers.get(analyzerKey)) { + return languageAnalyzers.get(analyzerKey); + } + if (null != config.analysis(analyzerKey)) { + return setAndReturn(analyzerKey, setUpAnalyzer(analyzerKey)); + } + if (null != analyzerComponents.getComponent(analyzerKey)) { + log.info("Analyzer for language=" + analyzerKey + " is from components."); + return setAndReturn(analyzerKey, analyzerComponents.getComponent(analyzerKey)); + } + if (null != defaultAnalyzers.get(language)) { + log.info("Analyzer for language=" + analyzerKey + " is from a list of default language analyzers."); + return setAndReturn(analyzerKey, defaultAnalyzers.get(language)); + } + // set the default analyzer for the language + log.info("StandardAnalyzer is used for language=" + analyzerKey); + return setAndReturn(analyzerKey, defaultAnalyzer); + } + + private Analyzer setAndReturn(String analyzerKey, Analyzer analyzer) { + languageAnalyzers.put(analyzerKey, analyzer); + return analyzer; + } + + // TODO: Would it make sense to combine language + stemMode + removeAccents to make + // a composite key so we can have more variations possible? + private String generateKey(Language language, StemMode stemMode, boolean removeAccents) { + return language.languageCode(); + } + + private Analyzer setUpAnalyzer(String analyzerKey) { + try { + LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey); + log.info("Creating analyzer for: '" + analyzerKey + "' with config: " + analysis); + CustomAnalyzer.Builder builder = CustomAnalyzer.builder(configDir); + builder = withTokenizer(builder, analysis); + builder = addCharFilters(builder, analysis); + builder = addTokenFilters(builder, analysis); + return builder.build(); + } catch (Exception e) { + // Failing to set up the Analyzer, should blow up during testing and VAP should not be deployed. + // Most likely cause for problems is that a specified resource is not available in VAP. + // Unit tests should catch such problems and prevent the VAP being deployed. + log.severe("Failed to build analyzer: '" + + analyzerKey + + "', with configuration: '" + + config.analysis(analyzerKey) + + "' with exception: '" + + e.getMessage() + "'" ); + throw new RuntimeException(e); + } + } + + private CustomAnalyzer.Builder withTokenizer(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // By default we use the "standard" tokenizer + return builder.withTokenizer(STANDARD_TOKENIZER, new HashMap<>()); + } + String tokenizerName = analysis.tokenizer().name(); + Map<String, String> conf = analysis.tokenizer().conf(); + return builder.withTokenizer(tokenizerName, toModifiable(conf)); + } + + private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // by default there are no char filters + return builder; + } + for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) { + builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf())); + } + return builder; + } + + private CustomAnalyzer.Builder addTokenFilters(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // by default no token filters are added + return builder; + } + for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) { + builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf())); + } + return builder; + } + + /** + * A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders + * mutates the map to mark that a param was consumed. Immutable maps can't be mutated! + * To overcome this conflict we can wrap the ConfigInstance map in a new HashMap. + * @param map + * @return Mutable Map + */ + private Map<String, String> toModifiable(Map<String, String> map) { + return new HashMap<>(map); + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java new file mode 100644 index 00000000000..955e18474f7 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java @@ -0,0 +1,110 @@ +package com.yahoo.language.lucene; + +import com.yahoo.language.Language; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ar.ArabicAnalyzer; +import org.apache.lucene.analysis.bg.BulgarianAnalyzer; +import org.apache.lucene.analysis.bn.BengaliAnalyzer; +import org.apache.lucene.analysis.ca.CatalanAnalyzer; +import org.apache.lucene.analysis.ckb.SoraniAnalyzer; +import org.apache.lucene.analysis.cz.CzechAnalyzer; +import org.apache.lucene.analysis.da.DanishAnalyzer; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.el.GreekAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.es.SpanishAnalyzer; +import org.apache.lucene.analysis.et.EstonianAnalyzer; +import org.apache.lucene.analysis.eu.BasqueAnalyzer; +import org.apache.lucene.analysis.fa.PersianAnalyzer; +import org.apache.lucene.analysis.fi.FinnishAnalyzer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.ga.IrishAnalyzer; +import org.apache.lucene.analysis.gl.GalicianAnalyzer; +import org.apache.lucene.analysis.hi.HindiAnalyzer; +import org.apache.lucene.analysis.hu.HungarianAnalyzer; +import org.apache.lucene.analysis.hy.ArmenianAnalyzer; +import org.apache.lucene.analysis.id.IndonesianAnalyzer; +import org.apache.lucene.analysis.it.ItalianAnalyzer; +import org.apache.lucene.analysis.lt.LithuanianAnalyzer; +import org.apache.lucene.analysis.lv.LatvianAnalyzer; +import org.apache.lucene.analysis.ne.NepaliAnalyzer; +import org.apache.lucene.analysis.nl.DutchAnalyzer; +import org.apache.lucene.analysis.no.NorwegianAnalyzer; +import org.apache.lucene.analysis.pt.PortugueseAnalyzer; +import org.apache.lucene.analysis.ro.RomanianAnalyzer; +import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.sr.SerbianAnalyzer; +import org.apache.lucene.analysis.sv.SwedishAnalyzer; +import org.apache.lucene.analysis.ta.TamilAnalyzer; +import org.apache.lucene.analysis.te.TeluguAnalyzer; +import org.apache.lucene.analysis.th.ThaiAnalyzer; +import org.apache.lucene.analysis.tr.TurkishAnalyzer; + +import java.util.Map; + +import static java.util.Map.entry; + +public class DefaultAnalyzers { + + private static DefaultAnalyzers INSTANCE; + private final Map<Language, Analyzer> analyzerClasses; + + private DefaultAnalyzers() { + analyzerClasses = Map.ofEntries( + entry(Language.ARABIC, new ArabicAnalyzer()), + entry(Language.BULGARIAN, new BulgarianAnalyzer()), + entry(Language.BENGALI, new BengaliAnalyzer()), + // analyzerClasses.put(Language.BRASILIAN, new BrazilianAnalyzer()) + entry(Language.CATALAN, new CatalanAnalyzer()), + // cjk analyzer? + entry(Language.KURDISH, new SoraniAnalyzer()), + entry(Language.CZECH, new CzechAnalyzer()), + entry(Language.DANISH, new DanishAnalyzer()), + entry(Language.GERMAN, new GermanAnalyzer()), + entry(Language.GREEK, new GreekAnalyzer()), + entry(Language.ENGLISH, new EnglishAnalyzer()), + entry(Language.SPANISH, new SpanishAnalyzer()), + entry(Language.ESTONIAN, new EstonianAnalyzer()), + entry(Language.BASQUE, new BasqueAnalyzer()), + entry(Language.PERSIAN, new PersianAnalyzer()), + entry(Language.FINNISH, new FinnishAnalyzer()), + entry(Language.FRENCH, new FrenchAnalyzer()), + entry(Language.IRISH, new IrishAnalyzer()), + entry(Language.GALICIAN, new GalicianAnalyzer()), + entry(Language.HINDI, new HindiAnalyzer()), + entry(Language.HUNGARIAN, new HungarianAnalyzer()), + entry(Language.ARMENIAN, new ArmenianAnalyzer()), + entry(Language.INDONESIAN, new IndonesianAnalyzer()), + entry(Language.ITALIAN, new ItalianAnalyzer()), + entry(Language.LITHUANIAN, new LithuanianAnalyzer()), + entry(Language.LATVIAN, new LatvianAnalyzer()), + entry(Language.NEPALI, new NepaliAnalyzer()), + entry(Language.DUTCH, new DutchAnalyzer()), + entry(Language.NORWEGIAN_BOKMAL, new NorwegianAnalyzer()), + entry(Language.PORTUGUESE, new PortugueseAnalyzer()), + entry(Language.ROMANIAN, new RomanianAnalyzer()), + entry(Language.RUSSIAN, new RussianAnalyzer()), + entry(Language.SERBIAN, new SerbianAnalyzer()), + entry(Language.SWEDISH, new SwedishAnalyzer()), + entry(Language.TAMIL, new TamilAnalyzer()), + entry(Language.TELUGU, new TeluguAnalyzer()), + entry(Language.THAI, new ThaiAnalyzer()), + entry(Language.TURKISH, new TurkishAnalyzer()) + ); + } + + public static DefaultAnalyzers getInstance() { + if (INSTANCE == null) { + INSTANCE = new DefaultAnalyzers(); + } + return INSTANCE; + } + + public Analyzer get(Language language) { + return analyzerClasses.get(language); + } + + public Analyzer get(String languageCode) { + return analyzerClasses.get(Language.fromLanguageTag(languageCode)); + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java new file mode 100644 index 00000000000..b5c5ba47ab6 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java @@ -0,0 +1,82 @@ +package com.yahoo.language.lucene; + +import com.google.inject.Inject; +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.SimpleLinguistics; +import org.apache.lucene.analysis.Analyzer; + +import java.util.ArrayList; +import java.util.logging.Logger; + +/** + * Factory of Lucene based linguistics processor. + * As described in the Linguistics docstring + * > the tokenizer should typically stem, transform and normalize + * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP. + * + * TODO: docs for all available analysis components. + * TODO: some registry for available language Analyzers. + */ +public class LuceneLinguistics extends SimpleLinguistics { + + private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName()); + private final Normalizer normalizer; + private final Transformer transformer; + private final Tokenizer tokenizer; + private final Stemmer stemmer; + private final Segmenter segmenter; + private final LuceneAnalysisConfig config; + + @Inject + public LuceneLinguistics(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) { + log.info("Creating LuceneLinguistics with: " + config); + this.config = config; + this.tokenizer = new LuceneTokenizer(config, analyzers); + // NOOP stemmer + this.stemmer = (word, stemMode, language) -> { + ArrayList<StemList> stemLists = new ArrayList<>(); + StemList stems = new StemList(); + stems.add(word); + stemLists.add(stems); + return stemLists; + }; + // Segmenter that just wraps a tokenizer + this.segmenter = (string, language) -> { + ArrayList<String> segments = new ArrayList<>(); + Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false); + tokens.forEach(token -> segments.add(token.getTokenString())); + return segments; + }; + // NOOP normalizer + this.normalizer = (string) -> string; + // NOOP transformer + this.transformer = (string, language) -> string; + } + + @Override + public Stemmer getStemmer() { return stemmer; } + + @Override + public Tokenizer getTokenizer() { return tokenizer; } + + @Override + public Normalizer getNormalizer() { return normalizer; } + + @Override + public Transformer getTransformer() { return transformer; } + + @Override + public Segmenter getSegmenter() { return segmenter; } + + public LuceneAnalysisConfig getConfig() { + return config; + } + + @Override + public boolean equals(Linguistics other) { + return (other instanceof LuceneLinguistics) + // Config actually determines if Linguistics are equal + && config.equals(((LuceneLinguistics) other).getConfig()); } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java new file mode 100644 index 00000000000..0cde849fd6e --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java @@ -0,0 +1,68 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Language; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.SimpleToken; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class LuceneTokenizer implements Tokenizer { + + private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName()); + + // Dummy value, just to stuff the Lucene interface. + private final static String FIELD_NAME = "F"; + + private final AnalyzerFactory analyzerFactory; + + public LuceneTokenizer(LuceneAnalysisConfig config) { + this(config, new ComponentRegistry<>()); + } + public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) { + this.analyzerFactory = new AnalyzerFactory(config, analyzers); + } + + @Override + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + if (input.isEmpty()) return List.of(); + + List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents)); + log.log(Level.FINEST, "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens); + return tokens; + } + + private List<Token> textToTokens(String text, Analyzer analyzer) { + List<Token> tokens = new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); + + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); + try { + tokenStream.reset(); + while (tokenStream.incrementToken()) { + // TODO: is SimpleToken good enough? Maybe a custom implementation. + // TODO: what to do with cases when multiple tokens are inserted into the position? + String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()); + String tokenString = charTermAttribute.toString(); + tokens.add(new SimpleToken(originalString, tokenString) + .setType(TokenType.ALPHABETIC) + .setOffset(offsetAttribute.startOffset()) + .setScript(TokenScript.UNKNOWN)); + } + tokenStream.end(); + tokenStream.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to analyze: " + text, e); + } + return tokens; + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java new file mode 100644 index 00000000000..14330723224 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java @@ -0,0 +1,4 @@ +@ExportPackage +package com.yahoo.language.lucene; + +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def new file mode 100644 index 00000000000..e4b5037dcbe --- /dev/null +++ b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def @@ -0,0 +1,14 @@ +package=com.yahoo.language.lucene + +# The schema ("type") for an application specified config type +# See +# - https://docs.vespa.ai/en/reference/config-files.html + +configDir path +analysis{}.tokenizer.name string default=standard +analysis{}.tokenizer.conf{} string + +analysis{}.charFilters[].name string +analysis{}.charFilters[].conf{} string +analysis{}.tokenFilters[].name string +analysis{}.tokenFilters[].conf{} string |