summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics/src/main/java
diff options
context:
space:
mode:
Diffstat (limited to 'lucene-linguistics/src/main/java')
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java160
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java110
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java82
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java68
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java4
5 files changed, 424 insertions, 0 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
new file mode 100644
index 00000000000..b7d3a618954
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
@@ -0,0 +1,160 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Language;
+import com.yahoo.language.process.StemMode;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharFilterFactory;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenizerFactory;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Logger;
+
+public class AnalyzerFactory {
+ private static final Logger log = Logger.getLogger(AnalyzerFactory.class.getName());
+
+ private final LuceneAnalysisConfig config;
+
+ // Root config directory for all analysis components
+ private final Path configDir;
+
+ // Registry of analyzers per language
+ // The idea is to create analyzers ONLY WHEN they are needed
+ // Analyzers are thread safe so no need to recreate them for every document
+ private final Map<String, Analyzer> languageAnalyzers = new HashMap<>();
+
+ private final Analyzer defaultAnalyzer = new StandardAnalyzer();
+
+ private final static String STANDARD_TOKENIZER = "standard";
+
+ private final ComponentRegistry<Analyzer> analyzerComponents;
+ private final DefaultAnalyzers defaultAnalyzers;
+
+ public AnalyzerFactory(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ this.config = config;
+ this.configDir = config.configDir();
+ this.analyzerComponents = analyzers;
+ this.defaultAnalyzers = DefaultAnalyzers.getInstance();
+ log.info("Available in classpath char filters: " + CharFilterFactory.availableCharFilters());
+ log.info("Available in classpath tokenizers: " + TokenizerFactory.availableTokenizers());
+ log.info("Available in classpath token filters: " + TokenFilterFactory.availableTokenFilters());
+ }
+
+ /**
+ * Retrieves an analyzer with a given params.
+ * Sets up the analyzer if config is provided.
+ * Default analyzer is the `StandardAnalyzer`.
+ * @param language
+ * @param stemMode
+ * @param removeAccents
+ * @return
+ */
+ public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) {
+ String analyzerKey = generateKey(language, stemMode, removeAccents);
+
+ // If analyzer for language is already known
+ if (null != languageAnalyzers.get(analyzerKey)) {
+ return languageAnalyzers.get(analyzerKey);
+ }
+ if (null != config.analysis(analyzerKey)) {
+ return setAndReturn(analyzerKey, setUpAnalyzer(analyzerKey));
+ }
+ if (null != analyzerComponents.getComponent(analyzerKey)) {
+ log.info("Analyzer for language=" + analyzerKey + " is from components.");
+ return setAndReturn(analyzerKey, analyzerComponents.getComponent(analyzerKey));
+ }
+ if (null != defaultAnalyzers.get(language)) {
+ log.info("Analyzer for language=" + analyzerKey + " is from a list of default language analyzers.");
+ return setAndReturn(analyzerKey, defaultAnalyzers.get(language));
+ }
+ // set the default analyzer for the language
+ log.info("StandardAnalyzer is used for language=" + analyzerKey);
+ return setAndReturn(analyzerKey, defaultAnalyzer);
+ }
+
+ private Analyzer setAndReturn(String analyzerKey, Analyzer analyzer) {
+ languageAnalyzers.put(analyzerKey, analyzer);
+ return analyzer;
+ }
+
+ // TODO: Would it make sense to combine language + stemMode + removeAccents to make
+ // a composite key so we can have more variations possible?
+ private String generateKey(Language language, StemMode stemMode, boolean removeAccents) {
+ return language.languageCode();
+ }
+
+ private Analyzer setUpAnalyzer(String analyzerKey) {
+ try {
+ LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey);
+ log.info("Creating analyzer for: '" + analyzerKey + "' with config: " + analysis);
+ CustomAnalyzer.Builder builder = CustomAnalyzer.builder(configDir);
+ builder = withTokenizer(builder, analysis);
+ builder = addCharFilters(builder, analysis);
+ builder = addTokenFilters(builder, analysis);
+ return builder.build();
+ } catch (Exception e) {
+ // Failing to set up the Analyzer, should blow up during testing and VAP should not be deployed.
+ // Most likely cause for problems is that a specified resource is not available in VAP.
+ // Unit tests should catch such problems and prevent the VAP being deployed.
+ log.severe("Failed to build analyzer: '"
+ + analyzerKey
+ + "', with configuration: '"
+ + config.analysis(analyzerKey)
+ + "' with exception: '"
+ + e.getMessage() + "'" );
+ throw new RuntimeException(e);
+ }
+ }
+
+ private CustomAnalyzer.Builder withTokenizer(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // By default we use the "standard" tokenizer
+ return builder.withTokenizer(STANDARD_TOKENIZER, new HashMap<>());
+ }
+ String tokenizerName = analysis.tokenizer().name();
+ Map<String, String> conf = analysis.tokenizer().conf();
+ return builder.withTokenizer(tokenizerName, toModifiable(conf));
+ }
+
+ private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // by default there are no char filters
+ return builder;
+ }
+ for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) {
+ builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf()));
+ }
+ return builder;
+ }
+
+ private CustomAnalyzer.Builder addTokenFilters(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // by default no token filters are added
+ return builder;
+ }
+ for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) {
+ builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf()));
+ }
+ return builder;
+ }
+
+ /**
+ * A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders
+ * mutates the map to mark that a param was consumed. Immutable maps can't be mutated!
+ * To overcome this conflict we can wrap the ConfigInstance map in a new HashMap.
+ * @param map
+ * @return Mutable Map
+ */
+ private Map<String, String> toModifiable(Map<String, String> map) {
+ return new HashMap<>(map);
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
new file mode 100644
index 00000000000..955e18474f7
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
@@ -0,0 +1,110 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.language.Language;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
+import org.apache.lucene.analysis.ca.CatalanAnalyzer;
+import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
+import org.apache.lucene.analysis.cz.CzechAnalyzer;
+import org.apache.lucene.analysis.da.DanishAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.el.GreekAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.es.SpanishAnalyzer;
+import org.apache.lucene.analysis.et.EstonianAnalyzer;
+import org.apache.lucene.analysis.eu.BasqueAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.fi.FinnishAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.ga.IrishAnalyzer;
+import org.apache.lucene.analysis.gl.GalicianAnalyzer;
+import org.apache.lucene.analysis.hi.HindiAnalyzer;
+import org.apache.lucene.analysis.hu.HungarianAnalyzer;
+import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
+import org.apache.lucene.analysis.id.IndonesianAnalyzer;
+import org.apache.lucene.analysis.it.ItalianAnalyzer;
+import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
+import org.apache.lucene.analysis.lv.LatvianAnalyzer;
+import org.apache.lucene.analysis.ne.NepaliAnalyzer;
+import org.apache.lucene.analysis.nl.DutchAnalyzer;
+import org.apache.lucene.analysis.no.NorwegianAnalyzer;
+import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
+import org.apache.lucene.analysis.ro.RomanianAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.sr.SerbianAnalyzer;
+import org.apache.lucene.analysis.sv.SwedishAnalyzer;
+import org.apache.lucene.analysis.ta.TamilAnalyzer;
+import org.apache.lucene.analysis.te.TeluguAnalyzer;
+import org.apache.lucene.analysis.th.ThaiAnalyzer;
+import org.apache.lucene.analysis.tr.TurkishAnalyzer;
+
+import java.util.Map;
+
+import static java.util.Map.entry;
+
+public class DefaultAnalyzers {
+
+ private static DefaultAnalyzers INSTANCE;
+ private final Map<Language, Analyzer> analyzerClasses;
+
+ private DefaultAnalyzers() {
+ analyzerClasses = Map.ofEntries(
+ entry(Language.ARABIC, new ArabicAnalyzer()),
+ entry(Language.BULGARIAN, new BulgarianAnalyzer()),
+ entry(Language.BENGALI, new BengaliAnalyzer()),
+ // analyzerClasses.put(Language.BRASILIAN, new BrazilianAnalyzer())
+ entry(Language.CATALAN, new CatalanAnalyzer()),
+ // cjk analyzer?
+ entry(Language.KURDISH, new SoraniAnalyzer()),
+ entry(Language.CZECH, new CzechAnalyzer()),
+ entry(Language.DANISH, new DanishAnalyzer()),
+ entry(Language.GERMAN, new GermanAnalyzer()),
+ entry(Language.GREEK, new GreekAnalyzer()),
+ entry(Language.ENGLISH, new EnglishAnalyzer()),
+ entry(Language.SPANISH, new SpanishAnalyzer()),
+ entry(Language.ESTONIAN, new EstonianAnalyzer()),
+ entry(Language.BASQUE, new BasqueAnalyzer()),
+ entry(Language.PERSIAN, new PersianAnalyzer()),
+ entry(Language.FINNISH, new FinnishAnalyzer()),
+ entry(Language.FRENCH, new FrenchAnalyzer()),
+ entry(Language.IRISH, new IrishAnalyzer()),
+ entry(Language.GALICIAN, new GalicianAnalyzer()),
+ entry(Language.HINDI, new HindiAnalyzer()),
+ entry(Language.HUNGARIAN, new HungarianAnalyzer()),
+ entry(Language.ARMENIAN, new ArmenianAnalyzer()),
+ entry(Language.INDONESIAN, new IndonesianAnalyzer()),
+ entry(Language.ITALIAN, new ItalianAnalyzer()),
+ entry(Language.LITHUANIAN, new LithuanianAnalyzer()),
+ entry(Language.LATVIAN, new LatvianAnalyzer()),
+ entry(Language.NEPALI, new NepaliAnalyzer()),
+ entry(Language.DUTCH, new DutchAnalyzer()),
+ entry(Language.NORWEGIAN_BOKMAL, new NorwegianAnalyzer()),
+ entry(Language.PORTUGUESE, new PortugueseAnalyzer()),
+ entry(Language.ROMANIAN, new RomanianAnalyzer()),
+ entry(Language.RUSSIAN, new RussianAnalyzer()),
+ entry(Language.SERBIAN, new SerbianAnalyzer()),
+ entry(Language.SWEDISH, new SwedishAnalyzer()),
+ entry(Language.TAMIL, new TamilAnalyzer()),
+ entry(Language.TELUGU, new TeluguAnalyzer()),
+ entry(Language.THAI, new ThaiAnalyzer()),
+ entry(Language.TURKISH, new TurkishAnalyzer())
+ );
+ }
+
+ public static DefaultAnalyzers getInstance() {
+ if (INSTANCE == null) {
+ INSTANCE = new DefaultAnalyzers();
+ }
+ return INSTANCE;
+ }
+
+ public Analyzer get(Language language) {
+ return analyzerClasses.get(language);
+ }
+
+ public Analyzer get(String languageCode) {
+ return analyzerClasses.get(Language.fromLanguageTag(languageCode));
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
new file mode 100644
index 00000000000..b5c5ba47ab6
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
@@ -0,0 +1,82 @@
+package com.yahoo.language.lucene;
+
+import com.google.inject.Inject;
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.apache.lucene.analysis.Analyzer;
+
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+/**
+ * Factory of Lucene based linguistics processor.
+ * As described in the Linguistics docstring
+ * > the tokenizer should typically stem, transform and normalize
+ * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP.
+ *
+ * TODO: docs for all available analysis components.
+ * TODO: some registry for available language Analyzers.
+ */
+public class LuceneLinguistics extends SimpleLinguistics {
+
+ private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName());
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final Tokenizer tokenizer;
+ private final Stemmer stemmer;
+ private final Segmenter segmenter;
+ private final LuceneAnalysisConfig config;
+
+ @Inject
+ public LuceneLinguistics(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ log.info("Creating LuceneLinguistics with: " + config);
+ this.config = config;
+ this.tokenizer = new LuceneTokenizer(config, analyzers);
+ // NOOP stemmer
+ this.stemmer = (word, stemMode, language) -> {
+ ArrayList<StemList> stemLists = new ArrayList<>();
+ StemList stems = new StemList();
+ stems.add(word);
+ stemLists.add(stems);
+ return stemLists;
+ };
+ // Segmenter that just wraps a tokenizer
+ this.segmenter = (string, language) -> {
+ ArrayList<String> segments = new ArrayList<>();
+ Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false);
+ tokens.forEach(token -> segments.add(token.getTokenString()));
+ return segments;
+ };
+ // NOOP normalizer
+ this.normalizer = (string) -> string;
+ // NOOP transformer
+ this.transformer = (string, language) -> string;
+ }
+
+ @Override
+ public Stemmer getStemmer() { return stemmer; }
+
+ @Override
+ public Tokenizer getTokenizer() { return tokenizer; }
+
+ @Override
+ public Normalizer getNormalizer() { return normalizer; }
+
+ @Override
+ public Transformer getTransformer() { return transformer; }
+
+ @Override
+ public Segmenter getSegmenter() { return segmenter; }
+
+ public LuceneAnalysisConfig getConfig() {
+ return config;
+ }
+
+ @Override
+ public boolean equals(Linguistics other) {
+ return (other instanceof LuceneLinguistics)
+ // Config actually determines if Linguistics are equal
+ && config.equals(((LuceneLinguistics) other).getConfig()); }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
new file mode 100644
index 00000000000..0cde849fd6e
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
@@ -0,0 +1,68 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Language;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.SimpleToken;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+public class LuceneTokenizer implements Tokenizer {
+
+ private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName());
+
+ // Dummy value, just to stuff the Lucene interface.
+ private final static String FIELD_NAME = "F";
+
+ private final AnalyzerFactory analyzerFactory;
+
+ public LuceneTokenizer(LuceneAnalysisConfig config) {
+ this(config, new ComponentRegistry<>());
+ }
+ public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ this.analyzerFactory = new AnalyzerFactory(config, analyzers);
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ if (input.isEmpty()) return List.of();
+
+ List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents));
+ log.log(Level.FINEST, "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens);
+ return tokens;
+ }
+
+ private List<Token> textToTokens(String text, Analyzer analyzer) {
+ List<Token> tokens = new ArrayList<>();
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+
+ CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
+ try {
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ // TODO: is SimpleToken good enough? Maybe a custom implementation.
+ // TODO: what to do with cases when multiple tokens are inserted into the position?
+ String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset());
+ String tokenString = charTermAttribute.toString();
+ tokens.add(new SimpleToken(originalString, tokenString)
+ .setType(TokenType.ALPHABETIC)
+ .setOffset(offsetAttribute.startOffset())
+ .setScript(TokenScript.UNKNOWN));
+ }
+ tokenStream.end();
+ tokenStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to analyze: " + text, e);
+ }
+ return tokens;
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java
new file mode 100644
index 00000000000..14330723224
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java
@@ -0,0 +1,4 @@
+@ExportPackage
+package com.yahoo.language.lucene;
+
+import com.yahoo.osgi.annotation.ExportPackage;