From 5a60f6f3ae8e99f1f3de10e22a1f055d03fb37db Mon Sep 17 00:00:00 2001 From: Dainius Jocas Date: Mon, 31 Jul 2023 13:27:43 +0300 Subject: integrate Lucene Linguistics into the vespa project --- lucene-linguistics/README.md | 93 ++++++++++++ lucene-linguistics/abi-spec.json | 1 + lucene-linguistics/pom.xml | 107 ++++++++++++++ .../com/yahoo/language/lucene/AnalyzerFactory.java | 160 +++++++++++++++++++++ .../yahoo/language/lucene/DefaultAnalyzers.java | 110 ++++++++++++++ .../yahoo/language/lucene/LuceneLinguistics.java | 82 +++++++++++ .../com/yahoo/language/lucene/LuceneTokenizer.java | 68 +++++++++ .../com/yahoo/language/lucene/package-info.java | 4 + .../configdefinitions/lucene-analysis.def | 14 ++ .../yahoo/language/lucene/LuceneTokenizerTest.java | 139 ++++++++++++++++++ .../src/test/resources/stopwords.txt | 1 + 11 files changed, 779 insertions(+) create mode 100644 lucene-linguistics/README.md create mode 100644 lucene-linguistics/abi-spec.json create mode 100644 lucene-linguistics/pom.xml create mode 100644 lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java create mode 100644 lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java create mode 100644 lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java create mode 100644 lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java create mode 100644 lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java create mode 100644 lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def create mode 100644 lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java create mode 100644 lucene-linguistics/src/test/resources/stopwords.txt (limited to 'lucene-linguistics') diff --git a/lucene-linguistics/README.md b/lucene-linguistics/README.md new file mode 100644 index 00000000000..6329811e458 --- /dev/null +++ b/lucene-linguistics/README.md @@ -0,0 +1,93 @@ +# Vespa Lucene Linguistics + +Linguistics implementation based on Apache Lucene. +Features: +- a list of default analyzers per language; +- building custom analyzers through the configuration of the linguistics component; +- building custom analyzers in Java code and declaring them as `components`. + +## Development + +Build: +```shell +mvn clean test -U package +``` + +To compile configuration classes so that Intellij doesn't complain: +- right click on `pom.xml` +- then `Maven` +- then `Generate Sources and Update Folders` + +## Usage + +Add `` to `services.xml` of your application package, e.g.: +```xml + + + linguistics + + + + standard + + + + reverseString + + + + + + +``` +into `container` clusters that has `` and/or `` specified. + +And then package and deploy, e.g.: +```shell +(mvn clean -DskipTests=true -U package && vespa deploy -w 100) +``` + +### Configuration of Lucene Analyzers + +Read the Lucene docs of subclasses of: +- [TokenizerFactory](org.apache.lucene.analysis.TokenizerFactory), e.g. [StandardTokenizerFactory](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/standard/StandardTokenizerFactory.html) +- [CharFilterFactory](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/CharFilterFactory.html), e.g. [PatternReplaceCharFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.html) +- [TokenFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/util/TokenFilterFactory.html), e.g. [ReverseStringFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.html) + +E.g. tokenizer `StandardTokenizerFactory` has this config [snippet](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/standard/StandardTokenizerFactory.html): +```xml + + + + + +``` + +Then go to the [source code](https://github.com/apache/lucene/blob/17c13a76c87c6246f32dd7a78a26db04401ddb6e/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java#L36) of the class on Github. +Copy value of the `public static final String NAME` into the `` and observe the names used for configuring the tokenizer (in this case only `maxTokenLength`). +```xml + + standard + + 255 + + +``` + +The `AnalyzerFactory` constructor logs the available analysis components. + +The analysis components are discovered through Java Service Provider Interface (SPI). +To add more analysis components it should be enough to put a Lucene analyzer dependency into your application package `pom.xml` +or register services and create classes directly in the application package. + +### Resource files + +The resource files are relative to the component config `configDir`. + +## Inspiration + +These projects: +- [vespa-chinese-linguistics](https://github.com/vespa-engine/sample-apps/blob/master/examples/vespa-chinese-linguistics/src/main/java/com/qihoo/language/JiebaLinguistics.java). +- [OpenNlp Linguistics](https://github.com/vespa-engine/vespa/blob/50d7555bfe7bdaec86f8b31c4d316c9ba66bb976/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java) +- [vespa-kuromoji-linguistics](https://github.com/yahoojapan/vespa-kuromoji-linguistics/tree/main) +- [Clojure library](https://github.com/dainiusjocas/lucene-text-analysis) to work with Lucene analyzers diff --git a/lucene-linguistics/abi-spec.json b/lucene-linguistics/abi-spec.json new file mode 100644 index 00000000000..6f31cf5a2e6 --- /dev/null +++ b/lucene-linguistics/abi-spec.json @@ -0,0 +1 @@ +{ } \ No newline at end of file diff --git a/lucene-linguistics/pom.xml b/lucene-linguistics/pom.xml new file mode 100644 index 00000000000..49cd5338e8d --- /dev/null +++ b/lucene-linguistics/pom.xml @@ -0,0 +1,107 @@ + + + 4.0.0 + + com.yahoo.vespa + parent + 8-SNAPSHOT + ../parent/pom.xml + + + lucene-linguistics + container-plugin + 8-SNAPSHOT + + + UTF-8 + + + + + org.apache.lucene + lucene-core + + + org.apache.lucene + lucene-analysis-common + + + com.yahoo.vespa + component + ${project.version} + provided + + + com.yahoo.vespa + config-bundle + ${project.version} + provided + + + com.yahoo.vespa + configdefinitions + ${project.version} + + + com.yahoo.vespa + annotations + ${project.version} + provided + + + com.yahoo.vespa + vespajlib + ${project.version} + + + com.yahoo.vespa + linguistics + ${project.version} + + + com.google.inject + guice + provided + + + junit + junit + test + + + + + + + com.yahoo.vespa + bundle-plugin + true + + CORE + true + + + + org.apache.maven.plugins + maven-compiler-plugin + + + com.yahoo.vespa + abi-check-plugin + + + com.yahoo.vespa + config-class-plugin + + + + config-gen + + + + + + + diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java new file mode 100644 index 00000000000..b7d3a618954 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java @@ -0,0 +1,160 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharFilterFactory; +import org.apache.lucene.analysis.TokenFilterFactory; +import org.apache.lucene.analysis.TokenizerFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.logging.Logger; + +public class AnalyzerFactory { + private static final Logger log = Logger.getLogger(AnalyzerFactory.class.getName()); + + private final LuceneAnalysisConfig config; + + // Root config directory for all analysis components + private final Path configDir; + + // Registry of analyzers per language + // The idea is to create analyzers ONLY WHEN they are needed + // Analyzers are thread safe so no need to recreate them for every document + private final Map languageAnalyzers = new HashMap<>(); + + private final Analyzer defaultAnalyzer = new StandardAnalyzer(); + + private final static String STANDARD_TOKENIZER = "standard"; + + private final ComponentRegistry analyzerComponents; + private final DefaultAnalyzers defaultAnalyzers; + + public AnalyzerFactory(LuceneAnalysisConfig config, ComponentRegistry analyzers) { + this.config = config; + this.configDir = config.configDir(); + this.analyzerComponents = analyzers; + this.defaultAnalyzers = DefaultAnalyzers.getInstance(); + log.info("Available in classpath char filters: " + CharFilterFactory.availableCharFilters()); + log.info("Available in classpath tokenizers: " + TokenizerFactory.availableTokenizers()); + log.info("Available in classpath token filters: " + TokenFilterFactory.availableTokenFilters()); + } + + /** + * Retrieves an analyzer with a given params. + * Sets up the analyzer if config is provided. + * Default analyzer is the `StandardAnalyzer`. + * @param language + * @param stemMode + * @param removeAccents + * @return + */ + public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) { + String analyzerKey = generateKey(language, stemMode, removeAccents); + + // If analyzer for language is already known + if (null != languageAnalyzers.get(analyzerKey)) { + return languageAnalyzers.get(analyzerKey); + } + if (null != config.analysis(analyzerKey)) { + return setAndReturn(analyzerKey, setUpAnalyzer(analyzerKey)); + } + if (null != analyzerComponents.getComponent(analyzerKey)) { + log.info("Analyzer for language=" + analyzerKey + " is from components."); + return setAndReturn(analyzerKey, analyzerComponents.getComponent(analyzerKey)); + } + if (null != defaultAnalyzers.get(language)) { + log.info("Analyzer for language=" + analyzerKey + " is from a list of default language analyzers."); + return setAndReturn(analyzerKey, defaultAnalyzers.get(language)); + } + // set the default analyzer for the language + log.info("StandardAnalyzer is used for language=" + analyzerKey); + return setAndReturn(analyzerKey, defaultAnalyzer); + } + + private Analyzer setAndReturn(String analyzerKey, Analyzer analyzer) { + languageAnalyzers.put(analyzerKey, analyzer); + return analyzer; + } + + // TODO: Would it make sense to combine language + stemMode + removeAccents to make + // a composite key so we can have more variations possible? + private String generateKey(Language language, StemMode stemMode, boolean removeAccents) { + return language.languageCode(); + } + + private Analyzer setUpAnalyzer(String analyzerKey) { + try { + LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey); + log.info("Creating analyzer for: '" + analyzerKey + "' with config: " + analysis); + CustomAnalyzer.Builder builder = CustomAnalyzer.builder(configDir); + builder = withTokenizer(builder, analysis); + builder = addCharFilters(builder, analysis); + builder = addTokenFilters(builder, analysis); + return builder.build(); + } catch (Exception e) { + // Failing to set up the Analyzer, should blow up during testing and VAP should not be deployed. + // Most likely cause for problems is that a specified resource is not available in VAP. + // Unit tests should catch such problems and prevent the VAP being deployed. + log.severe("Failed to build analyzer: '" + + analyzerKey + + "', with configuration: '" + + config.analysis(analyzerKey) + + "' with exception: '" + + e.getMessage() + "'" ); + throw new RuntimeException(e); + } + } + + private CustomAnalyzer.Builder withTokenizer(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // By default we use the "standard" tokenizer + return builder.withTokenizer(STANDARD_TOKENIZER, new HashMap<>()); + } + String tokenizerName = analysis.tokenizer().name(); + Map conf = analysis.tokenizer().conf(); + return builder.withTokenizer(tokenizerName, toModifiable(conf)); + } + + private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // by default there are no char filters + return builder; + } + for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) { + builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf())); + } + return builder; + } + + private CustomAnalyzer.Builder addTokenFilters(CustomAnalyzer.Builder builder, + LuceneAnalysisConfig.Analysis analysis) throws IOException { + if (null == analysis) { + // by default no token filters are added + return builder; + } + for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) { + builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf())); + } + return builder; + } + + /** + * A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders + * mutates the map to mark that a param was consumed. Immutable maps can't be mutated! + * To overcome this conflict we can wrap the ConfigInstance map in a new HashMap. + * @param map + * @return Mutable Map + */ + private Map toModifiable(Map map) { + return new HashMap<>(map); + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java new file mode 100644 index 00000000000..955e18474f7 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java @@ -0,0 +1,110 @@ +package com.yahoo.language.lucene; + +import com.yahoo.language.Language; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ar.ArabicAnalyzer; +import org.apache.lucene.analysis.bg.BulgarianAnalyzer; +import org.apache.lucene.analysis.bn.BengaliAnalyzer; +import org.apache.lucene.analysis.ca.CatalanAnalyzer; +import org.apache.lucene.analysis.ckb.SoraniAnalyzer; +import org.apache.lucene.analysis.cz.CzechAnalyzer; +import org.apache.lucene.analysis.da.DanishAnalyzer; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.el.GreekAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.es.SpanishAnalyzer; +import org.apache.lucene.analysis.et.EstonianAnalyzer; +import org.apache.lucene.analysis.eu.BasqueAnalyzer; +import org.apache.lucene.analysis.fa.PersianAnalyzer; +import org.apache.lucene.analysis.fi.FinnishAnalyzer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.ga.IrishAnalyzer; +import org.apache.lucene.analysis.gl.GalicianAnalyzer; +import org.apache.lucene.analysis.hi.HindiAnalyzer; +import org.apache.lucene.analysis.hu.HungarianAnalyzer; +import org.apache.lucene.analysis.hy.ArmenianAnalyzer; +import org.apache.lucene.analysis.id.IndonesianAnalyzer; +import org.apache.lucene.analysis.it.ItalianAnalyzer; +import org.apache.lucene.analysis.lt.LithuanianAnalyzer; +import org.apache.lucene.analysis.lv.LatvianAnalyzer; +import org.apache.lucene.analysis.ne.NepaliAnalyzer; +import org.apache.lucene.analysis.nl.DutchAnalyzer; +import org.apache.lucene.analysis.no.NorwegianAnalyzer; +import org.apache.lucene.analysis.pt.PortugueseAnalyzer; +import org.apache.lucene.analysis.ro.RomanianAnalyzer; +import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.sr.SerbianAnalyzer; +import org.apache.lucene.analysis.sv.SwedishAnalyzer; +import org.apache.lucene.analysis.ta.TamilAnalyzer; +import org.apache.lucene.analysis.te.TeluguAnalyzer; +import org.apache.lucene.analysis.th.ThaiAnalyzer; +import org.apache.lucene.analysis.tr.TurkishAnalyzer; + +import java.util.Map; + +import static java.util.Map.entry; + +public class DefaultAnalyzers { + + private static DefaultAnalyzers INSTANCE; + private final Map analyzerClasses; + + private DefaultAnalyzers() { + analyzerClasses = Map.ofEntries( + entry(Language.ARABIC, new ArabicAnalyzer()), + entry(Language.BULGARIAN, new BulgarianAnalyzer()), + entry(Language.BENGALI, new BengaliAnalyzer()), + // analyzerClasses.put(Language.BRASILIAN, new BrazilianAnalyzer()) + entry(Language.CATALAN, new CatalanAnalyzer()), + // cjk analyzer? + entry(Language.KURDISH, new SoraniAnalyzer()), + entry(Language.CZECH, new CzechAnalyzer()), + entry(Language.DANISH, new DanishAnalyzer()), + entry(Language.GERMAN, new GermanAnalyzer()), + entry(Language.GREEK, new GreekAnalyzer()), + entry(Language.ENGLISH, new EnglishAnalyzer()), + entry(Language.SPANISH, new SpanishAnalyzer()), + entry(Language.ESTONIAN, new EstonianAnalyzer()), + entry(Language.BASQUE, new BasqueAnalyzer()), + entry(Language.PERSIAN, new PersianAnalyzer()), + entry(Language.FINNISH, new FinnishAnalyzer()), + entry(Language.FRENCH, new FrenchAnalyzer()), + entry(Language.IRISH, new IrishAnalyzer()), + entry(Language.GALICIAN, new GalicianAnalyzer()), + entry(Language.HINDI, new HindiAnalyzer()), + entry(Language.HUNGARIAN, new HungarianAnalyzer()), + entry(Language.ARMENIAN, new ArmenianAnalyzer()), + entry(Language.INDONESIAN, new IndonesianAnalyzer()), + entry(Language.ITALIAN, new ItalianAnalyzer()), + entry(Language.LITHUANIAN, new LithuanianAnalyzer()), + entry(Language.LATVIAN, new LatvianAnalyzer()), + entry(Language.NEPALI, new NepaliAnalyzer()), + entry(Language.DUTCH, new DutchAnalyzer()), + entry(Language.NORWEGIAN_BOKMAL, new NorwegianAnalyzer()), + entry(Language.PORTUGUESE, new PortugueseAnalyzer()), + entry(Language.ROMANIAN, new RomanianAnalyzer()), + entry(Language.RUSSIAN, new RussianAnalyzer()), + entry(Language.SERBIAN, new SerbianAnalyzer()), + entry(Language.SWEDISH, new SwedishAnalyzer()), + entry(Language.TAMIL, new TamilAnalyzer()), + entry(Language.TELUGU, new TeluguAnalyzer()), + entry(Language.THAI, new ThaiAnalyzer()), + entry(Language.TURKISH, new TurkishAnalyzer()) + ); + } + + public static DefaultAnalyzers getInstance() { + if (INSTANCE == null) { + INSTANCE = new DefaultAnalyzers(); + } + return INSTANCE; + } + + public Analyzer get(Language language) { + return analyzerClasses.get(language); + } + + public Analyzer get(String languageCode) { + return analyzerClasses.get(Language.fromLanguageTag(languageCode)); + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java new file mode 100644 index 00000000000..b5c5ba47ab6 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java @@ -0,0 +1,82 @@ +package com.yahoo.language.lucene; + +import com.google.inject.Inject; +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.SimpleLinguistics; +import org.apache.lucene.analysis.Analyzer; + +import java.util.ArrayList; +import java.util.logging.Logger; + +/** + * Factory of Lucene based linguistics processor. + * As described in the Linguistics docstring + * > the tokenizer should typically stem, transform and normalize + * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP. + * + * TODO: docs for all available analysis components. + * TODO: some registry for available language Analyzers. + */ +public class LuceneLinguistics extends SimpleLinguistics { + + private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName()); + private final Normalizer normalizer; + private final Transformer transformer; + private final Tokenizer tokenizer; + private final Stemmer stemmer; + private final Segmenter segmenter; + private final LuceneAnalysisConfig config; + + @Inject + public LuceneLinguistics(LuceneAnalysisConfig config, ComponentRegistry analyzers) { + log.info("Creating LuceneLinguistics with: " + config); + this.config = config; + this.tokenizer = new LuceneTokenizer(config, analyzers); + // NOOP stemmer + this.stemmer = (word, stemMode, language) -> { + ArrayList stemLists = new ArrayList<>(); + StemList stems = new StemList(); + stems.add(word); + stemLists.add(stems); + return stemLists; + }; + // Segmenter that just wraps a tokenizer + this.segmenter = (string, language) -> { + ArrayList segments = new ArrayList<>(); + Iterable tokens = tokenizer.tokenize(string, language, StemMode.NONE, false); + tokens.forEach(token -> segments.add(token.getTokenString())); + return segments; + }; + // NOOP normalizer + this.normalizer = (string) -> string; + // NOOP transformer + this.transformer = (string, language) -> string; + } + + @Override + public Stemmer getStemmer() { return stemmer; } + + @Override + public Tokenizer getTokenizer() { return tokenizer; } + + @Override + public Normalizer getNormalizer() { return normalizer; } + + @Override + public Transformer getTransformer() { return transformer; } + + @Override + public Segmenter getSegmenter() { return segmenter; } + + public LuceneAnalysisConfig getConfig() { + return config; + } + + @Override + public boolean equals(Linguistics other) { + return (other instanceof LuceneLinguistics) + // Config actually determines if Linguistics are equal + && config.equals(((LuceneLinguistics) other).getConfig()); } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java new file mode 100644 index 00000000000..0cde849fd6e --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java @@ -0,0 +1,68 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Language; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.SimpleToken; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class LuceneTokenizer implements Tokenizer { + + private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName()); + + // Dummy value, just to stuff the Lucene interface. + private final static String FIELD_NAME = "F"; + + private final AnalyzerFactory analyzerFactory; + + public LuceneTokenizer(LuceneAnalysisConfig config) { + this(config, new ComponentRegistry<>()); + } + public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry analyzers) { + this.analyzerFactory = new AnalyzerFactory(config, analyzers); + } + + @Override + public Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + if (input.isEmpty()) return List.of(); + + List tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents)); + log.log(Level.FINEST, "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens); + return tokens; + } + + private List textToTokens(String text, Analyzer analyzer) { + List tokens = new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); + + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); + try { + tokenStream.reset(); + while (tokenStream.incrementToken()) { + // TODO: is SimpleToken good enough? Maybe a custom implementation. + // TODO: what to do with cases when multiple tokens are inserted into the position? + String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()); + String tokenString = charTermAttribute.toString(); + tokens.add(new SimpleToken(originalString, tokenString) + .setType(TokenType.ALPHABETIC) + .setOffset(offsetAttribute.startOffset()) + .setScript(TokenScript.UNKNOWN)); + } + tokenStream.end(); + tokenStream.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to analyze: " + text, e); + } + return tokens; + } +} diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java new file mode 100644 index 00000000000..14330723224 --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java @@ -0,0 +1,4 @@ +@ExportPackage +package com.yahoo.language.lucene; + +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def new file mode 100644 index 00000000000..e4b5037dcbe --- /dev/null +++ b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def @@ -0,0 +1,14 @@ +package=com.yahoo.language.lucene + +# The schema ("type") for an application specified config type +# See +# - https://docs.vespa.ai/en/reference/config-files.html + +configDir path +analysis{}.tokenizer.name string default=standard +analysis{}.tokenizer.conf{} string + +analysis{}.charFilters[].name string +analysis{}.charFilters[].conf{} string +analysis{}.tokenFilters[].name string +analysis{}.tokenFilters[].conf{} string diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java new file mode 100644 index 00000000000..568f295b39d --- /dev/null +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -0,0 +1,139 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.config.FileReference; +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class LuceneTokenizerTest { + + @Test + public void testTokenizer() { + String text = "This is my Text"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable tokens = tokenizer + .tokenize(text, Language.ENGLISH, StemMode.ALL, true); + assertEquals(List.of("my", "text"), tokenStrings(tokens)); + } + + @Test + public void testLithuanianTokenizer() { + String text = "Žalgirio mūšio data yra 1410 metai"; + var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig + .Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build()); + Iterable tokens = tokenizer + .tokenize(text, Language.LITHUANIAN, StemMode.ALL, true); + assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens)); + } + + private void assertToken(String tokenString, Iterator tokens) { + Token t = tokens.next(); + assertEquals(tokenString, t.getTokenString()); + } + + private List iterableToList(Iterable tokens) { + List tokenList = new ArrayList<>(); + tokens.forEach(tokenList::add); + return tokenList; + } + + private List tokenStrings(Iterable tokens) { + List tokenList = new ArrayList<>(); + tokens.forEach(token -> { + tokenList.add(token.getTokenString()); + }); + return tokenList; + } + + @Test + public void testAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig + .Analysis + .Builder() + .tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("uppercase")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable tokens = linguistics + .getTokenizer() + .tokenize("Dogs and cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("DOG", "AND", "CAT"), tokenStrings(tokens)); + } + + @Test + public void testEnglishStemmerAnalyzerConfiguration() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens)); + } + + @Test + public void testStemmerWithStopWords() { + String languageCode = Language.ENGLISH.languageCode(); + LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .analysis( + Map.of(languageCode, + new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of( + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("englishMinimalStem"), + new LuceneAnalysisConfig + .Analysis + .TokenFilters + .Builder() + .name("stop") + .conf("words", "stopwords.txt")))) + ).build(); + LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>()); + Iterable tokens = linguistics + .getTokenizer() + .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); + assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens)); + } +} diff --git a/lucene-linguistics/src/test/resources/stopwords.txt b/lucene-linguistics/src/test/resources/stopwords.txt new file mode 100644 index 00000000000..e8c07838bf5 --- /dev/null +++ b/lucene-linguistics/src/test/resources/stopwords.txt @@ -0,0 +1 @@ +and -- cgit v1.2.3