summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics
diff options
context:
space:
mode:
authorDainius Jocas <dainius.jocas@gmail.com>2023-07-31 13:27:43 +0300
committerDainius Jocas <dainius.jocas@gmail.com>2023-07-31 13:27:43 +0300
commit5a60f6f3ae8e99f1f3de10e22a1f055d03fb37db (patch)
tree0f7cc48efba4b6661036a509269868d7354d6af2 /lucene-linguistics
parentd488a7482e93ae233be571d61946caa796aba588 (diff)
integrate Lucene Linguistics into the vespa project
Diffstat (limited to 'lucene-linguistics')
-rw-r--r--lucene-linguistics/README.md93
-rw-r--r--lucene-linguistics/abi-spec.json1
-rw-r--r--lucene-linguistics/pom.xml107
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java160
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java110
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java82
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java68
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java4
-rw-r--r--lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def14
-rw-r--r--lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java139
-rw-r--r--lucene-linguistics/src/test/resources/stopwords.txt1
11 files changed, 779 insertions, 0 deletions
diff --git a/lucene-linguistics/README.md b/lucene-linguistics/README.md
new file mode 100644
index 00000000000..6329811e458
--- /dev/null
+++ b/lucene-linguistics/README.md
@@ -0,0 +1,93 @@
+# Vespa Lucene Linguistics
+
+Linguistics implementation based on Apache Lucene.
+Features:
+- a list of default analyzers per language;
+- building custom analyzers through the configuration of the linguistics component;
+- building custom analyzers in Java code and declaring them as `components`.
+
+## Development
+
+Build:
+```shell
+mvn clean test -U package
+```
+
+To compile configuration classes so that Intellij doesn't complain:
+- right click on `pom.xml`
+- then `Maven`
+- then `Generate Sources and Update Folders`
+
+## Usage
+
+Add `<component>` to `services.xml` of your application package, e.g.:
+```xml
+<component id="com.yahoo.language.lucene.LuceneLinguistics" bundle="lucene-linguistics">
+ <config name="com.yahoo.language.lucene.lucene-analysis">
+ <configDir>linguistics</configDir>
+ <analysis>
+ <item key="en">
+ <tokenizer>
+ <name>standard</name>
+ </tokenizer>
+ <tokenFilters>
+ <item>
+ <name>reverseString</name>
+ </item>
+ </tokenFilters>
+ </item>
+ </analysis>
+ </config>
+</component>
+```
+into `container` clusters that has `<document-processing/>` and/or `<search>` specified.
+
+And then package and deploy, e.g.:
+```shell
+(mvn clean -DskipTests=true -U package && vespa deploy -w 100)
+```
+
+### Configuration of Lucene Analyzers
+
+Read the Lucene docs of subclasses of:
+- [TokenizerFactory](org.apache.lucene.analysis.TokenizerFactory), e.g. [StandardTokenizerFactory](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/standard/StandardTokenizerFactory.html)
+- [CharFilterFactory](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/CharFilterFactory.html), e.g. [PatternReplaceCharFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.html)
+- [TokenFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/util/TokenFilterFactory.html), e.g. [ReverseStringFilterFactory](https://lucene.apache.org/core/8_1_1/analyzers-common/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.html)
+
+E.g. tokenizer `StandardTokenizerFactory` has this config [snippet](https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/analysis/standard/StandardTokenizerFactory.html):
+```xml
+ <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/>
+ </analyzer>
+ </fieldType>
+```
+
+Then go to the [source code](https://github.com/apache/lucene/blob/17c13a76c87c6246f32dd7a78a26db04401ddb6e/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java#L36) of the class on Github.
+Copy value of the `public static final String NAME` into the `<name>` and observe the names used for configuring the tokenizer (in this case only `maxTokenLength`).
+```xml
+<tokenizer>
+ <name>standard</name>
+ <config>
+ <item key="maxTokenLength">255</item>
+ </config>
+</tokenizer>
+```
+
+The `AnalyzerFactory` constructor logs the available analysis components.
+
+The analysis components are discovered through Java Service Provider Interface (SPI).
+To add more analysis components it should be enough to put a Lucene analyzer dependency into your application package `pom.xml`
+or register services and create classes directly in the application package.
+
+### Resource files
+
+The resource files are relative to the component config `configDir`.
+
+## Inspiration
+
+These projects:
+- [vespa-chinese-linguistics](https://github.com/vespa-engine/sample-apps/blob/master/examples/vespa-chinese-linguistics/src/main/java/com/qihoo/language/JiebaLinguistics.java).
+- [OpenNlp Linguistics](https://github.com/vespa-engine/vespa/blob/50d7555bfe7bdaec86f8b31c4d316c9ba66bb976/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java)
+- [vespa-kuromoji-linguistics](https://github.com/yahoojapan/vespa-kuromoji-linguistics/tree/main)
+- [Clojure library](https://github.com/dainiusjocas/lucene-text-analysis) to work with Lucene analyzers
diff --git a/lucene-linguistics/abi-spec.json b/lucene-linguistics/abi-spec.json
new file mode 100644
index 00000000000..6f31cf5a2e6
--- /dev/null
+++ b/lucene-linguistics/abi-spec.json
@@ -0,0 +1 @@
+{ } \ No newline at end of file
diff --git a/lucene-linguistics/pom.xml b/lucene-linguistics/pom.xml
new file mode 100644
index 00000000000..49cd5338e8d
--- /dev/null
+++ b/lucene-linguistics/pom.xml
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>parent</artifactId>
+ <version>8-SNAPSHOT</version>
+ <relativePath>../parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>lucene-linguistics</artifactId>
+ <packaging>container-plugin</packaging>
+ <version>8-SNAPSHOT</version>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analysis-common</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>component</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>config-bundle</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>configdefinitions</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>annotations</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>vespajlib</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>linguistics</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.inject</groupId>
+ <artifactId>guice</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <bundleType>CORE</bundleType>
+ <suppressWarningMissingImportPackages>true</suppressWarningMissingImportPackages>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>abi-check-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>config-class-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>config-gen</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
new file mode 100644
index 00000000000..b7d3a618954
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
@@ -0,0 +1,160 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Language;
+import com.yahoo.language.process.StemMode;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharFilterFactory;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenizerFactory;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.logging.Logger;
+
+public class AnalyzerFactory {
+ private static final Logger log = Logger.getLogger(AnalyzerFactory.class.getName());
+
+ private final LuceneAnalysisConfig config;
+
+ // Root config directory for all analysis components
+ private final Path configDir;
+
+ // Registry of analyzers per language
+ // The idea is to create analyzers ONLY WHEN they are needed
+ // Analyzers are thread safe so no need to recreate them for every document
+ private final Map<String, Analyzer> languageAnalyzers = new HashMap<>();
+
+ private final Analyzer defaultAnalyzer = new StandardAnalyzer();
+
+ private final static String STANDARD_TOKENIZER = "standard";
+
+ private final ComponentRegistry<Analyzer> analyzerComponents;
+ private final DefaultAnalyzers defaultAnalyzers;
+
+ public AnalyzerFactory(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ this.config = config;
+ this.configDir = config.configDir();
+ this.analyzerComponents = analyzers;
+ this.defaultAnalyzers = DefaultAnalyzers.getInstance();
+ log.info("Available in classpath char filters: " + CharFilterFactory.availableCharFilters());
+ log.info("Available in classpath tokenizers: " + TokenizerFactory.availableTokenizers());
+ log.info("Available in classpath token filters: " + TokenFilterFactory.availableTokenFilters());
+ }
+
+ /**
+ * Retrieves an analyzer with a given params.
+ * Sets up the analyzer if config is provided.
+ * Default analyzer is the `StandardAnalyzer`.
+ * @param language
+ * @param stemMode
+ * @param removeAccents
+ * @return
+ */
+ public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) {
+ String analyzerKey = generateKey(language, stemMode, removeAccents);
+
+ // If analyzer for language is already known
+ if (null != languageAnalyzers.get(analyzerKey)) {
+ return languageAnalyzers.get(analyzerKey);
+ }
+ if (null != config.analysis(analyzerKey)) {
+ return setAndReturn(analyzerKey, setUpAnalyzer(analyzerKey));
+ }
+ if (null != analyzerComponents.getComponent(analyzerKey)) {
+ log.info("Analyzer for language=" + analyzerKey + " is from components.");
+ return setAndReturn(analyzerKey, analyzerComponents.getComponent(analyzerKey));
+ }
+ if (null != defaultAnalyzers.get(language)) {
+ log.info("Analyzer for language=" + analyzerKey + " is from a list of default language analyzers.");
+ return setAndReturn(analyzerKey, defaultAnalyzers.get(language));
+ }
+ // set the default analyzer for the language
+ log.info("StandardAnalyzer is used for language=" + analyzerKey);
+ return setAndReturn(analyzerKey, defaultAnalyzer);
+ }
+
+ private Analyzer setAndReturn(String analyzerKey, Analyzer analyzer) {
+ languageAnalyzers.put(analyzerKey, analyzer);
+ return analyzer;
+ }
+
+ // TODO: Would it make sense to combine language + stemMode + removeAccents to make
+ // a composite key so we can have more variations possible?
+ private String generateKey(Language language, StemMode stemMode, boolean removeAccents) {
+ return language.languageCode();
+ }
+
+ private Analyzer setUpAnalyzer(String analyzerKey) {
+ try {
+ LuceneAnalysisConfig.Analysis analysis = config.analysis(analyzerKey);
+ log.info("Creating analyzer for: '" + analyzerKey + "' with config: " + analysis);
+ CustomAnalyzer.Builder builder = CustomAnalyzer.builder(configDir);
+ builder = withTokenizer(builder, analysis);
+ builder = addCharFilters(builder, analysis);
+ builder = addTokenFilters(builder, analysis);
+ return builder.build();
+ } catch (Exception e) {
+ // Failing to set up the Analyzer, should blow up during testing and VAP should not be deployed.
+ // Most likely cause for problems is that a specified resource is not available in VAP.
+ // Unit tests should catch such problems and prevent the VAP being deployed.
+ log.severe("Failed to build analyzer: '"
+ + analyzerKey
+ + "', with configuration: '"
+ + config.analysis(analyzerKey)
+ + "' with exception: '"
+ + e.getMessage() + "'" );
+ throw new RuntimeException(e);
+ }
+ }
+
+ private CustomAnalyzer.Builder withTokenizer(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // By default we use the "standard" tokenizer
+ return builder.withTokenizer(STANDARD_TOKENIZER, new HashMap<>());
+ }
+ String tokenizerName = analysis.tokenizer().name();
+ Map<String, String> conf = analysis.tokenizer().conf();
+ return builder.withTokenizer(tokenizerName, toModifiable(conf));
+ }
+
+ private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // by default there are no char filters
+ return builder;
+ }
+ for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) {
+ builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf()));
+ }
+ return builder;
+ }
+
+ private CustomAnalyzer.Builder addTokenFilters(CustomAnalyzer.Builder builder,
+ LuceneAnalysisConfig.Analysis analysis) throws IOException {
+ if (null == analysis) {
+ // by default no token filters are added
+ return builder;
+ }
+ for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) {
+ builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf()));
+ }
+ return builder;
+ }
+
+ /**
+ * A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders
+ * mutates the map to mark that a param was consumed. Immutable maps can't be mutated!
+ * To overcome this conflict we can wrap the ConfigInstance map in a new HashMap.
+ * @param map
+ * @return Mutable Map
+ */
+ private Map<String, String> toModifiable(Map<String, String> map) {
+ return new HashMap<>(map);
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
new file mode 100644
index 00000000000..955e18474f7
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
@@ -0,0 +1,110 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.language.Language;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
+import org.apache.lucene.analysis.ca.CatalanAnalyzer;
+import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
+import org.apache.lucene.analysis.cz.CzechAnalyzer;
+import org.apache.lucene.analysis.da.DanishAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.el.GreekAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.es.SpanishAnalyzer;
+import org.apache.lucene.analysis.et.EstonianAnalyzer;
+import org.apache.lucene.analysis.eu.BasqueAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.fi.FinnishAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.ga.IrishAnalyzer;
+import org.apache.lucene.analysis.gl.GalicianAnalyzer;
+import org.apache.lucene.analysis.hi.HindiAnalyzer;
+import org.apache.lucene.analysis.hu.HungarianAnalyzer;
+import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
+import org.apache.lucene.analysis.id.IndonesianAnalyzer;
+import org.apache.lucene.analysis.it.ItalianAnalyzer;
+import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
+import org.apache.lucene.analysis.lv.LatvianAnalyzer;
+import org.apache.lucene.analysis.ne.NepaliAnalyzer;
+import org.apache.lucene.analysis.nl.DutchAnalyzer;
+import org.apache.lucene.analysis.no.NorwegianAnalyzer;
+import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
+import org.apache.lucene.analysis.ro.RomanianAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.sr.SerbianAnalyzer;
+import org.apache.lucene.analysis.sv.SwedishAnalyzer;
+import org.apache.lucene.analysis.ta.TamilAnalyzer;
+import org.apache.lucene.analysis.te.TeluguAnalyzer;
+import org.apache.lucene.analysis.th.ThaiAnalyzer;
+import org.apache.lucene.analysis.tr.TurkishAnalyzer;
+
+import java.util.Map;
+
+import static java.util.Map.entry;
+
+public class DefaultAnalyzers {
+
+ private static DefaultAnalyzers INSTANCE;
+ private final Map<Language, Analyzer> analyzerClasses;
+
+ private DefaultAnalyzers() {
+ analyzerClasses = Map.ofEntries(
+ entry(Language.ARABIC, new ArabicAnalyzer()),
+ entry(Language.BULGARIAN, new BulgarianAnalyzer()),
+ entry(Language.BENGALI, new BengaliAnalyzer()),
+ // analyzerClasses.put(Language.BRASILIAN, new BrazilianAnalyzer())
+ entry(Language.CATALAN, new CatalanAnalyzer()),
+ // cjk analyzer?
+ entry(Language.KURDISH, new SoraniAnalyzer()),
+ entry(Language.CZECH, new CzechAnalyzer()),
+ entry(Language.DANISH, new DanishAnalyzer()),
+ entry(Language.GERMAN, new GermanAnalyzer()),
+ entry(Language.GREEK, new GreekAnalyzer()),
+ entry(Language.ENGLISH, new EnglishAnalyzer()),
+ entry(Language.SPANISH, new SpanishAnalyzer()),
+ entry(Language.ESTONIAN, new EstonianAnalyzer()),
+ entry(Language.BASQUE, new BasqueAnalyzer()),
+ entry(Language.PERSIAN, new PersianAnalyzer()),
+ entry(Language.FINNISH, new FinnishAnalyzer()),
+ entry(Language.FRENCH, new FrenchAnalyzer()),
+ entry(Language.IRISH, new IrishAnalyzer()),
+ entry(Language.GALICIAN, new GalicianAnalyzer()),
+ entry(Language.HINDI, new HindiAnalyzer()),
+ entry(Language.HUNGARIAN, new HungarianAnalyzer()),
+ entry(Language.ARMENIAN, new ArmenianAnalyzer()),
+ entry(Language.INDONESIAN, new IndonesianAnalyzer()),
+ entry(Language.ITALIAN, new ItalianAnalyzer()),
+ entry(Language.LITHUANIAN, new LithuanianAnalyzer()),
+ entry(Language.LATVIAN, new LatvianAnalyzer()),
+ entry(Language.NEPALI, new NepaliAnalyzer()),
+ entry(Language.DUTCH, new DutchAnalyzer()),
+ entry(Language.NORWEGIAN_BOKMAL, new NorwegianAnalyzer()),
+ entry(Language.PORTUGUESE, new PortugueseAnalyzer()),
+ entry(Language.ROMANIAN, new RomanianAnalyzer()),
+ entry(Language.RUSSIAN, new RussianAnalyzer()),
+ entry(Language.SERBIAN, new SerbianAnalyzer()),
+ entry(Language.SWEDISH, new SwedishAnalyzer()),
+ entry(Language.TAMIL, new TamilAnalyzer()),
+ entry(Language.TELUGU, new TeluguAnalyzer()),
+ entry(Language.THAI, new ThaiAnalyzer()),
+ entry(Language.TURKISH, new TurkishAnalyzer())
+ );
+ }
+
+ public static DefaultAnalyzers getInstance() {
+ if (INSTANCE == null) {
+ INSTANCE = new DefaultAnalyzers();
+ }
+ return INSTANCE;
+ }
+
+ public Analyzer get(Language language) {
+ return analyzerClasses.get(language);
+ }
+
+ public Analyzer get(String languageCode) {
+ return analyzerClasses.get(Language.fromLanguageTag(languageCode));
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
new file mode 100644
index 00000000000..b5c5ba47ab6
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
@@ -0,0 +1,82 @@
+package com.yahoo.language.lucene;
+
+import com.google.inject.Inject;
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.apache.lucene.analysis.Analyzer;
+
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+/**
+ * Factory of Lucene based linguistics processor.
+ * As described in the Linguistics docstring
+ * > the tokenizer should typically stem, transform and normalize
+ * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP.
+ *
+ * TODO: docs for all available analysis components.
+ * TODO: some registry for available language Analyzers.
+ */
+public class LuceneLinguistics extends SimpleLinguistics {
+
+ private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName());
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final Tokenizer tokenizer;
+ private final Stemmer stemmer;
+ private final Segmenter segmenter;
+ private final LuceneAnalysisConfig config;
+
+ @Inject
+ public LuceneLinguistics(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ log.info("Creating LuceneLinguistics with: " + config);
+ this.config = config;
+ this.tokenizer = new LuceneTokenizer(config, analyzers);
+ // NOOP stemmer
+ this.stemmer = (word, stemMode, language) -> {
+ ArrayList<StemList> stemLists = new ArrayList<>();
+ StemList stems = new StemList();
+ stems.add(word);
+ stemLists.add(stems);
+ return stemLists;
+ };
+ // Segmenter that just wraps a tokenizer
+ this.segmenter = (string, language) -> {
+ ArrayList<String> segments = new ArrayList<>();
+ Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false);
+ tokens.forEach(token -> segments.add(token.getTokenString()));
+ return segments;
+ };
+ // NOOP normalizer
+ this.normalizer = (string) -> string;
+ // NOOP transformer
+ this.transformer = (string, language) -> string;
+ }
+
+ @Override
+ public Stemmer getStemmer() { return stemmer; }
+
+ @Override
+ public Tokenizer getTokenizer() { return tokenizer; }
+
+ @Override
+ public Normalizer getNormalizer() { return normalizer; }
+
+ @Override
+ public Transformer getTransformer() { return transformer; }
+
+ @Override
+ public Segmenter getSegmenter() { return segmenter; }
+
+ public LuceneAnalysisConfig getConfig() {
+ return config;
+ }
+
+ @Override
+ public boolean equals(Linguistics other) {
+ return (other instanceof LuceneLinguistics)
+ // Config actually determines if Linguistics are equal
+ && config.equals(((LuceneLinguistics) other).getConfig()); }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
new file mode 100644
index 00000000000..0cde849fd6e
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
@@ -0,0 +1,68 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Language;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.SimpleToken;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+public class LuceneTokenizer implements Tokenizer {
+
+ private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName());
+
+ // Dummy value, just to stuff the Lucene interface.
+ private final static String FIELD_NAME = "F";
+
+ private final AnalyzerFactory analyzerFactory;
+
+ public LuceneTokenizer(LuceneAnalysisConfig config) {
+ this(config, new ComponentRegistry<>());
+ }
+ public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ this.analyzerFactory = new AnalyzerFactory(config, analyzers);
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ if (input.isEmpty()) return List.of();
+
+ List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents));
+ log.log(Level.FINEST, "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens);
+ return tokens;
+ }
+
+ private List<Token> textToTokens(String text, Analyzer analyzer) {
+ List<Token> tokens = new ArrayList<>();
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+
+ CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
+ try {
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ // TODO: is SimpleToken good enough? Maybe a custom implementation.
+ // TODO: what to do with cases when multiple tokens are inserted into the position?
+ String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset());
+ String tokenString = charTermAttribute.toString();
+ tokens.add(new SimpleToken(originalString, tokenString)
+ .setType(TokenType.ALPHABETIC)
+ .setOffset(offsetAttribute.startOffset())
+ .setScript(TokenScript.UNKNOWN));
+ }
+ tokenStream.end();
+ tokenStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to analyze: " + text, e);
+ }
+ return tokens;
+ }
+}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java
new file mode 100644
index 00000000000..14330723224
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/package-info.java
@@ -0,0 +1,4 @@
+@ExportPackage
+package com.yahoo.language.lucene;
+
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def
new file mode 100644
index 00000000000..e4b5037dcbe
--- /dev/null
+++ b/lucene-linguistics/src/main/resources/configdefinitions/lucene-analysis.def
@@ -0,0 +1,14 @@
+package=com.yahoo.language.lucene
+
+# The schema ("type") for an application specified config type
+# See
+# - https://docs.vespa.ai/en/reference/config-files.html
+
+configDir path
+analysis{}.tokenizer.name string default=standard
+analysis{}.tokenizer.conf{} string
+
+analysis{}.charFilters[].name string
+analysis{}.charFilters[].conf{} string
+analysis{}.tokenFilters[].name string
+analysis{}.tokenFilters[].conf{} string
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
new file mode 100644
index 00000000000..568f295b39d
--- /dev/null
+++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
@@ -0,0 +1,139 @@
+package com.yahoo.language.lucene;
+
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.config.FileReference;
+import com.yahoo.language.Language;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import org.junit.Test;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class LuceneTokenizerTest {
+
+ @Test
+ public void testTokenizer() {
+ String text = "This is my Text";
+ var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig
+ .Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .build());
+ Iterable<Token> tokens = tokenizer
+ .tokenize(text, Language.ENGLISH, StemMode.ALL, true);
+ assertEquals(List.of("my", "text"), tokenStrings(tokens));
+ }
+
+ @Test
+ public void testLithuanianTokenizer() {
+ String text = "Žalgirio mūšio data yra 1410 metai";
+ var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig
+ .Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .build());
+ Iterable<Token> tokens = tokenizer
+ .tokenize(text, Language.LITHUANIAN, StemMode.ALL, true);
+ assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens));
+ }
+
+ private void assertToken(String tokenString, Iterator<Token> tokens) {
+ Token t = tokens.next();
+ assertEquals(tokenString, t.getTokenString());
+ }
+
+ private List<Token> iterableToList(Iterable<Token> tokens) {
+ List<Token> tokenList = new ArrayList<>();
+ tokens.forEach(tokenList::add);
+ return tokenList;
+ }
+
+ private List<String> tokenStrings(Iterable<Token> tokens) {
+ List<String> tokenList = new ArrayList<>();
+ tokens.forEach(token -> {
+ tokenList.add(token.getTokenString());
+ });
+ return tokenList;
+ }
+
+ @Test
+ public void testAnalyzerConfiguration() {
+ String languageCode = Language.ENGLISH.languageCode();
+ LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .analysis(
+ Map.of(languageCode,
+ new LuceneAnalysisConfig
+ .Analysis
+ .Builder()
+ .tokenFilters(List.of(
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("englishMinimalStem"),
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("uppercase"))))
+ ).build();
+ LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>());
+ Iterable<Token> tokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and cats", Language.ENGLISH, StemMode.ALL, false);
+ assertEquals(List.of("DOG", "AND", "CAT"), tokenStrings(tokens));
+ }
+
+ @Test
+ public void testEnglishStemmerAnalyzerConfiguration() {
+ String languageCode = Language.ENGLISH.languageCode();
+ LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .analysis(
+ Map.of(languageCode,
+ new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("englishMinimalStem"))))
+ ).build();
+ LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>());
+ Iterable<Token> tokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
+ assertEquals(List.of("Dog", "and", "Cat"), tokenStrings(tokens));
+ }
+
+ @Test
+ public void testStemmerWithStopWords() {
+ String languageCode = Language.ENGLISH.languageCode();
+ LuceneAnalysisConfig enConfig = new LuceneAnalysisConfig.Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .analysis(
+ Map.of(languageCode,
+ new LuceneAnalysisConfig.Analysis.Builder().tokenFilters(List.of(
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("englishMinimalStem"),
+ new LuceneAnalysisConfig
+ .Analysis
+ .TokenFilters
+ .Builder()
+ .name("stop")
+ .conf("words", "stopwords.txt"))))
+ ).build();
+ LuceneLinguistics linguistics = new LuceneLinguistics(enConfig, new ComponentRegistry<>());
+ Iterable<Token> tokens = linguistics
+ .getTokenizer()
+ .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
+ assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens));
+ }
+}
diff --git a/lucene-linguistics/src/test/resources/stopwords.txt b/lucene-linguistics/src/test/resources/stopwords.txt
new file mode 100644
index 00000000000..e8c07838bf5
--- /dev/null
+++ b/lucene-linguistics/src/test/resources/stopwords.txt
@@ -0,0 +1 @@
+and