summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
diff options
context:
space:
mode:
Diffstat (limited to 'lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java')
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java82
1 files changed, 82 insertions, 0 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
new file mode 100644
index 00000000000..b5c5ba47ab6
--- /dev/null
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
@@ -0,0 +1,82 @@
+package com.yahoo.language.lucene;
+
+import com.google.inject.Inject;
+import com.yahoo.component.provider.ComponentRegistry;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.apache.lucene.analysis.Analyzer;
+
+import java.util.ArrayList;
+import java.util.logging.Logger;
+
+/**
+ * Factory of Lucene based linguistics processor.
+ * As described in the Linguistics docstring
+ * > the tokenizer should typically stem, transform and normalize
+ * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP.
+ *
+ * TODO: docs for all available analysis components.
+ * TODO: some registry for available language Analyzers.
+ */
+public class LuceneLinguistics extends SimpleLinguistics {
+
+ private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName());
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final Tokenizer tokenizer;
+ private final Stemmer stemmer;
+ private final Segmenter segmenter;
+ private final LuceneAnalysisConfig config;
+
+ @Inject
+ public LuceneLinguistics(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
+ log.info("Creating LuceneLinguistics with: " + config);
+ this.config = config;
+ this.tokenizer = new LuceneTokenizer(config, analyzers);
+ // NOOP stemmer
+ this.stemmer = (word, stemMode, language) -> {
+ ArrayList<StemList> stemLists = new ArrayList<>();
+ StemList stems = new StemList();
+ stems.add(word);
+ stemLists.add(stems);
+ return stemLists;
+ };
+ // Segmenter that just wraps a tokenizer
+ this.segmenter = (string, language) -> {
+ ArrayList<String> segments = new ArrayList<>();
+ Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false);
+ tokens.forEach(token -> segments.add(token.getTokenString()));
+ return segments;
+ };
+ // NOOP normalizer
+ this.normalizer = (string) -> string;
+ // NOOP transformer
+ this.transformer = (string, language) -> string;
+ }
+
+ @Override
+ public Stemmer getStemmer() { return stemmer; }
+
+ @Override
+ public Tokenizer getTokenizer() { return tokenizer; }
+
+ @Override
+ public Normalizer getNormalizer() { return normalizer; }
+
+ @Override
+ public Transformer getTransformer() { return transformer; }
+
+ @Override
+ public Segmenter getSegmenter() { return segmenter; }
+
+ public LuceneAnalysisConfig getConfig() {
+ return config;
+ }
+
+ @Override
+ public boolean equals(Linguistics other) {
+ return (other instanceof LuceneLinguistics)
+ // Config actually determines if Linguistics are equal
+ && config.equals(((LuceneLinguistics) other).getConfig()); }
+}