diff options
Diffstat (limited to 'lucene-linguistics/src')
5 files changed, 21 insertions, 21 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java index b71d06a2c3f..f4d3b482363 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java @@ -17,6 +17,8 @@ import java.util.Map; import java.util.logging.Logger; /** + * Analyzers for various languages. + * * @author dainiusjocas */ class AnalyzerFactory { @@ -54,10 +56,6 @@ class AnalyzerFactory { * Retrieves an analyzer with a given params. * Sets up the analyzer if config is provided. * Default analyzer is the `StandardAnalyzer`. - * @param language - * @param stemMode - * @param removeAccents - * @return */ public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) { String analyzerKey = generateKey(language, stemMode, removeAccents); @@ -119,7 +117,7 @@ class AnalyzerFactory { } String tokenizerName = analysis.tokenizer().name(); Map<String, String> conf = analysis.tokenizer().conf(); - return builder.withTokenizer(tokenizerName, toModifiable(conf)); + return builder.withTokenizer(tokenizerName, asModifiable(conf)); } private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder, @@ -129,7 +127,7 @@ class AnalyzerFactory { return builder; } for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) { - builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf())); + builder.addCharFilter(charFilter.name(), asModifiable(charFilter.conf())); } return builder; } @@ -141,7 +139,7 @@ class AnalyzerFactory { return builder; } for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) { - builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf())); + builder.addTokenFilter(tokenFilter.name(), asModifiable(tokenFilter.conf())); } return builder; } @@ -150,10 +148,9 @@ class AnalyzerFactory { * A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders * mutates the map to mark that a param was consumed. Immutable maps can't be mutated! * To overcome this conflict we can wrap the ConfigInstance map in a new HashMap. - * @param map - * @return Mutable Map */ - private Map<String, String> toModifiable(Map<String, String> map) { + private Map<String, String> asModifiable(Map<String, String> map) { return new HashMap<>(map); } + } diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java index b7ec6f9a23d..858b71b7fae 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java @@ -110,4 +110,5 @@ class DefaultAnalyzers { public Analyzer get(String languageCode) { return analyzerClasses.get(Language.fromLanguageTag(languageCode)); } + } diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java index c55ad8adecb..8b193c103d6 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java @@ -3,28 +3,25 @@ package com.yahoo.language.lucene; import com.google.inject.Inject; import com.yahoo.component.provider.ComponentRegistry; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.*; +import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; import org.apache.lucene.analysis.Analyzer; -import java.util.ArrayList; import java.util.logging.Logger; /** - * Factory of Lucene based linguistics processor. + * Factory of Lucene based linguistics processors. * As described in the Linguistics docstring * > the tokenizer should typically stem, transform and normalize - * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP. * * TODO: docs for all available analysis components. - * TODO: some registry for available language Analyzers. * * @author dainiusjocas */ public class LuceneLinguistics extends SimpleLinguistics { private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName()); - private final Tokenizer tokenizer; + private final LuceneTokenizer tokenizer; private final LuceneAnalysisConfig config; @Inject @@ -39,8 +36,8 @@ public class LuceneLinguistics extends SimpleLinguistics { @Override public boolean equals(Linguistics other) { - return (other instanceof LuceneLinguistics) - // Config actually determines if Linguistics are equal - && config.equals(((LuceneLinguistics) other).config); } + // Config actually determines if Linguistics are equal + return (other instanceof LuceneLinguistics) && config.equals(((LuceneLinguistics) other).config); + } } diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java index 87f34da745f..c1fa4da4989 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java @@ -2,7 +2,11 @@ package com.yahoo.language.lucene; import com.yahoo.component.provider.ComponentRegistry; import com.yahoo.language.Language; -import com.yahoo.language.process.*; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleToken; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -52,7 +56,6 @@ class LuceneTokenizer implements Tokenizer { try { tokenStream.reset(); while (tokenStream.incrementToken()) { - // TODO: is SimpleToken good enough? Maybe a custom implementation. // TODO: what to do with cases when multiple tokens are inserted into the position? String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()); String tokenString = charTermAttribute.toString(); @@ -68,4 +71,5 @@ class LuceneTokenizer implements Tokenizer { } return tokens; } + } diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 5fad34eaaae..21d3a7bd33d 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -148,4 +148,5 @@ public class LuceneTokenizerTest { .tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false); assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens)); } + } |