summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-07-31 22:52:22 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-07-31 22:52:22 +0200
commit8aac810a33de0e28947b9c48993d1a3833b5b73b (patch)
tree954c0b4ab1306d14ff9c29cae9c23f2aa27a4e49 /lucene-linguistics
parentd1810e7868dc7f729745a98af66ec2b25951a8a6 (diff)
Cleanup: No functional changes
Diffstat (limited to 'lucene-linguistics')
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java17
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java1
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java15
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java8
-rw-r--r--lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java1
5 files changed, 21 insertions, 21 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
index b71d06a2c3f..f4d3b482363 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/AnalyzerFactory.java
@@ -17,6 +17,8 @@ import java.util.Map;
import java.util.logging.Logger;
/**
+ * Analyzers for various languages.
+ *
* @author dainiusjocas
*/
class AnalyzerFactory {
@@ -54,10 +56,6 @@ class AnalyzerFactory {
* Retrieves an analyzer with a given params.
* Sets up the analyzer if config is provided.
* Default analyzer is the `StandardAnalyzer`.
- * @param language
- * @param stemMode
- * @param removeAccents
- * @return
*/
public Analyzer getAnalyzer(Language language, StemMode stemMode, boolean removeAccents) {
String analyzerKey = generateKey(language, stemMode, removeAccents);
@@ -119,7 +117,7 @@ class AnalyzerFactory {
}
String tokenizerName = analysis.tokenizer().name();
Map<String, String> conf = analysis.tokenizer().conf();
- return builder.withTokenizer(tokenizerName, toModifiable(conf));
+ return builder.withTokenizer(tokenizerName, asModifiable(conf));
}
private CustomAnalyzer.Builder addCharFilters(CustomAnalyzer.Builder builder,
@@ -129,7 +127,7 @@ class AnalyzerFactory {
return builder;
}
for (LuceneAnalysisConfig.Analysis.CharFilters charFilter : analysis.charFilters()) {
- builder.addCharFilter(charFilter.name(), toModifiable(charFilter.conf()));
+ builder.addCharFilter(charFilter.name(), asModifiable(charFilter.conf()));
}
return builder;
}
@@ -141,7 +139,7 @@ class AnalyzerFactory {
return builder;
}
for (LuceneAnalysisConfig.Analysis.TokenFilters tokenFilter : analysis.tokenFilters()) {
- builder.addTokenFilter(tokenFilter.name(), toModifiable(tokenFilter.conf()));
+ builder.addTokenFilter(tokenFilter.name(), asModifiable(tokenFilter.conf()));
}
return builder;
}
@@ -150,10 +148,9 @@ class AnalyzerFactory {
* A config map coming from the Vespa ConfigInstance is immutable while CustomAnalyzer builders
* mutates the map to mark that a param was consumed. Immutable maps can't be mutated!
* To overcome this conflict we can wrap the ConfigInstance map in a new HashMap.
- * @param map
- * @return Mutable Map
*/
- private Map<String, String> toModifiable(Map<String, String> map) {
+ private Map<String, String> asModifiable(Map<String, String> map) {
return new HashMap<>(map);
}
+
}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
index b7ec6f9a23d..858b71b7fae 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/DefaultAnalyzers.java
@@ -110,4 +110,5 @@ class DefaultAnalyzers {
public Analyzer get(String languageCode) {
return analyzerClasses.get(Language.fromLanguageTag(languageCode));
}
+
}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
index c55ad8adecb..8b193c103d6 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
@@ -3,28 +3,25 @@ package com.yahoo.language.lucene;
import com.google.inject.Inject;
import com.yahoo.component.provider.ComponentRegistry;
import com.yahoo.language.Linguistics;
-import com.yahoo.language.process.*;
+import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleLinguistics;
import org.apache.lucene.analysis.Analyzer;
-import java.util.ArrayList;
import java.util.logging.Logger;
/**
- * Factory of Lucene based linguistics processor.
+ * Factory of Lucene based linguistics processors.
* As described in the Linguistics docstring
* > the tokenizer should typically stem, transform and normalize
- * The Stemmer, Transformer, Normalizer, and Segmenter implementations are mostly NOOP.
*
* TODO: docs for all available analysis components.
- * TODO: some registry for available language Analyzers.
*
* @author dainiusjocas
*/
public class LuceneLinguistics extends SimpleLinguistics {
private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName());
- private final Tokenizer tokenizer;
+ private final LuceneTokenizer tokenizer;
private final LuceneAnalysisConfig config;
@Inject
@@ -39,8 +36,8 @@ public class LuceneLinguistics extends SimpleLinguistics {
@Override
public boolean equals(Linguistics other) {
- return (other instanceof LuceneLinguistics)
- // Config actually determines if Linguistics are equal
- && config.equals(((LuceneLinguistics) other).config); }
+ // Config actually determines if Linguistics are equal
+ return (other instanceof LuceneLinguistics) && config.equals(((LuceneLinguistics) other).config);
+ }
}
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
index 87f34da745f..c1fa4da4989 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java
@@ -2,7 +2,11 @@ package com.yahoo.language.lucene;
import com.yahoo.component.provider.ComponentRegistry;
import com.yahoo.language.Language;
-import com.yahoo.language.process.*;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenScript;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleToken;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@@ -52,7 +56,6 @@ class LuceneTokenizer implements Tokenizer {
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
- // TODO: is SimpleToken good enough? Maybe a custom implementation.
// TODO: what to do with cases when multiple tokens are inserted into the position?
String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset());
String tokenString = charTermAttribute.toString();
@@ -68,4 +71,5 @@ class LuceneTokenizer implements Tokenizer {
}
return tokens;
}
+
}
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
index 5fad34eaaae..21d3a7bd33d 100644
--- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
+++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
@@ -148,4 +148,5 @@ public class LuceneTokenizerTest {
.tokenize("Dogs and Cats", Language.ENGLISH, StemMode.ALL, false);
assertEquals(List.of("Dog", "Cat"), tokenStrings(tokens));
}
+
}