summaryrefslogtreecommitdiffstats
path: root/lucene-linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-07-31 14:04:02 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-07-31 14:04:02 +0200
commit00040fc85d3cb5a4752e4d916926f21195d16520 (patch)
tree265f0725d183998ced1389002d624a0ad016a04e /lucene-linguistics
parent113e57691652232871a122b0c0a748a9692b60c6 (diff)
Fallback to default implementations
Diffstat (limited to 'lucene-linguistics')
-rw-r--r--lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java35
-rw-r--r--lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java29
2 files changed, 19 insertions, 45 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
index b5c5ba47ab6..37d2f6abdd7 100644
--- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
+++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java
@@ -22,11 +22,7 @@ import java.util.logging.Logger;
public class LuceneLinguistics extends SimpleLinguistics {
private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName());
- private final Normalizer normalizer;
- private final Transformer transformer;
private final Tokenizer tokenizer;
- private final Stemmer stemmer;
- private final Segmenter segmenter;
private final LuceneAnalysisConfig config;
@Inject
@@ -34,42 +30,11 @@ public class LuceneLinguistics extends SimpleLinguistics {
log.info("Creating LuceneLinguistics with: " + config);
this.config = config;
this.tokenizer = new LuceneTokenizer(config, analyzers);
- // NOOP stemmer
- this.stemmer = (word, stemMode, language) -> {
- ArrayList<StemList> stemLists = new ArrayList<>();
- StemList stems = new StemList();
- stems.add(word);
- stemLists.add(stems);
- return stemLists;
- };
- // Segmenter that just wraps a tokenizer
- this.segmenter = (string, language) -> {
- ArrayList<String> segments = new ArrayList<>();
- Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false);
- tokens.forEach(token -> segments.add(token.getTokenString()));
- return segments;
- };
- // NOOP normalizer
- this.normalizer = (string) -> string;
- // NOOP transformer
- this.transformer = (string, language) -> string;
}
@Override
- public Stemmer getStemmer() { return stemmer; }
-
- @Override
public Tokenizer getTokenizer() { return tokenizer; }
- @Override
- public Normalizer getNormalizer() { return normalizer; }
-
- @Override
- public Transformer getTransformer() { return transformer; }
-
- @Override
- public Segmenter getSegmenter() { return segmenter; }
-
public LuceneAnalysisConfig getConfig() {
return config;
}
diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
index 568f295b39d..2c569e730fa 100644
--- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
+++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java
@@ -3,6 +3,8 @@ package com.yahoo.language.lucene;
import com.yahoo.component.provider.ComponentRegistry;
import com.yahoo.config.FileReference;
import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.StemList;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import org.junit.Test;
@@ -20,11 +22,7 @@ public class LuceneTokenizerTest {
@Test
public void testTokenizer() {
String text = "This is my Text";
- var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig
- .Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
- .build());
- Iterable<Token> tokens = tokenizer
+ Iterable<Token> tokens = luceneLinguistics().getTokenizer()
.tokenize(text, Language.ENGLISH, StemMode.ALL, true);
assertEquals(List.of("my", "text"), tokenStrings(tokens));
}
@@ -32,15 +30,26 @@ public class LuceneTokenizerTest {
@Test
public void testLithuanianTokenizer() {
String text = "Žalgirio mūšio data yra 1410 metai";
- var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig
- .Builder()
- .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
- .build());
- Iterable<Token> tokens = tokenizer
+ Iterable<Token> tokens = luceneLinguistics().getTokenizer()
.tokenize(text, Language.LITHUANIAN, StemMode.ALL, true);
assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens));
}
+ @Test
+ public void testStemming() {
+ String text = "mūšio";
+ List<StemList> tokens = luceneLinguistics().getStemmer().stem(text, StemMode.ALL, Language.LITHUANIAN);
+ assertEquals(1, tokens.size());
+ assertEquals("mūš", tokens.get(0).get(0));
+ }
+
+ private Linguistics luceneLinguistics() {
+ return new LuceneLinguistics(new LuceneAnalysisConfig.Builder()
+ .configDir(FileReference.mockFileReferenceForUnitTesting(new File(".")))
+ .build(),
+ new ComponentRegistry<>());
+ }
+
private void assertToken(String tokenString, Iterator<Token> tokens) {
Token t = tokens.next();
assertEquals(tokenString, t.getTokenString());