diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-07-31 14:04:02 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-07-31 14:04:02 +0200 |
commit | 00040fc85d3cb5a4752e4d916926f21195d16520 (patch) | |
tree | 265f0725d183998ced1389002d624a0ad016a04e /lucene-linguistics | |
parent | 113e57691652232871a122b0c0a748a9692b60c6 (diff) |
Fallback to default implementations
Diffstat (limited to 'lucene-linguistics')
-rw-r--r-- | lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java | 35 | ||||
-rw-r--r-- | lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java | 29 |
2 files changed, 19 insertions, 45 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java index b5c5ba47ab6..37d2f6abdd7 100644 --- a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneLinguistics.java @@ -22,11 +22,7 @@ import java.util.logging.Logger; public class LuceneLinguistics extends SimpleLinguistics { private static final Logger log = Logger.getLogger(LuceneLinguistics.class.getName()); - private final Normalizer normalizer; - private final Transformer transformer; private final Tokenizer tokenizer; - private final Stemmer stemmer; - private final Segmenter segmenter; private final LuceneAnalysisConfig config; @Inject @@ -34,42 +30,11 @@ public class LuceneLinguistics extends SimpleLinguistics { log.info("Creating LuceneLinguistics with: " + config); this.config = config; this.tokenizer = new LuceneTokenizer(config, analyzers); - // NOOP stemmer - this.stemmer = (word, stemMode, language) -> { - ArrayList<StemList> stemLists = new ArrayList<>(); - StemList stems = new StemList(); - stems.add(word); - stemLists.add(stems); - return stemLists; - }; - // Segmenter that just wraps a tokenizer - this.segmenter = (string, language) -> { - ArrayList<String> segments = new ArrayList<>(); - Iterable<Token> tokens = tokenizer.tokenize(string, language, StemMode.NONE, false); - tokens.forEach(token -> segments.add(token.getTokenString())); - return segments; - }; - // NOOP normalizer - this.normalizer = (string) -> string; - // NOOP transformer - this.transformer = (string, language) -> string; } @Override - public Stemmer getStemmer() { return stemmer; } - - @Override public Tokenizer getTokenizer() { return tokenizer; } - @Override - public Normalizer getNormalizer() { return normalizer; } - - @Override - public Transformer getTransformer() { return transformer; } - - @Override - public Segmenter getSegmenter() { return segmenter; } - public LuceneAnalysisConfig getConfig() { return config; } diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 568f295b39d..2c569e730fa 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -3,6 +3,8 @@ package com.yahoo.language.lucene; import com.yahoo.component.provider.ComponentRegistry; import com.yahoo.config.FileReference; import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemList; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import org.junit.Test; @@ -20,11 +22,7 @@ public class LuceneTokenizerTest { @Test public void testTokenizer() { String text = "This is my Text"; - var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig - .Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) - .build()); - Iterable<Token> tokens = tokenizer + Iterable<Token> tokens = luceneLinguistics().getTokenizer() .tokenize(text, Language.ENGLISH, StemMode.ALL, true); assertEquals(List.of("my", "text"), tokenStrings(tokens)); } @@ -32,15 +30,26 @@ public class LuceneTokenizerTest { @Test public void testLithuanianTokenizer() { String text = "Žalgirio mūšio data yra 1410 metai"; - var tokenizer = new LuceneTokenizer(new LuceneAnalysisConfig - .Builder() - .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) - .build()); - Iterable<Token> tokens = tokenizer + Iterable<Token> tokens = luceneLinguistics().getTokenizer() .tokenize(text, Language.LITHUANIAN, StemMode.ALL, true); assertEquals(List.of("žalgir", "mūš", "dat", "1410", "met"), tokenStrings(tokens)); } + @Test + public void testStemming() { + String text = "mūšio"; + List<StemList> tokens = luceneLinguistics().getStemmer().stem(text, StemMode.ALL, Language.LITHUANIAN); + assertEquals(1, tokens.size()); + assertEquals("mūš", tokens.get(0).get(0)); + } + + private Linguistics luceneLinguistics() { + return new LuceneLinguistics(new LuceneAnalysisConfig.Builder() + .configDir(FileReference.mockFileReferenceForUnitTesting(new File("."))) + .build(), + new ComponentRegistry<>()); + } + private void assertToken(String tokenString, Iterator<Token> tokens) { Token t = tokens.next(); assertEquals(tokenString, t.getTokenString()); |