diff options
Diffstat (limited to 'lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java')
-rw-r--r-- | lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java new file mode 100644 index 00000000000..0cde849fd6e --- /dev/null +++ b/lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java @@ -0,0 +1,68 @@ +package com.yahoo.language.lucene; + +import com.yahoo.component.provider.ComponentRegistry; +import com.yahoo.language.Language; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.SimpleToken; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class LuceneTokenizer implements Tokenizer { + + private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName()); + + // Dummy value, just to stuff the Lucene interface. + private final static String FIELD_NAME = "F"; + + private final AnalyzerFactory analyzerFactory; + + public LuceneTokenizer(LuceneAnalysisConfig config) { + this(config, new ComponentRegistry<>()); + } + public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) { + this.analyzerFactory = new AnalyzerFactory(config, analyzers); + } + + @Override + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + if (input.isEmpty()) return List.of(); + + List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents)); + log.log(Level.FINEST, "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens); + return tokens; + } + + private List<Token> textToTokens(String text, Analyzer analyzer) { + List<Token> tokens = new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); + + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); + try { + tokenStream.reset(); + while (tokenStream.incrementToken()) { + // TODO: is SimpleToken good enough? Maybe a custom implementation. + // TODO: what to do with cases when multiple tokens are inserted into the position? + String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()); + String tokenString = charTermAttribute.toString(); + tokens.add(new SimpleToken(originalString, tokenString) + .setType(TokenType.ALPHABETIC) + .setOffset(offsetAttribute.startOffset()) + .setScript(TokenScript.UNKNOWN)); + } + tokenStream.end(); + tokenStream.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to analyze: " + text, e); + } + return tokens; + } +} |