lucene-linguistics/src/main/java/com/yahoo/language/lucene/LuceneTokenizer.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.lucene;

import com.yahoo.component.provider.ComponentRegistry;
import com.yahoo.language.Language;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleToken;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * @author dainiusjocas
 */
class LuceneTokenizer implements Tokenizer {

    private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName());

    // Dummy value, just to stuff the Lucene interface.
    private final static String FIELD_NAME = "F";

    private final AnalyzerFactory analyzerFactory;

    public LuceneTokenizer(LuceneAnalysisConfig config) {
        this(config, new ComponentRegistry<>());
    }
    public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
        this.analyzerFactory = new AnalyzerFactory(config, analyzers);
    }

    @Override
    public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
        if (input.isEmpty()) return List.of();

        List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents));
        log.log(Level.FINEST, () -> "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens);
        return tokens;
    }

    private List<Token> textToTokens(String text, Analyzer analyzer) {
        List<Token> tokens = new ArrayList<>();
        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);

        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                // TODO: what to do with cases when multiple tokens are inserted into the position?
                String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset());
                String tokenString = charTermAttribute.toString();
                tokens.add(new SimpleToken(originalString, tokenString)
                        .setType(TokenType.ALPHABETIC)
                        .setOffset(offsetAttribute.startOffset())
                        .setScript(TokenScript.UNKNOWN));
            }
            tokenStream.end();
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("Failed to analyze: " + text, e);
        }
        return tokens;
    }

}