1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
package com.yahoo.language.lucene;
import com.yahoo.component.provider.ComponentRegistry;
import com.yahoo.language.Language;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleToken;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* @author dainiusjocas
*/
class LuceneTokenizer implements Tokenizer {
private static final Logger log = Logger.getLogger(LuceneTokenizer.class.getName());
// Dummy value, just to stuff the Lucene interface.
private final static String FIELD_NAME = "F";
private final AnalyzerFactory analyzerFactory;
public LuceneTokenizer(LuceneAnalysisConfig config) {
this(config, new ComponentRegistry<>());
}
public LuceneTokenizer(LuceneAnalysisConfig config, ComponentRegistry<Analyzer> analyzers) {
this.analyzerFactory = new AnalyzerFactory(config, analyzers);
}
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return List.of();
List<Token> tokens = textToTokens(input, analyzerFactory.getAnalyzer(language, stemMode, removeAccents));
log.log(Level.FINEST, () -> "Tokenized '" + language + "' text='" + input + "' into: n=" + tokens.size() + ", tokens=" + tokens);
return tokens;
}
private List<Token> textToTokens(String text, Analyzer analyzer) {
List<Token> tokens = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
// TODO: what to do with cases when multiple tokens are inserted into the position?
String originalString = text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset());
String tokenString = charTermAttribute.toString();
tokens.add(new SimpleToken(originalString, tokenString)
.setType(TokenType.ALPHABETIC)
.setOffset(offsetAttribute.startOffset())
.setScript(TokenScript.UNKNOWN));
}
tokenStream.end();
tokenStream.close();
} catch (IOException e) {
throw new RuntimeException("Failed to analyze: " + text, e);
}
return tokens;
}
}
|