blob: 3f2b6a5825a3fe54ab8ab5fc8a28e56ff8b4aaab (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.indexinglanguage.expressions;
import com.yahoo.document.DataType;
import com.yahoo.document.DocumentType;
import com.yahoo.document.Field;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.StemMode;
import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
/**
* @author Simon Thoresen Hult
*/
public final class TokenizeExpression extends Expression {
private final Linguistics linguistics;
private LinguisticsContext linguisticsContext = LinguisticsContext.empty();
private final AnnotatorConfig config;
public TokenizeExpression(Linguistics linguistics, AnnotatorConfig config) {
super(DataType.STRING);
this.linguistics = linguistics;
this.config = config;
}
public Linguistics getLinguistics() {
return linguistics;
}
public AnnotatorConfig getConfig() {
return config;
}
@Override
public void setStatementOutput(DocumentType documentType, Field field) {
linguisticsContext = new LinguisticsContext.Builder().schema(documentType.getName())
.field( field.getName())
.build();
}
@Override
protected void doExecute(ExecutionContext context) {
StringFieldValue input = (StringFieldValue)context.getValue();
StringFieldValue output = input.clone();
context.setValue(output);
AnnotatorConfig config = new AnnotatorConfig(this.config);
Language lang = context.resolveLanguage(linguistics);
if (lang != null) {
config.setLanguage(lang);
}
LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, linguisticsContext, config);
annotator.annotate(output, context);
}
@Override
protected void doVerify(VerificationContext context) {
// empty
}
@Override
public DataType createdOutputType() {
return null;
}
@Override
public String toString() {
StringBuilder ret = new StringBuilder();
ret.append("tokenize");
if (config.getRemoveAccents()) {
ret.append(" normalize");
}
if (config.getStemMode() != StemMode.NONE) {
ret.append(" stem:\""+config.getStemMode()+"\"");
}
if (config.hasNonDefaultMaxTokenLength()) {
ret.append(" max-length:" + config.getMaxTokenizeLength());
}
return ret.toString();
}
@Override
public boolean equals(Object obj) {
if ( ! (obj instanceof TokenizeExpression rhs)) return false;
if ( ! config.equals(rhs.config)) return false;
return true;
}
@Override
public int hashCode() {
return getClass().hashCode() + config.hashCode();
}
}
|