aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
diff options
context:
space:
mode:
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java15
1 files changed, 8 insertions, 7 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 879a6b2ce8e..18f09a72fc9 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase;
public class LinguisticsAnnotator {
private final Linguistics factory;
+ private final LinguisticsContext linguisticsContext;
private final AnnotatorConfig config;
private static class TermOccurrences {
@@ -56,8 +57,9 @@ public class LinguisticsAnnotator {
* @param factory the linguistics factory to use when annotating
* @param config the linguistics config to use
*/
- public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
+ public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) {
this.factory = factory;
+ this.linguisticsContext = context;
this.config = config;
}
@@ -70,15 +72,14 @@ public class LinguisticsAnnotator {
public boolean annotate(StringFieldValue text, ExecutionContext context) {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
- Tokenizer tokenizer = factory.getTokenizer();
+ Tokenizer tokenizer = factory.getTokenizer(linguisticsContext);
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
: text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input,
config.getLanguage(),
config.getStemMode(),
- config.getRemoveAccents(),
- new LinguisticsContext(context.getDocumentType().getName()));
+ config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens)
@@ -93,9 +94,9 @@ public class LinguisticsAnnotator {
* Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase The term to lower case.
- * @param origTerm The original term.
- * @return the created TERM annotation.
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
+ * @return the created TERM annotation
*/
public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
String annotationValue = toLowerCase(termToLowerCase);