diff options
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java')
-rw-r--r-- | indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 879a6b2ce8e..18f09a72fc9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; public class LinguisticsAnnotator { private final Linguistics factory; + private final LinguisticsContext linguisticsContext; private final AnnotatorConfig config; private static class TermOccurrences { @@ -56,8 +57,9 @@ public class LinguisticsAnnotator { * @param factory the linguistics factory to use when annotating * @param config the linguistics config to use */ - public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) { + public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) { this.factory = factory; + this.linguisticsContext = context; this.config = config; } @@ -70,15 +72,14 @@ public class LinguisticsAnnotator { public boolean annotate(StringFieldValue text, ExecutionContext context) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. - Tokenizer tokenizer = factory.getTokenizer(); + Tokenizer tokenizer = factory.getTokenizer(linguisticsContext); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), - config.getRemoveAccents(), - new LinguisticsContext(context.getDocumentType().getName())); + config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) @@ -93,9 +94,9 @@ public class LinguisticsAnnotator { * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param termToLowerCase The term to lower case. - * @param origTerm The original term. - * @return the created TERM annotation. + * @param termToLowerCase the term to lower case + * @param origTerm the original term + * @return the created TERM annotation */ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { String annotationValue = toLowerCase(termToLowerCase); |