diff options
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics')
2 files changed, 15 insertions, 14 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 441ac711cc3..03efee5f271 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable { private Language language; private StemMode stemMode; private boolean removeAccents; - private int maxTermOccurences; + private int maxTermOccurrences; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; @@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable { language = Language.ENGLISH; stemMode = StemMode.NONE; removeAccents = false; - maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable { language = rhs.language; stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; - maxTermOccurences = rhs.maxTermOccurences; + maxTermOccurrences = rhs.maxTermOccurrences; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable { } public int getMaxTermOccurrences() { - return maxTermOccurences; + return maxTermOccurrences; } public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { - this.maxTermOccurences = maxTermCount; + this.maxTermOccurrences = maxTermCount; return this; } @@ -110,7 +110,7 @@ public class AnnotatorConfig implements Cloneable { if (removeAccents != rhs.removeAccents) { return false; } - if (maxTermOccurences != rhs.maxTermOccurences) { + if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } if (maxTokenizeLength != rhs.maxTokenizeLength) { @@ -122,6 +122,6 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 879a6b2ce8e..18f09a72fc9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; public class LinguisticsAnnotator { private final Linguistics factory; + private final LinguisticsContext linguisticsContext; private final AnnotatorConfig config; private static class TermOccurrences { @@ -56,8 +57,9 @@ public class LinguisticsAnnotator { * @param factory the linguistics factory to use when annotating * @param config the linguistics config to use */ - public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) { + public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) { this.factory = factory; + this.linguisticsContext = context; this.config = config; } @@ -70,15 +72,14 @@ public class LinguisticsAnnotator { public boolean annotate(StringFieldValue text, ExecutionContext context) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. - Tokenizer tokenizer = factory.getTokenizer(); + Tokenizer tokenizer = factory.getTokenizer(linguisticsContext); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), - config.getRemoveAccents(), - new LinguisticsContext(context.getDocumentType().getName())); + config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) @@ -93,9 +94,9 @@ public class LinguisticsAnnotator { * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param termToLowerCase The term to lower case. - * @param origTerm The original term. - * @return the created TERM annotation. + * @param termToLowerCase the term to lower case + * @param origTerm the original term + * @return the created TERM annotation */ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { String annotationValue = toLowerCase(termToLowerCase); |