aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-11-03 18:09:12 +0100
committerJon Bratseth <bratseth@gmail.com>2022-11-03 18:09:12 +0100
commit9e5a6fe3caf8ed4d7810202d843662ba8cac8bc0 (patch)
tree2418ace521d5dee02b56629004a27b21c2c67660 /indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
parentbb132428fa56e52317fad756e8ca498a0f32db30 (diff)
Accept LinguisticContextbratseth/linguistics-context-rebased
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java15
1 files changed, 8 insertions, 7 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 879a6b2ce8e..18f09a72fc9 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase;
public class LinguisticsAnnotator {
private final Linguistics factory;
+ private final LinguisticsContext linguisticsContext;
private final AnnotatorConfig config;
private static class TermOccurrences {
@@ -56,8 +57,9 @@ public class LinguisticsAnnotator {
* @param factory the linguistics factory to use when annotating
* @param config the linguistics config to use
*/
- public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
+ public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) {
this.factory = factory;
+ this.linguisticsContext = context;
this.config = config;
}
@@ -70,15 +72,14 @@ public class LinguisticsAnnotator {
public boolean annotate(StringFieldValue text, ExecutionContext context) {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
- Tokenizer tokenizer = factory.getTokenizer();
+ Tokenizer tokenizer = factory.getTokenizer(linguisticsContext);
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
: text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input,
config.getLanguage(),
config.getStemMode(),
- config.getRemoveAccents(),
- new LinguisticsContext(context.getDocumentType().getName()));
+ config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens)
@@ -93,9 +94,9 @@ public class LinguisticsAnnotator {
* Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase The term to lower case.
- * @param origTerm The original term.
- * @return the created TERM annotation.
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
+ * @return the created TERM annotation
*/
public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
String annotationValue = toLowerCase(termToLowerCase);