diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-26 10:24:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-11-02 13:39:24 +0100 |
commit | 055e776ed2d47ea3bc95b493f075a0c3d26b9729 (patch) | |
tree | 5afdf905516125d94450af72522a16399cee50a0 /indexinglanguage | |
parent | 522935f83ecfa8ed6f0e55859ecb9330d012c73b (diff) |
Linguistics context WIP
Diffstat (limited to 'indexinglanguage')
7 files changed, 32 insertions, 4 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java index 3c3f75a6693..0b4308d68a9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java @@ -66,6 +66,9 @@ public class FieldPathUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } + @Override + public DocumentType getDocumentType() { return adapter.getDocumentType(); } + @SuppressWarnings({ "unchecked", "rawtypes" }) private void createUpdatesAt(List<FieldPathEntry> path, FieldValue value, int idx, DocumentUpdate out) { FieldPath updatePath = update.getFieldPath(); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java index 4182c133000..0ab962cd908 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; import com.yahoo.document.Document; +import com.yahoo.document.DocumentType; import com.yahoo.document.DocumentUpdate; import com.yahoo.document.Field; import com.yahoo.document.FieldPath; @@ -94,6 +95,9 @@ public class FieldUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } + @Override + public DocumentType getDocumentType() { return adapter.getDocumentType(); } + public static FieldUpdateAdapter fromPartialUpdate(DocumentAdapter documentAdapter, ValueUpdate valueUpdate) { return new FieldUpdateAdapter(null, documentAdapter, new PartialBuilder(valueUpdate)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java index 5406ca67c63..6bca95e3f47 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; import com.yahoo.document.Document; +import com.yahoo.document.DocumentType; import com.yahoo.document.DocumentUpdate; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; @@ -66,4 +67,7 @@ public class IdentityFieldPathUpdateAdapter implements UpdateAdapter { fwdAdapter.tryOutputType(exp, fieldName, valueType); } + @Override + public DocumentType getDocumentType() { return fwdAdapter.getDocumentType(); } + } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java index 389259cc811..650e7ee06ff 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Language; @@ -21,6 +22,7 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl private FieldValue value; private Language language; + /** For testing only. */ public ExecutionContext() { this(null); } @@ -72,6 +74,12 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl return this; } + @Override + public DocumentType getDocumentType() { + if (adapter == null) return null; // Only happens in tests + return adapter.getDocumentType(); + } + public FieldValueAdapter getAdapter() { return adapter; } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java index 6f6074db2fd..d5b595490cb 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.expressions; +import com.yahoo.document.DocumentType; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; @@ -9,7 +10,10 @@ import com.yahoo.document.datatypes.FieldValue; */ public interface FieldValueAdapter extends FieldTypeAdapter { + DocumentType getDocumentType(); + FieldValue getInputValue(String fieldName); + FieldValue getInputValue(FieldPath fieldPath); FieldValueAdapter setOutputValue(Expression exp, String fieldName, FieldValue fieldValue); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 577bc1e8d28..91bd85420e0 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -43,7 +43,7 @@ public final class TokenizeExpression extends Expression { cfg.setLanguage(lang); } LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, cfg); - annotator.annotate(output); + annotator.annotate(output, context); } @Override diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 173df65a47e..879a6b2ce8e 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -9,9 +9,11 @@ import com.yahoo.document.annotation.SpanTree; import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.Tokenizer; +import com.yahoo.vespa.indexinglanguage.expressions.ExecutionContext; import java.util.HashMap; import java.util.Map; @@ -65,15 +67,18 @@ public class LinguisticsAnnotator { * @param text the text to annotate * @return whether anything was annotated */ - public boolean annotate(StringFieldValue text) { + public boolean annotate(StringFieldValue text, ExecutionContext context) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); - Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), - config.getRemoveAccents()); + Iterable<Token> tokens = tokenizer.tokenize(input, + config.getLanguage(), + config.getStemMode(), + config.getRemoveAccents(), + new LinguisticsContext(context.getDocumentType().getName())); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) |