diff options
Diffstat (limited to 'indexinglanguage/src')
11 files changed, 68 insertions, 61 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java index 0b4308d68a9..3c3f75a6693 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java @@ -66,9 +66,6 @@ public class FieldPathUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } - @Override - public DocumentType getDocumentType() { return adapter.getDocumentType(); } - @SuppressWarnings({ "unchecked", "rawtypes" }) private void createUpdatesAt(List<FieldPathEntry> path, FieldValue value, int idx, DocumentUpdate out) { FieldPath updatePath = update.getFieldPath(); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java index 0ab962cd908..dac710d560b 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java @@ -95,9 +95,6 @@ public class FieldUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } - @Override - public DocumentType getDocumentType() { return adapter.getDocumentType(); } - public static FieldUpdateAdapter fromPartialUpdate(DocumentAdapter documentAdapter, ValueUpdate valueUpdate) { return new FieldUpdateAdapter(null, documentAdapter, new PartialBuilder(valueUpdate)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java index 6bca95e3f47..783346d2aa7 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java @@ -67,7 +67,4 @@ public class IdentityFieldPathUpdateAdapter implements UpdateAdapter { fwdAdapter.tryOutputType(exp, fieldName, valueType); } - @Override - public DocumentType getDocumentType() { return fwdAdapter.getDocumentType(); } - } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java index f36c44539c7..36d0c9212dc 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; import com.yahoo.document.Document; +import com.yahoo.document.DocumentType; import com.yahoo.document.Field; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java index 650e7ee06ff..4f4541ba5ee 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java @@ -11,6 +11,7 @@ import com.yahoo.language.detect.Detection; import java.util.HashMap; import java.util.Map; +import java.util.Objects; /** * @author Simon Thoresen Hult @@ -74,12 +75,6 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl return this; } - @Override - public DocumentType getDocumentType() { - if (adapter == null) return null; // Only happens in tests - return adapter.getDocumentType(); - } - public FieldValueAdapter getAdapter() { return adapter; } @@ -98,8 +93,7 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl } public ExecutionContext setLanguage(Language language) { - language.getClass(); - this.language = language; + this.language = Objects.requireNonNull(language); return this; } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java index d5b595490cb..1d07318c32d 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java @@ -10,8 +10,6 @@ import com.yahoo.document.datatypes.FieldValue; */ public interface FieldValueAdapter extends FieldTypeAdapter { - DocumentType getDocumentType(); - FieldValue getInputValue(String fieldName); FieldValue getInputValue(FieldPath fieldPath); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 91bd85420e0..3f2b6a5825a 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -2,9 +2,12 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; +import com.yahoo.document.Field; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.StemMode; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; @@ -15,6 +18,7 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; public final class TokenizeExpression extends Expression { private final Linguistics linguistics; + private LinguisticsContext linguisticsContext = LinguisticsContext.empty(); private final AnnotatorConfig config; public TokenizeExpression(Linguistics linguistics, AnnotatorConfig config) { @@ -32,17 +36,24 @@ public final class TokenizeExpression extends Expression { } @Override + public void setStatementOutput(DocumentType documentType, Field field) { + linguisticsContext = new LinguisticsContext.Builder().schema(documentType.getName()) + .field( field.getName()) + .build(); + } + + @Override protected void doExecute(ExecutionContext context) { StringFieldValue input = (StringFieldValue)context.getValue(); StringFieldValue output = input.clone(); context.setValue(output); - AnnotatorConfig cfg = new AnnotatorConfig(config); + AnnotatorConfig config = new AnnotatorConfig(this.config); Language lang = context.resolveLanguage(linguistics); if (lang != null) { - cfg.setLanguage(lang); + config.setLanguage(lang); } - LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, cfg); + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, linguisticsContext, config); annotator.annotate(output, context); } @@ -74,13 +85,8 @@ public final class TokenizeExpression extends Expression { @Override public boolean equals(Object obj) { - if (!(obj instanceof TokenizeExpression)) { - return false; - } - TokenizeExpression rhs = (TokenizeExpression)obj; - if (!config.equals(rhs.config)) { - return false; - } + if ( ! (obj instanceof TokenizeExpression rhs)) return false; + if ( ! config.equals(rhs.config)) return false; return true; } @@ -88,4 +94,5 @@ public final class TokenizeExpression extends Expression { public int hashCode() { return getClass().hashCode() + config.hashCode(); } + } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 441ac711cc3..03efee5f271 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable { private Language language; private StemMode stemMode; private boolean removeAccents; - private int maxTermOccurences; + private int maxTermOccurrences; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; @@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable { language = Language.ENGLISH; stemMode = StemMode.NONE; removeAccents = false; - maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable { language = rhs.language; stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; - maxTermOccurences = rhs.maxTermOccurences; + maxTermOccurrences = rhs.maxTermOccurrences; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable { } public int getMaxTermOccurrences() { - return maxTermOccurences; + return maxTermOccurrences; } public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { - this.maxTermOccurences = maxTermCount; + this.maxTermOccurrences = maxTermCount; return this; } @@ -110,7 +110,7 @@ public class AnnotatorConfig implements Cloneable { if (removeAccents != rhs.removeAccents) { return false; } - if (maxTermOccurences != rhs.maxTermOccurences) { + if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } if (maxTokenizeLength != rhs.maxTokenizeLength) { @@ -122,6 +122,6 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 879a6b2ce8e..18f09a72fc9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; public class LinguisticsAnnotator { private final Linguistics factory; + private final LinguisticsContext linguisticsContext; private final AnnotatorConfig config; private static class TermOccurrences { @@ -56,8 +57,9 @@ public class LinguisticsAnnotator { * @param factory the linguistics factory to use when annotating * @param config the linguistics config to use */ - public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) { + public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) { this.factory = factory; + this.linguisticsContext = context; this.config = config; } @@ -70,15 +72,14 @@ public class LinguisticsAnnotator { public boolean annotate(StringFieldValue text, ExecutionContext context) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. - Tokenizer tokenizer = factory.getTokenizer(); + Tokenizer tokenizer = factory.getTokenizer(linguisticsContext); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), - config.getRemoveAccents(), - new LinguisticsContext(context.getDocumentType().getName())); + config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) @@ -93,9 +94,9 @@ public class LinguisticsAnnotator { * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param termToLowerCase The term to lower case. - * @param origTerm The original term. - * @return the created TERM annotation. + * @param termToLowerCase the term to lower case + * @param origTerm the original term + * @return the created TERM annotation */ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { String annotationValue = toLowerCase(termToLowerCase); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java index cfb82034545..f1fef2f0489 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; import com.yahoo.document.Field; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 335fccf597e..160d645d966 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -8,6 +8,7 @@ import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; @@ -15,8 +16,8 @@ import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.language.simple.SimpleToken; +import com.yahoo.vespa.indexinglanguage.expressions.ExecutionContext; import org.junit.Test; -import org.mockito.Mockito; import java.util.*; import java.util.stream.Collectors; @@ -156,9 +157,9 @@ public class LinguisticsAnnotatorTestCase { Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), Collections.<String, String>emptyMap()); - new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); + new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext()); - assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertTrue(new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext())); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -174,36 +175,33 @@ public class LinguisticsAnnotatorTestCase { Linguistics linguistics = new SimpleLinguistics(); - LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12)); + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), new AnnotatorConfig().setMaxTokenLength(12)); - assertTrue(annotator.annotate(shortValue)); + assertTrue(annotator.annotate(shortValue, new ExecutionContext())); assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS)); assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string"); - assertTrue(annotator.annotate(cappedValue)); + assertTrue(annotator.annotate(cappedValue, new ExecutionContext())); assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); } @Test public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; - final String stemmedInputTerm = "bar"; // completely different from - // inputTerm for safer test + final String stemmedInputTerm = "bar"; final String paddedInputTerm = inputTerm + " "; final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; + final int inputTermOccurrence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) { expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()) .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm))); } for (TokenType type : TokenType.values()) { - if (!type.isIndexable()) { - continue; - } + if ( ! type.isIndexable()) continue; StringBuilder input = new StringBuilder(); - Token[] tokens = new Token[inputTermOccurence]; - for (int i = 0; i < inputTermOccurence; ++i) { + Token[] tokens = new Token[inputTermOccurrence]; + for (int i = 0; i < inputTermOccurrence; ++i) { SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; @@ -235,14 +233,30 @@ public class LinguisticsAnnotatorTestCase { private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); - assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext())); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) { - Linguistics linguistics = Mockito.mock(Linguistics.class); - Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms)); - return linguistics; + return new MyLinguistics(tokens, replacementTerms); + } + + private static class MyLinguistics extends SimpleLinguistics { + + private final List<? extends Token> tokens; + private final Map<String, String> replacementTerms; + + public MyLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) { + super(); + this.tokens = tokens; + this.replacementTerms = replacementTerms; + } + + @Override + public Tokenizer getTokenizer(LinguisticsContext context) { + return new MyTokenizer(tokens, replacementTerms); + } + } private static class MyTokenizer implements Tokenizer { |