diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-11-03 18:09:12 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-11-03 18:09:12 +0100 |
commit | 9e5a6fe3caf8ed4d7810202d843662ba8cac8bc0 (patch) | |
tree | 2418ace521d5dee02b56629004a27b21c2c67660 | |
parent | bb132428fa56e52317fad756e8ca498a0f32db30 (diff) |
Accept LinguisticContextbratseth/linguistics-context-rebased
21 files changed, 195 insertions, 135 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java index 0b4308d68a9..3c3f75a6693 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java @@ -66,9 +66,6 @@ public class FieldPathUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } - @Override - public DocumentType getDocumentType() { return adapter.getDocumentType(); } - @SuppressWarnings({ "unchecked", "rawtypes" }) private void createUpdatesAt(List<FieldPathEntry> path, FieldValue value, int idx, DocumentUpdate out) { FieldPath updatePath = update.getFieldPath(); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java index 0ab962cd908..dac710d560b 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java @@ -95,9 +95,6 @@ public class FieldUpdateAdapter implements UpdateAdapter { return adapter.setOutputValue(exp, fieldName, fieldValue); } - @Override - public DocumentType getDocumentType() { return adapter.getDocumentType(); } - public static FieldUpdateAdapter fromPartialUpdate(DocumentAdapter documentAdapter, ValueUpdate valueUpdate) { return new FieldUpdateAdapter(null, documentAdapter, new PartialBuilder(valueUpdate)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java index 6bca95e3f47..783346d2aa7 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java @@ -67,7 +67,4 @@ public class IdentityFieldPathUpdateAdapter implements UpdateAdapter { fwdAdapter.tryOutputType(exp, fieldName, valueType); } - @Override - public DocumentType getDocumentType() { return fwdAdapter.getDocumentType(); } - } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java index f36c44539c7..36d0c9212dc 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; import com.yahoo.document.Document; +import com.yahoo.document.DocumentType; import com.yahoo.document.Field; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java index 650e7ee06ff..4f4541ba5ee 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java @@ -11,6 +11,7 @@ import com.yahoo.language.detect.Detection; import java.util.HashMap; import java.util.Map; +import java.util.Objects; /** * @author Simon Thoresen Hult @@ -74,12 +75,6 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl return this; } - @Override - public DocumentType getDocumentType() { - if (adapter == null) return null; // Only happens in tests - return adapter.getDocumentType(); - } - public FieldValueAdapter getAdapter() { return adapter; } @@ -98,8 +93,7 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl } public ExecutionContext setLanguage(Language language) { - language.getClass(); - this.language = language; + this.language = Objects.requireNonNull(language); return this; } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java index d5b595490cb..1d07318c32d 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java @@ -10,8 +10,6 @@ import com.yahoo.document.datatypes.FieldValue; */ public interface FieldValueAdapter extends FieldTypeAdapter { - DocumentType getDocumentType(); - FieldValue getInputValue(String fieldName); FieldValue getInputValue(FieldPath fieldPath); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java index 91bd85420e0..3f2b6a5825a 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java @@ -2,9 +2,12 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; +import com.yahoo.document.Field; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.StemMode; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; @@ -15,6 +18,7 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; public final class TokenizeExpression extends Expression { private final Linguistics linguistics; + private LinguisticsContext linguisticsContext = LinguisticsContext.empty(); private final AnnotatorConfig config; public TokenizeExpression(Linguistics linguistics, AnnotatorConfig config) { @@ -32,17 +36,24 @@ public final class TokenizeExpression extends Expression { } @Override + public void setStatementOutput(DocumentType documentType, Field field) { + linguisticsContext = new LinguisticsContext.Builder().schema(documentType.getName()) + .field( field.getName()) + .build(); + } + + @Override protected void doExecute(ExecutionContext context) { StringFieldValue input = (StringFieldValue)context.getValue(); StringFieldValue output = input.clone(); context.setValue(output); - AnnotatorConfig cfg = new AnnotatorConfig(config); + AnnotatorConfig config = new AnnotatorConfig(this.config); Language lang = context.resolveLanguage(linguistics); if (lang != null) { - cfg.setLanguage(lang); + config.setLanguage(lang); } - LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, cfg); + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, linguisticsContext, config); annotator.annotate(output, context); } @@ -74,13 +85,8 @@ public final class TokenizeExpression extends Expression { @Override public boolean equals(Object obj) { - if (!(obj instanceof TokenizeExpression)) { - return false; - } - TokenizeExpression rhs = (TokenizeExpression)obj; - if (!config.equals(rhs.config)) { - return false; - } + if ( ! (obj instanceof TokenizeExpression rhs)) return false; + if ( ! config.equals(rhs.config)) return false; return true; } @@ -88,4 +94,5 @@ public final class TokenizeExpression extends Expression { public int hashCode() { return getClass().hashCode() + config.hashCode(); } + } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 441ac711cc3..03efee5f271 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable { private Language language; private StemMode stemMode; private boolean removeAccents; - private int maxTermOccurences; + private int maxTermOccurrences; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; @@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable { language = Language.ENGLISH; stemMode = StemMode.NONE; removeAccents = false; - maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable { language = rhs.language; stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; - maxTermOccurences = rhs.maxTermOccurences; + maxTermOccurrences = rhs.maxTermOccurrences; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable { } public int getMaxTermOccurrences() { - return maxTermOccurences; + return maxTermOccurrences; } public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { - this.maxTermOccurences = maxTermCount; + this.maxTermOccurrences = maxTermCount; return this; } @@ -110,7 +110,7 @@ public class AnnotatorConfig implements Cloneable { if (removeAccents != rhs.removeAccents) { return false; } - if (maxTermOccurences != rhs.maxTermOccurences) { + if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } if (maxTokenizeLength != rhs.maxTokenizeLength) { @@ -122,6 +122,6 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 879a6b2ce8e..18f09a72fc9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase; public class LinguisticsAnnotator { private final Linguistics factory; + private final LinguisticsContext linguisticsContext; private final AnnotatorConfig config; private static class TermOccurrences { @@ -56,8 +57,9 @@ public class LinguisticsAnnotator { * @param factory the linguistics factory to use when annotating * @param config the linguistics config to use */ - public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) { + public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) { this.factory = factory; + this.linguisticsContext = context; this.config = config; } @@ -70,15 +72,14 @@ public class LinguisticsAnnotator { public boolean annotate(StringFieldValue text, ExecutionContext context) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. - Tokenizer tokenizer = factory.getTokenizer(); + Tokenizer tokenizer = factory.getTokenizer(linguisticsContext); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() : text.getString().substring(0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), - config.getRemoveAccents(), - new LinguisticsContext(context.getDocumentType().getName())); + config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS); for (Token token : tokens) @@ -93,9 +94,9 @@ public class LinguisticsAnnotator { * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param termToLowerCase The term to lower case. - * @param origTerm The original term. - * @return the created TERM annotation. + * @param termToLowerCase the term to lower case + * @param origTerm the original term + * @return the created TERM annotation */ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { String annotationValue = toLowerCase(termToLowerCase); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java index cfb82034545..f1fef2f0489 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; import com.yahoo.document.Field; import com.yahoo.document.FieldPath; import com.yahoo.document.datatypes.FieldValue; diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 335fccf597e..160d645d966 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -8,6 +8,7 @@ import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; @@ -15,8 +16,8 @@ import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.language.simple.SimpleToken; +import com.yahoo.vespa.indexinglanguage.expressions.ExecutionContext; import org.junit.Test; -import org.mockito.Mockito; import java.util.*; import java.util.stream.Collectors; @@ -156,9 +157,9 @@ public class LinguisticsAnnotatorTestCase { Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), Collections.<String, String>emptyMap()); - new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); + new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext()); - assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertTrue(new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext())); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -174,36 +175,33 @@ public class LinguisticsAnnotatorTestCase { Linguistics linguistics = new SimpleLinguistics(); - LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12)); + LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), new AnnotatorConfig().setMaxTokenLength(12)); - assertTrue(annotator.annotate(shortValue)); + assertTrue(annotator.annotate(shortValue, new ExecutionContext())); assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS)); assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string"); - assertTrue(annotator.annotate(cappedValue)); + assertTrue(annotator.annotate(cappedValue, new ExecutionContext())); assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); } @Test public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; - final String stemmedInputTerm = "bar"; // completely different from - // inputTerm for safer test + final String stemmedInputTerm = "bar"; final String paddedInputTerm = inputTerm + " "; final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; + final int inputTermOccurrence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) { expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()) .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm))); } for (TokenType type : TokenType.values()) { - if (!type.isIndexable()) { - continue; - } + if ( ! type.isIndexable()) continue; StringBuilder input = new StringBuilder(); - Token[] tokens = new Token[inputTermOccurence]; - for (int i = 0; i < inputTermOccurence; ++i) { + Token[] tokens = new Token[inputTermOccurrence]; + for (int i = 0; i < inputTermOccurrence; ++i) { SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; @@ -235,14 +233,30 @@ public class LinguisticsAnnotatorTestCase { private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); - assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext())); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) { - Linguistics linguistics = Mockito.mock(Linguistics.class); - Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms)); - return linguistics; + return new MyLinguistics(tokens, replacementTerms); + } + + private static class MyLinguistics extends SimpleLinguistics { + + private final List<? extends Token> tokens; + private final Map<String, String> replacementTerms; + + public MyLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) { + super(); + this.tokens = tokens; + this.replacementTerms = replacementTerms; + } + + @Override + public Tokenizer getTokenizer(LinguisticsContext context) { + return new MyTokenizer(tokens, replacementTerms); + } + } private static class MyTokenizer implements Tokenizer { diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index 1f01148b3b9..2b670e42e53 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -204,11 +204,14 @@ "abstract" ], "methods" : [ - "public abstract com.yahoo.language.process.Stemmer getStemmer()", - "public abstract com.yahoo.language.process.Tokenizer getTokenizer()", + "public com.yahoo.language.process.Stemmer getStemmer()", + "public com.yahoo.language.process.Stemmer getStemmer(com.yahoo.language.process.LinguisticsContext)", + "public com.yahoo.language.process.Tokenizer getTokenizer()", + "public com.yahoo.language.process.Tokenizer getTokenizer(com.yahoo.language.process.LinguisticsContext)", "public abstract com.yahoo.language.process.Normalizer getNormalizer()", - "public abstract com.yahoo.language.process.Transformer getTransformer()", - "public abstract com.yahoo.language.process.Segmenter getSegmenter()", + "public com.yahoo.language.process.Transformer getTransformer()", + "public com.yahoo.language.process.Segmenter getSegmenter()", + "public com.yahoo.language.process.Segmenter getSegmenter(com.yahoo.language.process.LinguisticsContext)", "public abstract com.yahoo.language.detect.Detector getDetector()", "public abstract com.yahoo.language.process.GramSplitter getGramSplitter()", "public abstract com.yahoo.language.process.CharacterClasses getCharacterClasses()", @@ -427,6 +430,33 @@ ], "fields" : [ ] }, + "com.yahoo.language.process.LinguisticsContext$Builder" : { + "superClass" : "java.lang.Object", + "interfaces" : [ ], + "attributes" : [ + "public" + ], + "methods" : [ + "public void <init>()", + "public com.yahoo.language.process.LinguisticsContext$Builder schema(java.lang.String)", + "public com.yahoo.language.process.LinguisticsContext$Builder field(java.lang.String)", + "public com.yahoo.language.process.LinguisticsContext build()" + ], + "fields" : [ ] + }, + "com.yahoo.language.process.LinguisticsContext" : { + "superClass" : "java.lang.Object", + "interfaces" : [ ], + "attributes" : [ + "public" + ], + "methods" : [ + "public java.util.Optional schema()", + "public java.util.Optional field()", + "public static com.yahoo.language.process.LinguisticsContext empty()" + ], + "fields" : [ ] + }, "com.yahoo.language.process.Normalizer" : { "superClass" : "java.lang.Object", "interfaces" : [ ], @@ -461,7 +491,7 @@ "abstract" ], "methods" : [ - "public abstract java.util.List segment(java.lang.String, com.yahoo.language.Language)" + "public java.util.List segment(java.lang.String, com.yahoo.language.Language)" ], "fields" : [ ] }, @@ -734,7 +764,7 @@ "abstract" ], "methods" : [ - "public abstract java.lang.Iterable tokenize(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)" + "public java.lang.Iterable tokenize(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)" ], "fields" : [ ] }, diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 6fa63e657bd..f14c8fc0407 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -4,6 +4,7 @@ package com.yahoo.language; import com.yahoo.language.detect.Detector; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.Stemmer; @@ -38,19 +39,36 @@ public interface Linguistics { CHARACTER_CLASSES } + /** Prefer getStemmer(LinguisticsContext) */ + // TODO: Deprecate this + default Stemmer getStemmer() { + return getStemmer(LinguisticsContext.empty()); + } + /** * Returns a thread-unsafe stemmer or lemmatizer. * This is used at query time to do stemming of search terms to indexes which contains text tokenized * with stemming turned on */ - Stemmer getStemmer(); + default Stemmer getStemmer(LinguisticsContext linguisticsContext) { + return getStemmer(); + } + + /** + * Prefer getTokenize(LinguisticsContext). + */ + default Tokenizer getTokenizer() { + return getTokenizer(LinguisticsContext.empty()); + } /** * Returns a thread-unsafe tokenizer. * This is used at indexing time to produce an optionally stemmed and * transformed (accent normalized) stream of indexable tokens. */ - Tokenizer getTokenizer(); + default Tokenizer getTokenizer(LinguisticsContext context) { + return getTokenizer(); + } /** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */ Normalizer getNormalizer(); @@ -60,14 +78,26 @@ public interface Linguistics { * This is used at query time to do stemming of search terms to indexes which contains text tokenized * with accent normalization turned on */ - Transformer getTransformer(); + default Transformer getTransformer() { + return getTransformer(); + } + + /** + * Prefer getSegmenter(LinguisticsContext). + */ + // TODO: Deprecate this + default Segmenter getSegmenter() { + return getSegmenter(LinguisticsContext.empty()); + } /** * Returns a thread-unsafe segmenter. * This is used at query time to find the individual semantic components of search terms to indexes * tokenized with segmentation. */ - Segmenter getSegmenter(); + default Segmenter getSegmenter(LinguisticsContext context) { + return getSegmenter(); + } /** * Returns a thread-unsafe detector. diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 1d96d8a0cdf..31ae396430d 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -4,6 +4,7 @@ package com.yahoo.language.opennlp; import com.yahoo.component.annotation.Inject; import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detector; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; @@ -23,7 +24,7 @@ public class OpenNlpLinguistics extends SimpleLinguistics { } @Override - public Tokenizer getTokenizer() { + public Tokenizer getTokenizer(LinguisticsContext context) { return new OpenNlpTokenizer(getNormalizer(), getTransformer()); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 1e28b98f669..0c11c6d5ce9 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -48,8 +48,7 @@ public class OpenNlpTokenizer implements Tokenizer { } @Override - public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents, - LinguisticsContext context) { + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { Stemmer stemmer = stemmerFor(language, stemMode); if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); diff --git a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java index ddb30303b9d..bb556ec64da 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java +++ b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java @@ -1,19 +1,53 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.process; +import java.util.Objects; +import java.util.Optional; + /** - * The context in which some text is linguistically processes + * The context in which some text is linguistically processed. * * @author bratseth */ public class LinguisticsContext { - private final String documentTypeName; + private static final LinguisticsContext empty = new LinguisticsContext(null, null); + + private final Optional<String> schema; + private final Optional<String> field; - public LinguisticsContext(String documentTypeName) { - this.documentTypeName = documentTypeName; + private LinguisticsContext(Optional<String> schema, Optional<String> field) { + this.schema = schema; + this.field = field; } - public String documentTypeName() { return documentTypeName; } + /** Returns the schema we are processing for, if determined. */ + public Optional<String> schema() { return schema; } + + /** Returns the schema we are processing for, if determined. */ + public Optional<String> field() { return field; } + + public static LinguisticsContext empty() { return empty; } + + public static class Builder { + + private String schema = null; + private String field = null; + + public Builder schema(String schema) { + this.schema = Objects.requireNonNull(schema); + return this; + } + + public Builder field(String field) { + this.field = Objects.requireNonNull(field); + return this; + } + + public LinguisticsContext build() { + return new LinguisticsContext(Optional.ofNullable(schema), Optional.ofNullable(field)); + } + + } } diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java index a1df10f481b..6bcafc4ffbd 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -25,27 +25,9 @@ public interface Segmenter { * @param language language of input text. * @return the list of segments. * @throws ProcessingException if an exception is encountered during processing - * @deprecated use the method with a context */ - @Deprecated // TODO: Remove on Vespa 8 default List<String> segment(String input, Language language) { return List.of(); } - /** - * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized - * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only - * contains word-characters, any punctuation and spacing tokens will be removed. - * - * This default implementation calls the method without a context. - * - * @param input the text to segment. - * @param language language of input text. - * @return the list of segments. - * @throws ProcessingException if an exception is encountered during processing - */ - default List<String> segment(String input, Language language, LinguisticsContext context) { - return segment(input, language); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java index 7c1148dac50..de99d7226c3 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java +++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java @@ -17,9 +17,9 @@ public class SegmenterImpl implements Segmenter { } @Override - public List<String> segment(String input, Language language, LinguisticsContext context) { + public List<String> segment(String input, Language language) { List<String> segments = new ArrayList<>(); - for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false, context)) { + for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) { findSegments(token, segments); } if (segments.isEmpty()) { diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java index c9e432dd459..975a32c8852 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -22,28 +22,9 @@ public interface Tokenizer { * @param removeAccents if true accents and similar are removed from the returned tokens * @return the tokens of the input String. * @throws ProcessingException If the underlying library throws an Exception. - * @deprecated use tokenize with a context instead */ - @Deprecated // TODO: Remove on Vespa 8 default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { return List.of(); } - /** - * Returns the tokens produced from an input string under the rules of the given Language and additional options. - * This dsefault implementation delegates to the tokenize method without context. - * - * @param input the string to tokenize. May be arbitrarily large. - * @param language the language of the input string. - * @param stemMode the stem mode applied on the returned tokens - * @param removeAccents if true accents and similar are removed from the returned tokens - * @param context the context of this processing - * @return the tokens of the input String. - * @throws ProcessingException If the underlying library throws an Exception. - */ - default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents, - LinguisticsContext context) { - return tokenize(input, language, stemMode, removeAccents); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 42172be680b..e666df15c94 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -7,6 +7,7 @@ import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detector; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.SegmenterImpl; @@ -45,10 +46,10 @@ public class SimpleLinguistics implements Linguistics { } @Override - public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); } + public Stemmer getStemmer(LinguisticsContext context) { return new StemmerImpl(getTokenizer(context)); } @Override - public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } + public Tokenizer getTokenizer(LinguisticsContext context) { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } @Override public Normalizer getNormalizer() { return normalizer; } @@ -57,7 +58,7 @@ public class SimpleLinguistics implements Linguistics { public Transformer getTransformer() { return transformer; } @Override - public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); } + public Segmenter getSegmenter(LinguisticsContext context) { return new SegmenterImpl(getTokenizer(context)); } @Override public Detector getDetector() { return detector; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 2728249333e..f0d91995b79 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -3,14 +3,18 @@ package com.yahoo.language.simple; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; -import com.yahoo.language.process.*; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.SpecialTokenRegistry; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; import java.util.List; import java.util.function.Function; -import java.util.logging.Logger; -import java.util.logging.Level; /** * <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations @@ -23,7 +27,6 @@ import java.util.logging.Level; */ public class SimpleTokenizer implements Tokenizer { - private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); private final static int SPACE_CODE = 32; private final Normalizer normalizer; @@ -90,21 +93,13 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { - String original = token; - log.log(Level.FINEST, () -> "processToken '" + original + "'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) { - String oldToken = token; + if (stemMode != StemMode.NONE) token = stemmer.stem(token); - String newToken = token; - log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); - } - String result = token; - log.log(Level.FINEST, () -> "processed token is: " + result); - return result; + return token; } } |