diff options
Diffstat (limited to 'linguistics/src/main')
9 files changed, 91 insertions, 68 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 6fa63e657bd..f14c8fc0407 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -4,6 +4,7 @@ package com.yahoo.language; import com.yahoo.language.detect.Detector; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.Stemmer; @@ -38,19 +39,36 @@ public interface Linguistics { CHARACTER_CLASSES } + /** Prefer getStemmer(LinguisticsContext) */ + // TODO: Deprecate this + default Stemmer getStemmer() { + return getStemmer(LinguisticsContext.empty()); + } + /** * Returns a thread-unsafe stemmer or lemmatizer. * This is used at query time to do stemming of search terms to indexes which contains text tokenized * with stemming turned on */ - Stemmer getStemmer(); + default Stemmer getStemmer(LinguisticsContext linguisticsContext) { + return getStemmer(); + } + + /** + * Prefer getTokenize(LinguisticsContext). + */ + default Tokenizer getTokenizer() { + return getTokenizer(LinguisticsContext.empty()); + } /** * Returns a thread-unsafe tokenizer. * This is used at indexing time to produce an optionally stemmed and * transformed (accent normalized) stream of indexable tokens. */ - Tokenizer getTokenizer(); + default Tokenizer getTokenizer(LinguisticsContext context) { + return getTokenizer(); + } /** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */ Normalizer getNormalizer(); @@ -60,14 +78,26 @@ public interface Linguistics { * This is used at query time to do stemming of search terms to indexes which contains text tokenized * with accent normalization turned on */ - Transformer getTransformer(); + default Transformer getTransformer() { + return getTransformer(); + } + + /** + * Prefer getSegmenter(LinguisticsContext). + */ + // TODO: Deprecate this + default Segmenter getSegmenter() { + return getSegmenter(LinguisticsContext.empty()); + } /** * Returns a thread-unsafe segmenter. * This is used at query time to find the individual semantic components of search terms to indexes * tokenized with segmentation. */ - Segmenter getSegmenter(); + default Segmenter getSegmenter(LinguisticsContext context) { + return getSegmenter(); + } /** * Returns a thread-unsafe detector. diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 1d96d8a0cdf..31ae396430d 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -4,6 +4,7 @@ package com.yahoo.language.opennlp; import com.yahoo.component.annotation.Inject; import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detector; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; @@ -23,7 +24,7 @@ public class OpenNlpLinguistics extends SimpleLinguistics { } @Override - public Tokenizer getTokenizer() { + public Tokenizer getTokenizer(LinguisticsContext context) { return new OpenNlpTokenizer(getNormalizer(), getTransformer()); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 1e28b98f669..0c11c6d5ce9 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -48,8 +48,7 @@ public class OpenNlpTokenizer implements Tokenizer { } @Override - public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents, - LinguisticsContext context) { + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { Stemmer stemmer = stemmerFor(language, stemMode); if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); diff --git a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java index ddb30303b9d..bb556ec64da 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java +++ b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java @@ -1,19 +1,53 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.process; +import java.util.Objects; +import java.util.Optional; + /** - * The context in which some text is linguistically processes + * The context in which some text is linguistically processed. * * @author bratseth */ public class LinguisticsContext { - private final String documentTypeName; + private static final LinguisticsContext empty = new LinguisticsContext(null, null); + + private final Optional<String> schema; + private final Optional<String> field; - public LinguisticsContext(String documentTypeName) { - this.documentTypeName = documentTypeName; + private LinguisticsContext(Optional<String> schema, Optional<String> field) { + this.schema = schema; + this.field = field; } - public String documentTypeName() { return documentTypeName; } + /** Returns the schema we are processing for, if determined. */ + public Optional<String> schema() { return schema; } + + /** Returns the schema we are processing for, if determined. */ + public Optional<String> field() { return field; } + + public static LinguisticsContext empty() { return empty; } + + public static class Builder { + + private String schema = null; + private String field = null; + + public Builder schema(String schema) { + this.schema = Objects.requireNonNull(schema); + return this; + } + + public Builder field(String field) { + this.field = Objects.requireNonNull(field); + return this; + } + + public LinguisticsContext build() { + return new LinguisticsContext(Optional.ofNullable(schema), Optional.ofNullable(field)); + } + + } } diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java index a1df10f481b..6bcafc4ffbd 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -25,27 +25,9 @@ public interface Segmenter { * @param language language of input text. * @return the list of segments. * @throws ProcessingException if an exception is encountered during processing - * @deprecated use the method with a context */ - @Deprecated // TODO: Remove on Vespa 8 default List<String> segment(String input, Language language) { return List.of(); } - /** - * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized - * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only - * contains word-characters, any punctuation and spacing tokens will be removed. - * - * This default implementation calls the method without a context. - * - * @param input the text to segment. - * @param language language of input text. - * @return the list of segments. - * @throws ProcessingException if an exception is encountered during processing - */ - default List<String> segment(String input, Language language, LinguisticsContext context) { - return segment(input, language); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java index 7c1148dac50..de99d7226c3 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java +++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java @@ -17,9 +17,9 @@ public class SegmenterImpl implements Segmenter { } @Override - public List<String> segment(String input, Language language, LinguisticsContext context) { + public List<String> segment(String input, Language language) { List<String> segments = new ArrayList<>(); - for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false, context)) { + for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) { findSegments(token, segments); } if (segments.isEmpty()) { diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java index c9e432dd459..975a32c8852 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -22,28 +22,9 @@ public interface Tokenizer { * @param removeAccents if true accents and similar are removed from the returned tokens * @return the tokens of the input String. * @throws ProcessingException If the underlying library throws an Exception. - * @deprecated use tokenize with a context instead */ - @Deprecated // TODO: Remove on Vespa 8 default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { return List.of(); } - /** - * Returns the tokens produced from an input string under the rules of the given Language and additional options. - * This dsefault implementation delegates to the tokenize method without context. - * - * @param input the string to tokenize. May be arbitrarily large. - * @param language the language of the input string. - * @param stemMode the stem mode applied on the returned tokens - * @param removeAccents if true accents and similar are removed from the returned tokens - * @param context the context of this processing - * @return the tokens of the input String. - * @throws ProcessingException If the underlying library throws an Exception. - */ - default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents, - LinguisticsContext context) { - return tokenize(input, language, stemMode, removeAccents); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 42172be680b..e666df15c94 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -7,6 +7,7 @@ import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detector; import com.yahoo.language.process.CharacterClasses; import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.LinguisticsContext; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.SegmenterImpl; @@ -45,10 +46,10 @@ public class SimpleLinguistics implements Linguistics { } @Override - public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); } + public Stemmer getStemmer(LinguisticsContext context) { return new StemmerImpl(getTokenizer(context)); } @Override - public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } + public Tokenizer getTokenizer(LinguisticsContext context) { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } @Override public Normalizer getNormalizer() { return normalizer; } @@ -57,7 +58,7 @@ public class SimpleLinguistics implements Linguistics { public Transformer getTransformer() { return transformer; } @Override - public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); } + public Segmenter getSegmenter(LinguisticsContext context) { return new SegmenterImpl(getTokenizer(context)); } @Override public Detector getDetector() { return detector; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 2728249333e..f0d91995b79 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -3,14 +3,18 @@ package com.yahoo.language.simple; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; -import com.yahoo.language.process.*; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.SpecialTokenRegistry; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; import java.util.List; import java.util.function.Function; -import java.util.logging.Logger; -import java.util.logging.Level; /** * <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations @@ -23,7 +27,6 @@ import java.util.logging.Level; */ public class SimpleTokenizer implements Tokenizer { - private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); private final static int SPACE_CODE = 32; private final Normalizer normalizer; @@ -90,21 +93,13 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { - String original = token; - log.log(Level.FINEST, () -> "processToken '" + original + "'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) { - String oldToken = token; + if (stemMode != StemMode.NONE) token = stemmer.stem(token); - String newToken = token; - log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); - } - String result = token; - log.log(Level.FINEST, () -> "processed token is: " + result); - return result; + return token; } } |