aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java38
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java44
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java18
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java23
9 files changed, 91 insertions, 68 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
index 6fa63e657bd..f14c8fc0407 100644
--- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -4,6 +4,7 @@ package com.yahoo.language;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.CharacterClasses;
import com.yahoo.language.process.GramSplitter;
+import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.process.Stemmer;
@@ -38,19 +39,36 @@ public interface Linguistics {
CHARACTER_CLASSES
}
+ /** Prefer getStemmer(LinguisticsContext) */
+ // TODO: Deprecate this
+ default Stemmer getStemmer() {
+ return getStemmer(LinguisticsContext.empty());
+ }
+
/**
* Returns a thread-unsafe stemmer or lemmatizer.
* This is used at query time to do stemming of search terms to indexes which contains text tokenized
* with stemming turned on
*/
- Stemmer getStemmer();
+ default Stemmer getStemmer(LinguisticsContext linguisticsContext) {
+ return getStemmer();
+ }
+
+ /**
+ * Prefer getTokenize(LinguisticsContext).
+ */
+ default Tokenizer getTokenizer() {
+ return getTokenizer(LinguisticsContext.empty());
+ }
/**
* Returns a thread-unsafe tokenizer.
* This is used at indexing time to produce an optionally stemmed and
* transformed (accent normalized) stream of indexable tokens.
*/
- Tokenizer getTokenizer();
+ default Tokenizer getTokenizer(LinguisticsContext context) {
+ return getTokenizer();
+ }
/** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */
Normalizer getNormalizer();
@@ -60,14 +78,26 @@ public interface Linguistics {
* This is used at query time to do stemming of search terms to indexes which contains text tokenized
* with accent normalization turned on
*/
- Transformer getTransformer();
+ default Transformer getTransformer() {
+ return getTransformer();
+ }
+
+ /**
+ * Prefer getSegmenter(LinguisticsContext).
+ */
+ // TODO: Deprecate this
+ default Segmenter getSegmenter() {
+ return getSegmenter(LinguisticsContext.empty());
+ }
/**
* Returns a thread-unsafe segmenter.
* This is used at query time to find the individual semantic components of search terms to indexes
* tokenized with segmentation.
*/
- Segmenter getSegmenter();
+ default Segmenter getSegmenter(LinguisticsContext context) {
+ return getSegmenter();
+ }
/**
* Returns a thread-unsafe detector.
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 1d96d8a0cdf..31ae396430d 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -4,6 +4,7 @@ package com.yahoo.language.opennlp;
import com.yahoo.component.annotation.Inject;
import com.yahoo.language.Linguistics;
import com.yahoo.language.detect.Detector;
+import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleLinguistics;
@@ -23,7 +24,7 @@ public class OpenNlpLinguistics extends SimpleLinguistics {
}
@Override
- public Tokenizer getTokenizer() {
+ public Tokenizer getTokenizer(LinguisticsContext context) {
return new OpenNlpTokenizer(getNormalizer(), getTransformer());
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 1e28b98f669..0c11c6d5ce9 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -48,8 +48,7 @@ public class OpenNlpTokenizer implements Tokenizer {
}
@Override
- public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents,
- LinguisticsContext context) {
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
Stemmer stemmer = stemmerFor(language, stemMode);
if (stemmer == null)
return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
diff --git a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java
index ddb30303b9d..bb556ec64da 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/LinguisticsContext.java
@@ -1,19 +1,53 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
+import java.util.Objects;
+import java.util.Optional;
+
/**
- * The context in which some text is linguistically processes
+ * The context in which some text is linguistically processed.
*
* @author bratseth
*/
public class LinguisticsContext {
- private final String documentTypeName;
+ private static final LinguisticsContext empty = new LinguisticsContext(null, null);
+
+ private final Optional<String> schema;
+ private final Optional<String> field;
- public LinguisticsContext(String documentTypeName) {
- this.documentTypeName = documentTypeName;
+ private LinguisticsContext(Optional<String> schema, Optional<String> field) {
+ this.schema = schema;
+ this.field = field;
}
- public String documentTypeName() { return documentTypeName; }
+ /** Returns the schema we are processing for, if determined. */
+ public Optional<String> schema() { return schema; }
+
+ /** Returns the schema we are processing for, if determined. */
+ public Optional<String> field() { return field; }
+
+ public static LinguisticsContext empty() { return empty; }
+
+ public static class Builder {
+
+ private String schema = null;
+ private String field = null;
+
+ public Builder schema(String schema) {
+ this.schema = Objects.requireNonNull(schema);
+ return this;
+ }
+
+ public Builder field(String field) {
+ this.field = Objects.requireNonNull(field);
+ return this;
+ }
+
+ public LinguisticsContext build() {
+ return new LinguisticsContext(Optional.ofNullable(schema), Optional.ofNullable(field));
+ }
+
+ }
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
index a1df10f481b..6bcafc4ffbd 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -25,27 +25,9 @@ public interface Segmenter {
* @param language language of input text.
* @return the list of segments.
* @throws ProcessingException if an exception is encountered during processing
- * @deprecated use the method with a context
*/
- @Deprecated // TODO: Remove on Vespa 8
default List<String> segment(String input, Language language) {
return List.of();
}
- /**
- * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
- * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
- * contains word-characters, any punctuation and spacing tokens will be removed.
- *
- * This default implementation calls the method without a context.
- *
- * @param input the text to segment.
- * @param language language of input text.
- * @return the list of segments.
- * @throws ProcessingException if an exception is encountered during processing
- */
- default List<String> segment(String input, Language language, LinguisticsContext context) {
- return segment(input, language);
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
index 7c1148dac50..de99d7226c3 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
@@ -17,9 +17,9 @@ public class SegmenterImpl implements Segmenter {
}
@Override
- public List<String> segment(String input, Language language, LinguisticsContext context) {
+ public List<String> segment(String input, Language language) {
List<String> segments = new ArrayList<>();
- for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false, context)) {
+ for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) {
findSegments(token, segments);
}
if (segments.isEmpty()) {
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
index c9e432dd459..975a32c8852 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -22,28 +22,9 @@ public interface Tokenizer {
* @param removeAccents if true accents and similar are removed from the returned tokens
* @return the tokens of the input String.
* @throws ProcessingException If the underlying library throws an Exception.
- * @deprecated use tokenize with a context instead
*/
- @Deprecated // TODO: Remove on Vespa 8
default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
return List.of();
}
- /**
- * Returns the tokens produced from an input string under the rules of the given Language and additional options.
- * This dsefault implementation delegates to the tokenize method without context.
- *
- * @param input the string to tokenize. May be arbitrarily large.
- * @param language the language of the input string.
- * @param stemMode the stem mode applied on the returned tokens
- * @param removeAccents if true accents and similar are removed from the returned tokens
- * @param context the context of this processing
- * @return the tokens of the input String.
- * @throws ProcessingException If the underlying library throws an Exception.
- */
- default Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents,
- LinguisticsContext context) {
- return tokenize(input, language, stemMode, removeAccents);
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 42172be680b..e666df15c94 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -7,6 +7,7 @@ import com.yahoo.language.Linguistics;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.CharacterClasses;
import com.yahoo.language.process.GramSplitter;
+import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.process.SegmenterImpl;
@@ -45,10 +46,10 @@ public class SimpleLinguistics implements Linguistics {
}
@Override
- public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); }
+ public Stemmer getStemmer(LinguisticsContext context) { return new StemmerImpl(getTokenizer(context)); }
@Override
- public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); }
+ public Tokenizer getTokenizer(LinguisticsContext context) { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); }
@Override
public Normalizer getNormalizer() { return normalizer; }
@@ -57,7 +58,7 @@ public class SimpleLinguistics implements Linguistics {
public Transformer getTransformer() { return transformer; }
@Override
- public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); }
+ public Segmenter getSegmenter(LinguisticsContext context) { return new SegmenterImpl(getTokenizer(context)); }
@Override
public Detector getDetector() { return detector; }
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 2728249333e..f0d91995b79 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -3,14 +3,18 @@ package com.yahoo.language.simple;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
-import com.yahoo.language.process.*;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.SpecialTokenRegistry;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
-import java.util.logging.Logger;
-import java.util.logging.Level;
/**
* <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations
@@ -23,7 +27,6 @@ import java.util.logging.Level;
*/
public class SimpleTokenizer implements Tokenizer {
- private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
@@ -90,21 +93,13 @@ public class SimpleTokenizer implements Tokenizer {
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
- String original = token;
- log.log(Level.FINEST, () -> "processToken '" + original + "'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE) {
- String oldToken = token;
+ if (stemMode != StemMode.NONE)
token = stemmer.stem(token);
- String newToken = token;
- log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
- }
- String result = token;
- log.log(Level.FINEST, () -> "processed token is: " + result);
- return result;
+ return token;
}
}