aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src
diff options
context:
space:
mode:
Diffstat (limited to 'indexinglanguage/src')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java3
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java3
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java3
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java10
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java2
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java27
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java14
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java15
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java1
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java50
11 files changed, 68 insertions, 61 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java
index 0b4308d68a9..3c3f75a6693 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldPathUpdateAdapter.java
@@ -66,9 +66,6 @@ public class FieldPathUpdateAdapter implements UpdateAdapter {
return adapter.setOutputValue(exp, fieldName, fieldValue);
}
- @Override
- public DocumentType getDocumentType() { return adapter.getDocumentType(); }
-
@SuppressWarnings({ "unchecked", "rawtypes" })
private void createUpdatesAt(List<FieldPathEntry> path, FieldValue value, int idx, DocumentUpdate out) {
FieldPath updatePath = update.getFieldPath();
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java
index 0ab962cd908..dac710d560b 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/FieldUpdateAdapter.java
@@ -95,9 +95,6 @@ public class FieldUpdateAdapter implements UpdateAdapter {
return adapter.setOutputValue(exp, fieldName, fieldValue);
}
- @Override
- public DocumentType getDocumentType() { return adapter.getDocumentType(); }
-
public static FieldUpdateAdapter fromPartialUpdate(DocumentAdapter documentAdapter, ValueUpdate valueUpdate) {
return new FieldUpdateAdapter(null, documentAdapter, new PartialBuilder(valueUpdate));
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java
index 6bca95e3f47..783346d2aa7 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/IdentityFieldPathUpdateAdapter.java
@@ -67,7 +67,4 @@ public class IdentityFieldPathUpdateAdapter implements UpdateAdapter {
fwdAdapter.tryOutputType(exp, fieldName, valueType);
}
- @Override
- public DocumentType getDocumentType() { return fwdAdapter.getDocumentType(); }
-
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java
index f36c44539c7..36d0c9212dc 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/SimpleDocumentAdapter.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage;
import com.yahoo.document.DataType;
import com.yahoo.document.Document;
+import com.yahoo.document.DocumentType;
import com.yahoo.document.Field;
import com.yahoo.document.FieldPath;
import com.yahoo.document.datatypes.FieldValue;
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java
index 650e7ee06ff..4f4541ba5ee 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExecutionContext.java
@@ -11,6 +11,7 @@ import com.yahoo.language.detect.Detection;
import java.util.HashMap;
import java.util.Map;
+import java.util.Objects;
/**
* @author Simon Thoresen Hult
@@ -74,12 +75,6 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl
return this;
}
- @Override
- public DocumentType getDocumentType() {
- if (adapter == null) return null; // Only happens in tests
- return adapter.getDocumentType();
- }
-
public FieldValueAdapter getAdapter() {
return adapter;
}
@@ -98,8 +93,7 @@ public class ExecutionContext implements FieldTypeAdapter, FieldValueAdapter, Cl
}
public ExecutionContext setLanguage(Language language) {
- language.getClass();
- this.language = language;
+ this.language = Objects.requireNonNull(language);
return this;
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java
index d5b595490cb..1d07318c32d 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/FieldValueAdapter.java
@@ -10,8 +10,6 @@ import com.yahoo.document.datatypes.FieldValue;
*/
public interface FieldValueAdapter extends FieldTypeAdapter {
- DocumentType getDocumentType();
-
FieldValue getInputValue(String fieldName);
FieldValue getInputValue(FieldPath fieldPath);
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
index 91bd85420e0..3f2b6a5825a 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
@@ -2,9 +2,12 @@
package com.yahoo.vespa.indexinglanguage.expressions;
import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.Field;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.StemMode;
import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
@@ -15,6 +18,7 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
public final class TokenizeExpression extends Expression {
private final Linguistics linguistics;
+ private LinguisticsContext linguisticsContext = LinguisticsContext.empty();
private final AnnotatorConfig config;
public TokenizeExpression(Linguistics linguistics, AnnotatorConfig config) {
@@ -32,17 +36,24 @@ public final class TokenizeExpression extends Expression {
}
@Override
+ public void setStatementOutput(DocumentType documentType, Field field) {
+ linguisticsContext = new LinguisticsContext.Builder().schema(documentType.getName())
+ .field( field.getName())
+ .build();
+ }
+
+ @Override
protected void doExecute(ExecutionContext context) {
StringFieldValue input = (StringFieldValue)context.getValue();
StringFieldValue output = input.clone();
context.setValue(output);
- AnnotatorConfig cfg = new AnnotatorConfig(config);
+ AnnotatorConfig config = new AnnotatorConfig(this.config);
Language lang = context.resolveLanguage(linguistics);
if (lang != null) {
- cfg.setLanguage(lang);
+ config.setLanguage(lang);
}
- LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, cfg);
+ LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, linguisticsContext, config);
annotator.annotate(output, context);
}
@@ -74,13 +85,8 @@ public final class TokenizeExpression extends Expression {
@Override
public boolean equals(Object obj) {
- if (!(obj instanceof TokenizeExpression)) {
- return false;
- }
- TokenizeExpression rhs = (TokenizeExpression)obj;
- if (!config.equals(rhs.config)) {
- return false;
- }
+ if ( ! (obj instanceof TokenizeExpression rhs)) return false;
+ if ( ! config.equals(rhs.config)) return false;
return true;
}
@@ -88,4 +94,5 @@ public final class TokenizeExpression extends Expression {
public int hashCode() {
return getClass().hashCode() + config.hashCode();
}
+
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 441ac711cc3..03efee5f271 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
private Language language;
private StemMode stemMode;
private boolean removeAccents;
- private int maxTermOccurences;
+ private int maxTermOccurrences;
private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
language = Language.ENGLISH;
stemMode = StemMode.NONE;
removeAccents = false;
- maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
language = rhs.language;
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
- maxTermOccurences = rhs.maxTermOccurences;
+ maxTermOccurrences = rhs.maxTermOccurrences;
maxTokenizeLength = rhs.maxTokenizeLength;
}
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
}
public int getMaxTermOccurrences() {
- return maxTermOccurences;
+ return maxTermOccurrences;
}
public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurences = maxTermCount;
+ this.maxTermOccurrences = maxTermCount;
return this;
}
@@ -110,7 +110,7 @@ public class AnnotatorConfig implements Cloneable {
if (removeAccents != rhs.removeAccents) {
return false;
}
- if (maxTermOccurences != rhs.maxTermOccurences) {
+ if (maxTermOccurrences != rhs.maxTermOccurrences) {
return false;
}
if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -122,6 +122,6 @@ public class AnnotatorConfig implements Cloneable {
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
}
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 879a6b2ce8e..18f09a72fc9 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase;
public class LinguisticsAnnotator {
private final Linguistics factory;
+ private final LinguisticsContext linguisticsContext;
private final AnnotatorConfig config;
private static class TermOccurrences {
@@ -56,8 +57,9 @@ public class LinguisticsAnnotator {
* @param factory the linguistics factory to use when annotating
* @param config the linguistics config to use
*/
- public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
+ public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) {
this.factory = factory;
+ this.linguisticsContext = context;
this.config = config;
}
@@ -70,15 +72,14 @@ public class LinguisticsAnnotator {
public boolean annotate(StringFieldValue text, ExecutionContext context) {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
- Tokenizer tokenizer = factory.getTokenizer();
+ Tokenizer tokenizer = factory.getTokenizer(linguisticsContext);
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
: text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input,
config.getLanguage(),
config.getStemMode(),
- config.getRemoveAccents(),
- new LinguisticsContext(context.getDocumentType().getName()));
+ config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens)
@@ -93,9 +94,9 @@ public class LinguisticsAnnotator {
* Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase The term to lower case.
- * @param origTerm The original term.
- * @return the created TERM annotation.
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
+ * @return the created TERM annotation
*/
public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
String annotationValue = toLowerCase(termToLowerCase);
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java
index cfb82034545..f1fef2f0489 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/SimpleTestAdapter.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.indexinglanguage;
import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
import com.yahoo.document.Field;
import com.yahoo.document.FieldPath;
import com.yahoo.document.datatypes.FieldValue;
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 335fccf597e..160d645d966 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -8,6 +8,7 @@ import com.yahoo.document.annotation.SpanTrees;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
@@ -15,8 +16,8 @@ import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.vespa.indexinglanguage.expressions.ExecutionContext;
import org.junit.Test;
-import org.mockito.Mockito;
import java.util.*;
import java.util.stream.Collectors;
@@ -156,9 +157,9 @@ public class LinguisticsAnnotatorTestCase {
Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
Collections.<String, String>emptyMap());
- new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
+ new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext());
- assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertTrue(new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext()));
assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
}
@@ -174,36 +175,33 @@ public class LinguisticsAnnotatorTestCase {
Linguistics linguistics = new SimpleLinguistics();
- LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
+ LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), new AnnotatorConfig().setMaxTokenLength(12));
- assertTrue(annotator.annotate(shortValue));
+ assertTrue(annotator.annotate(shortValue, new ExecutionContext()));
assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
- assertTrue(annotator.annotate(cappedValue));
+ assertTrue(annotator.annotate(cappedValue, new ExecutionContext()));
assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
}
@Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
- final String stemmedInputTerm = "bar"; // completely different from
- // inputTerm for safer test
+ final String stemmedInputTerm = "bar";
final String paddedInputTerm = inputTerm + " ";
final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
+ final int inputTermOccurrence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length())
.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
}
for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) {
- continue;
- }
+ if ( ! type.isIndexable()) continue;
StringBuilder input = new StringBuilder();
- Token[] tokens = new Token[inputTermOccurence];
- for (int i = 0; i < inputTermOccurence; ++i) {
+ Token[] tokens = new Token[inputTermOccurrence];
+ for (int i = 0; i < inputTermOccurrence; ++i) {
SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
@@ -235,14 +233,30 @@ public class LinguisticsAnnotatorTestCase {
private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
StringFieldValue val = new StringFieldValue(str);
- assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertEquals(expected != null, new LinguisticsAnnotator(linguistics, LinguisticsContext.empty(), CONFIG).annotate(val, new ExecutionContext()));
assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
}
private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- Linguistics linguistics = Mockito.mock(Linguistics.class);
- Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms));
- return linguistics;
+ return new MyLinguistics(tokens, replacementTerms);
+ }
+
+ private static class MyLinguistics extends SimpleLinguistics {
+
+ private final List<? extends Token> tokens;
+ private final Map<String, String> replacementTerms;
+
+ public MyLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) {
+ super();
+ this.tokens = tokens;
+ this.replacementTerms = replacementTerms;
+ }
+
+ @Override
+ public Tokenizer getTokenizer(LinguisticsContext context) {
+ return new MyTokenizer(tokens, replacementTerms);
+ }
+
}
private static class MyTokenizer implements Tokenizer {