summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2017-06-14 14:41:18 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2017-06-14 14:41:18 +0200
commit6ff3df19226036b8ee1bb559f9d73cab40e8d2a0 (patch)
tree355a7b0623b58983ba655b868341fe479a22eb3d /indexinglanguage
parentb7f9e7ceaef72489d76683537973b639f8895b84 (diff)
Remove carriage return
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java108
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java2
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java212
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java26
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java2
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java498
6 files changed, 424 insertions, 424 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java
index f648a0e38e4..9596bf60cae 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java
@@ -1,55 +1,55 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.indexinglanguage.expressions;
-
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Stack;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public class MathResolver {
-
- private final List<Item> items = new LinkedList<>();
-
- public void push(ArithmeticExpression.Operator op, Expression exp) {
- op.getClass(); // throws NullPointerException
- if (items.isEmpty() && op != ArithmeticExpression.Operator.ADD) {
- throw new IllegalArgumentException("First item in an arithmetic operation must be an addition.");
- }
- items.add(new Item(op, exp));
- }
-
- public Expression resolve() {
- Stack<Item> stack = new Stack<>();
- stack.push(items.remove(0));
- while (!items.isEmpty()) {
- Item item = items.remove(0);
- while (stack.size() > 1 && stack.peek().op.precedes(item.op)) {
- pop(stack);
- }
- stack.push(item);
- }
- while (stack.size() > 1) {
- pop(stack);
- }
- return stack.remove(0).exp;
- }
-
- private void pop(Stack<Item> stack) {
- Item rhs = stack.pop();
- Item lhs = stack.peek();
- lhs.exp = new ArithmeticExpression(lhs.exp, rhs.op, rhs.exp);
- }
-
- private static class Item {
-
- final ArithmeticExpression.Operator op;
- Expression exp;
-
- Item(ArithmeticExpression.Operator op, Expression exp) {
- this.op = op;
- this.exp = exp;
- }
- }
-} \ No newline at end of file
+package com.yahoo.vespa.indexinglanguage.expressions;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class MathResolver {
+
+ private final List<Item> items = new LinkedList<>();
+
+ public void push(ArithmeticExpression.Operator op, Expression exp) {
+ op.getClass(); // throws NullPointerException
+ if (items.isEmpty() && op != ArithmeticExpression.Operator.ADD) {
+ throw new IllegalArgumentException("First item in an arithmetic operation must be an addition.");
+ }
+ items.add(new Item(op, exp));
+ }
+
+ public Expression resolve() {
+ Stack<Item> stack = new Stack<>();
+ stack.push(items.remove(0));
+ while (!items.isEmpty()) {
+ Item item = items.remove(0);
+ while (stack.size() > 1 && stack.peek().op.precedes(item.op)) {
+ pop(stack);
+ }
+ stack.push(item);
+ }
+ while (stack.size() > 1) {
+ pop(stack);
+ }
+ return stack.remove(0).exp;
+ }
+
+ private void pop(Stack<Item> stack) {
+ Item rhs = stack.pop();
+ Item lhs = stack.peek();
+ lhs.exp = new ArithmeticExpression(lhs.exp, rhs.op, rhs.exp);
+ }
+
+ private static class Item {
+
+ final ArithmeticExpression.Operator op;
+ Expression exp;
+
+ Item(ArithmeticExpression.Operator op, Expression exp) {
+ this.op = op;
+ this.exp = exp;
+ }
+ }
+}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java
index 427c777db5a..703cf30f6e0 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java
@@ -18,4 +18,4 @@ public class PassthroughExpression extends OutputExpression {
public boolean equals(Object obj) {
return super.equals(obj) && obj instanceof PassthroughExpression;
}
-} \ No newline at end of file
+}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 09364c796f0..48e78392498 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -1,106 +1,106 @@
-// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.indexinglanguage.linguistics;
-
-import com.yahoo.language.Language;
-import com.yahoo.language.process.StemMode;
-import com.yahoo.vespa.configdefinition.IlscriptsConfig;
-
-/**
- * @author Simon Thoresen
- */
-public class AnnotatorConfig implements Cloneable {
-
- private Language language;
- private StemMode stemMode;
- private boolean removeAccents;
- private int maxTermOccurences;
-
- public static final int DEFAULT_MAX_TERM_OCCURRENCES;
-
- static {
- IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
- DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
- }
-
- public AnnotatorConfig() {
- language = Language.ENGLISH;
- stemMode = StemMode.NONE;
- removeAccents = false;
- maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
- }
-
- public AnnotatorConfig(AnnotatorConfig rhs) {
- language = rhs.language;
- stemMode = rhs.stemMode;
- removeAccents = rhs.removeAccents;
- maxTermOccurences = rhs.maxTermOccurences;
- }
-
- public Language getLanguage() {
- return language;
- }
-
- public AnnotatorConfig setLanguage(Language language) {
- this.language = language;
- return this;
- }
-
- public StemMode getStemMode() {
- return stemMode;
- }
-
- public AnnotatorConfig setStemMode(StemMode stemMode) {
- this.stemMode = stemMode;
- return this;
- }
-
- public AnnotatorConfig setStemMode(String name) {
- this.stemMode = StemMode.valueOf(name);
- return this;
- }
-
- public boolean getRemoveAccents() {
- return removeAccents;
- }
-
- public AnnotatorConfig setRemoveAccents(boolean removeAccents) {
- this.removeAccents = removeAccents;
- return this;
- }
-
- public int getMaxTermOccurrences() {
- return maxTermOccurences;
- }
-
- public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurences = maxTermCount;
- return this;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof AnnotatorConfig)) {
- return false;
- }
- AnnotatorConfig rhs = (AnnotatorConfig)obj;
- if (!language.equals(rhs.language)) {
- return false;
- }
- if (!stemMode.equals(rhs.stemMode)) {
- return false;
- }
- if (removeAccents != rhs.removeAccents) {
- return false;
- }
- if (maxTermOccurences != rhs.maxTermOccurences) {
- return false;
- }
- return true;
- }
-
- @Override
- public int hashCode() {
- return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences;
- }
-}
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.indexinglanguage.linguistics;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.vespa.configdefinition.IlscriptsConfig;
+
+/**
+ * @author Simon Thoresen
+ */
+public class AnnotatorConfig implements Cloneable {
+
+ private Language language;
+ private StemMode stemMode;
+ private boolean removeAccents;
+ private int maxTermOccurences;
+
+ public static final int DEFAULT_MAX_TERM_OCCURRENCES;
+
+ static {
+ IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
+ DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
+ }
+
+ public AnnotatorConfig() {
+ language = Language.ENGLISH;
+ stemMode = StemMode.NONE;
+ removeAccents = false;
+ maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+ }
+
+ public AnnotatorConfig(AnnotatorConfig rhs) {
+ language = rhs.language;
+ stemMode = rhs.stemMode;
+ removeAccents = rhs.removeAccents;
+ maxTermOccurences = rhs.maxTermOccurences;
+ }
+
+ public Language getLanguage() {
+ return language;
+ }
+
+ public AnnotatorConfig setLanguage(Language language) {
+ this.language = language;
+ return this;
+ }
+
+ public StemMode getStemMode() {
+ return stemMode;
+ }
+
+ public AnnotatorConfig setStemMode(StemMode stemMode) {
+ this.stemMode = stemMode;
+ return this;
+ }
+
+ public AnnotatorConfig setStemMode(String name) {
+ this.stemMode = StemMode.valueOf(name);
+ return this;
+ }
+
+ public boolean getRemoveAccents() {
+ return removeAccents;
+ }
+
+ public AnnotatorConfig setRemoveAccents(boolean removeAccents) {
+ this.removeAccents = removeAccents;
+ return this;
+ }
+
+ public int getMaxTermOccurrences() {
+ return maxTermOccurences;
+ }
+
+ public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
+ this.maxTermOccurences = maxTermCount;
+ return this;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof AnnotatorConfig)) {
+ return false;
+ }
+ AnnotatorConfig rhs = (AnnotatorConfig)obj;
+ if (!language.equals(rhs.language)) {
+ return false;
+ }
+ if (!stemMode.equals(rhs.stemMode)) {
+ return false;
+ }
+ if (removeAccents != rhs.removeAccents) {
+ return false;
+ }
+ if (maxTermOccurences != rhs.maxTermOccurences) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences;
+ }
+}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java
index de9d26547da..5e9ffd677cf 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java
@@ -1,14 +1,14 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.indexinglanguage.parser;
-
-import com.yahoo.javacc.FastCharStream;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public final class IndexingInput extends FastCharStream implements CharStream {
-
- public IndexingInput(String input) {
- super(input);
- }
-}
+package com.yahoo.vespa.indexinglanguage.parser;
+
+import com.yahoo.javacc.FastCharStream;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public final class IndexingInput extends FastCharStream implements CharStream {
+
+ public IndexingInput(String input) {
+ super(input);
+ }
+}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java
index 097e0f21bc1..95eab5f9ef8 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java
@@ -38,4 +38,4 @@ public class AttributeExpressionTestCase {
public void requireThatExpressionCanBeExecuted() {
assertExecute(new AttributeExpression("foo"));
}
-} \ No newline at end of file
+}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 805bdc96904..5882e2c19c6 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -1,250 +1,250 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.indexinglanguage.linguistics;
-
-import com.yahoo.document.annotation.Annotation;
-import com.yahoo.document.annotation.AnnotationTypes;
-import com.yahoo.document.annotation.SpanTree;
-import com.yahoo.document.annotation.SpanTrees;
-import com.yahoo.document.datatypes.StringFieldValue;
-import com.yahoo.language.Language;
-import com.yahoo.language.Linguistics;
-import com.yahoo.language.process.StemMode;
-import com.yahoo.language.process.Token;
-import com.yahoo.language.process.TokenType;
-import com.yahoo.language.process.Tokenizer;
-import com.yahoo.language.simple.SimpleToken;
-
-import org.junit.Test;
-import org.mockito.Mockito;
-
-import java.util.*;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public class LinguisticsAnnotatorTestCase {
-
- private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
-
- // --------------------------------------------------------------------------------
- //
- // Tests
- //
- // --------------------------------------------------------------------------------
-
- @Test
- public void requireThatAnnotateFailsWithZeroTokens() {
- assertAnnotations(null, "foo");
- }
-
- @Test
- public void requireThatAnnotateFailsWithoutIndexableTokenString() {
- for (TokenType type : TokenType.values()) {
- if (type.isIndexable()) {
- continue;
- }
- assertAnnotations(null, "foo", newToken("foo", "bar", type));
- }
- }
-
- @Test
- public void requireThatIndexableTokenStringsAreAnnotated() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "bar", type));
- }
- }
-
- @Test
- public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (TokenType type : TokenType.values()) {
- assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
- }
- }
-
- @Test
- public void requireThatTermAnnotationsAreEmptyIfOrigIsLowerCase() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
- }
- }
- }
-
- @Test
- public void requireThatTermAnnotationsAreLowerCased() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
- }
- }
- }
-
- @Test
- public void requireThatCompositeTokensAreFlattened() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo")));
- expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
-
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
- assertAnnotations(expected, "foobarbaz", token);
- }
-
- @Test
- public void requireThatCompositeSpecialTokensAreNotFlattened() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
- new StringFieldValue("foobarbaz")));
-
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
- assertAnnotations(expected, "foobarbaz", token);
- }
-
- @Test
- public void requireThatErrorTokensAreSkipped() {
- assertAnnotations(null, "foo", new SimpleToken("foo").setType(TokenType.ALPHABETIC)
- .setOffset(-1));
- }
-
- @Test
- public void requireThatTermReplacementsAreApplied() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo",
- newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)),
- Collections.singletonMap("foo", "bar")));
- }
- }
- }
-
- @Test
- public void requireThatExistingAnnotationsAreKept() {
- SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
- spanTree.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
-
- StringFieldValue val = new StringFieldValue("foo");
- val.setSpanTree(spanTree);
-
- Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
- Collections.<String, String>emptyMap());
- new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
-
- assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
- assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
- }
-
- @Test
- public void requireThatMaxTermOccurencesIsHonored() {
- final String inputTerm = "foo";
- final String stemmedInputTerm = "bar"; // completely different from
- // inputTerm for safer test
- final String paddedInputTerm = inputTerm + " ";
- final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
- for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
- expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length())
- .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
- }
- for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) {
- continue;
- }
- StringBuilder input = new StringBuilder();
- Token[] tokens = new Token[inputTermOccurence];
- for (int i = 0; i < inputTermOccurence; ++i) {
- SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
- t.setOffset(i * paddedInputTerm.length());
- tokens[i] = t;
- input.append(paddedInputTerm);
- }
- assertAnnotations(expected, input.toString(), tokens);
- }
- }
-
- // --------------------------------------------------------------------------------
- //
- // Utilities
- //
- // --------------------------------------------------------------------------------
-
- private static SimpleToken newToken(String orig, String stem, TokenType type) {
- return newToken(orig, stem, type, false);
- }
-
- private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
- return new SimpleToken(orig).setTokenString(stem)
- .setType(type)
- .setSpecialToken(specialToken);
- }
-
- private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
- assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
- }
-
- private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
- StringFieldValue val = new StringFieldValue(str);
- assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
- assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
- }
-
- private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- Linguistics linguistics = Mockito.mock(Linguistics.class);
- Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms));
- return linguistics;
- }
-
- private static class MyTokenizer implements Tokenizer {
-
- final List<Token> tokens;
- final Map<String, String> replacementTerms;
-
- public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- this.tokens = new ArrayList<>(tokens);
- this.replacementTerms = replacementTerms;
- }
-
- @Override
- public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- return tokens;
- }
-
- @Override
- public String getReplacementTerm(String term) {
- String replacement = replacementTerms.get(term);
- return replacement != null ? replacement : term;
- }
- }
-}
+package com.yahoo.vespa.indexinglanguage.linguistics;
+
+import com.yahoo.document.annotation.Annotation;
+import com.yahoo.document.annotation.AnnotationTypes;
+import com.yahoo.document.annotation.SpanTree;
+import com.yahoo.document.annotation.SpanTrees;
+import com.yahoo.document.datatypes.StringFieldValue;
+import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleToken;
+
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.util.*;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class LinguisticsAnnotatorTestCase {
+
+ private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
+
+ // --------------------------------------------------------------------------------
+ //
+ // Tests
+ //
+ // --------------------------------------------------------------------------------
+
+ @Test
+ public void requireThatAnnotateFailsWithZeroTokens() {
+ assertAnnotations(null, "foo");
+ }
+
+ @Test
+ public void requireThatAnnotateFailsWithoutIndexableTokenString() {
+ for (TokenType type : TokenType.values()) {
+ if (type.isIndexable()) {
+ continue;
+ }
+ assertAnnotations(null, "foo", newToken("foo", "bar", type));
+ }
+ }
+
+ @Test
+ public void requireThatIndexableTokenStringsAreAnnotated() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ for (TokenType type : TokenType.values()) {
+ if (!type.isIndexable()) {
+ continue;
+ }
+ assertAnnotations(expected, "foo", newToken("foo", "bar", type));
+ }
+ }
+
+ @Test
+ public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ for (TokenType type : TokenType.values()) {
+ assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
+ }
+ }
+
+ @Test
+ public void requireThatTermAnnotationsAreEmptyIfOrigIsLowerCase() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM));
+ for (boolean specialToken : Arrays.asList(true, false)) {
+ for (TokenType type : TokenType.values()) {
+ if (!specialToken && !type.isIndexable()) {
+ continue;
+ }
+ assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
+ }
+ }
+ }
+
+ @Test
+ public void requireThatTermAnnotationsAreLowerCased() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ for (boolean specialToken : Arrays.asList(true, false)) {
+ for (TokenType type : TokenType.values()) {
+ if (!specialToken && !type.isIndexable()) {
+ continue;
+ }
+ assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
+ }
+ }
+ }
+
+ @Test
+ public void requireThatCompositeTokensAreFlattened() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo")));
+ expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
+
+ SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
+ .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ assertAnnotations(expected, "foobarbaz", token);
+ }
+
+ @Test
+ public void requireThatCompositeSpecialTokensAreNotFlattened() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
+ new StringFieldValue("foobarbaz")));
+
+ SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
+ .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ assertAnnotations(expected, "foobarbaz", token);
+ }
+
+ @Test
+ public void requireThatErrorTokensAreSkipped() {
+ assertAnnotations(null, "foo", new SimpleToken("foo").setType(TokenType.ALPHABETIC)
+ .setOffset(-1));
+ }
+
+ @Test
+ public void requireThatTermReplacementsAreApplied() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ for (boolean specialToken : Arrays.asList(true, false)) {
+ for (TokenType type : TokenType.values()) {
+ if (!specialToken && !type.isIndexable()) {
+ continue;
+ }
+ assertAnnotations(expected, "foo",
+ newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)),
+ Collections.singletonMap("foo", "bar")));
+ }
+ }
+ }
+
+ @Test
+ public void requireThatExistingAnnotationsAreKept() {
+ SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
+ spanTree.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
+
+ StringFieldValue val = new StringFieldValue("foo");
+ val.setSpanTree(spanTree);
+
+ Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+ Collections.<String, String>emptyMap());
+ new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
+
+ assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
+ }
+
+ @Test
+ public void requireThatMaxTermOccurencesIsHonored() {
+ final String inputTerm = "foo";
+ final String stemmedInputTerm = "bar"; // completely different from
+ // inputTerm for safer test
+ final String paddedInputTerm = inputTerm + " ";
+ final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
+ for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
+ expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length())
+ .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
+ }
+ for (TokenType type : TokenType.values()) {
+ if (!type.isIndexable()) {
+ continue;
+ }
+ StringBuilder input = new StringBuilder();
+ Token[] tokens = new Token[inputTermOccurence];
+ for (int i = 0; i < inputTermOccurence; ++i) {
+ SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
+ t.setOffset(i * paddedInputTerm.length());
+ tokens[i] = t;
+ input.append(paddedInputTerm);
+ }
+ assertAnnotations(expected, input.toString(), tokens);
+ }
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Utilities
+ //
+ // --------------------------------------------------------------------------------
+
+ private static SimpleToken newToken(String orig, String stem, TokenType type) {
+ return newToken(orig, stem, type, false);
+ }
+
+ private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
+ return new SimpleToken(orig).setTokenString(stem)
+ .setType(type)
+ .setSpecialToken(specialToken);
+ }
+
+ private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
+ assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
+ }
+
+ private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
+ StringFieldValue val = new StringFieldValue(str);
+ assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
+ }
+
+ private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) {
+ Linguistics linguistics = Mockito.mock(Linguistics.class);
+ Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms));
+ return linguistics;
+ }
+
+ private static class MyTokenizer implements Tokenizer {
+
+ final List<Token> tokens;
+ final Map<String, String> replacementTerms;
+
+ public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) {
+ this.tokens = new ArrayList<>(tokens);
+ this.replacementTerms = replacementTerms;
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ return tokens;
+ }
+
+ @Override
+ public String getReplacementTerm(String term) {
+ String replacement = replacementTerms.get(term);
+ return replacement != null ? replacement : term;
+ }
+ }
+}