diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-06-14 14:41:18 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-06-14 14:41:18 +0200 |
commit | 6ff3df19226036b8ee1bb559f9d73cab40e8d2a0 (patch) | |
tree | 355a7b0623b58983ba655b868341fe479a22eb3d /indexinglanguage | |
parent | b7f9e7ceaef72489d76683537973b639f8895b84 (diff) |
Remove carriage return
Diffstat (limited to 'indexinglanguage')
6 files changed, 424 insertions, 424 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java index f648a0e38e4..9596bf60cae 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/MathResolver.java @@ -1,55 +1,55 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.indexinglanguage.expressions;
-
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Stack;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public class MathResolver {
-
- private final List<Item> items = new LinkedList<>();
-
- public void push(ArithmeticExpression.Operator op, Expression exp) {
- op.getClass(); // throws NullPointerException
- if (items.isEmpty() && op != ArithmeticExpression.Operator.ADD) {
- throw new IllegalArgumentException("First item in an arithmetic operation must be an addition.");
- }
- items.add(new Item(op, exp));
- }
-
- public Expression resolve() {
- Stack<Item> stack = new Stack<>();
- stack.push(items.remove(0));
- while (!items.isEmpty()) {
- Item item = items.remove(0);
- while (stack.size() > 1 && stack.peek().op.precedes(item.op)) {
- pop(stack);
- }
- stack.push(item);
- }
- while (stack.size() > 1) {
- pop(stack);
- }
- return stack.remove(0).exp;
- }
-
- private void pop(Stack<Item> stack) {
- Item rhs = stack.pop();
- Item lhs = stack.peek();
- lhs.exp = new ArithmeticExpression(lhs.exp, rhs.op, rhs.exp);
- }
-
- private static class Item {
-
- final ArithmeticExpression.Operator op;
- Expression exp;
-
- Item(ArithmeticExpression.Operator op, Expression exp) {
- this.op = op;
- this.exp = exp;
- }
- }
-}
\ No newline at end of file +package com.yahoo.vespa.indexinglanguage.expressions; + +import java.util.LinkedList; +import java.util.List; +import java.util.Stack; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class MathResolver { + + private final List<Item> items = new LinkedList<>(); + + public void push(ArithmeticExpression.Operator op, Expression exp) { + op.getClass(); // throws NullPointerException + if (items.isEmpty() && op != ArithmeticExpression.Operator.ADD) { + throw new IllegalArgumentException("First item in an arithmetic operation must be an addition."); + } + items.add(new Item(op, exp)); + } + + public Expression resolve() { + Stack<Item> stack = new Stack<>(); + stack.push(items.remove(0)); + while (!items.isEmpty()) { + Item item = items.remove(0); + while (stack.size() > 1 && stack.peek().op.precedes(item.op)) { + pop(stack); + } + stack.push(item); + } + while (stack.size() > 1) { + pop(stack); + } + return stack.remove(0).exp; + } + + private void pop(Stack<Item> stack) { + Item rhs = stack.pop(); + Item lhs = stack.peek(); + lhs.exp = new ArithmeticExpression(lhs.exp, rhs.op, rhs.exp); + } + + private static class Item { + + final ArithmeticExpression.Operator op; + Expression exp; + + Item(ArithmeticExpression.Operator op, Expression exp) { + this.op = op; + this.exp = exp; + } + } +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java index 427c777db5a..703cf30f6e0 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/PassthroughExpression.java @@ -18,4 +18,4 @@ public class PassthroughExpression extends OutputExpression { public boolean equals(Object obj) { return super.equals(obj) && obj instanceof PassthroughExpression; } -}
\ No newline at end of file +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 09364c796f0..48e78392498 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -1,106 +1,106 @@ -// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.indexinglanguage.linguistics;
-
-import com.yahoo.language.Language;
-import com.yahoo.language.process.StemMode;
-import com.yahoo.vespa.configdefinition.IlscriptsConfig;
-
-/**
- * @author Simon Thoresen
- */
-public class AnnotatorConfig implements Cloneable {
-
- private Language language;
- private StemMode stemMode;
- private boolean removeAccents;
- private int maxTermOccurences;
-
- public static final int DEFAULT_MAX_TERM_OCCURRENCES;
-
- static {
- IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
- DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
- }
-
- public AnnotatorConfig() {
- language = Language.ENGLISH;
- stemMode = StemMode.NONE;
- removeAccents = false;
- maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
- }
-
- public AnnotatorConfig(AnnotatorConfig rhs) {
- language = rhs.language;
- stemMode = rhs.stemMode;
- removeAccents = rhs.removeAccents;
- maxTermOccurences = rhs.maxTermOccurences;
- }
-
- public Language getLanguage() {
- return language;
- }
-
- public AnnotatorConfig setLanguage(Language language) {
- this.language = language;
- return this;
- }
-
- public StemMode getStemMode() {
- return stemMode;
- }
-
- public AnnotatorConfig setStemMode(StemMode stemMode) {
- this.stemMode = stemMode;
- return this;
- }
-
- public AnnotatorConfig setStemMode(String name) {
- this.stemMode = StemMode.valueOf(name);
- return this;
- }
-
- public boolean getRemoveAccents() {
- return removeAccents;
- }
-
- public AnnotatorConfig setRemoveAccents(boolean removeAccents) {
- this.removeAccents = removeAccents;
- return this;
- }
-
- public int getMaxTermOccurrences() {
- return maxTermOccurences;
- }
-
- public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurences = maxTermCount;
- return this;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof AnnotatorConfig)) {
- return false;
- }
- AnnotatorConfig rhs = (AnnotatorConfig)obj;
- if (!language.equals(rhs.language)) {
- return false;
- }
- if (!stemMode.equals(rhs.stemMode)) {
- return false;
- }
- if (removeAccents != rhs.removeAccents) {
- return false;
- }
- if (maxTermOccurences != rhs.maxTermOccurences) {
- return false;
- }
- return true;
- }
-
- @Override
- public int hashCode() {
- return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences;
- }
-}
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.linguistics; + +import com.yahoo.language.Language; +import com.yahoo.language.process.StemMode; +import com.yahoo.vespa.configdefinition.IlscriptsConfig; + +/** + * @author Simon Thoresen + */ +public class AnnotatorConfig implements Cloneable { + + private Language language; + private StemMode stemMode; + private boolean removeAccents; + private int maxTermOccurences; + + public static final int DEFAULT_MAX_TERM_OCCURRENCES; + + static { + IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder()); + DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences(); + } + + public AnnotatorConfig() { + language = Language.ENGLISH; + stemMode = StemMode.NONE; + removeAccents = false; + maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; + } + + public AnnotatorConfig(AnnotatorConfig rhs) { + language = rhs.language; + stemMode = rhs.stemMode; + removeAccents = rhs.removeAccents; + maxTermOccurences = rhs.maxTermOccurences; + } + + public Language getLanguage() { + return language; + } + + public AnnotatorConfig setLanguage(Language language) { + this.language = language; + return this; + } + + public StemMode getStemMode() { + return stemMode; + } + + public AnnotatorConfig setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + + public AnnotatorConfig setStemMode(String name) { + this.stemMode = StemMode.valueOf(name); + return this; + } + + public boolean getRemoveAccents() { + return removeAccents; + } + + public AnnotatorConfig setRemoveAccents(boolean removeAccents) { + this.removeAccents = removeAccents; + return this; + } + + public int getMaxTermOccurrences() { + return maxTermOccurences; + } + + public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { + this.maxTermOccurences = maxTermCount; + return this; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof AnnotatorConfig)) { + return false; + } + AnnotatorConfig rhs = (AnnotatorConfig)obj; + if (!language.equals(rhs.language)) { + return false; + } + if (!stemMode.equals(rhs.stemMode)) { + return false; + } + if (removeAccents != rhs.removeAccents) { + return false; + } + if (maxTermOccurences != rhs.maxTermOccurences) { + return false; + } + return true; + } + + @Override + public int hashCode() { + return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences; + } +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java index de9d26547da..5e9ffd677cf 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/parser/IndexingInput.java @@ -1,14 +1,14 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.indexinglanguage.parser;
-
-import com.yahoo.javacc.FastCharStream;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public final class IndexingInput extends FastCharStream implements CharStream {
-
- public IndexingInput(String input) {
- super(input);
- }
-}
+package com.yahoo.vespa.indexinglanguage.parser; + +import com.yahoo.javacc.FastCharStream; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public final class IndexingInput extends FastCharStream implements CharStream { + + public IndexingInput(String input) { + super(input); + } +} diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java index 097e0f21bc1..95eab5f9ef8 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/AttributeExpressionTestCase.java @@ -38,4 +38,4 @@ public class AttributeExpressionTestCase { public void requireThatExpressionCanBeExecuted() { assertExecute(new AttributeExpression("foo")); } -}
\ No newline at end of file +} diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 805bdc96904..5882e2c19c6 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -1,250 +1,250 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.indexinglanguage.linguistics;
-
-import com.yahoo.document.annotation.Annotation;
-import com.yahoo.document.annotation.AnnotationTypes;
-import com.yahoo.document.annotation.SpanTree;
-import com.yahoo.document.annotation.SpanTrees;
-import com.yahoo.document.datatypes.StringFieldValue;
-import com.yahoo.language.Language;
-import com.yahoo.language.Linguistics;
-import com.yahoo.language.process.StemMode;
-import com.yahoo.language.process.Token;
-import com.yahoo.language.process.TokenType;
-import com.yahoo.language.process.Tokenizer;
-import com.yahoo.language.simple.SimpleToken;
-
-import org.junit.Test;
-import org.mockito.Mockito;
-
-import java.util.*;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- */
-public class LinguisticsAnnotatorTestCase {
-
- private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
-
- // --------------------------------------------------------------------------------
- //
- // Tests
- //
- // --------------------------------------------------------------------------------
-
- @Test
- public void requireThatAnnotateFailsWithZeroTokens() {
- assertAnnotations(null, "foo");
- }
-
- @Test
- public void requireThatAnnotateFailsWithoutIndexableTokenString() {
- for (TokenType type : TokenType.values()) {
- if (type.isIndexable()) {
- continue;
- }
- assertAnnotations(null, "foo", newToken("foo", "bar", type));
- }
- }
-
- @Test
- public void requireThatIndexableTokenStringsAreAnnotated() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "bar", type));
- }
- }
-
- @Test
- public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (TokenType type : TokenType.values()) {
- assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
- }
- }
-
- @Test
- public void requireThatTermAnnotationsAreEmptyIfOrigIsLowerCase() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
- }
- }
- }
-
- @Test
- public void requireThatTermAnnotationsAreLowerCased() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
- }
- }
- }
-
- @Test
- public void requireThatCompositeTokensAreFlattened() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo")));
- expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
-
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
- assertAnnotations(expected, "foobarbaz", token);
- }
-
- @Test
- public void requireThatCompositeSpecialTokensAreNotFlattened() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
- new StringFieldValue("foobarbaz")));
-
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
- assertAnnotations(expected, "foobarbaz", token);
- }
-
- @Test
- public void requireThatErrorTokensAreSkipped() {
- assertAnnotations(null, "foo", new SimpleToken("foo").setType(TokenType.ALPHABETIC)
- .setOffset(-1));
- }
-
- @Test
- public void requireThatTermReplacementsAreApplied() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
- for (boolean specialToken : Arrays.asList(true, false)) {
- for (TokenType type : TokenType.values()) {
- if (!specialToken && !type.isIndexable()) {
- continue;
- }
- assertAnnotations(expected, "foo",
- newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)),
- Collections.singletonMap("foo", "bar")));
- }
- }
- }
-
- @Test
- public void requireThatExistingAnnotationsAreKept() {
- SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
- spanTree.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
-
- StringFieldValue val = new StringFieldValue("foo");
- val.setSpanTree(spanTree);
-
- Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
- Collections.<String, String>emptyMap());
- new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
-
- assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
- assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
- }
-
- @Test
- public void requireThatMaxTermOccurencesIsHonored() {
- final String inputTerm = "foo";
- final String stemmedInputTerm = "bar"; // completely different from
- // inputTerm for safer test
- final String paddedInputTerm = inputTerm + " ";
- final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
- for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) {
- expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length())
- .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm)));
- }
- for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) {
- continue;
- }
- StringBuilder input = new StringBuilder();
- Token[] tokens = new Token[inputTermOccurence];
- for (int i = 0; i < inputTermOccurence; ++i) {
- SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
- t.setOffset(i * paddedInputTerm.length());
- tokens[i] = t;
- input.append(paddedInputTerm);
- }
- assertAnnotations(expected, input.toString(), tokens);
- }
- }
-
- // --------------------------------------------------------------------------------
- //
- // Utilities
- //
- // --------------------------------------------------------------------------------
-
- private static SimpleToken newToken(String orig, String stem, TokenType type) {
- return newToken(orig, stem, type, false);
- }
-
- private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
- return new SimpleToken(orig).setTokenString(stem)
- .setType(type)
- .setSpecialToken(specialToken);
- }
-
- private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
- assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
- }
-
- private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
- StringFieldValue val = new StringFieldValue(str);
- assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
- assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
- }
-
- private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- Linguistics linguistics = Mockito.mock(Linguistics.class);
- Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms));
- return linguistics;
- }
-
- private static class MyTokenizer implements Tokenizer {
-
- final List<Token> tokens;
- final Map<String, String> replacementTerms;
-
- public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- this.tokens = new ArrayList<>(tokens);
- this.replacementTerms = replacementTerms;
- }
-
- @Override
- public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- return tokens;
- }
-
- @Override
- public String getReplacementTerm(String term) {
- String replacement = replacementTerms.get(term);
- return replacement != null ? replacement : term;
- }
- }
-}
+package com.yahoo.vespa.indexinglanguage.linguistics; + +import com.yahoo.document.annotation.Annotation; +import com.yahoo.document.annotation.AnnotationTypes; +import com.yahoo.document.annotation.SpanTree; +import com.yahoo.document.annotation.SpanTrees; +import com.yahoo.document.datatypes.StringFieldValue; +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleToken; + +import org.junit.Test; +import org.mockito.Mockito; + +import java.util.*; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class LinguisticsAnnotatorTestCase { + + private static final AnnotatorConfig CONFIG = new AnnotatorConfig(); + + // -------------------------------------------------------------------------------- + // + // Tests + // + // -------------------------------------------------------------------------------- + + @Test + public void requireThatAnnotateFailsWithZeroTokens() { + assertAnnotations(null, "foo"); + } + + @Test + public void requireThatAnnotateFailsWithoutIndexableTokenString() { + for (TokenType type : TokenType.values()) { + if (type.isIndexable()) { + continue; + } + assertAnnotations(null, "foo", newToken("foo", "bar", type)); + } + } + + @Test + public void requireThatIndexableTokenStringsAreAnnotated() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); + for (TokenType type : TokenType.values()) { + if (!type.isIndexable()) { + continue; + } + assertAnnotations(expected, "foo", newToken("foo", "bar", type)); + } + } + + @Test + public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); + for (TokenType type : TokenType.values()) { + assertAnnotations(expected, "foo", newToken("foo", "bar", type, true)); + } + } + + @Test + public void requireThatTermAnnotationsAreEmptyIfOrigIsLowerCase() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM)); + for (boolean specialToken : Arrays.asList(true, false)) { + for (TokenType type : TokenType.values()) { + if (!specialToken && !type.isIndexable()) { + continue; + } + assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken)); + } + } + } + + @Test + public void requireThatTermAnnotationsAreLowerCased() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); + for (boolean specialToken : Arrays.asList(true, false)) { + for (TokenType type : TokenType.values()) { + if (!specialToken && !type.isIndexable()) { + continue; + } + assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken)); + } + } + } + + @Test + public void requireThatCompositeTokensAreFlattened() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo"))); + expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); + expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); + + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + assertAnnotations(expected, "foobarbaz", token); + } + + @Test + public void requireThatCompositeSpecialTokensAreNotFlattened() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, + new StringFieldValue("foobarbaz"))); + + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + assertAnnotations(expected, "foobarbaz", token); + } + + @Test + public void requireThatErrorTokensAreSkipped() { + assertAnnotations(null, "foo", new SimpleToken("foo").setType(TokenType.ALPHABETIC) + .setOffset(-1)); + } + + @Test + public void requireThatTermReplacementsAreApplied() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); + for (boolean specialToken : Arrays.asList(true, false)) { + for (TokenType type : TokenType.values()) { + if (!specialToken && !type.isIndexable()) { + continue; + } + assertAnnotations(expected, "foo", + newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)), + Collections.singletonMap("foo", "bar"))); + } + } + } + + @Test + public void requireThatExistingAnnotationsAreKept() { + SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS); + spanTree.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); + + StringFieldValue val = new StringFieldValue("foo"); + val.setSpanTree(spanTree); + + Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)), + Collections.<String, String>emptyMap()); + new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); + + assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); + } + + @Test + public void requireThatMaxTermOccurencesIsHonored() { + final String inputTerm = "foo"; + final String stemmedInputTerm = "bar"; // completely different from + // inputTerm for safer test + final String paddedInputTerm = inputTerm + " "; + final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; + for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) { + expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()) + .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm))); + } + for (TokenType type : TokenType.values()) { + if (!type.isIndexable()) { + continue; + } + StringBuilder input = new StringBuilder(); + Token[] tokens = new Token[inputTermOccurence]; + for (int i = 0; i < inputTermOccurence; ++i) { + SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); + t.setOffset(i * paddedInputTerm.length()); + tokens[i] = t; + input.append(paddedInputTerm); + } + assertAnnotations(expected, input.toString(), tokens); + } + } + + // -------------------------------------------------------------------------------- + // + // Utilities + // + // -------------------------------------------------------------------------------- + + private static SimpleToken newToken(String orig, String stem, TokenType type) { + return newToken(orig, stem, type, false); + } + + private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) { + return new SimpleToken(orig).setTokenString(stem) + .setType(type) + .setSpecialToken(specialToken); + } + + private static void assertAnnotations(SpanTree expected, String value, Token... tokens) { + assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap())); + } + + private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { + StringFieldValue val = new StringFieldValue(str); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); + } + + private static Linguistics newLinguistics(List<? extends Token> tokens, Map<String, String> replacementTerms) { + Linguistics linguistics = Mockito.mock(Linguistics.class); + Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms)); + return linguistics; + } + + private static class MyTokenizer implements Tokenizer { + + final List<Token> tokens; + final Map<String, String> replacementTerms; + + public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) { + this.tokens = new ArrayList<>(tokens); + this.replacementTerms = replacementTerms; + } + + @Override + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + return tokens; + } + + @Override + public String getReplacementTerm(String term) { + String replacement = replacementTerms.get(term); + return replacement != null ? replacement : term; + } + } +} |