diff options
author | Harald Musum <musum@vespa.ai> | 2023-11-13 21:34:45 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-11-13 21:34:45 +0100 |
commit | ef5be496bc4857c5923f566251dd527873b248bf (patch) | |
tree | 657d51a4166d3f7cf40e04f0a5972f11d0261afd /indexinglanguage/src/test/java | |
parent | 944d635d00e165166508ef23399e9ed65a87a9c8 (diff) |
Revert "Bratseth/casing take 2"
Diffstat (limited to 'indexinglanguage/src/test/java')
2 files changed, 38 insertions, 55 deletions
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java index b4e266ab3eb..bcde8751de8 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java @@ -57,8 +57,8 @@ public class NGramTestCase { new NGramExpression(new SimpleLinguistics(), 3).execute(context); StringFieldValue value = (StringFieldValue)context.getValue(); - assertEquals("Grams are pure annotations - field value is unchanged", - "en gul Bille sang... ", value.getString()); + assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ", + value.getString()); SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS); assertNotNull(gramTree); SpanList grams = (SpanList)gramTree.getRoot(); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index a4dbe1fe826..67bff3843ee 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -19,6 +19,7 @@ import org.junit.Test; import org.mockito.Mockito; import java.util.*; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -28,6 +29,8 @@ import static org.junit.Assert.assertTrue; */ public class LinguisticsAnnotatorTestCase { + private static final AnnotatorConfig CONFIG = new AnnotatorConfig(); + @Test public void requireThatAnnotateFailsWithZeroTokens() { assertAnnotations(null, "foo"); @@ -39,7 +42,7 @@ public class LinguisticsAnnotatorTestCase { if (type.isIndexable()) { continue; } - assertAnnotations(null, "foo", token("foo", "bar", type)); + assertAnnotations(null, "foo", newToken("foo", "bar", type)); } } @@ -51,27 +54,7 @@ public class LinguisticsAnnotatorTestCase { if (!type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "bar", type)); - } - } - - @Test - public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() { - SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); - var span2 = expected.spanList().span(0, 4); - span2.annotate(new Annotation(AnnotationTypes.TERM)); - span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car"))); - var span3 = expected.spanList().span(0, 8); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes"))); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx"))); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex"))); - for (TokenType type : TokenType.values()) { - if (!type.isIndexable()) continue; - assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"), - token("Tesla", "tesla", type), - token("cars", "car", type), - SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex"))); + assertAnnotations(expected, "foo", newToken("foo", "bar", type)); } } @@ -80,7 +63,7 @@ public class LinguisticsAnnotatorTestCase { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (TokenType type : TokenType.values()) { - assertAnnotations(expected, "foo", token("foo", "bar", type, true)); + assertAnnotations(expected, "foo", newToken("foo", "bar", type, true)); } } @@ -93,21 +76,21 @@ public class LinguisticsAnnotatorTestCase { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken)); + assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken)); } } } @Test - public void requireThatTermAnnotationsPreserveCasing() { + public void requireThatTermAnnotationsAreLowerCased() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR"))); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (boolean specialToken : Arrays.asList(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken)); + assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken)); } } } @@ -119,11 +102,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); - SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) - .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -133,11 +116,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foobarbaz"))); - SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) - .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -157,8 +140,7 @@ public class LinguisticsAnnotatorTestCase { continue; } assertAnnotations(expected, "foo", - new AnnotatorConfig(), - newLinguistics(List.of(token("foo", "foo", type, specialToken)), + newLinguistics(List.of(newToken("foo", "foo", type, specialToken)), Collections.singletonMap("foo", "bar"))); } } @@ -172,9 +154,11 @@ public class LinguisticsAnnotatorTestCase { StringFieldValue val = new StringFieldValue("foo"); val.setSpanTree(spanTree); - Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)), + Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), Collections.<String, String>emptyMap()); - assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val)); + new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); + + assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -202,7 +186,7 @@ public class LinguisticsAnnotatorTestCase { } @Test - public void requireThatMaxTermOccurrencesIsHonored() { + public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from // inputTerm for safer test @@ -220,7 +204,7 @@ public class LinguisticsAnnotatorTestCase { StringBuilder input = new StringBuilder(); Token[] tokens = new Token[inputTermOccurence]; for (int i = 0; i < inputTermOccurence; ++i) { - SimpleToken t = token(inputTerm, stemmedInputTerm, type); + SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; input.append(paddedInputTerm); @@ -230,29 +214,28 @@ public class LinguisticsAnnotatorTestCase { } // -------------------------------------------------------------------------------- + // // Utilities + // + // -------------------------------------------------------------------------------- - private static SimpleToken token(String orig, String stem, TokenType type) { - return token(orig, stem, type, false); + private static SimpleToken newToken(String orig, String stem, TokenType type) { + return newToken(orig, stem, type, false); } - private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) { + private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) { return new SimpleToken(orig).setTokenString(stem) .setType(type) .setSpecialToken(specialToken); } private static void assertAnnotations(SpanTree expected, String value, Token... tokens) { - assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); - } - - private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) { - assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); + assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap())); } - private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) { + private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); - assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val)); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } |