// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.linguistics; import com.yahoo.document.annotation.Annotation; import com.yahoo.document.annotation.AnnotationTypes; import com.yahoo.document.annotation.SpanTree; import com.yahoo.document.annotation.SpanTrees; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.language.simple.SimpleToken; import org.junit.Test; import org.mockito.Mockito; import java.util.List; import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** * @author Simon Thoresen Hult */ public class LinguisticsAnnotatorTestCase { @Test public void requireThatAnnotateFailsWithZeroTokens() { assertAnnotations(null, "foo"); } @Test public void requireThatAnnotateFailsWithoutIndexableTokenString() { for (TokenType type : TokenType.values()) { if (type.isIndexable()) { continue; } assertAnnotations(null, "foo", token("foo", "bar", type)); } } @Test public void requireThatIndexableTokenStringsAreAnnotated() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (TokenType type : TokenType.values()) { if (!type.isIndexable()) { continue; } assertAnnotations(expected, "foo", token("foo", "bar", type)); } } @Test public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); var span1 = expected.spanList().span(0, 6); span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("teslas"))); var span2 = expected.spanList().span(0, 4); span2.annotate(new Annotation(AnnotationTypes.TERM)); span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car"))); var span3 = expected.spanList().span(0, 8); span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes"))); span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx"))); span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex"))); for (TokenType type : TokenType.values()) { if (!type.isIndexable()) continue; assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"), token("Teslas", "tesla", type), token("cars", "car", type), SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex"))); } } @Test public void requireThatSpecialTokenStringsAreAnnotatedRegardlessOfType() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (TokenType type : TokenType.values()) { assertAnnotations(expected, "foo", token("foo", "bar", type, true)); } } @Test public void requireThatTermAnnotationsAreEmptyIfOrigIsLowerCase() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM)); for (boolean specialToken : List.of(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken)); } } } @Test public void requireThatTermAnnotationsPreserveCasing() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR"))); for (boolean specialToken : List.of(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken)); } } } @Test public void requireThatCompositeTokensAreFlattened() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foo"))); expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @Test public void requireThatCompositeSpecialTokensAreNotFlattened() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foobarbaz"))); SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @Test public void requireThatErrorTokensAreSkipped() { assertAnnotations(null, "foo", new SimpleToken("foo").setType(TokenType.ALPHABETIC) .setOffset(-1)); } @Test public void requireThatTermReplacementsAreApplied() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (boolean specialToken : List.of(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } assertAnnotations(expected, "foo", new AnnotatorConfig(), newLinguistics(List.of(token("foo", "foo", type, specialToken)), Map.of("foo", "bar"))); } } } @Test public void requireThatExistingAnnotationsAreKept() { SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS); spanTree.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); StringFieldValue val = new StringFieldValue("foo"); val.setSpanTree(spanTree); Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)), Map.of()); assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val)); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @Test public void requireThatTokenizeCappingWorks() { String shortString = "short string"; SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS); spanTree.setStringFieldValue(new StringFieldValue(shortString)); spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM)); spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM)); StringFieldValue shortValue = new StringFieldValue(shortString); Linguistics linguistics = new SimpleLinguistics(); LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12)); assertTrue(annotator.annotate(shortValue)); assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS)); assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string"); assertTrue(annotator.annotate(cappedValue)); assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString()); } @Test public void requireThatMaxTermOccurrencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from inputTerm for safer test final String paddedInputTerm = inputTerm + " "; final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; for (int i = 0; i < AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES; ++i) { expected.spanList().span(i * paddedInputTerm.length(), inputTerm.length()) .annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue(stemmedInputTerm))); } for (TokenType type : TokenType.values()) { if (!type.isIndexable()) { continue; } StringBuilder input = new StringBuilder(); Token[] tokens = new Token[inputTermOccurence]; for (int i = 0; i < inputTermOccurence; ++i) { SimpleToken t = token(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; input.append(paddedInputTerm); } assertAnnotations(expected, input.toString(), tokens); } } // -------------------------------------------------------------------------------- // Utilities private static SimpleToken token(String orig, String stem, TokenType type) { return token(orig, stem, type, false); } private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) { return new SimpleToken(orig).setTokenString(stem) .setType(type) .setSpecialToken(specialToken); } private static void assertAnnotations(SpanTree expected, String value, Token... tokens) { assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(List.of(tokens), Map.of())); } private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) { assertAnnotations(expected, value, config, newLinguistics(List.of(tokens), Map.of())); } private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val)); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } private static Linguistics newLinguistics(List tokens, Map replacementTerms) { Linguistics linguistics = Mockito.mock(Linguistics.class); Mockito.when(linguistics.getTokenizer()).thenReturn(new MyTokenizer(tokens, replacementTerms)); return linguistics; } private static class MyTokenizer implements Tokenizer { final List tokens; public MyTokenizer(List tokens, Map replacementTerms) { this.tokens = tokens.stream().map(token -> replace(token, replacementTerms)).toList(); } private Token replace(Token token, Map replacementTerms) { String tokenString = token.getTokenString(); if (tokenString != null && !replacementTerms.isEmpty()) { var simpleToken = (SimpleToken)token; simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString())); } return token; } @Override public Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { return tokens; } } }