diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-11-10 21:38:23 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-11-10 21:38:23 +0100 |
commit | 90965807bd8a6134fe92f7058b3b0a3287050c2a (patch) | |
tree | ada0de730852649e4e648c0c4093e9288988ea77 /indexinglanguage | |
parent | d74701fe719494819eeb7f5c1af4b59a5c652df6 (diff) |
Prefer first stem to original if non equal
Diffstat (limited to 'indexinglanguage')
3 files changed, 65 insertions, 47 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 684bae3bf97..5c1bf0813c4 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable { private Language language; private StemMode stemMode; private boolean removeAccents; - private int maxTermOccurences; + private int maxTermOccurrences; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; @@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable { language = Language.ENGLISH; stemMode = StemMode.NONE; removeAccents = false; - maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable { language = rhs.language; stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; - maxTermOccurences = rhs.maxTermOccurences; + maxTermOccurrences = rhs.maxTermOccurrences; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable { } public int getMaxTermOccurrences() { - return maxTermOccurences; + return maxTermOccurrences; } public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { - this.maxTermOccurences = maxTermCount; + this.maxTermOccurrences = maxTermCount; return this; } @@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable { if (removeAccents != rhs.removeAccents) { return false; } - if (maxTermOccurences != rhs.maxTermOccurences) { + if (maxTermOccurrences != rhs.maxTermOccurrences) { return false; } if (maxTokenizeLength != rhs.maxTokenizeLength) { @@ -121,6 +121,7 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; } + } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 04019800d59..74afd30d7ef 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -34,8 +34,8 @@ public class LinguisticsAnnotator { final Map<String, Integer> termOccurrences = new HashMap<>(); final int maxOccurrences; - public TermOccurrences(int maxOccurences) { - this.maxOccurrences = maxOccurences; + public TermOccurrences(int maxOccurrences) { + this.maxOccurrences = maxOccurrences; } boolean termCountBelowLimit(String term) { @@ -126,16 +126,15 @@ public class LinguisticsAnnotator { } if (mode == StemMode.ALL) { Span where = parent.span((int)token.getOffset(), token.getOrig().length()); - addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); String lowercasedOrig = toLowerCase(token.getOrig()); String termOrIfNullOrig = lowercasedOrig; String term = token.getTokenString(); if (term != null) { - termOrIfNullOrig = term; - } - if (! lowercasedOrig.equals(termOrIfNullOrig)) { addAnnotation(where, term, token.getOrig(), termOccurrences); + termOrIfNullOrig = term; + if ( ! term.equals(lowercasedOrig)) + addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 8baa4851f5d..c52b877ba3b 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -19,7 +19,6 @@ import org.junit.Test; import org.mockito.Mockito; import java.util.*; -import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -29,8 +28,6 @@ import static org.junit.Assert.assertTrue; */ public class LinguisticsAnnotatorTestCase { - private static final AnnotatorConfig CONFIG = new AnnotatorConfig(); - @Test public void requireThatAnnotateFailsWithZeroTokens() { assertAnnotations(null, "foo"); @@ -42,7 +39,7 @@ public class LinguisticsAnnotatorTestCase { if (type.isIndexable()) { continue; } - assertAnnotations(null, "foo", newToken("foo", "bar", type)); + assertAnnotations(null, "foo", token("foo", "bar", type)); } } @@ -54,7 +51,27 @@ public class LinguisticsAnnotatorTestCase { if (!type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", newToken("foo", "bar", type)); + assertAnnotations(expected, "foo", token("foo", "bar", type)); + } + } + + @Test + public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() { + SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); + expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); + var span2 = expected.spanList().span(0, 4); + span2.annotate(new Annotation(AnnotationTypes.TERM)); + span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car"))); + var span3 = expected.spanList().span(0, 8); + span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes"))); + span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx"))); + span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex"))); + for (TokenType type : TokenType.values()) { + if (!type.isIndexable()) continue; + assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"), + token("Tesla", "tesla", type), + token("cars", "car", type), + SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex"))); } } @@ -63,7 +80,7 @@ public class LinguisticsAnnotatorTestCase { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (TokenType type : TokenType.values()) { - assertAnnotations(expected, "foo", newToken("foo", "bar", type, true)); + assertAnnotations(expected, "foo", token("foo", "bar", type, true)); } } @@ -76,7 +93,7 @@ public class LinguisticsAnnotatorTestCase { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken)); + assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken)); } } } @@ -90,7 +107,7 @@ public class LinguisticsAnnotatorTestCase { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", newToken("foo", "BaR", type, specialToken)); + assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken)); } } } @@ -102,11 +119,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); - SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) - .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) + .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -116,11 +133,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foobarbaz"))); - SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) - .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) + .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -140,7 +157,8 @@ public class LinguisticsAnnotatorTestCase { continue; } assertAnnotations(expected, "foo", - newLinguistics(List.of(newToken("foo", "foo", type, specialToken)), + new AnnotatorConfig(), + newLinguistics(List.of(token("foo", "foo", type, specialToken)), Collections.singletonMap("foo", "bar"))); } } @@ -154,11 +172,9 @@ public class LinguisticsAnnotatorTestCase { StringFieldValue val = new StringFieldValue("foo"); val.setSpanTree(spanTree); - Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), + Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)), Collections.<String, String>emptyMap()); - new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); - - assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val)); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -186,7 +202,7 @@ public class LinguisticsAnnotatorTestCase { } @Test - public void requireThatMaxTermOccurencesIsHonored() { + public void requireThatMaxTermOccurrencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from // inputTerm for safer test @@ -204,7 +220,7 @@ public class LinguisticsAnnotatorTestCase { StringBuilder input = new StringBuilder(); Token[] tokens = new Token[inputTermOccurence]; for (int i = 0; i < inputTermOccurence; ++i) { - SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); + SimpleToken t = token(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; input.append(paddedInputTerm); @@ -214,28 +230,29 @@ public class LinguisticsAnnotatorTestCase { } // -------------------------------------------------------------------------------- - // // Utilities - // - // -------------------------------------------------------------------------------- - private static SimpleToken newToken(String orig, String stem, TokenType type) { - return newToken(orig, stem, type, false); + private static SimpleToken token(String orig, String stem, TokenType type) { + return token(orig, stem, type, false); } - private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) { + private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) { return new SimpleToken(orig).setTokenString(stem) .setType(type) .setSpecialToken(specialToken); } private static void assertAnnotations(SpanTree expected, String value, Token... tokens) { - assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap())); + assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); + } + + private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) { + assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); } - private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { + private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); - assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val)); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -255,6 +272,7 @@ public class LinguisticsAnnotatorTestCase { private Token replace(Token token, Map<String, String> replacementTerms) { var simpleToken = (SimpleToken)token; + System.out.println("Token: " + token + ", getTokenString: " + token.getTokenString()); simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString())); return simpleToken; } |