Prefer first stem to original if non equal

author: Jon Bratseth <bratseth@vespa.ai> 2023-11-10 21:38:23 +0100
committer: Jon Bratseth <bratseth@vespa.ai> 2023-11-10 21:38:23 +0100
commit: 90965807bd8a6134fe92f7058b3b0a3287050c2a (patch)
tree: ada0de730852649e4e648c0c4093e9288988ea77 /indexinglanguage
parent: d74701fe719494819eeb7f5c1af4b59a5c652df6 (diff)
3 files changed, 65 insertions, 47 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 684bae3bf97..5c1bf0813c4 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
     private Language language;
     private StemMode stemMode;
     private boolean removeAccents;
-    private int maxTermOccurences;
+    private int maxTermOccurrences;
     private int maxTokenizeLength;
 
     public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
         language = Language.ENGLISH;
         stemMode = StemMode.NONE;
         removeAccents = false;
-        maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+        maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
         maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
     }
 
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
         language = rhs.language;
         stemMode = rhs.stemMode;
         removeAccents = rhs.removeAccents;
-        maxTermOccurences = rhs.maxTermOccurences;
+        maxTermOccurrences = rhs.maxTermOccurrences;
         maxTokenizeLength = rhs.maxTokenizeLength;
     }
 
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
     }
 
     public int getMaxTermOccurrences() {
-        return maxTermOccurences;
+        return maxTermOccurrences;
     }
 
     public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
-        this.maxTermOccurences = maxTermCount;
+        this.maxTermOccurrences = maxTermCount;
         return this;
     }
 
@@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable {
         if (removeAccents != rhs.removeAccents) {
             return false;
         }
-        if (maxTermOccurences != rhs.maxTermOccurences) {
+        if (maxTermOccurrences != rhs.maxTermOccurrences) {
             return false;
         }
         if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -121,6 +121,7 @@ public class AnnotatorConfig implements Cloneable {
     @Override
     public int hashCode() {
         return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
-               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
+               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
     }
+
 }
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 04019800d59..74afd30d7ef 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -34,8 +34,8 @@ public class LinguisticsAnnotator {
         final Map<String, Integer> termOccurrences = new HashMap<>();
         final int maxOccurrences;
 
-        public TermOccurrences(int maxOccurences) {
-            this.maxOccurrences = maxOccurences;
+        public TermOccurrences(int maxOccurrences) {
+            this.maxOccurrences = maxOccurrences;
         }
 
         boolean termCountBelowLimit(String term) {
@@ -126,16 +126,15 @@ public class LinguisticsAnnotator {
         }
         if (mode == StemMode.ALL) {
             Span where = parent.span((int)token.getOffset(), token.getOrig().length());
-            addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
 
             String lowercasedOrig = toLowerCase(token.getOrig());
             String termOrIfNullOrig = lowercasedOrig;
             String term = token.getTokenString();
             if (term != null) {
-                termOrIfNullOrig = term;
-            }
-            if (! lowercasedOrig.equals(termOrIfNullOrig)) {
                 addAnnotation(where, term, token.getOrig(), termOccurrences);
+                termOrIfNullOrig = term;
+                if ( ! term.equals(lowercasedOrig))
+                    addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
             }
             for (int i = 0; i < token.getNumStems(); i++) {
                 String stem = token.getStem(i);
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 8baa4851f5d..c52b877ba3b 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,7 +19,6 @@ import org.junit.Test;
 import org.mockito.Mockito;
 
 import java.util.*;
-import java.util.stream.Collectors;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -29,8 +28,6 @@ import static org.junit.Assert.assertTrue;
  */
 public class LinguisticsAnnotatorTestCase {
 
-    private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
-
     @Test
     public void requireThatAnnotateFailsWithZeroTokens() {
         assertAnnotations(null, "foo");
@@ -42,7 +39,7 @@ public class LinguisticsAnnotatorTestCase {
             if (type.isIndexable()) {
                 continue;
             }
-            assertAnnotations(null, "foo", newToken("foo", "bar", type));
+            assertAnnotations(null, "foo", token("foo", "bar", type));
         }
     }
 
@@ -54,7 +51,27 @@ public class LinguisticsAnnotatorTestCase {
             if (!type.isIndexable()) {
                 continue;
             }
-            assertAnnotations(expected, "foo", newToken("foo", "bar", type));
+            assertAnnotations(expected, "foo", token("foo", "bar", type));
+        }
+    }
+
+    @Test
+    public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
+        SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+        expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+        var span2 = expected.spanList().span(0, 4);
+        span2.annotate(new Annotation(AnnotationTypes.TERM));
+        span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
+        var span3 = expected.spanList().span(0, 8);
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes")));
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx")));
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex")));
+        for (TokenType type : TokenType.values()) {
+            if (!type.isIndexable()) continue;
+            assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
+                              token("Tesla", "tesla", type),
+                              token("cars", "car", type),
+                              SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
         }
     }
 
@@ -63,7 +80,7 @@ public class LinguisticsAnnotatorTestCase {
         SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
         expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
         for (TokenType type : TokenType.values()) {
-            assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
+            assertAnnotations(expected, "foo", token("foo", "bar", type, true));
         }
     }
 
@@ -76,7 +93,7 @@ public class LinguisticsAnnotatorTestCase {
                 if (!specialToken && !type.isIndexable()) {
                     continue;
                 }
-                assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
+                assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken));
             }
         }
     }
@@ -90,7 +107,7 @@ public class LinguisticsAnnotatorTestCase {
                 if (!specialToken && !type.isIndexable()) {
                     continue;
                 }
-                assertAnnotations(expected, "foo", newToken("foo", "BaR", type, specialToken));
+                assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken));
             }
         }
     }
@@ -102,11 +119,11 @@ public class LinguisticsAnnotatorTestCase {
         expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
         expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
 
-        SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
-                .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
-                .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
-                                      .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
-                                      .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+        SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
+                .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+                .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+                                                                             .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+                                                                             .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
         assertAnnotations(expected, "foobarbaz", token);
     }
 
@@ -116,11 +133,11 @@ public class LinguisticsAnnotatorTestCase {
         expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
                                                                new StringFieldValue("foobarbaz")));
 
-        SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
-                .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
-                .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
-                                      .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
-                                      .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+        SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
+                                                                                 .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+                                                                                 .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+                                                                                                                                              .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+                                                                                                                                              .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
         assertAnnotations(expected, "foobarbaz", token);
     }
 
@@ -140,7 +157,8 @@ public class LinguisticsAnnotatorTestCase {
                     continue;
                 }
                 assertAnnotations(expected, "foo",
-                                  newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
+                                  new AnnotatorConfig(),
+                                  newLinguistics(List.of(token("foo", "foo", type, specialToken)),
                                                  Collections.singletonMap("foo", "bar")));
             }
         }
@@ -154,11 +172,9 @@ public class LinguisticsAnnotatorTestCase {
         StringFieldValue val = new StringFieldValue("foo");
         val.setSpanTree(spanTree);
 
-        Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+        Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)),
                                                  Collections.<String, String>emptyMap());
-        new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
-
-        assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+        assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val));
         assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
     }
 
@@ -186,7 +202,7 @@ public class LinguisticsAnnotatorTestCase {
     }
 
     @Test
-    public void requireThatMaxTermOccurencesIsHonored() {
+    public void requireThatMaxTermOccurrencesIsHonored() {
         final String inputTerm = "foo";
         final String stemmedInputTerm = "bar"; // completely different from
                                                // inputTerm for safer test
@@ -204,7 +220,7 @@ public class LinguisticsAnnotatorTestCase {
             StringBuilder input = new StringBuilder();
             Token[] tokens = new Token[inputTermOccurence];
             for (int i = 0; i < inputTermOccurence; ++i) {
-                SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
+                SimpleToken t = token(inputTerm, stemmedInputTerm, type);
                 t.setOffset(i * paddedInputTerm.length());
                 tokens[i] = t;
                 input.append(paddedInputTerm);
@@ -214,28 +230,29 @@ public class LinguisticsAnnotatorTestCase {
     }
 
     // --------------------------------------------------------------------------------
-    //
     // Utilities
-    //
-    // --------------------------------------------------------------------------------
 
-    private static SimpleToken newToken(String orig, String stem, TokenType type) {
-        return newToken(orig, stem, type, false);
+    private static SimpleToken token(String orig, String stem, TokenType type) {
+        return token(orig, stem, type, false);
     }
 
-    private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
+    private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) {
         return new SimpleToken(orig).setTokenString(stem)
                                     .setType(type)
                                     .setSpecialToken(specialToken);
     }
 
     private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
-        assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
+        assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
+    }
+
+    private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) {
+        assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
     }
 
-    private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
+    private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) {
         StringFieldValue val = new StringFieldValue(str);
-        assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+        assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val));
         assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
     }
 
@@ -255,6 +272,7 @@ public class LinguisticsAnnotatorTestCase {
 
         private Token replace(Token token, Map<String, String> replacementTerms) {
             var simpleToken = (SimpleToken)token;
+            System.out.println("Token: " + token + ", getTokenString: " + token.getTokenString());
             simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString()));
             return simpleToken;
         }
author	Jon Bratseth <bratseth@vespa.ai>	2023-11-10 21:38:23 +0100
committer	Jon Bratseth <bratseth@vespa.ai>	2023-11-10 21:38:23 +0100
commit	90965807bd8a6134fe92f7058b3b0a3287050c2a (patch)
tree	ada0de730852649e4e648c0c4093e9288988ea77 /indexinglanguage
parent	d74701fe719494819eeb7f5c1af4b59a5c652df6 (diff)