If we index the original in addition to stems, lowercase it

author: Jon Bratseth <bratseth@vespa.ai> 2023-11-20 14:31:00 +0100
committer: Jon Bratseth <bratseth@vespa.ai> 2023-11-20 14:31:00 +0100
commit: e6a4803325ed3b5bd5b6867484904aaed508af35 (patch)
tree: a2f834d6c2a1da768c5c6174667ff925206927b6 /indexinglanguage
parent: e8c0a04b67b632ea3f98327d8f39cd0293ad8581 (diff)
2 files changed, 6 insertions, 5 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 52cd8a8ff54..86d4e91a567 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -132,7 +132,7 @@ public class LinguisticsAnnotator {
             if (term != null) {
                 addAnnotation(where, term, token.getOrig(), termOccurrences);
                 if ( ! term.equals(lowercasedOrig))
-                    addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
+                    addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences);
             }
             for (int i = 0; i < token.getNumStems(); i++) {
                 String stem = token.getStem(i);
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index a4dbe1fe826..0bdf98f2ae0 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -58,7 +58,9 @@ public class LinguisticsAnnotatorTestCase {
     @Test
     public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
         SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
-        expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+        var span1 = expected.spanList().span(0, 6);
+        span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+        span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("teslas")));
         var span2 = expected.spanList().span(0, 4);
         span2.annotate(new Annotation(AnnotationTypes.TERM));
         span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
@@ -69,7 +71,7 @@ public class LinguisticsAnnotatorTestCase {
         for (TokenType type : TokenType.values()) {
             if (!type.isIndexable()) continue;
             assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
-                              token("Tesla", "tesla", type),
+                              token("Teslas", "tesla", type),
                               token("cars", "car", type),
                               SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
         }
@@ -204,8 +206,7 @@ public class LinguisticsAnnotatorTestCase {
     @Test
     public void requireThatMaxTermOccurrencesIsHonored() {
         final String inputTerm = "foo";
-        final String stemmedInputTerm = "bar"; // completely different from
-                                               // inputTerm for safer test
+        final String stemmedInputTerm = "bar"; // completely different from inputTerm for safer test
         final String paddedInputTerm = inputTerm + " ";
         final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
         final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;
author	Jon Bratseth <bratseth@vespa.ai>	2023-11-20 14:31:00 +0100
committer	Jon Bratseth <bratseth@vespa.ai>	2023-11-20 14:31:00 +0100
commit	e6a4803325ed3b5bd5b6867484904aaed508af35 (patch)
tree	a2f834d6c2a1da768c5c6174667ff925206927b6 /indexinglanguage
parent	e8c0a04b67b632ea3f98327d8f39cd0293ad8581 (diff)