diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-11-20 14:31:00 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-11-20 14:31:00 +0100 |
commit | e6a4803325ed3b5bd5b6867484904aaed508af35 (patch) | |
tree | a2f834d6c2a1da768c5c6174667ff925206927b6 /indexinglanguage | |
parent | e8c0a04b67b632ea3f98327d8f39cd0293ad8581 (diff) |
If we index the original in addition to stems, lowercase it
Diffstat (limited to 'indexinglanguage')
2 files changed, 6 insertions, 5 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 52cd8a8ff54..86d4e91a567 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -132,7 +132,7 @@ public class LinguisticsAnnotator { if (term != null) { addAnnotation(where, term, token.getOrig(), termOccurrences); if ( ! term.equals(lowercasedOrig)) - addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); + addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index a4dbe1fe826..0bdf98f2ae0 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -58,7 +58,9 @@ public class LinguisticsAnnotatorTestCase { @Test public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); + var span1 = expected.spanList().span(0, 6); + span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); + span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("teslas"))); var span2 = expected.spanList().span(0, 4); span2.annotate(new Annotation(AnnotationTypes.TERM)); span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car"))); @@ -69,7 +71,7 @@ public class LinguisticsAnnotatorTestCase { for (TokenType type : TokenType.values()) { if (!type.isIndexable()) continue; assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"), - token("Tesla", "tesla", type), + token("Teslas", "tesla", type), token("cars", "car", type), SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex"))); } @@ -204,8 +206,7 @@ public class LinguisticsAnnotatorTestCase { @Test public void requireThatMaxTermOccurrencesIsHonored() { final String inputTerm = "foo"; - final String stemmedInputTerm = "bar"; // completely different from - // inputTerm for safer test + final String stemmedInputTerm = "bar"; // completely different from inputTerm for safer test final String paddedInputTerm = inputTerm + " "; final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2; |