summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-11-20 14:31:00 +0100
committerJon Bratseth <bratseth@vespa.ai>2023-11-20 14:31:00 +0100
commite6a4803325ed3b5bd5b6867484904aaed508af35 (patch)
treea2f834d6c2a1da768c5c6174667ff925206927b6 /indexinglanguage
parente8c0a04b67b632ea3f98327d8f39cd0293ad8581 (diff)
If we index the original in addition to stems, lowercase it
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java2
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java9
2 files changed, 6 insertions, 5 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 52cd8a8ff54..86d4e91a567 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -132,7 +132,7 @@ public class LinguisticsAnnotator {
if (term != null) {
addAnnotation(where, term, token.getOrig(), termOccurrences);
if ( ! term.equals(lowercasedOrig))
- addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
+ addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index a4dbe1fe826..0bdf98f2ae0 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -58,7 +58,9 @@ public class LinguisticsAnnotatorTestCase {
@Test
public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+ var span1 = expected.spanList().span(0, 6);
+ span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+ span1.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("teslas")));
var span2 = expected.spanList().span(0, 4);
span2.annotate(new Annotation(AnnotationTypes.TERM));
span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
@@ -69,7 +71,7 @@ public class LinguisticsAnnotatorTestCase {
for (TokenType type : TokenType.values()) {
if (!type.isIndexable()) continue;
assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
- token("Tesla", "tesla", type),
+ token("Teslas", "tesla", type),
token("cars", "car", type),
SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
}
@@ -204,8 +206,7 @@ public class LinguisticsAnnotatorTestCase {
@Test
public void requireThatMaxTermOccurrencesIsHonored() {
final String inputTerm = "foo";
- final String stemmedInputTerm = "bar"; // completely different from
- // inputTerm for safer test
+ final String stemmedInputTerm = "bar"; // completely different from inputTerm for safer test
final String paddedInputTerm = inputTerm + " ";
final SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
final int inputTermOccurence = AnnotatorConfig.DEFAULT_MAX_TERM_OCCURRENCES * 2;