aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-11-09 09:46:37 +0100
committerJon Bratseth <bratseth@vespa.ai>2023-11-09 09:46:37 +0100
commit6d433b7c567be0ffd4473a32884ec0fbe83a5df3 (patch)
tree7765eb7be8cdd3e0391ff0e05ae2bd10c2fc0b0e
parent28fad63cdf0bb5f82a68ddb1195ac747117785d6 (diff)
Don't lowercase linguistics annotations
Tokens are already lowercased by our bundled linguistics components. Lowercasing again when annotating precludes plugging in a lingustics component which preserves casing.
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java7
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java30
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java4
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java3
6 files changed, 24 insertions, 23 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
index 26058eeb8f3..fdfadf65400 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
@@ -15,6 +15,8 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
import java.util.Iterator;
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+
/**
* A filter which splits incoming text into n-grams.
*
@@ -68,8 +70,9 @@ public final class NGramExpression extends Expression {
// annotate gram as a word term
String gramString = gram.extractFrom(output.getString());
- typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
- annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));
+ typedSpan(gram.getStart(),
+ gram.getCodePointCount(),
+ TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString));
lastPosition = gram.getStart() + gram.getCodePointCount();
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 191d067effe..04019800d59 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -86,24 +86,23 @@ public class LinguisticsAnnotator {
}
/**
- * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
+ * Creates a TERM annotation which has the term as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase the term to lower case
- * @param origTerm the original term
+ * @param term the term
+ * @param origTerm the original term
* @return the created TERM annotation
*/
- public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
- String annotationValue = toLowerCase(termToLowerCase);
- if (annotationValue.equals(origTerm)) {
+ public static Annotation termAnnotation(String term, String origTerm) {
+ if (term.equals(origTerm))
return new Annotation(AnnotationTypes.TERM);
- }
- return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue));
+ else
+ return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
}
private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
if (termOccurrences.termCountBelowLimit(term)) {
- here.annotate(lowerCaseTermAnnotation(term, orig));
+ here.annotate(termAnnotation(term, orig));
}
}
@@ -127,21 +126,20 @@ public class LinguisticsAnnotator {
}
if (mode == StemMode.ALL) {
Span where = parent.span((int)token.getOffset(), token.getOrig().length());
- String lowercasedOrig = toLowerCase(token.getOrig());
addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
- String lowercasedTerm = lowercasedOrig;
+ String lowercasedOrig = toLowerCase(token.getOrig());
+ String termOrIfNullOrig = lowercasedOrig;
String term = token.getTokenString();
if (term != null) {
- lowercasedTerm = toLowerCase(term);
+ termOrIfNullOrig = term;
}
- if (! lowercasedOrig.equals(lowercasedTerm)) {
+ if (! lowercasedOrig.equals(termOrIfNullOrig)) {
addAnnotation(where, term, token.getOrig(), termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
- String lowercasedStem = toLowerCase(stem);
- if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
+ if (! (lowercasedOrig.equals(stem) || termOrIfNullOrig.equals(stem))) {
addAnnotation(where, stem, token.getOrig(), termOccurrences);
}
}
@@ -149,7 +147,7 @@ public class LinguisticsAnnotator {
String term = token.getTokenString();
if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
- parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
+ parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
}
}
}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
index bcde8751de8..b4e266ab3eb 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
@@ -57,8 +57,8 @@ public class NGramTestCase {
new NGramExpression(new SimpleLinguistics(), 3).execute(context);
StringFieldValue value = (StringFieldValue)context.getValue();
- assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ",
- value.getString());
+ assertEquals("Grams are pure annotations - field value is unchanged",
+ "en gul Bille sang... ", value.getString());
SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS);
assertNotNull(gramTree);
SpanList grams = (SpanList)gramTree.getRoot();
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 67bff3843ee..7f7e039cb9b 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -90,7 +90,7 @@ public class LinguisticsAnnotatorTestCase {
if (!specialToken && !type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
+ assertAnnotations(expected, "foo", newToken("foo", "bar", type, specialToken));
}
}
}
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index 5ad6a382abd..f0439a21fec 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -26,6 +26,7 @@ public class LinguisticsCase {
public static String toLowerCase(String in) {
// def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
// Also, at the time of writing, English is the default language for queries
+ if (in == null) return null;
return Lowercase.toLowerCase(in);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 33f5ee7e4bb..9178c2d7e09 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -189,9 +189,8 @@ public class GramSplitter {
@Override
public boolean equals(Object o) {
if (this == o) return true;
- if ( ! (o instanceof Gram)) return false;
+ if ( ! (o instanceof Gram gram)) return false;
- Gram gram = (Gram)o;
if (codePointCount != gram.codePointCount) return false;
if (start != gram.start) return false;
return true;