From 0dfd4fe4c6ddbded490da36e71f27c4b70aa4226 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Thu, 9 Nov 2023 23:05:11 +0100 Subject: Revert "Don't lowercase linguistics annotations" --- .../expressions/NGramExpression.java | 7 ++--- .../linguistics/LinguisticsAnnotator.java | 30 ++++++++++++---------- .../expressions/NGramTestCase.java | 4 +-- .../linguistics/LinguisticsAnnotatorTestCase.java | 6 ++--- .../java/com/yahoo/language/LinguisticsCase.java | 1 - .../com/yahoo/language/process/GramSplitter.java | 3 ++- 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java index fdfadf65400..26058eeb8f3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java @@ -15,8 +15,6 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; import java.util.Iterator; -import static com.yahoo.language.LinguisticsCase.toLowerCase; - /** * A filter which splits incoming text into n-grams. * @@ -70,9 +68,8 @@ public final class NGramExpression extends Expression { // annotate gram as a word term String gramString = gram.extractFrom(output.getString()); - typedSpan(gram.getStart(), - gram.getCodePointCount(), - TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString)); + typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList). + annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); lastPosition = gram.getStart() + gram.getCodePointCount(); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 04019800d59..191d067effe 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -86,23 +86,24 @@ public class LinguisticsAnnotator { } /** - * Creates a TERM annotation which has the term as annotation (only) if it is different from the + * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param term the term - * @param origTerm the original term + * @param termToLowerCase the term to lower case + * @param origTerm the original term * @return the created TERM annotation */ - public static Annotation termAnnotation(String term, String origTerm) { - if (term.equals(origTerm)) + public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { + String annotationValue = toLowerCase(termToLowerCase); + if (annotationValue.equals(origTerm)) { return new Annotation(AnnotationTypes.TERM); - else - return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term)); + } + return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue)); } private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) { if (termOccurrences.termCountBelowLimit(term)) { - here.annotate(termAnnotation(term, orig)); + here.annotate(lowerCaseTermAnnotation(term, orig)); } } @@ -126,20 +127,21 @@ public class LinguisticsAnnotator { } if (mode == StemMode.ALL) { Span where = parent.span((int)token.getOffset(), token.getOrig().length()); + String lowercasedOrig = toLowerCase(token.getOrig()); addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); - String lowercasedOrig = toLowerCase(token.getOrig()); - String termOrIfNullOrig = lowercasedOrig; + String lowercasedTerm = lowercasedOrig; String term = token.getTokenString(); if (term != null) { - termOrIfNullOrig = term; + lowercasedTerm = toLowerCase(term); } - if (! lowercasedOrig.equals(termOrIfNullOrig)) { + if (! lowercasedOrig.equals(lowercasedTerm)) { addAnnotation(where, term, token.getOrig(), termOccurrences); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); - if (! (lowercasedOrig.equals(stem) || termOrIfNullOrig.equals(stem))) { + String lowercasedStem = toLowerCase(stem); + if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) { addAnnotation(where, stem, token.getOrig(), termOccurrences); } } @@ -147,7 +149,7 @@ public class LinguisticsAnnotator { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; if (termOccurrences.termCountBelowLimit(term)) { - parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig())); + parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig())); } } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java index b4e266ab3eb..bcde8751de8 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java @@ -57,8 +57,8 @@ public class NGramTestCase { new NGramExpression(new SimpleLinguistics(), 3).execute(context); StringFieldValue value = (StringFieldValue)context.getValue(); - assertEquals("Grams are pure annotations - field value is unchanged", - "en gul Bille sang... ", value.getString()); + assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ", + value.getString()); SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS); assertNotNull(gramTree); SpanList grams = (SpanList)gramTree.getRoot(); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index 8baa4851f5d..67bff3843ee 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -82,15 +82,15 @@ public class LinguisticsAnnotatorTestCase { } @Test - public void requireThatTermAnnotationsPreserveCasing() { + public void requireThatTermAnnotationsAreLowerCased() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR"))); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (boolean specialToken : Arrays.asList(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", newToken("foo", "BaR", type, specialToken)); + assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken)); } } } diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index f0439a21fec..5ad6a382abd 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -26,7 +26,6 @@ public class LinguisticsCase { public static String toLowerCase(String in) { // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 // Also, at the time of writing, English is the default language for queries - if (in == null) return null; return Lowercase.toLowerCase(in); } diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 9178c2d7e09..33f5ee7e4bb 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -189,8 +189,9 @@ public class GramSplitter { @Override public boolean equals(Object o) { if (this == o) return true; - if ( ! (o instanceof Gram gram)) return false; + if ( ! (o instanceof Gram)) return false; + Gram gram = (Gram)o; if (codePointCount != gram.codePointCount) return false; if (start != gram.start) return false; return true; -- cgit v1.2.3