From a2fa15b1ab6d6a9930381e981694ce5b39a1160c Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sun, 11 Sep 2022 22:19:47 +0200 Subject: No functional changes --- .../linguistics/LinguisticsAnnotator.java | 27 ++++++++++------------ .../com/yahoo/language/simple/SimpleToken.java | 20 ++++++++-------- .../yahoo/language/simple/SimpleTokenTestCase.java | 4 ++-- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 5986ab44426..173df65a47e 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -63,7 +63,7 @@ public class LinguisticsAnnotator { * Annotates the given string with the appropriate linguistics annotations. * * @param text the text to annotate - * @return whether or not anything was annotated + * @return whether anything was annotated */ public boolean annotate(StringFieldValue text) { if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS. @@ -116,21 +116,18 @@ public class LinguisticsAnnotator { } if ( ! token.isIndexable()) return; } - String orig = token.getOrig(); - int pos = (int)token.getOffset(); - if (pos >= input.length()) { - throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which is outside the " + - "bounds of the input string; " + input); + if (token.getOffset() >= input.length()) { + throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which is outside the " + + "bounds of the input string '" + input + "'"); } - int len = orig.length(); - if (pos + len > input.length()) { - throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which makes it overflow " + + if (token.getOffset() + token.getOrig().length() > input.length()) { + throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which makes it overflow " + "the bounds of the input string; " + input); } if (mode == StemMode.ALL) { - Span where = parent.span(pos, len); - String lowercasedOrig = toLowerCase(orig); - addAnnotation(where, orig, orig, termOccurrences); + Span where = parent.span((int)token.getOffset(), token.getOrig().length()); + String lowercasedOrig = toLowerCase(token.getOrig()); + addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); String lowercasedTerm = lowercasedOrig; String term = token.getTokenString(); @@ -138,20 +135,20 @@ public class LinguisticsAnnotator { lowercasedTerm = toLowerCase(term); } if (! lowercasedOrig.equals(lowercasedTerm)) { - addAnnotation(where, term, orig, termOccurrences); + addAnnotation(where, term, token.getOrig(), termOccurrences); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); String lowercasedStem = toLowerCase(stem); if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) { - addAnnotation(where, stem, orig, termOccurrences); + addAnnotation(where, stem, token.getOrig(), termOccurrences); } } } else { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; if (termOccurrences.termCountBelowLimit(term)) { - parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig())); + parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig())); } } } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index b6ca219afc8..7ed9e1a2f03 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -7,6 +7,7 @@ import com.yahoo.language.process.TokenType; import java.util.ArrayList; import java.util.List; +import java.util.Objects; /** * @author Mathias Mølster Lidal @@ -115,14 +116,14 @@ public class SimpleToken implements Token { if (!(o instanceof Token other)) return false; if (getType() != other.getType()) return false; - if (!equalsOpt(getOrig(), other.getOrig())) return false; + if (!Objects.equals(getOrig(), other.getOrig())) return false; if (getOffset() != other.getOffset()) return false; - if (!equalsOpt(getScript(), other.getScript())) return false; - if (!equalsOpt(getTokenString(), other.getTokenString())) return false; + if (!Objects.equals(getScript(), other.getScript())) return false; + if (!Objects.equals(getTokenString(), other.getTokenString())) return false; if (isSpecialToken() != other.isSpecialToken()) return false; if (getNumComponents() != other.getNumComponents()) return false; for (int i = 0, len = getNumComponents(); i < len; ++i) { - if (!equalsOpt(getComponent(i), other.getComponent(i))) + if (!Objects.equals(getComponent(i), other.getComponent(i))) return false; } return true; @@ -133,15 +134,12 @@ public class SimpleToken implements Token { return orig.hashCode(); } - private static boolean equalsOpt(Object lhs, Object rhs) { - if (lhs == null || rhs == null) { - return lhs == rhs; - } - return lhs.equals(rhs); - } - @Override public String toString() { + return "token '" + orig + "'"; + } + + public String toDetailString() { return "token : " + getClass().getSimpleName() + " {\n" + toString(this, " ") + "}"; } diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java index c699f9d314b..67d787d8587 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java @@ -138,7 +138,7 @@ public class SimpleTokenTestCase { } @Test - public void requireThatToStringIsExpressive() { + public void testDetailString() { SimpleToken token = new SimpleToken("my_orig"); token.addComponent(new SimpleToken("my_component_1")); token.addComponent(new SimpleToken("my_component_2")); @@ -177,7 +177,7 @@ public class SimpleTokenTestCase { " token string : 'my_token_string'\n" + " type : ALPHABETIC\n" + "}"; - assertEquals(expected, token.toString()); + assertEquals(expected, token.toDetailString()); } @Test -- cgit v1.2.3