summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-09-25 22:18:57 +0200
committerGitHub <noreply@github.com>2022-09-25 22:18:57 +0200
commit508e348097d20344417a5185f72aca418ca930f0 (patch)
tree15b5e25be5908710fc23c8224550bcc4b98eb04d /indexinglanguage
parent81de99f749f119aa8f4f06d5346f6990e785dbc3 (diff)
parenta2fa15b1ab6d6a9930381e981694ce5b39a1160c (diff)
Merge pull request #24007 from vespa-engine/bratseth/cleanup-082
No functional changes
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java27
1 files changed, 12 insertions, 15 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 5986ab44426..173df65a47e 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -63,7 +63,7 @@ public class LinguisticsAnnotator {
* Annotates the given string with the appropriate linguistics annotations.
*
* @param text the text to annotate
- * @return whether or not anything was annotated
+ * @return whether anything was annotated
*/
public boolean annotate(StringFieldValue text) {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
@@ -116,21 +116,18 @@ public class LinguisticsAnnotator {
}
if ( ! token.isIndexable()) return;
}
- String orig = token.getOrig();
- int pos = (int)token.getOffset();
- if (pos >= input.length()) {
- throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which is outside the " +
- "bounds of the input string; " + input);
+ if (token.getOffset() >= input.length()) {
+ throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which is outside the " +
+ "bounds of the input string '" + input + "'");
}
- int len = orig.length();
- if (pos + len > input.length()) {
- throw new IllegalArgumentException("Token '" + orig + "' has offset " + pos + ", which makes it overflow " +
+ if (token.getOffset() + token.getOrig().length() > input.length()) {
+ throw new IllegalArgumentException(token + " has offset " + token.getOffset() + ", which makes it overflow " +
"the bounds of the input string; " + input);
}
if (mode == StemMode.ALL) {
- Span where = parent.span(pos, len);
- String lowercasedOrig = toLowerCase(orig);
- addAnnotation(where, orig, orig, termOccurrences);
+ Span where = parent.span((int)token.getOffset(), token.getOrig().length());
+ String lowercasedOrig = toLowerCase(token.getOrig());
+ addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
String lowercasedTerm = lowercasedOrig;
String term = token.getTokenString();
@@ -138,20 +135,20 @@ public class LinguisticsAnnotator {
lowercasedTerm = toLowerCase(term);
}
if (! lowercasedOrig.equals(lowercasedTerm)) {
- addAnnotation(where, term, orig, termOccurrences);
+ addAnnotation(where, term, token.getOrig(), termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
String lowercasedStem = toLowerCase(stem);
if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
- addAnnotation(where, stem, orig, termOccurrences);
+ addAnnotation(where, stem, token.getOrig(), termOccurrences);
}
}
} else {
String term = token.getTokenString();
if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
- parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
+ parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
}
}
}