aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
committerJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
commit0c3afd47b0e2f43bc9aa9e4e60d33c104c37b81b (patch)
treeb419c9be56d0fd049b49ba6f4258bb5d2f931c00 /indexinglanguage
parent8ef499e16e9fb5daede071d36cb523f4d30538c0 (diff)
Require replacements to be applied during tokenization
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java14
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java27
2 files changed, 13 insertions, 28 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 8b6ef83f05e..81a5305a778 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -115,9 +115,7 @@ public class LinguisticsAnnotator {
}
return;
}
- if ( ! token.isIndexable()) {
- return;
- }
+ if ( ! token.isIndexable()) return;
}
String orig = token.getOrig();
int pos = (int)token.getOffset();
@@ -138,9 +136,6 @@ public class LinguisticsAnnotator {
String lowercasedTerm = lowercasedOrig;
String term = token.getTokenString();
if (term != null) {
- term = tokenizer.getReplacementTerm(term);
- }
- if (term != null) {
lowercasedTerm = toLowerCase(term);
}
if (! lowercasedOrig.equals(lowercasedTerm)) {
@@ -155,12 +150,7 @@ public class LinguisticsAnnotator {
}
} else {
String term = token.getTokenString();
- if (term != null) {
- term = tokenizer.getReplacementTerm(term);
- }
- if (term == null || term.trim().isEmpty()) {
- return;
- }
+ if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index afbcf597a46..5f436720990 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,6 +19,7 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.util.*;
+import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -30,12 +31,6 @@ public class LinguisticsAnnotatorTestCase {
private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
- // --------------------------------------------------------------------------------
- //
- // Tests
- //
- // --------------------------------------------------------------------------------
-
@Test
public void requireThatAnnotateFailsWithZeroTokens() {
assertAnnotations(null, "foo");
@@ -145,7 +140,7 @@ public class LinguisticsAnnotatorTestCase {
continue;
}
assertAnnotations(expected, "foo",
- newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)),
+ newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
Collections.singletonMap("foo", "bar")));
}
}
@@ -159,7 +154,7 @@ public class LinguisticsAnnotatorTestCase {
StringFieldValue val = new StringFieldValue("foo");
val.setSpanTree(spanTree);
- Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+ Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
Collections.<String, String>emptyMap());
new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
@@ -253,11 +248,15 @@ public class LinguisticsAnnotatorTestCase {
private static class MyTokenizer implements Tokenizer {
final List<Token> tokens;
- final Map<String, String> replacementTerms;
public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- this.tokens = new ArrayList<>(tokens);
- this.replacementTerms = replacementTerms;
+ this.tokens = tokens.stream().map(token -> replace(token, replacementTerms)).collect(Collectors.toList());
+ }
+
+ private Token replace(Token token, Map<String, String> replacementTerms) {
+ var simpleToken = (SimpleToken)token;
+ simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString()));
+ return simpleToken;
}
@Override
@@ -265,10 +264,6 @@ public class LinguisticsAnnotatorTestCase {
return tokens;
}
- @Override
- public String getReplacementTerm(String term) {
- String replacement = replacementTerms.get(term);
- return replacement != null ? replacement : term;
- }
}
+
}