aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
committerJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
commit0c3afd47b0e2f43bc9aa9e4e60d33c104c37b81b (patch)
treeb419c9be56d0fd049b49ba6f4258bb5d2f931c00
parent8ef499e16e9fb5daede071d36cb523f4d30538c0 (diff)
Require replacements to be applied during tokenization
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java14
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java27
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Token.java8
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java11
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java4
5 files changed, 24 insertions, 40 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 8b6ef83f05e..81a5305a778 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -115,9 +115,7 @@ public class LinguisticsAnnotator {
}
return;
}
- if ( ! token.isIndexable()) {
- return;
- }
+ if ( ! token.isIndexable()) return;
}
String orig = token.getOrig();
int pos = (int)token.getOffset();
@@ -138,9 +136,6 @@ public class LinguisticsAnnotator {
String lowercasedTerm = lowercasedOrig;
String term = token.getTokenString();
if (term != null) {
- term = tokenizer.getReplacementTerm(term);
- }
- if (term != null) {
lowercasedTerm = toLowerCase(term);
}
if (! lowercasedOrig.equals(lowercasedTerm)) {
@@ -155,12 +150,7 @@ public class LinguisticsAnnotator {
}
} else {
String term = token.getTokenString();
- if (term != null) {
- term = tokenizer.getReplacementTerm(term);
- }
- if (term == null || term.trim().isEmpty()) {
- return;
- }
+ if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index afbcf597a46..5f436720990 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,6 +19,7 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.util.*;
+import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -30,12 +31,6 @@ public class LinguisticsAnnotatorTestCase {
private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
- // --------------------------------------------------------------------------------
- //
- // Tests
- //
- // --------------------------------------------------------------------------------
-
@Test
public void requireThatAnnotateFailsWithZeroTokens() {
assertAnnotations(null, "foo");
@@ -145,7 +140,7 @@ public class LinguisticsAnnotatorTestCase {
continue;
}
assertAnnotations(expected, "foo",
- newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)),
+ newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
Collections.singletonMap("foo", "bar")));
}
}
@@ -159,7 +154,7 @@ public class LinguisticsAnnotatorTestCase {
StringFieldValue val = new StringFieldValue("foo");
val.setSpanTree(spanTree);
- Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+ Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
Collections.<String, String>emptyMap());
new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
@@ -253,11 +248,15 @@ public class LinguisticsAnnotatorTestCase {
private static class MyTokenizer implements Tokenizer {
final List<Token> tokens;
- final Map<String, String> replacementTerms;
public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) {
- this.tokens = new ArrayList<>(tokens);
- this.replacementTerms = replacementTerms;
+ this.tokens = tokens.stream().map(token -> replace(token, replacementTerms)).collect(Collectors.toList());
+ }
+
+ private Token replace(Token token, Map<String, String> replacementTerms) {
+ var simpleToken = (SimpleToken)token;
+ simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString()));
+ return simpleToken;
}
@Override
@@ -265,10 +264,6 @@ public class LinguisticsAnnotatorTestCase {
return tokens;
}
- @Override
- public String getReplacementTerm(String term) {
- String replacement = replacementTerms.get(term);
- return replacement != null ? replacement : term;
- }
}
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java
index 73c0ac857ab..70b78ef1a92 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Token.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java
@@ -38,12 +38,12 @@ public interface Token {
TokenScript getScript();
/**
- * Returns token string in a form suitable for indexing: The
- * most lowercased variant of the most processed token form available.
+ * Returns the token string in a form suitable for indexing: The
+ * most lowercased variant of the most processed token form available,
* If called on a compound token this returns a lowercased form of the
* entire word.
- *
- * @return token string value
+ * If this is a special token with a configured replacement,
+ * this will return the replacement token.
*/
String getTokenString();
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
index 7e61cd885a8..5be0a6fa635 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -23,16 +23,11 @@ public interface Tokenizer {
Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
/**
- * Return a replacement for an input token string.
- * This accepts strings returned by Token.getTokenString
- * and returns a replacement which will be used as the index token.
- * The input token string is returned if there is no replacement.
- * <p>
- * This default implementation always returns the input token string.
+ * Not used.
*
- * @param tokenString the token string of the term to lookup a replacement for
- * @return the replacement, if any, or the argument token string if not
+ * @deprecated replacements are already applied in tokens returned by tokenize
*/
+ @Deprecated // Remove on Vespa 8
default String getReplacementTerm(String tokenString) { return tokenString; }
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 122b9b6dff6..7b63650fa94 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -25,6 +25,10 @@ public class SimpleToken implements Token {
this.orig = orig;
}
+ public SimpleToken(String orig, String tokenString) {
+ this.orig = orig;
+ }
+
@Override
public String getOrig() {
return orig;