diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-06-15 19:43:59 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-06-15 19:43:59 +0200 |
commit | 0c3afd47b0e2f43bc9aa9e4e60d33c104c37b81b (patch) | |
tree | b419c9be56d0fd049b49ba6f4258bb5d2f931c00 | |
parent | 8ef499e16e9fb5daede071d36cb523f4d30538c0 (diff) |
Require replacements to be applied during tokenization
5 files changed, 24 insertions, 40 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 8b6ef83f05e..81a5305a778 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -115,9 +115,7 @@ public class LinguisticsAnnotator { } return; } - if ( ! token.isIndexable()) { - return; - } + if ( ! token.isIndexable()) return; } String orig = token.getOrig(); int pos = (int)token.getOffset(); @@ -138,9 +136,6 @@ public class LinguisticsAnnotator { String lowercasedTerm = lowercasedOrig; String term = token.getTokenString(); if (term != null) { - term = tokenizer.getReplacementTerm(term); - } - if (term != null) { lowercasedTerm = toLowerCase(term); } if (! lowercasedOrig.equals(lowercasedTerm)) { @@ -155,12 +150,7 @@ public class LinguisticsAnnotator { } } else { String term = token.getTokenString(); - if (term != null) { - term = tokenizer.getReplacementTerm(term); - } - if (term == null || term.trim().isEmpty()) { - return; - } + if (term == null || term.trim().isEmpty()) return; if (termOccurrences.termCountBelowLimit(term)) { parent.span(pos, len).annotate(lowerCaseTermAnnotation(term, token.getOrig())); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index afbcf597a46..5f436720990 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -19,6 +19,7 @@ import org.junit.Test; import org.mockito.Mockito; import java.util.*; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -30,12 +31,6 @@ public class LinguisticsAnnotatorTestCase { private static final AnnotatorConfig CONFIG = new AnnotatorConfig(); - // -------------------------------------------------------------------------------- - // - // Tests - // - // -------------------------------------------------------------------------------- - @Test public void requireThatAnnotateFailsWithZeroTokens() { assertAnnotations(null, "foo"); @@ -145,7 +140,7 @@ public class LinguisticsAnnotatorTestCase { continue; } assertAnnotations(expected, "foo", - newLinguistics(Arrays.asList(newToken("foo", "foo", type, specialToken)), + newLinguistics(List.of(newToken("foo", "foo", type, specialToken)), Collections.singletonMap("foo", "bar"))); } } @@ -159,7 +154,7 @@ public class LinguisticsAnnotatorTestCase { StringFieldValue val = new StringFieldValue("foo"); val.setSpanTree(spanTree); - Linguistics linguistics = newLinguistics(Arrays.asList(newToken("foo", "bar", TokenType.ALPHABETIC, false)), + Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), Collections.<String, String>emptyMap()); new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); @@ -253,11 +248,15 @@ public class LinguisticsAnnotatorTestCase { private static class MyTokenizer implements Tokenizer { final List<Token> tokens; - final Map<String, String> replacementTerms; public MyTokenizer(List<? extends Token> tokens, Map<String, String> replacementTerms) { - this.tokens = new ArrayList<>(tokens); - this.replacementTerms = replacementTerms; + this.tokens = tokens.stream().map(token -> replace(token, replacementTerms)).collect(Collectors.toList()); + } + + private Token replace(Token token, Map<String, String> replacementTerms) { + var simpleToken = (SimpleToken)token; + simpleToken.setTokenString(replacementTerms.getOrDefault(token.getTokenString(), token.getTokenString())); + return simpleToken; } @Override @@ -265,10 +264,6 @@ public class LinguisticsAnnotatorTestCase { return tokens; } - @Override - public String getReplacementTerm(String term) { - String replacement = replacementTerms.get(term); - return replacement != null ? replacement : term; - } } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java index 73c0ac857ab..70b78ef1a92 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Token.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java @@ -38,12 +38,12 @@ public interface Token { TokenScript getScript(); /** - * Returns token string in a form suitable for indexing: The - * most lowercased variant of the most processed token form available. + * Returns the token string in a form suitable for indexing: The + * most lowercased variant of the most processed token form available, * If called on a compound token this returns a lowercased form of the * entire word. - * - * @return token string value + * If this is a special token with a configured replacement, + * this will return the replacement token. */ String getTokenString(); diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java index 7e61cd885a8..5be0a6fa635 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -23,16 +23,11 @@ public interface Tokenizer { Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents); /** - * Return a replacement for an input token string. - * This accepts strings returned by Token.getTokenString - * and returns a replacement which will be used as the index token. - * The input token string is returned if there is no replacement. - * <p> - * This default implementation always returns the input token string. + * Not used. * - * @param tokenString the token string of the term to lookup a replacement for - * @return the replacement, if any, or the argument token string if not + * @deprecated replacements are already applied in tokens returned by tokenize */ + @Deprecated // Remove on Vespa 8 default String getReplacementTerm(String tokenString) { return tokenString; } } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index 122b9b6dff6..7b63650fa94 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -25,6 +25,10 @@ public class SimpleToken implements Token { this.orig = orig; } + public SimpleToken(String orig, String tokenString) { + this.orig = orig; + } + @Override public String getOrig() { return orig; |