summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
committerJon Bratseth <bratseth@gmail.com>2021-06-15 19:43:59 +0200
commit0c3afd47b0e2f43bc9aa9e4e60d33c104c37b81b (patch)
treeb419c9be56d0fd049b49ba6f4258bb5d2f931c00 /linguistics
parent8ef499e16e9fb5daede071d36cb523f4d30538c0 (diff)
Require replacements to be applied during tokenization
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Token.java8
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java11
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java4
3 files changed, 11 insertions, 12 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java
index 73c0ac857ab..70b78ef1a92 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Token.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java
@@ -38,12 +38,12 @@ public interface Token {
TokenScript getScript();
/**
- * Returns token string in a form suitable for indexing: The
- * most lowercased variant of the most processed token form available.
+ * Returns the token string in a form suitable for indexing: The
+ * most lowercased variant of the most processed token form available,
* If called on a compound token this returns a lowercased form of the
* entire word.
- *
- * @return token string value
+ * If this is a special token with a configured replacement,
+ * this will return the replacement token.
*/
String getTokenString();
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
index 7e61cd885a8..5be0a6fa635 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -23,16 +23,11 @@ public interface Tokenizer {
Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
/**
- * Return a replacement for an input token string.
- * This accepts strings returned by Token.getTokenString
- * and returns a replacement which will be used as the index token.
- * The input token string is returned if there is no replacement.
- * <p>
- * This default implementation always returns the input token string.
+ * Not used.
*
- * @param tokenString the token string of the term to lookup a replacement for
- * @return the replacement, if any, or the argument token string if not
+ * @deprecated replacements are already applied in tokens returned by tokenize
*/
+ @Deprecated // Remove on Vespa 8
default String getReplacementTerm(String tokenString) { return tokenString; }
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 122b9b6dff6..7b63650fa94 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -25,6 +25,10 @@ public class SimpleToken implements Token {
this.orig = orig;
}
+ public SimpleToken(String orig, String tokenString) {
+ this.orig = orig;
+ }
+
@Override
public String getOrig() {
return orig;