diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java new file mode 100644 index 00000000000..d7d1e210de4 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +/** + * Language-sensitive tokenization of a text string. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Tokenizer { + + /** + * Returns the tokens produced from an input string under the rules of the given Language and additional options + * + * @param input the string to tokenize. May be arbitrarily large. + * @param language the language of the input string. + * @param stemMode the stem mode applied on the returned tokens + * @param removeAccents if true accents and similar are removed from the returned tokens + * @return the tokens of the input String. + * @throws ProcessingException If the underlying library throws an Exception. + */ + Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents); + + /** + * Return a replacement for an input token string. + * This accepts strings returned by Token.getTokenString + * and returns a replacement which will be used as the index token. + * The input token string is returned if there is no replacement. + * <p> + * This default implementation always returns the input token string. + * + * @param tokenString the token string of the term to lookup a replacement for + * @return the replacement, if any, or the argument token string if not + */ + default String getReplacementTerm(String tokenString) { return tokenString; } + +} |