diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/process |
Publish
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process')
16 files changed, 818 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java new file mode 100644 index 00000000000..0e1327aabcf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -0,0 +1,55 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Determines the class of a given character. Use this rather than java.lang.Character. + * + * @author bratseth + */ +public class CharacterClasses { + + /** + * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters + * which are useful to view as letters even though not defined as such in unicode. + */ + public boolean isLetter(int c) { + if (java.lang.Character.isLetter(c)) return true; + if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters + // if (c == '_') return true; + + // Ticket 3864695, some CJK punctuation YST defined as word characters + if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || + c == '\u300c' || c == '\u300d' || c == '\u300e' || + c == '\u300f' || c == '\u3010' || c == '\u3011') { + return true; + } + int type = java.lang.Character.getType(c); + return type == java.lang.Character.NON_SPACING_MARK || + type == java.lang.Character.COMBINING_SPACING_MARK || + type == java.lang.Character.ENCLOSING_MARK; + } + + /** + * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit + */ + public boolean isDigit(int c) { + return Character.isDigit(c); + } + + /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */ + public boolean isLatinDigit(int c) { + return Character.isDigit(c) && isLatin(c); + } + + /** Returns true if this is a latin character */ + public boolean isLatin(int c) { + return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN); + } + + /** + * Convenience, returns isLetter(c) || isDigit(c) + */ + public boolean isLetterOrDigit(int c) { + return isLetter(c) || isDigit(c); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java new file mode 100644 index 00000000000..0672582d732 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -0,0 +1,222 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import java.util.*; + +/** + * A class which splits consecutive word character sequences into overlapping character n-grams. + * For example "en gul bille sang" split into 2-grams becomes + * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang". + * <p> + * This class is multithread safe. + * + * @author bratseth + */ +public class GramSplitter { + + private final CharacterClasses characterClasses; + + public GramSplitter(CharacterClasses characterClasses) { + this.characterClasses = characterClasses; + } + + /** + * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length] + * pairs into the input string. + * <p> + * The iterator is implemented as a sliding view over the input string rather than being backed by a + * list, which makes this space efficient for large strings. + * + * @param input the input string to be split, cannot be null + * @param n the gram size, a positive integer + * @return a read only iterator over the resulting grams + * @throws NullPointerException if input==null + * @throws IllegalArgumentException if n is less than 1 + */ + public GramSplitterIterator split(String input, int n) { + if (input == null) { + throw new NullPointerException("input cannot be null"); + } + if (n < 1) { + throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); + } + return new GramSplitterIterator(input, n, characterClasses); + } + + public static class GramSplitterIterator implements Iterator<Gram> { + + private final CharacterClasses characterClasses; + + /** + * Text to split + */ + private final String input; + + /** + * Gram size + */ + private final int n; + + /** + * Current index + */ + private int i = 0; + + /** + * Whether the last thing that happened was being on a separator (including the start of the string) + */ + private boolean isFirstAfterSeparator = true; + + /** + * The next gram or null if not determined yet + */ + private Gram nextGram = null; + + public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { + this.input = input; + this.n = n; + this.characterClasses = characterClasses; + } + + @Override + public boolean hasNext() { + if (nextGram != null) { + return true; + } + nextGram = findNext(); + return nextGram != null; + } + + @Override + public Gram next() { + Gram currentGram = nextGram; + if (currentGram == null) { + currentGram = findNext(); + } + if (currentGram == null) { + throw new NoSuchElementException("No next gram at position " + i); + } + nextGram = null; + return currentGram; + } + + private Gram findNext() { + // Skip to next word character + while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { + i++; + isFirstAfterSeparator = true; + } + if (i >= input.length()) { + return null; + } + + String gram = input.substring(i, Math.min(i + n, input.length())); + int nonWordChar = indexOfNonWordChar(gram); + if (nonWordChar == 0) { + throw new RuntimeException("Programming error"); + } + if (nonWordChar > 0) { + gram = gram.substring(0, nonWordChar); + } + + if (gram.length() == n) { // normal case: got a full length gram + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // gram is too short due either to a non-word separator or end of string + if (isFirstAfterSeparator) { // make a gram anyway + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // skip to next + i += gram.length() + 1; + isFirstAfterSeparator = true; + return findNext(); + } + } + } + + private int indexOfNonWordChar(String s) { + for (int i = 0; i < s.length(); i++) { + if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { + return i; + } + } + return -1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("This iterator is read only"); + } + + /** + * Convenience list which splits the remaining items in this iterator into a list of gram strings + * + * @return an immutable list of extracted grams + */ + public List<String> toExtractedList() { + List<String> gramList = new ArrayList<>(); + while (hasNext()) { + gramList.add(next().extractFrom(input)); + } + return Collections.unmodifiableList(gramList); + } + } + + /** + * An immutable start index and length pair + */ + public static final class Gram { + + private int start, length; + + public Gram(int start, int length) { + this.start = start; + this.length = length; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Returns this gram as a string from the input string + */ + public String extractFrom(String input) { + return input.substring(start, start + length); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Gram)) { + return false; + } + + Gram gram = (Gram)o; + + if (length != gram.length) { + return false; + } + if (start != gram.start) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = start; + result = 31 * result + length; + return result; + } + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java new file mode 100644 index 00000000000..f4e1ccc9feb --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java @@ -0,0 +1,19 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * <p>This interface provides NFKC normalization of Strings through the underlying linguistics library.</p> + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M\u00F8lster Lidal</a> + */ +public interface Normalizer { + + /** + * <p>NFKC normalizes a String.</p> + * + * @param input String to normalize. + * @return The normalized String. + * @throws ProcessingException If underlying library throws an Exception. + */ + public String normalize(String input); +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java new file mode 100644 index 00000000000..ce8b455707c --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java @@ -0,0 +1,18 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * <p>Exception class indicating that a fatal error occured during linguistic processing.</p> + * + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class ProcessingException extends RuntimeException { + + public ProcessingException(String message) { + super(message); + } + + public ProcessingException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java new file mode 100644 index 00000000000..73764e06ef6 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.List; + +/** + * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a + * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK + * processing).</p> + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Segmenter { + + /** + * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized + * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only + * contains word-characters, any punctuation and spacing tokens will be removed. + * + * @param input the text to segment. + * @param language language of input text. + * @return the list of segments. + * @throws ProcessingException if an exception is encountered during processing + */ + List<String> segment(String input, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java new file mode 100644 index 00000000000..146d65cb7e2 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java @@ -0,0 +1,45 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SegmenterImpl implements Segmenter { + + private final Tokenizer tokenizer; + + public SegmenterImpl(Tokenizer tokenizer) { + this.tokenizer = tokenizer; + } + + @Override + public List<String> segment(String input, Language language) { + List<String> segments = new ArrayList<>(); + for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) { + findSegments(token, segments); + } + if (segments.isEmpty()) { + segments.add(input); // no segments, return original string + } + return segments; + } + + private void findSegments(Token token, List<String> out) { + int len; + if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { + if (token.isIndexable()) { + out.add(token.getOrig()); + } + } else { + for (int i = 0; i < len; ++i) { + findSegments(token.getComponent(i), out); + } + } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java new file mode 100644 index 00000000000..d355af87f08 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import java.util.AbstractList; +import java.util.ArrayList; + +/** + * A list of strings which does not allow for duplicate elements. + * + * @author steinar + */ +public class StemList extends AbstractList<String> { + private final ArrayList<String> stems; + + public StemList() { + this(new String[0]); + } + + public StemList(String... stems) { + super(); + this.stems = new ArrayList<>(Math.max(stems.length, 3)); + for (String word : stems) { + add(word); + } + } + + @Override + public String get(int i) { + return stems.get(i); + } + + @Override + public int size() { + return stems.size(); + } + + @Override + public String set(int i, String element) { + int existing = stems.indexOf(element); + if (existing >= 0 && existing != i) { + // the element already exists + return element; + } else { + return stems.set(i, element); + } + } + + @Override + public void add(int i, String element) { + int existing = stems.indexOf(element); + if (existing < 0) { + stems.add(i, element); + } + } + + @Override + public String remove(int i) { + return stems.remove(i); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java new file mode 100644 index 00000000000..269b08dcdf7 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java @@ -0,0 +1,45 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * An enum of the stemming modes which can be requested. + * Stemming implementation may support a smaller number of modes by mapping a mode to a more + * inclusive alternative. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public enum StemMode { + + NONE(0), + DEFAULT(1), + ALL(2), + SHORTEST(4), + BEST(5); + + private final int value; + + StemMode(int value) { + this.value = value; + } + + /** + * Returns the stem mode as an int + * + * @deprecated do not use + */ + @Deprecated + public int getValue() { + return value; + } + + @Deprecated + public static StemMode valueOf(int value) { + for (StemMode mode : values()) { + if (mode.value == value) { + return mode; + } + } + return NONE; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java new file mode 100644 index 00000000000..739fd1d9e96 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java @@ -0,0 +1,26 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.List; + +/** + * <p>Interface providing stemming of single words.</p> + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Stemmer { + + /** + * Stem input according to specified stemming mode. + * + * @param input the string to stem. + * @param mode the stemming mode + * @param language the language to use for stemming + * @return list of possible stems. Empty if none. + * @throws ProcessingException thrown if there is an exception stemming this input + */ + List<StemList> stem(String input, StemMode mode, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java new file mode 100644 index 00000000000..0d175a2bf3e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java @@ -0,0 +1,46 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class StemmerImpl implements Stemmer { + + private final Tokenizer tokenizer; + + public StemmerImpl(Tokenizer tokenizer) { + this.tokenizer = tokenizer; + } + + @Override + public List<StemList> stem(String input, StemMode stemMode, Language language) { + List<StemList> stems = new ArrayList<>(); + for (Token token : tokenizer.tokenize(input, language, stemMode, false)) { + findStems(token, stems); + } + return stems; + } + + private void findStems(Token token, List<StemList> out) { + int len; + if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { + if (token.isIndexable()) { + StemList word = new StemList(); + word.add(token.getTokenString()); // takes care of getStem(0) + for (int i = 1; i < token.getNumStems(); i++) { + word.add(token.getStem(i)); + } + out.add(word); + } + } else { + for (int i = 0; i < len; ++i) { + findStems(token.getComponent(i), out); + } + } + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java new file mode 100644 index 00000000000..f1dc6639e11 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java @@ -0,0 +1,56 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Interface providing access to a single token produced by the tokenizer. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Token { + + /** Returns the type of this token - word, space or punctuation etc. */ + TokenType getType(); + + /** Returns the original form of this token */ + String getOrig(); + + /** Returns the number of stem forms available for this token. */ + int getNumStems(); + + /** Returns the stem at position i */ + String getStem(int i); + + /** + * Returns the number of components, if this token is a compound word + * (e.g. german "kommunikationsfehler". Otherwise, return 0 + * + * @return number of components, or 0 if none + */ + int getNumComponents(); + + /** Returns a component token of this */ + Token getComponent(int i); + + /** Returns the offset position of this token */ + long getOffset(); + + /** Returns the script of this token */ + TokenScript getScript(); + + /** + * Returns token string in a form suitable for indexing: The + * most lowercased variant of the most processed token form available. + * If called on a compound token this returns a lowercased form of the + * entire word. + * + * @return token string value + */ + String getTokenString(); + + /** Returns whether this is an instance of a declared special token (e.g. c++) */ + boolean isSpecialToken(); + + /** Whether this token should be indexed */ + boolean isIndexable(); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java new file mode 100644 index 00000000000..ba0ad89b454 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java @@ -0,0 +1,77 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different + * linguistics treatment. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public enum TokenScript { + + COMMON, + LATIN, + GREEK, + CYRILLIC, + ARMENIAN, + HEBREW, + ARABIC, + SYRIAC, + THAANA, + DEVANAGARI, + BENGALI, + GURMUKHI, + GUJARATI, + ORIYA, + TAMIL, + TELUGU, + KANNADA, + MALAYALAM, + SINHALA, + THAI, + LAO, + TIBETAN, + MYANMAR, + GEORGIAN, + HANGUL, + ETHIOPIC, + CHEROKEE, + CANADIAN, + OGHAM, + RUNIC, + KHMER, + MONGOLIAN, + HIRAGANA, + KATAKANA, + CHINESE, + HAN, + YI, + OLDITALIC, + GOTHIC, + DESERET, + INHERITED, + TAGALOG, + HANUNOO, + BUHID, + TAGBANWA, + LIMBU, + TAILE, + LINEARB, + UGARITIC, + SHAVIAN, + OSMANYA, + CYPRIOT, + BRAILLE, + ASCII, + BUGINESE, + COPTIC, + GLAGOLITIC, + KHAROSHTHI, + OLDPERSIAN, + SYLOTINAGRI, + TAILUE, + TIFINAGH, + VIETNAMESE, + UNKNOWN; + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java new file mode 100644 index 00000000000..7d880440f1e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java @@ -0,0 +1,51 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * An enumeration of token types. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public enum TokenType { + + UNKNOWN(0), + SPACE(1), + PUNCTUATION(2), + SYMBOL(3), + ALPHABETIC(4), + NUMERIC(5), + MARKER(255); + + private final int value; + + TokenType(int value) { + this.value = value; + } + + /** Returns an int code for this type */ + public int getValue() { return value; } + + /** + * Marker for whether this type of token can be indexed for search. + * Note that a Token can be excluded from an index, even though the token type marks + * it as indexable. + * + * @see com.yahoo.language.process.Token#isIndexable() + * @return whether this type of token can be indexed + */ + public boolean isIndexable() { + switch (this) { + case ALPHABETIC: case NUMERIC: return true; + default: return false; + } + } + + /** Translates this from the int code representation returned from {@link #getValue} */ + public static TokenType valueOf(int value) { + for (TokenType type : values()) { + if (value == type.value) return type; + } + return UNKNOWN; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java new file mode 100644 index 00000000000..d7d1e210de4 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +/** + * Language-sensitive tokenization of a text string. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Tokenizer { + + /** + * Returns the tokens produced from an input string under the rules of the given Language and additional options + * + * @param input the string to tokenize. May be arbitrarily large. + * @param language the language of the input string. + * @param stemMode the stem mode applied on the returned tokens + * @param removeAccents if true accents and similar are removed from the returned tokens + * @return the tokens of the input String. + * @throws ProcessingException If the underlying library throws an Exception. + */ + Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents); + + /** + * Return a replacement for an input token string. + * This accepts strings returned by Token.getTokenString + * and returns a replacement which will be used as the index token. + * The input token string is returned if there is no replacement. + * <p> + * This default implementation always returns the input token string. + * + * @param tokenString the token string of the term to lookup a replacement for + * @return the replacement, if any, or the argument token string if not + */ + default String getReplacementTerm(String tokenString) { return tokenString; } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java new file mode 100644 index 00000000000..4d288aafaca --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java @@ -0,0 +1,23 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +/** + * Interface for providers of text transformations such as accent removal. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Transformer { + + /** + * Remove accents from input text. + * + * @param input text to transform. + * @param language language of input text. + * @return text with accents removed, or input-text if the feature is unavailable + * @throws ProcessingException thrown if there is an exception stemming this input + */ + String accentDrop(String input, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/package-info.java b/linguistics/src/main/java/com/yahoo/language/process/package-info.java new file mode 100644 index 00000000000..de8d82fcf36 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language.process; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; |