summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/process
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java55
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java222
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Normalizer.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java18
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java45
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemList.java61
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemMode.java45
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Stemmer.java26
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java46
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Token.java56
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/TokenScript.java77
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/TokenType.java51
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java38
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Transformer.java23
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/package-info.java7
16 files changed, 818 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
new file mode 100644
index 00000000000..0e1327aabcf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Determines the class of a given character. Use this rather than java.lang.Character.
+ *
+ * @author bratseth
+ */
+public class CharacterClasses {
+
+ /**
+ * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters
+ * which are useful to view as letters even though not defined as such in unicode.
+ */
+ public boolean isLetter(int c) {
+ if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
+ // if (c == '_') return true;
+
+ // Ticket 3864695, some CJK punctuation YST defined as word characters
+ if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
+ c == '\u300c' || c == '\u300d' || c == '\u300e' ||
+ c == '\u300f' || c == '\u3010' || c == '\u3011') {
+ return true;
+ }
+ int type = java.lang.Character.getType(c);
+ return type == java.lang.Character.NON_SPACING_MARK ||
+ type == java.lang.Character.COMBINING_SPACING_MARK ||
+ type == java.lang.Character.ENCLOSING_MARK;
+ }
+
+ /**
+ * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
+ */
+ public boolean isDigit(int c) {
+ return Character.isDigit(c);
+ }
+
+ /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */
+ public boolean isLatinDigit(int c) {
+ return Character.isDigit(c) && isLatin(c);
+ }
+
+ /** Returns true if this is a latin character */
+ public boolean isLatin(int c) {
+ return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN);
+ }
+
+ /**
+ * Convenience, returns isLetter(c) || isDigit(c)
+ */
+ public boolean isLetterOrDigit(int c) {
+ return isLetter(c) || isDigit(c);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
new file mode 100644
index 00000000000..0672582d732
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -0,0 +1,222 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.*;
+
+/**
+ * A class which splits consecutive word character sequences into overlapping character n-grams.
+ * For example "en gul bille sang" split into 2-grams becomes
+ * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang".
+ * <p>
+ * This class is multithread safe.
+ *
+ * @author bratseth
+ */
+public class GramSplitter {
+
+ private final CharacterClasses characterClasses;
+
+ public GramSplitter(CharacterClasses characterClasses) {
+ this.characterClasses = characterClasses;
+ }
+
+ /**
+ * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length]
+ * pairs into the input string.
+ * <p>
+ * The iterator is implemented as a sliding view over the input string rather than being backed by a
+ * list, which makes this space efficient for large strings.
+ *
+ * @param input the input string to be split, cannot be null
+ * @param n the gram size, a positive integer
+ * @return a read only iterator over the resulting grams
+ * @throws NullPointerException if input==null
+ * @throws IllegalArgumentException if n is less than 1
+ */
+ public GramSplitterIterator split(String input, int n) {
+ if (input == null) {
+ throw new NullPointerException("input cannot be null");
+ }
+ if (n < 1) {
+ throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
+ }
+ return new GramSplitterIterator(input, n, characterClasses);
+ }
+
+ public static class GramSplitterIterator implements Iterator<Gram> {
+
+ private final CharacterClasses characterClasses;
+
+ /**
+ * Text to split
+ */
+ private final String input;
+
+ /**
+ * Gram size
+ */
+ private final int n;
+
+ /**
+ * Current index
+ */
+ private int i = 0;
+
+ /**
+ * Whether the last thing that happened was being on a separator (including the start of the string)
+ */
+ private boolean isFirstAfterSeparator = true;
+
+ /**
+ * The next gram or null if not determined yet
+ */
+ private Gram nextGram = null;
+
+ public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
+ this.input = input;
+ this.n = n;
+ this.characterClasses = characterClasses;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (nextGram != null) {
+ return true;
+ }
+ nextGram = findNext();
+ return nextGram != null;
+ }
+
+ @Override
+ public Gram next() {
+ Gram currentGram = nextGram;
+ if (currentGram == null) {
+ currentGram = findNext();
+ }
+ if (currentGram == null) {
+ throw new NoSuchElementException("No next gram at position " + i);
+ }
+ nextGram = null;
+ return currentGram;
+ }
+
+ private Gram findNext() {
+ // Skip to next word character
+ while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
+ i++;
+ isFirstAfterSeparator = true;
+ }
+ if (i >= input.length()) {
+ return null;
+ }
+
+ String gram = input.substring(i, Math.min(i + n, input.length()));
+ int nonWordChar = indexOfNonWordChar(gram);
+ if (nonWordChar == 0) {
+ throw new RuntimeException("Programming error");
+ }
+ if (nonWordChar > 0) {
+ gram = gram.substring(0, nonWordChar);
+ }
+
+ if (gram.length() == n) { // normal case: got a full length gram
+ i++;
+ isFirstAfterSeparator = false;
+ return new Gram(i - 1, gram.length());
+ } else { // gram is too short due either to a non-word separator or end of string
+ if (isFirstAfterSeparator) { // make a gram anyway
+ i++;
+ isFirstAfterSeparator = false;
+ return new Gram(i - 1, gram.length());
+ } else { // skip to next
+ i += gram.length() + 1;
+ isFirstAfterSeparator = true;
+ return findNext();
+ }
+ }
+ }
+
+ private int indexOfNonWordChar(String s) {
+ for (int i = 0; i < s.length(); i++) {
+ if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("This iterator is read only");
+ }
+
+ /**
+ * Convenience list which splits the remaining items in this iterator into a list of gram strings
+ *
+ * @return an immutable list of extracted grams
+ */
+ public List<String> toExtractedList() {
+ List<String> gramList = new ArrayList<>();
+ while (hasNext()) {
+ gramList.add(next().extractFrom(input));
+ }
+ return Collections.unmodifiableList(gramList);
+ }
+ }
+
+ /**
+ * An immutable start index and length pair
+ */
+ public static final class Gram {
+
+ private int start, length;
+
+ public Gram(int start, int length) {
+ this.start = start;
+ this.length = length;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Returns this gram as a string from the input string
+ */
+ public String extractFrom(String input) {
+ return input.substring(start, start + length);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof Gram)) {
+ return false;
+ }
+
+ Gram gram = (Gram)o;
+
+ if (length != gram.length) {
+ return false;
+ }
+ if (start != gram.start) {
+ return false;
+ }
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = start;
+ result = 31 * result + length;
+ return result;
+ }
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
new file mode 100644
index 00000000000..f4e1ccc9feb
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>This interface provides NFKC normalization of Strings through the underlying linguistics library.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M\u00F8lster Lidal</a>
+ */
+public interface Normalizer {
+
+ /**
+ * <p>NFKC normalizes a String.</p>
+ *
+ * @param input String to normalize.
+ * @return The normalized String.
+ * @throws ProcessingException If underlying library throws an Exception.
+ */
+ public String normalize(String input);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
new file mode 100644
index 00000000000..ce8b455707c
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
@@ -0,0 +1,18 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>Exception class indicating that a fatal error occured during linguistic processing.</p>
+ *
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class ProcessingException extends RuntimeException {
+
+ public ProcessingException(String message) {
+ super(message);
+ }
+
+ public ProcessingException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
new file mode 100644
index 00000000000..73764e06ef6
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
+ * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
+ * processing).</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Segmenter {
+
+ /**
+ * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
+ * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
+ * contains word-characters, any punctuation and spacing tokens will be removed.
+ *
+ * @param input the text to segment.
+ * @param language language of input text.
+ * @return the list of segments.
+ * @throws ProcessingException if an exception is encountered during processing
+ */
+ List<String> segment(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
new file mode 100644
index 00000000000..146d65cb7e2
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SegmenterImpl implements Segmenter {
+
+ private final Tokenizer tokenizer;
+
+ public SegmenterImpl(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public List<String> segment(String input, Language language) {
+ List<String> segments = new ArrayList<>();
+ for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) {
+ findSegments(token, segments);
+ }
+ if (segments.isEmpty()) {
+ segments.add(input); // no segments, return original string
+ }
+ return segments;
+ }
+
+ private void findSegments(Token token, List<String> out) {
+ int len;
+ if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+ if (token.isIndexable()) {
+ out.add(token.getOrig());
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ findSegments(token.getComponent(i), out);
+ }
+ }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
new file mode 100644
index 00000000000..d355af87f08
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.AbstractList;
+import java.util.ArrayList;
+
+/**
+ * A list of strings which does not allow for duplicate elements.
+ *
+ * @author steinar
+ */
+public class StemList extends AbstractList<String> {
+ private final ArrayList<String> stems;
+
+ public StemList() {
+ this(new String[0]);
+ }
+
+ public StemList(String... stems) {
+ super();
+ this.stems = new ArrayList<>(Math.max(stems.length, 3));
+ for (String word : stems) {
+ add(word);
+ }
+ }
+
+ @Override
+ public String get(int i) {
+ return stems.get(i);
+ }
+
+ @Override
+ public int size() {
+ return stems.size();
+ }
+
+ @Override
+ public String set(int i, String element) {
+ int existing = stems.indexOf(element);
+ if (existing >= 0 && existing != i) {
+ // the element already exists
+ return element;
+ } else {
+ return stems.set(i, element);
+ }
+ }
+
+ @Override
+ public void add(int i, String element) {
+ int existing = stems.indexOf(element);
+ if (existing < 0) {
+ stems.add(i, element);
+ }
+ }
+
+ @Override
+ public String remove(int i) {
+ return stems.remove(i);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
new file mode 100644
index 00000000000..269b08dcdf7
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enum of the stemming modes which can be requested.
+ * Stemming implementation may support a smaller number of modes by mapping a mode to a more
+ * inclusive alternative.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum StemMode {
+
+ NONE(0),
+ DEFAULT(1),
+ ALL(2),
+ SHORTEST(4),
+ BEST(5);
+
+ private final int value;
+
+ StemMode(int value) {
+ this.value = value;
+ }
+
+ /**
+ * Returns the stem mode as an int
+ *
+ * @deprecated do not use
+ */
+ @Deprecated
+ public int getValue() {
+ return value;
+ }
+
+ @Deprecated
+ public static StemMode valueOf(int value) {
+ for (StemMode mode : values()) {
+ if (mode.value == value) {
+ return mode;
+ }
+ }
+ return NONE;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
new file mode 100644
index 00000000000..739fd1d9e96
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing stemming of single words.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Stemmer {
+
+ /**
+ * Stem input according to specified stemming mode.
+ *
+ * @param input the string to stem.
+ * @param mode the stemming mode
+ * @param language the language to use for stemming
+ * @return list of possible stems. Empty if none.
+ * @throws ProcessingException thrown if there is an exception stemming this input
+ */
+ List<StemList> stem(String input, StemMode mode, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
new file mode 100644
index 00000000000..0d175a2bf3e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemmerImpl implements Stemmer {
+
+ private final Tokenizer tokenizer;
+
+ public StemmerImpl(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public List<StemList> stem(String input, StemMode stemMode, Language language) {
+ List<StemList> stems = new ArrayList<>();
+ for (Token token : tokenizer.tokenize(input, language, stemMode, false)) {
+ findStems(token, stems);
+ }
+ return stems;
+ }
+
+ private void findStems(Token token, List<StemList> out) {
+ int len;
+ if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+ if (token.isIndexable()) {
+ StemList word = new StemList();
+ word.add(token.getTokenString()); // takes care of getStem(0)
+ for (int i = 1; i < token.getNumStems(); i++) {
+ word.add(token.getStem(i));
+ }
+ out.add(word);
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ findStems(token.getComponent(i), out);
+ }
+ }
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java
new file mode 100644
index 00000000000..f1dc6639e11
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Interface providing access to a single token produced by the tokenizer.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Token {
+
+ /** Returns the type of this token - word, space or punctuation etc. */
+ TokenType getType();
+
+ /** Returns the original form of this token */
+ String getOrig();
+
+ /** Returns the number of stem forms available for this token. */
+ int getNumStems();
+
+ /** Returns the stem at position i */
+ String getStem(int i);
+
+ /**
+ * Returns the number of components, if this token is a compound word
+ * (e.g. german "kommunikationsfehler". Otherwise, return 0
+ *
+ * @return number of components, or 0 if none
+ */
+ int getNumComponents();
+
+ /** Returns a component token of this */
+ Token getComponent(int i);
+
+ /** Returns the offset position of this token */
+ long getOffset();
+
+ /** Returns the script of this token */
+ TokenScript getScript();
+
+ /**
+ * Returns token string in a form suitable for indexing: The
+ * most lowercased variant of the most processed token form available.
+ * If called on a compound token this returns a lowercased form of the
+ * entire word.
+ *
+ * @return token string value
+ */
+ String getTokenString();
+
+ /** Returns whether this is an instance of a declared special token (e.g. c++) */
+ boolean isSpecialToken();
+
+ /** Whether this token should be indexed */
+ boolean isIndexable();
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
new file mode 100644
index 00000000000..ba0ad89b454
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
@@ -0,0 +1,77 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different
+ * linguistics treatment.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenScript {
+
+ COMMON,
+ LATIN,
+ GREEK,
+ CYRILLIC,
+ ARMENIAN,
+ HEBREW,
+ ARABIC,
+ SYRIAC,
+ THAANA,
+ DEVANAGARI,
+ BENGALI,
+ GURMUKHI,
+ GUJARATI,
+ ORIYA,
+ TAMIL,
+ TELUGU,
+ KANNADA,
+ MALAYALAM,
+ SINHALA,
+ THAI,
+ LAO,
+ TIBETAN,
+ MYANMAR,
+ GEORGIAN,
+ HANGUL,
+ ETHIOPIC,
+ CHEROKEE,
+ CANADIAN,
+ OGHAM,
+ RUNIC,
+ KHMER,
+ MONGOLIAN,
+ HIRAGANA,
+ KATAKANA,
+ CHINESE,
+ HAN,
+ YI,
+ OLDITALIC,
+ GOTHIC,
+ DESERET,
+ INHERITED,
+ TAGALOG,
+ HANUNOO,
+ BUHID,
+ TAGBANWA,
+ LIMBU,
+ TAILE,
+ LINEARB,
+ UGARITIC,
+ SHAVIAN,
+ OSMANYA,
+ CYPRIOT,
+ BRAILLE,
+ ASCII,
+ BUGINESE,
+ COPTIC,
+ GLAGOLITIC,
+ KHAROSHTHI,
+ OLDPERSIAN,
+ SYLOTINAGRI,
+ TAILUE,
+ TIFINAGH,
+ VIETNAMESE,
+ UNKNOWN;
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
new file mode 100644
index 00000000000..7d880440f1e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enumeration of token types.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenType {
+
+ UNKNOWN(0),
+ SPACE(1),
+ PUNCTUATION(2),
+ SYMBOL(3),
+ ALPHABETIC(4),
+ NUMERIC(5),
+ MARKER(255);
+
+ private final int value;
+
+ TokenType(int value) {
+ this.value = value;
+ }
+
+ /** Returns an int code for this type */
+ public int getValue() { return value; }
+
+ /**
+ * Marker for whether this type of token can be indexed for search.
+ * Note that a Token can be excluded from an index, even though the token type marks
+ * it as indexable.
+ *
+ * @see com.yahoo.language.process.Token#isIndexable()
+ * @return whether this type of token can be indexed
+ */
+ public boolean isIndexable() {
+ switch (this) {
+ case ALPHABETIC: case NUMERIC: return true;
+ default: return false;
+ }
+ }
+
+ /** Translates this from the int code representation returned from {@link #getValue} */
+ public static TokenType valueOf(int value) {
+ for (TokenType type : values()) {
+ if (value == type.value) return type;
+ }
+ return UNKNOWN;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
new file mode 100644
index 00000000000..d7d1e210de4
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Language-sensitive tokenization of a text string.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Tokenizer {
+
+ /**
+ * Returns the tokens produced from an input string under the rules of the given Language and additional options
+ *
+ * @param input the string to tokenize. May be arbitrarily large.
+ * @param language the language of the input string.
+ * @param stemMode the stem mode applied on the returned tokens
+ * @param removeAccents if true accents and similar are removed from the returned tokens
+ * @return the tokens of the input String.
+ * @throws ProcessingException If the underlying library throws an Exception.
+ */
+ Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
+
+ /**
+ * Return a replacement for an input token string.
+ * This accepts strings returned by Token.getTokenString
+ * and returns a replacement which will be used as the index token.
+ * The input token string is returned if there is no replacement.
+ * <p>
+ * This default implementation always returns the input token string.
+ *
+ * @param tokenString the token string of the term to lookup a replacement for
+ * @return the replacement, if any, or the argument token string if not
+ */
+ default String getReplacementTerm(String tokenString) { return tokenString; }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
new file mode 100644
index 00000000000..4d288aafaca
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Interface for providers of text transformations such as accent removal.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Transformer {
+
+ /**
+ * Remove accents from input text.
+ *
+ * @param input text to transform.
+ * @param language language of input text.
+ * @return text with accents removed, or input-text if the feature is unavailable
+ * @throws ProcessingException thrown if there is an exception stemming this input
+ */
+ String accentDrop(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/package-info.java b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
new file mode 100644
index 00000000000..de8d82fcf36
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.process;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;