Publish

author: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
committer: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
commit: 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree: 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/process
16 files changed, 818 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
new file mode 100644
index 00000000000..0e1327aabcf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Determines the class of a given character. Use this rather than java.lang.Character.
+ *
+ * @author bratseth
+ */
+public class CharacterClasses {
+
+    /**
+     * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters
+     * which are useful to view as letters even though not defined as such in unicode.
+     */
+    public boolean isLetter(int c) {
+        if (java.lang.Character.isLetter(c)) return true;
+        if (Character.isDigit(c) &&  ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
+        // if (c == '_') return true;
+
+        // Ticket 3864695, some CJK punctuation YST defined as word characters
+        if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
+            c == '\u300c' || c == '\u300d' || c == '\u300e' ||
+            c == '\u300f' || c == '\u3010' || c == '\u3011') {
+            return true;
+        }
+        int type = java.lang.Character.getType(c);
+        return type == java.lang.Character.NON_SPACING_MARK ||
+               type == java.lang.Character.COMBINING_SPACING_MARK ||
+               type == java.lang.Character.ENCLOSING_MARK;
+    }
+
+    /**
+     * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
+     */
+    public boolean isDigit(int c) {
+        return Character.isDigit(c);
+    }
+
+    /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */
+    public boolean isLatinDigit(int c) {
+        return Character.isDigit(c) && isLatin(c);
+    }
+
+    /** Returns true if this is a latin character */
+    public boolean isLatin(int c) {
+        return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN);
+    }
+
+    /**
+     * Convenience, returns isLetter(c) || isDigit(c)
+     */
+    public boolean isLetterOrDigit(int c) {
+        return isLetter(c) || isDigit(c);
+    }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
new file mode 100644
index 00000000000..0672582d732
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -0,0 +1,222 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.*;
+
+/**
+ * A class which splits consecutive word character sequences into overlapping character n-grams.
+ * For example "en gul bille sang" split into 2-grams becomes
+ * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang".
+ * <p>
+ * This class is multithread safe.
+ *
+ * @author bratseth
+ */
+public class GramSplitter {
+
+    private final CharacterClasses characterClasses;
+
+    public GramSplitter(CharacterClasses characterClasses) {
+        this.characterClasses = characterClasses;
+    }
+
+    /**
+     * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length]
+     * pairs into the input string.
+     * <p>
+     * The iterator is implemented as a sliding view over the input string rather than being backed by a
+     * list, which makes this space efficient for large strings.
+     *
+     * @param input the input string to be split, cannot be null
+     * @param n     the gram size, a positive integer
+     * @return a read only iterator over the resulting grams
+     * @throws NullPointerException     if input==null
+     * @throws IllegalArgumentException if n is less than 1
+     */
+    public GramSplitterIterator split(String input, int n) {
+        if (input == null) {
+            throw new NullPointerException("input cannot be null");
+        }
+        if (n < 1) {
+            throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
+        }
+        return new GramSplitterIterator(input, n, characterClasses);
+    }
+
+    public static class GramSplitterIterator implements Iterator<Gram> {
+
+        private final CharacterClasses characterClasses;
+
+        /**
+         * Text to split
+         */
+        private final String input;
+
+        /**
+         * Gram size
+         */
+        private final int n;
+
+        /**
+         * Current index
+         */
+        private int i = 0;
+
+        /**
+         * Whether the last thing that happened was being on a separator (including the start of the string)
+         */
+        private boolean isFirstAfterSeparator = true;
+
+        /**
+         * The next gram or null if not determined yet
+         */
+        private Gram nextGram = null;
+
+        public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
+            this.input = input;
+            this.n = n;
+            this.characterClasses = characterClasses;
+        }
+
+        @Override
+        public boolean hasNext() {
+            if (nextGram != null) {
+                return true;
+            }
+            nextGram = findNext();
+            return nextGram != null;
+        }
+
+        @Override
+        public Gram next() {
+            Gram currentGram = nextGram;
+            if (currentGram == null) {
+                currentGram = findNext();
+            }
+            if (currentGram == null) {
+                throw new NoSuchElementException("No next gram at position " + i);
+            }
+            nextGram = null;
+            return currentGram;
+        }
+
+        private Gram findNext() {
+            // Skip to next word character
+            while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
+                i++;
+                isFirstAfterSeparator = true;
+            }
+            if (i >= input.length()) {
+                return null;
+            }
+
+            String gram = input.substring(i, Math.min(i + n, input.length()));
+            int nonWordChar = indexOfNonWordChar(gram);
+            if (nonWordChar == 0) {
+                throw new RuntimeException("Programming error");
+            }
+            if (nonWordChar > 0) {
+                gram = gram.substring(0, nonWordChar);
+            }
+
+            if (gram.length() == n) { // normal case: got a full length gram
+                i++;
+                isFirstAfterSeparator = false;
+                return new Gram(i - 1, gram.length());
+            } else { // gram is too short due either to a non-word separator or end of string
+                if (isFirstAfterSeparator) { // make a gram anyway
+                    i++;
+                    isFirstAfterSeparator = false;
+                    return new Gram(i - 1, gram.length());
+                } else { // skip to next
+                    i += gram.length() + 1;
+                    isFirstAfterSeparator = true;
+                    return findNext();
+                }
+            }
+        }
+
+        private int indexOfNonWordChar(String s) {
+            for (int i = 0; i < s.length(); i++) {
+                if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) {
+                    return i;
+                }
+            }
+            return -1;
+        }
+
+        @Override
+        public void remove() {
+            throw new UnsupportedOperationException("This iterator is read only");
+        }
+
+        /**
+         * Convenience list which splits the remaining items in this iterator into a list of gram strings
+         *
+         * @return an immutable list of extracted grams
+         */
+        public List<String> toExtractedList() {
+            List<String> gramList = new ArrayList<>();
+            while (hasNext()) {
+                gramList.add(next().extractFrom(input));
+            }
+            return Collections.unmodifiableList(gramList);
+        }
+    }
+
+    /**
+     * An immutable start index and length pair
+     */
+    public static final class Gram {
+
+        private int start, length;
+
+        public Gram(int start, int length) {
+            this.start = start;
+            this.length = length;
+        }
+
+        public int getStart() {
+            return start;
+        }
+
+        public int getLength() {
+            return length;
+        }
+
+        /**
+         * Returns this gram as a string from the input string
+         */
+        public String extractFrom(String input) {
+            return input.substring(start, start + length);
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (!(o instanceof Gram)) {
+                return false;
+            }
+
+            Gram gram = (Gram)o;
+
+            if (length != gram.length) {
+                return false;
+            }
+            if (start != gram.start) {
+                return false;
+            }
+
+            return true;
+        }
+
+        @Override
+        public int hashCode() {
+            int result = start;
+            result = 31 * result + length;
+            return result;
+        }
+    }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
new file mode 100644
index 00000000000..f4e1ccc9feb
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>This interface provides NFKC normalization of Strings through the underlying linguistics library.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M\u00F8lster Lidal</a>
+ */
+public interface Normalizer {
+
+    /**
+     * <p>NFKC normalizes a String.</p>
+     *
+     * @param input String to normalize.
+     * @return The normalized String.
+     * @throws ProcessingException If underlying library throws an Exception.
+     */
+    public String normalize(String input);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
new file mode 100644
index 00000000000..ce8b455707c
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
@@ -0,0 +1,18 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>Exception class indicating that a fatal error occured during linguistic processing.</p>
+ *
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class ProcessingException extends RuntimeException {
+
+    public ProcessingException(String message) {
+        super(message);
+    }
+
+    public ProcessingException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
new file mode 100644
index 00000000000..73764e06ef6
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
+ * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
+ * processing).</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Segmenter {
+
+    /**
+     * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
+     * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
+     * contains word-characters, any punctuation and spacing tokens will be removed.
+     *
+     * @param input the text to segment.
+     * @param language language of input text.
+     * @return the list of segments.
+     * @throws ProcessingException if an exception is encountered during processing
+     */
+    List<String> segment(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
new file mode 100644
index 00000000000..146d65cb7e2
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SegmenterImpl implements Segmenter {
+
+    private final Tokenizer tokenizer;
+
+    public SegmenterImpl(Tokenizer tokenizer) {
+        this.tokenizer = tokenizer;
+    }
+
+    @Override
+    public List<String> segment(String input, Language language) {
+        List<String> segments = new ArrayList<>();
+        for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) {
+            findSegments(token, segments);
+        }
+        if (segments.isEmpty()) {
+            segments.add(input); // no segments, return original string
+        }
+        return segments;
+    }
+
+    private void findSegments(Token token, List<String> out) {
+        int len;
+        if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+            if (token.isIndexable()) {
+                out.add(token.getOrig());
+            }
+        } else {
+            for (int i = 0; i < len; ++i) {
+                findSegments(token.getComponent(i), out);
+            }
+        }
+    }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
new file mode 100644
index 00000000000..d355af87f08
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.AbstractList;
+import java.util.ArrayList;
+
+/**
+ * A list of strings which does not allow for duplicate elements.
+ *
+ * @author steinar
+ */
+public class StemList extends AbstractList<String> {
+    private final ArrayList<String> stems;
+
+    public StemList() {
+        this(new String[0]);
+    }
+
+    public StemList(String... stems) {
+        super();
+        this.stems = new ArrayList<>(Math.max(stems.length, 3));
+        for (String word : stems) {
+            add(word);
+        }
+    }
+
+    @Override
+    public String get(int i) {
+        return stems.get(i);
+    }
+
+    @Override
+    public int size() {
+        return stems.size();
+    }
+
+    @Override
+    public String set(int i, String element) {
+        int existing = stems.indexOf(element);
+        if (existing >= 0 && existing != i) {
+            // the element already exists
+            return element;
+        } else {
+            return stems.set(i, element);
+        }
+    }
+
+    @Override
+    public void add(int i, String element) {
+        int existing = stems.indexOf(element);
+        if (existing < 0) {
+            stems.add(i, element);
+        }
+    }
+
+    @Override
+    public String remove(int i) {
+        return stems.remove(i);
+    }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
new file mode 100644
index 00000000000..269b08dcdf7
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enum of the stemming modes which can be requested.
+ * Stemming implementation may support a smaller number of modes by mapping a mode to a more
+ * inclusive alternative.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum StemMode {
+
+    NONE(0),
+    DEFAULT(1),
+    ALL(2),
+    SHORTEST(4),
+    BEST(5);
+
+    private final int value;
+
+    StemMode(int value) {
+        this.value = value;
+    }
+
+    /**
+     * Returns the stem mode as an int
+     *
+     * @deprecated do not use
+     */
+    @Deprecated
+    public int getValue() {
+        return value;
+    }
+
+    @Deprecated
+    public static StemMode valueOf(int value) {
+        for (StemMode mode : values()) {
+            if (mode.value == value) {
+                return mode;
+            }
+        }
+        return NONE;
+    }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
new file mode 100644
index 00000000000..739fd1d9e96
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing stemming of single words.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Stemmer {
+
+    /**
+     * Stem input according to specified stemming mode.
+     *
+     * @param input    the string to stem.
+     * @param mode     the stemming mode
+     * @param language the language to use for stemming
+     * @return list of possible stems. Empty if none.
+     * @throws ProcessingException thrown if there is an exception stemming this input
+     */
+    List<StemList> stem(String input, StemMode mode, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
new file mode 100644
index 00000000000..0d175a2bf3e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemmerImpl implements Stemmer {
+
+    private final Tokenizer tokenizer;
+
+    public StemmerImpl(Tokenizer tokenizer) {
+        this.tokenizer = tokenizer;
+    }
+
+    @Override
+    public List<StemList> stem(String input, StemMode stemMode, Language language) {
+        List<StemList> stems = new ArrayList<>();
+        for (Token token : tokenizer.tokenize(input, language, stemMode, false)) {
+            findStems(token, stems);
+        }
+        return stems;
+    }
+
+    private void findStems(Token token, List<StemList> out) {
+        int len;
+        if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+            if (token.isIndexable()) {
+                StemList word = new StemList();
+                word.add(token.getTokenString()); // takes care of getStem(0)
+                for (int i = 1; i < token.getNumStems(); i++) {
+                    word.add(token.getStem(i));
+                }
+                out.add(word);
+            }
+        } else {
+            for (int i = 0; i < len; ++i) {
+                findStems(token.getComponent(i), out);
+            }
+        }
+    }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java
new file mode 100644
index 00000000000..f1dc6639e11
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Interface providing access to a single token produced by the tokenizer.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Token {
+
+    /** Returns the type of this token - word, space or punctuation etc. */
+    TokenType getType();
+
+    /** Returns the original form of this token */
+    String getOrig();
+
+    /** Returns the number of stem forms available for this token. */
+    int getNumStems();
+
+    /** Returns the stem at position i */
+    String getStem(int i);
+
+    /**
+     * Returns the number of components, if this token is a compound word
+     * (e.g. german "kommunikationsfehler". Otherwise, return 0
+     *
+     * @return number of components, or 0 if none
+     */
+    int getNumComponents();
+
+    /** Returns a component token of this */
+    Token getComponent(int i);
+
+    /** Returns the offset position of this token */
+    long getOffset();
+
+    /** Returns the script of this token */
+    TokenScript getScript();
+
+    /**
+     * Returns token string in a form suitable for indexing: The
+     * most lowercased variant of the most processed token form available.
+     * If called on a compound token this returns a lowercased form of the
+     * entire word.
+     *
+     * @return token string value
+     */
+    String getTokenString();
+
+    /** Returns whether this is an instance of a declared special token (e.g. c++) */
+    boolean isSpecialToken();
+
+    /** Whether this token should be indexed */
+    boolean isIndexable();
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
new file mode 100644
index 00000000000..ba0ad89b454
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
@@ -0,0 +1,77 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different
+ * linguistics treatment.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenScript {
+
+    COMMON,
+    LATIN,
+    GREEK,
+    CYRILLIC,
+    ARMENIAN,
+    HEBREW,
+    ARABIC,
+    SYRIAC,
+    THAANA,
+    DEVANAGARI,
+    BENGALI,
+    GURMUKHI,
+    GUJARATI,
+    ORIYA,
+    TAMIL,
+    TELUGU,
+    KANNADA,
+    MALAYALAM,
+    SINHALA,
+    THAI,
+    LAO,
+    TIBETAN,
+    MYANMAR,
+    GEORGIAN,
+    HANGUL,
+    ETHIOPIC,
+    CHEROKEE,
+    CANADIAN,
+    OGHAM,
+    RUNIC,
+    KHMER,
+    MONGOLIAN,
+    HIRAGANA,
+    KATAKANA,
+    CHINESE,
+    HAN,
+    YI,
+    OLDITALIC,
+    GOTHIC,
+    DESERET,
+    INHERITED,
+    TAGALOG,
+    HANUNOO,
+    BUHID,
+    TAGBANWA,
+    LIMBU,
+    TAILE,
+    LINEARB,
+    UGARITIC,
+    SHAVIAN,
+    OSMANYA,
+    CYPRIOT,
+    BRAILLE,
+    ASCII,
+    BUGINESE,
+    COPTIC,
+    GLAGOLITIC,
+    KHAROSHTHI,
+    OLDPERSIAN,
+    SYLOTINAGRI,
+    TAILUE,
+    TIFINAGH,
+    VIETNAMESE,
+    UNKNOWN;
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
new file mode 100644
index 00000000000..7d880440f1e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enumeration of token types.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenType {
+
+    UNKNOWN(0),
+    SPACE(1),
+    PUNCTUATION(2),
+    SYMBOL(3),
+    ALPHABETIC(4),
+    NUMERIC(5),
+    MARKER(255);
+
+    private final int value;
+
+    TokenType(int value) {
+        this.value = value;
+    }
+
+    /** Returns an int code for this type */
+    public int getValue() { return value; }
+
+    /**
+     * Marker for whether this type of token can be indexed for search.
+     * Note that a Token can be excluded from an index, even though the token type marks
+     * it as indexable.
+     *
+     * @see com.yahoo.language.process.Token#isIndexable()
+     * @return whether this type of token can be indexed
+     */
+    public boolean isIndexable() {
+        switch (this) {
+            case ALPHABETIC: case NUMERIC: return true;
+            default: return false;
+        }
+    }
+
+    /** Translates this from the int code representation returned from {@link #getValue} */
+    public static TokenType valueOf(int value) {
+        for (TokenType type : values()) {
+            if (value == type.value) return type;
+        }
+        return UNKNOWN;
+    }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
new file mode 100644
index 00000000000..d7d1e210de4
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Language-sensitive tokenization of a text string.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Tokenizer {
+
+    /**
+     * Returns the tokens produced from an input string under the rules of the given Language and additional options
+     *
+     * @param input the string to tokenize. May be arbitrarily large.
+     * @param language the language of the input string.
+     * @param stemMode the stem mode applied on the returned tokens
+     * @param removeAccents if true accents and similar are removed from the returned tokens
+     * @return the tokens of the input String.
+     * @throws ProcessingException If the underlying library throws an Exception.
+     */
+    Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
+
+    /**
+     * Return a replacement for an input token string.
+     * This accepts strings returned by Token.getTokenString
+     * and returns a replacement which will be used as the index token.
+     * The input token string is returned if there is no replacement.
+     * <p>
+     * This default implementation always returns the input token string.
+     *
+     * @param tokenString the token string of the term to lookup a replacement for
+     * @return the replacement, if any, or the argument token string if not
+     */
+    default String getReplacementTerm(String tokenString) { return tokenString; }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
new file mode 100644
index 00000000000..4d288aafaca
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Interface for providers of text transformations such as accent removal.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Transformer {
+
+    /**
+     * Remove accents from input text.
+     *
+     * @param input    text to transform.
+     * @param language language of input text.
+     * @return text with accents removed, or input-text if the feature is unavailable
+     * @throws ProcessingException thrown if there is an exception stemming this input
+     */
+    String accentDrop(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/package-info.java b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
new file mode 100644
index 00000000000..de8d82fcf36
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.process;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
author	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
committer	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
commit	72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree	2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/process