From 72231250ed81e10d66bfe70701e64fa5fe50f712 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 15 Jun 2016 23:09:44 +0200 Subject: Publish --- .../src/main/java/com/yahoo/language/Language.java | 615 +++++++++ .../main/java/com/yahoo/language/Linguistics.java | 101 ++ .../java/com/yahoo/language/LinguisticsCase.java | 31 + .../java/com/yahoo/language/LocaleFactory.java | 57 + .../yahoo/language/detect/AbstractDetector.java | 25 + .../java/com/yahoo/language/detect/Detection.java | 47 + .../yahoo/language/detect/DetectionException.java | 14 + .../java/com/yahoo/language/detect/Detector.java | 44 + .../main/java/com/yahoo/language/detect/Hint.java | 38 + .../com/yahoo/language/detect/package-info.java | 7 + .../main/java/com/yahoo/language/package-info.java | 7 + .../yahoo/language/process/CharacterClasses.java | 55 + .../com/yahoo/language/process/GramSplitter.java | 222 +++ .../com/yahoo/language/process/Normalizer.java | 19 + .../language/process/ProcessingException.java | 18 + .../java/com/yahoo/language/process/Segmenter.java | 29 + .../com/yahoo/language/process/SegmenterImpl.java | 45 + .../java/com/yahoo/language/process/StemList.java | 61 + .../java/com/yahoo/language/process/StemMode.java | 45 + .../java/com/yahoo/language/process/Stemmer.java | 26 + .../com/yahoo/language/process/StemmerImpl.java | 46 + .../java/com/yahoo/language/process/Token.java | 56 + .../com/yahoo/language/process/TokenScript.java | 77 ++ .../java/com/yahoo/language/process/TokenType.java | 51 + .../java/com/yahoo/language/process/Tokenizer.java | 38 + .../com/yahoo/language/process/Transformer.java | 23 + .../com/yahoo/language/process/package-info.java | 7 + .../com/yahoo/language/simple/SimpleDetector.java | 179 +++ .../yahoo/language/simple/SimpleLinguistics.java | 61 + .../yahoo/language/simple/SimpleNormalizer.java | 16 + .../com/yahoo/language/simple/SimpleToken.java | 188 +++ .../com/yahoo/language/simple/SimpleTokenType.java | 68 + .../com/yahoo/language/simple/SimpleTokenizer.java | 76 ++ .../yahoo/language/simple/SimpleTransformer.java | 25 + .../yahoo/language/simple/kstem/CharArrayMap.java | 661 +++++++++ .../yahoo/language/simple/kstem/CharArraySet.java | 184 +++ .../language/simple/kstem/CharacterUtils.java | 375 +++++ .../yahoo/language/simple/kstem/KStemData1.java | 716 ++++++++++ .../yahoo/language/simple/kstem/KStemData2.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData3.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData4.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData5.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData6.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData7.java | 715 ++++++++++ .../yahoo/language/simple/kstem/KStemData8.java | 614 +++++++++ .../com/yahoo/language/simple/kstem/KStemmer.java | 1426 ++++++++++++++++++++ .../language/simple/kstem/OpenStringBuilder.java | 136 ++ .../com/yahoo/language/simple/package-info.java | 9 + .../java/com/yahoo/language/LanguageTestCase.java | 107 ++ .../com/yahoo/language/LocaleFactoryTestCase.java | 52 + .../language/detect/AbstractDetectorTestCase.java | 61 + .../process/AbstractTokenizerTestCase.java | 66 + .../language/process/GramSplitterTestCase.java | 150 ++ .../language/process/NormalizationTestCase.java | 35 + .../process/ProcessingExceptionTestCase.java | 27 + .../language/process/SegmenterImplTestCase.java | 43 + .../yahoo/language/process/StemListTestCase.java | 73 + .../yahoo/language/process/StemModeTestCase.java | 27 + .../language/process/StemmerImplTestCase.java | 68 + .../yahoo/language/process/TokenTypeTestCase.java | 38 + .../language/process/TokenizationTestCase.java | 233 ++++ .../language/simple/SimpleDetectorTestCase.java | 89 ++ .../language/simple/SimpleNormalizerTestCase.java | 34 + .../yahoo/language/simple/SimpleTokenTestCase.java | 194 +++ .../language/simple/SimpleTokenTypeTestCase.java | 43 + .../language/simple/SimpleTokenizerTestCase.java | 36 + .../language/simple/SimpleTransformerTestCase.java | 40 + .../com/yahoo/language/simple/TokenizerTester.java | 69 + 68 files changed, 12313 insertions(+) create mode 100644 linguistics/src/main/java/com/yahoo/language/Language.java create mode 100644 linguistics/src/main/java/com/yahoo/language/Linguistics.java create mode 100644 linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java create mode 100644 linguistics/src/main/java/com/yahoo/language/LocaleFactory.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/Detection.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/Detector.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/Hint.java create mode 100644 linguistics/src/main/java/com/yahoo/language/detect/package-info.java create mode 100644 linguistics/src/main/java/com/yahoo/language/package-info.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Normalizer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Segmenter.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/StemList.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/StemMode.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Stemmer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Token.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/TokenScript.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/TokenType.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Transformer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/package-info.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java create mode 100644 linguistics/src/main/java/com/yahoo/language/simple/package-info.java create mode 100644 linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java create mode 100644 linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java (limited to 'linguistics/src') diff --git a/linguistics/src/main/java/com/yahoo/language/Language.java b/linguistics/src/main/java/com/yahoo/language/Language.java new file mode 100644 index 00000000000..0fade0d7299 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/Language.java @@ -0,0 +1,615 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import com.yahoo.text.Lowercase; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +/** + * @author rpito + */ +public enum Language { + + /** Language tag "un". */ + UNKNOWN("un"), + + /** Language tag "ab". */ + ABKHAZIAN("ab"), + + /** Language tag "aa". */ + AFAR("aa"), + + /** Language tag "af". */ + AFRIKAANS("af"), + + /** Language tag "sq". */ + ALBANIAN("sq"), + + /** Language tag "am". */ + AMHARIC("am"), + + /** Language tag "ar". */ + ARABIC("ar"), + + /** Language tag "hy". */ + ARMENIAN("hy"), + + /** Language tag "as". */ + ASSAMESE("as"), + + /** Language tag "ay". */ + AYMARA("ay"), + + /** Language tag "az". */ + AZERBAIJANI("az"), + + /** Language tag "ba". */ + BASHKIR("ba"), + + /** Language tag "eu". */ + BASQUE("eu"), + + /** Language tag "bn". */ + BENGALI("bn"), + + /** Language tag "dz". */ + BHUTANI("dz"), + + /** Language tag "bh". */ + BIHARI("bh"), + + /** Language tag "bi". */ + BISLAMA("bi"), + + /** Language tag "br". */ + BRETON("br"), + + /** Language tag "bug". */ + BUGINESE("bug"), + + /** Language tag "bg". */ + BULGARIAN("bg"), + + /** Language tag "my". */ + BURMESE("my"), + + /** Language tag "be". */ + BYELORUSSIAN("be"), + + /** Language tag "km". */ + CAMBODIAN("km"), + + /** Language tag "ca". */ + CATALAN("ca"), + + /** Language tag "chr". */ + CHEROKEE("chr"), + + /** + * Language tag "zh-hans". + * + * @see #fromLocale(Locale) + */ + CHINESE_SIMPLIFIED("zh-hans"), + + /** + * Language tag "zh-hant". + * + * @see #fromLocale(Locale) + */ + CHINESE_TRADITIONAL("zh-hant"), + + /** Language tag "cop". */ + COPTIC("cop"), + + /** Language tag "co". */ + CORSICAN("co"), + + /** Language tag "hr". */ + CROATIAN("hr"), + + /** Language tag "cs". */ + CZECH("cs"), + + /** Language tag "da". */ + DANISH("da"), + + /** Language tag "div". */ + DIVEHI("div"), + + /** Language tag "nl". */ + DUTCH("nl"), + + /** Language tag "en". */ + ENGLISH("en"), + + /** Language tag "eo". */ + ESPERANTO("eo"), + + /** Language tag "et". */ + ESTONIAN("et"), + + /** Language tag "fo". */ + FAROESE("fo"), + + /** Language tag "fj". */ + FIJI("fj"), + + /** Language tag "fi". */ + FINNISH("fi"), + + /** Language tag "fr". */ + FRENCH("fr"), + + /** Language tag "fy". */ + FRISIAN("fy"), + + /** Language tag "gl". */ + GALICIAN("gl"), + + /** Language tag "ka". */ + GEORGIAN("ka"), + + /** Language tag "de". */ + GERMAN("de"), + + /** Language tag "got". */ + GOTHIC("got"), + + /** Language tag "el". */ + GREEK("el"), + + /** Language tag "kl". */ + GREENLANDIC("kl"), + + /** Language tag "gn". */ + GUARANI("gn"), + + /** Language tag "gu". */ + GUJARATI("gu"), + + /** Language tag "ha". */ + HAUSA("ha"), + + /** + * Language tag "he". + * + * @see #fromLocale(Locale) + */ + HEBREW("he"), + + /** Language tag "hi". */ + HINDI("hi"), + + /** Language tag "hu". */ + HUNGARIAN("hu"), + + /** Language tag "is". */ + ICELANDIC("is"), + + /** + * Language tag "id". + * + * @see #fromLocale(Locale) + */ + INDONESIAN("id"), + + /** Language tag "ia". */ + INTERLINGUA("ia"), + + /** Language tag "ie". */ + INTERLINGUE("ie"), + + /** Language tag "iu". */ + INUKTITUT("iu"), + + /** Language tag "ik". */ + INUPIAK("ik"), + + /** Language tag "ga". */ + IRISH("ga"), + + /** Language tag "it". */ + ITALIAN("it"), + + /** Language tag "ja". */ + JAPANESE("ja"), + + /** Language tag "jw". */ + JAVANESE("jw"), + + /** Language tag "kn". */ + KANNADA("kn"), + + /** Language tag "ks". */ + KASHMIRI("ks"), + + /** Language tag "kk". */ + KAZAKH("kk"), + + /** Language tag "rw". */ + KINYARWANDA("rw"), + + /** Language tag "ky". */ + KIRGHIZ("ky"), + + /** Language tag "rn". */ + KIRUNDI("rn"), + + /** Language tag "ko". */ + KOREAN("ko"), + + /** Language tag "ku". */ + KURDISH("ku"), + + /** Language tag "lo". */ + LAOTHIAN("lo"), + + /** Language tag "la". */ + LATIN("la"), + + /** Language tag "lv". */ + LATVIAN("lv"), + + /** Language tag "ln". */ + LINGALA("ln"), + + /** Language tag "lt". */ + LITHUANIAN("lt"), + + /** Language tag "mk". */ + MACEDONIAN("mk"), + + /** Language tag "mg". */ + MALAGASY("mg"), + + /** Language tag "ms". */ + MALAY("ms"), + + /** Language tag "ml". */ + MALAYALAM("ml"), + + /** Language tag "mt". */ + MALTESE("mt"), + + /** Language tag "mni". */ + MANIPURI("mni"), + + /** Language tag "mi". */ + MAORI("mi"), + + /** Language tag "mr". */ + MARATHI("mr"), + + /** Language tag "mo". */ + MOLDAVIAN("mo"), + + /** Language tag "mn". */ + MONGOLIAN("mn"), + + /** Language tag "mun". */ + MUNDA("mun"), + + /** Language tag "na". */ + NAURU("na"), + + /** Language tag "ne". */ + NEPALI("ne"), + + /** + * Language tag "nb". + * + * @see #fromLocale(Locale) + */ + NORWEGIAN_BOKMAL("nb"), + + /** Language tag "nn". */ + NORWEGIAN_NYNORSK("nn"), + + /** Language tag "oc". */ + OCCITAN("oc"), + + /** Language tag "or". */ + ORIYA("or"), + + /** Language tag "om". */ + OROMO("om"), + + /** Language tag "ps". */ + PASHTO("ps"), + + /** Language tag "fa". */ + PERSIAN("fa"), + + /** Language tag "pl". */ + POLISH("pl"), + + /** Language tag "pt". */ + PORTUGUESE("pt"), + + /** Language tag "pa". */ + PUNJABI("pa"), + + /** Language tag "qu". */ + QUECHUA("qu"), + + /** Language tag "rm". */ + RHAETO_ROMANCE("rm"), + + /** Language tag "ro". */ + ROMANIAN("ro"), + + /** Language tag "ru". */ + RUSSIAN("ru"), + + /** Language tag "sm". */ + SAMOAN("sm"), + + /** Language tag "sg". */ + SANGHO("sg"), + + /** Language tag "sa". */ + SANSKRIT("sa"), + + /** Language tag "gd". */ + SCOTS_GAELIC("gd"), + + /** Language tag "sr". */ + SERBIAN("sr"), + + /** Language tag "s". */ + SERBO_CROATIAN("sh"), + + /** Language tag "st". */ + SESOTHO("st"), + + /** Language tag "tn". */ + SETSWANA("tn"), + + /** Language tag "sn". */ + SHONA("sn"), + + /** Language tag "ii". */ + SICHUAN_YI("ii"), + + /** Language tag "sd". */ + SINDHI("sd"), + + /** Language tag "si". */ + SINHALESE("si"), + + /** Language tag "ss". */ + SISWATI("ss"), + + /** Language tag "sk". */ + SLOVAK("sk"), + + /** Language tag "sl". */ + SLOVENIAN("sl"), + + /** Language tag "so". */ + SOMALI("so"), + + /** Language tag "es". */ + SPANISH("es"), + + /** Language tag "su". */ + SUNDANESE("su"), + + /** Language tag "sw". */ + SWAHILI("sw"), + + /** Language tag "sv". */ + SWEDISH("sv"), + + /** Language tag "syr". */ + SYRIAC("syr"), + + /** Language tag "fil". */ + TAGALOG("fil"), + + /** Language tag "tg". */ + TAJIK("tg"), + + /** Language tag "ta". */ + TAMIL("ta"), + + /** Language tag "tt". */ + TATAR("tt"), + + /** Language tag "te". */ + TELUGU("te"), + + /** Language tag "th". */ + THAI("th"), + + /** Language tag "bo". */ + TIBETAN("bo"), + + /** Language tag "ti". */ + TIGRINYA("ti"), + + /** Language tag "to". */ + TONGA("to"), + + /** Language tag "ts". */ + TSONGA("ts"), + + /** Language tag "tr". */ + TURKISH("tr"), + + /** Language tag "tk". */ + TURKMEN("tk"), + + /** Language tag "tw". */ + TWI("tw"), + + /** Language tag "uga". */ + UGARITIC("uga"), + + /** Language tag "ug". */ + UIGHUR("ug"), + + /** Language tag "uk". */ + UKRAINIAN("uk"), + + /** Language tag "ur". */ + URDU("ur"), + + /** Language tag "uz". */ + UZBEK("uz"), + + /** Language tag "vi". */ + VIETNAMESE("vi"), + + /** Language tag "vo". */ + VOLAPUK("vo"), + + /** Language tag "cy". */ + WELSH("cy"), + + /** Language tag "wo". */ + WOLOF("wo"), + + /** Language tag "xh". */ + XHOSA("xh"), + + /** + * Language tag "yi". + * + * @see #fromLocale(Locale) + */ + YIDDISH("yi"), + + /** Language tag "yo". */ + YORUBA("yo"), + + /** Language tag "za". */ + ZHUANG("za"), + + /** Language tag "zu". */ + ZULU("zu"); + + private static final Map index = new HashMap<>(); + private final String code; + + static { + for (Language language : values()) { + index.put(language.code, language); + } + } + + private Language(String code) { + this.code = code; + } + + public String languageCode() { + return code; + } + + /** + * Returns whether this is a "cjk" language. CJK is here not a linguistic term, it is basically whether the language + * has loose word order and a non-rigid use of space. + * + * @return True if this is a CJK language. + */ + public boolean isCjk() { + switch (this) { + case CHINESE_SIMPLIFIED: + case CHINESE_TRADITIONAL: + case JAPANESE: + case KOREAN: + case THAI: + return true; + default: + return false; + } + } + + /** + *

Convenience method for calling fromLocale(LocaleFactory.fromLanguageTag(languageTag)).

+ * + * @param languageTag The language tag for which the Language to return. + * @return The corresponding Language, or {@link #UNKNOWN} if not known. + */ + public static Language fromLanguageTag(String languageTag) { + if (languageTag == null) { + return UNKNOWN; + } + return fromLocale(LocaleFactory.fromLanguageTag(languageTag)); + } + + /** + *

Returns the Language whose {@link #languageCode()} is equal to locale.getLanguage(), with + * the following additions:

+ * + * + * @param locale The locale for which the Language to return. + * @return The corresponding Language, or {@link #UNKNOWN} if not known. + */ + public static Language fromLocale(Locale locale) { + String str = locale.getLanguage(); + if (str.equals("in")) { + return INDONESIAN; // Locale converts 'id' to 'in' + } + if (str.equals("iw")) { + return HEBREW; // Locale converts 'he' to 'iw' + } + if (str.equals("ji")) { + return YIDDISH; // Locale converts 'yi' to 'ji' + } + if (str.equals("no")) { + return NORWEGIAN_BOKMAL; // alias for 'nb' + } + if (str.equals("zh")) { + if (locale.getCountry().equalsIgnoreCase("cn") || + locale.getVariant().equalsIgnoreCase("hans")) { + return CHINESE_SIMPLIFIED; + } + return CHINESE_TRADITIONAL; + } + Language ret = index.get(str); + return ret != null ? ret : UNKNOWN; + } + + /** + * Returns the language from an encoding, or {@link #UNKNOWN} if it cannot be determined. + * + * @param encoding The name of the encoding to derive the Language from. + * @return the language given by the encoding, or {@link #UNKNOWN} if not determined. + */ + public static Language fromEncoding(String encoding) { + if (encoding == null) { + return UNKNOWN; + } + return fromLowerCasedEncoding(Lowercase.toLowerCase(encoding)); + } + + private static Language fromLowerCasedEncoding(String encoding) { + if (encoding.equals("gb2312")) { + return CHINESE_SIMPLIFIED; + } + if (encoding.equals("big5")) { + return CHINESE_TRADITIONAL; + } + if (encoding.equals("euc-jp") || + encoding.equals("iso-2022-jp") || + encoding.equals("shift-jis")) { + return JAPANESE; + } + if (encoding.equals("euc-kr")) { + return KOREAN; + } + return UNKNOWN; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java new file mode 100644 index 00000000000..7a6e224b221 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -0,0 +1,101 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import com.yahoo.collections.Tuple2; +import com.yahoo.component.Version; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.process.CharacterClasses; +import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.Segmenter; +import com.yahoo.language.process.Stemmer; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; +import com.yahoo.language.simple.SimpleLinguistics; + +/** + *

Factory of linguistic processors. For technical reasons this provides more flexibility to provide separate + * components for different operations than is needed in many cases; in particular the tokenizer should typically + * stem, transform and normalize using the same operations as provided directly by this. A set of adaptors are + * provided that makes this easy to achieve. Refer to the {com.yahoo.language.simple.SimpleLinguistics} implementation + * to set this up.

+ * + *

Thread safety: Instances of this factory type must be thread safe but the processors + * returned by the factory methods do not. Clients should request separate processor instances + * for each thread.

+ * + * @author Mathias Moelster Lidal + * @author Simon Thoresen + * @author bratseth + */ +public interface Linguistics { + + enum Component { + STEMMER, + TOKENIZER, + NORMALIZER, + TRANSFORMER, + SEGMENTER, + DETECTOR, + GRAM_SPLITTER, + CHARACTER_CLASSES + } + + /** The same as new com.yahoo.language.simple.SimpleLinguistics(). Prefer using that directly. */ + Linguistics SIMPLE = new SimpleLinguistics(); + + /** + * Returns a thread-unsafe stemmer or lemmatizer. + * This is used at query time to do stemming of search terms to indexes which contains text tokenized + * with stemming turned on + */ + Stemmer getStemmer(); + + /** + * Returns a thread-unsafe tokenizer. + * This is used at indexing time to produce a optionally stemmed and + * transformed (accent normalized) stream of indexable tokens. + */ + Tokenizer getTokenizer(); + + /** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */ + Normalizer getNormalizer(); + + /** + * Returns a thread-unsafe transformer. + * This is used at query time to do stemming of search terms to indexes which contains text tokenized + * with accent normalization turned on + */ + Transformer getTransformer(); + + /** + * Returns a thread-unsafe segmenter. + * This is used at query time to find the individual semantic components of search terms to indexes + * tokenized with segmentation. + */ + Segmenter getSegmenter(); + + /** + * Returns a thread-unsafe detector. + * The language of the text is a parameter to other linguistic operations. + * This is used to determine the language of a query or document field when not specified explicitly. + */ + Detector getDetector(); + + /** + * Returns a thread-unsafe gram splitter. + * This is used to split query or document text into fixed-length grams which allows matching without needing + * or using segmented tokens. + */ + GramSplitter getGramSplitter(); + + /** Returns a thread-unsafe character classes instance. */ + CharacterClasses getCharacterClasses(); + + /** + * Returns the name and version of a processor component returned by + * this instance. + */ + Tuple2 getVersion(Linguistics.Component component); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java new file mode 100644 index 00000000000..a34ec9386b0 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -0,0 +1,31 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import com.yahoo.text.Lowercase; + +import java.util.Locale; + +/** + * This class provides a case normalization operation to be used e.g. when + * document search should be case insensitive. + * + * @author Simon Thoresen + */ +public class LinguisticsCase { + + /** + *

The lower casing method to use in Vespa when doing language independent processing of natural language data. + * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.

+ *

Return a lowercased version of the given string. Since this is language independent, this is more of a case + * normalization operation than lowercasing.

+ * + * @param in The string to lowercase. + * @return A string containing only lowercase character. + */ + public static String toLowerCase(String in) { + // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 + // Also, at the time of writing, English is the default language for queries + return Lowercase.toLowerCase(in); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java new file mode 100644 index 00000000000..2610550dfd2 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java @@ -0,0 +1,57 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import java.util.Locale; + +/** + * @author Simon Thoresen + */ +public final class LocaleFactory { + + private static final Locale UNKNOWN = new Locale("", "", ""); + + private LocaleFactory() { + // hide + } + + /** + *

Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale.

+ * + * @param tag The language tag to parse. + * @return The corrseponding Locale. + */ + @SuppressWarnings("ConstantConditions") + public static Locale fromLanguageTag(String tag) { + // TODO: Should be replaced by return Locale.forLanguageTag(tag); ? + + tag.getClass(); // throws NullPointerException + tag = tag.trim(); + if (tag.isEmpty()) { + return UNKNOWN; + } + String language = ""; + String region = ""; + String script = ""; + String[] parts = tag.split("-"); + for (int partIdx = 0; partIdx < parts.length; ++partIdx) { + String part = parts[partIdx]; + int partLen = part.length(); + if (partIdx == 0) { + if (partLen == 2 || partLen == 3) { + language = part; + } + } else if (partIdx == 1 || partIdx == 2) { + if (partLen == 2 || partLen == 3) { + region = part; + } else if (partLen == 4) { + script = part; + } + } + } + if (language.isEmpty()) { + return UNKNOWN; + } + return new Locale(language, region, script); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java b/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java new file mode 100644 index 00000000000..f80f876d248 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java @@ -0,0 +1,25 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +import com.yahoo.text.Utf8; + +import java.nio.ByteBuffer; + +/** + * @author Simon Thoresen + */ +public abstract class AbstractDetector implements Detector { + + @Override + public final Detection detect(String input, Hint hint) { + byte[] buf = Utf8.toBytes(input); + return detect(buf, 0, buf.length, hint); + } + + @Override + public final Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Detection.java b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java new file mode 100644 index 00000000000..e70d70425d4 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java @@ -0,0 +1,47 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +import com.yahoo.language.Language; + +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; + +/** + * @author Einar M R Rosenvinge + */ +public class Detection { + + private final Language language; + private final String encodingName; + private final boolean local; + + public Detection(Language language, String encodingName, boolean local) { + this.language = language; + this.encodingName = encodingName; + this.local = local; + } + + public Language getLanguage() { + return language; + } + + public Charset getEncoding() { + if (encodingName == null) { + return null; + } + try { + return Charset.forName(encodingName); + } catch (UnsupportedCharsetException e) { + // ignore + } + return null; + } + + public String getEncodingName() { + return encodingName; + } + + public boolean isLocal() { + return local; + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java new file mode 100644 index 00000000000..c97895387fe --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java @@ -0,0 +1,14 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +/** + * Exception that is thrown when detection fails. + * + * @author Einar M R Rosenvinge + */ +public final class DetectionException extends RuntimeException { + + public DetectionException(String str) { + super(str); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Detector.java b/linguistics/src/main/java/com/yahoo/language/detect/Detector.java new file mode 100644 index 00000000000..4962d761a5a --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/Detector.java @@ -0,0 +1,44 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +import java.nio.ByteBuffer; + +/** + * Abstract superclass of all Detectors used for language and encoding detection. + * + * @author Einar M R Rosenvinge + */ +public interface Detector { + + /** + * Detects language and encoding of the supplied byte array, possibly using a language/encoding hint. + * + * @param input the buffer that is to be inspected + * @param offset the offset to detect from + * @param length the size to detect from + * @param hint a hint to the detector, or null for no hint + * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null) + * @throws DetectionException if detection fails + */ + public abstract Detection detect(byte[] input, int offset, int length, Hint hint); + + /** + * Detects language and encoding of the supplied ByteBuffer, possibly using a language/encoding hint. + * + * @param input the buffer that is to be inspected, from its current position to its limit + * @param hint a hint to the detector, or null for no hint + * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null) + * @throws DetectionException if detection fails + */ + public abstract Detection detect(ByteBuffer input, Hint hint); + + /** + * Detects language of the supplied String, possibly using a language hint. + * + * @param input the string that is to be inspected + * @param hint a hint to the detector, or null for no hint + * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null) + * @throws DetectionException if detection fails + */ + public abstract Detection detect(String input, Hint hint); +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Hint.java b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java new file mode 100644 index 00000000000..c3fad8bc260 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +/** + *

A hint that can be given to a {@link Detector}.

+ * + * @author Einar M R Rosenvinge + */ +public class Hint { + + private final String market; + private final String country; + + private Hint(String market, String country) { + this.market = market; + this.country = country; + } + + public String getMarket() { + return market; + } + + public String getCountry() { + return country; + } + + public static Hint newMarketHint(String market) { + return new Hint(market, null); + } + + public static Hint newCountryHint(String country) { + return new Hint(null, country); + } + + public static Hint newInstance(String market, String country) { + return new Hint(market, country); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/detect/package-info.java b/linguistics/src/main/java/com/yahoo/language/detect/package-info.java new file mode 100644 index 00000000000..3ab6309e9e2 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/detect/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language.detect; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/linguistics/src/main/java/com/yahoo/language/package-info.java b/linguistics/src/main/java/com/yahoo/language/package-info.java new file mode 100644 index 00000000000..2f5638d6b70 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java new file mode 100644 index 00000000000..0e1327aabcf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -0,0 +1,55 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Determines the class of a given character. Use this rather than java.lang.Character. + * + * @author bratseth + */ +public class CharacterClasses { + + /** + * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters + * which are useful to view as letters even though not defined as such in unicode. + */ + public boolean isLetter(int c) { + if (java.lang.Character.isLetter(c)) return true; + if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters + // if (c == '_') return true; + + // Ticket 3864695, some CJK punctuation YST defined as word characters + if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || + c == '\u300c' || c == '\u300d' || c == '\u300e' || + c == '\u300f' || c == '\u3010' || c == '\u3011') { + return true; + } + int type = java.lang.Character.getType(c); + return type == java.lang.Character.NON_SPACING_MARK || + type == java.lang.Character.COMBINING_SPACING_MARK || + type == java.lang.Character.ENCLOSING_MARK; + } + + /** + * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit + */ + public boolean isDigit(int c) { + return Character.isDigit(c); + } + + /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */ + public boolean isLatinDigit(int c) { + return Character.isDigit(c) && isLatin(c); + } + + /** Returns true if this is a latin character */ + public boolean isLatin(int c) { + return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN); + } + + /** + * Convenience, returns isLetter(c) || isDigit(c) + */ + public boolean isLetterOrDigit(int c) { + return isLetter(c) || isDigit(c); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java new file mode 100644 index 00000000000..0672582d732 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -0,0 +1,222 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import java.util.*; + +/** + * A class which splits consecutive word character sequences into overlapping character n-grams. + * For example "en gul bille sang" split into 2-grams becomes + * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang". + *

+ * This class is multithread safe. + * + * @author bratseth + */ +public class GramSplitter { + + private final CharacterClasses characterClasses; + + public GramSplitter(CharacterClasses characterClasses) { + this.characterClasses = characterClasses; + } + + /** + * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length] + * pairs into the input string. + *

+ * The iterator is implemented as a sliding view over the input string rather than being backed by a + * list, which makes this space efficient for large strings. + * + * @param input the input string to be split, cannot be null + * @param n the gram size, a positive integer + * @return a read only iterator over the resulting grams + * @throws NullPointerException if input==null + * @throws IllegalArgumentException if n is less than 1 + */ + public GramSplitterIterator split(String input, int n) { + if (input == null) { + throw new NullPointerException("input cannot be null"); + } + if (n < 1) { + throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n); + } + return new GramSplitterIterator(input, n, characterClasses); + } + + public static class GramSplitterIterator implements Iterator { + + private final CharacterClasses characterClasses; + + /** + * Text to split + */ + private final String input; + + /** + * Gram size + */ + private final int n; + + /** + * Current index + */ + private int i = 0; + + /** + * Whether the last thing that happened was being on a separator (including the start of the string) + */ + private boolean isFirstAfterSeparator = true; + + /** + * The next gram or null if not determined yet + */ + private Gram nextGram = null; + + public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { + this.input = input; + this.n = n; + this.characterClasses = characterClasses; + } + + @Override + public boolean hasNext() { + if (nextGram != null) { + return true; + } + nextGram = findNext(); + return nextGram != null; + } + + @Override + public Gram next() { + Gram currentGram = nextGram; + if (currentGram == null) { + currentGram = findNext(); + } + if (currentGram == null) { + throw new NoSuchElementException("No next gram at position " + i); + } + nextGram = null; + return currentGram; + } + + private Gram findNext() { + // Skip to next word character + while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { + i++; + isFirstAfterSeparator = true; + } + if (i >= input.length()) { + return null; + } + + String gram = input.substring(i, Math.min(i + n, input.length())); + int nonWordChar = indexOfNonWordChar(gram); + if (nonWordChar == 0) { + throw new RuntimeException("Programming error"); + } + if (nonWordChar > 0) { + gram = gram.substring(0, nonWordChar); + } + + if (gram.length() == n) { // normal case: got a full length gram + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // gram is too short due either to a non-word separator or end of string + if (isFirstAfterSeparator) { // make a gram anyway + i++; + isFirstAfterSeparator = false; + return new Gram(i - 1, gram.length()); + } else { // skip to next + i += gram.length() + 1; + isFirstAfterSeparator = true; + return findNext(); + } + } + } + + private int indexOfNonWordChar(String s) { + for (int i = 0; i < s.length(); i++) { + if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { + return i; + } + } + return -1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("This iterator is read only"); + } + + /** + * Convenience list which splits the remaining items in this iterator into a list of gram strings + * + * @return an immutable list of extracted grams + */ + public List toExtractedList() { + List gramList = new ArrayList<>(); + while (hasNext()) { + gramList.add(next().extractFrom(input)); + } + return Collections.unmodifiableList(gramList); + } + } + + /** + * An immutable start index and length pair + */ + public static final class Gram { + + private int start, length; + + public Gram(int start, int length) { + this.start = start; + this.length = length; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Returns this gram as a string from the input string + */ + public String extractFrom(String input) { + return input.substring(start, start + length); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Gram)) { + return false; + } + + Gram gram = (Gram)o; + + if (length != gram.length) { + return false; + } + if (start != gram.start) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = start; + result = 31 * result + length; + return result; + } + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java new file mode 100644 index 00000000000..f4e1ccc9feb --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java @@ -0,0 +1,19 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + *

This interface provides NFKC normalization of Strings through the underlying linguistics library.

+ * + * @author Mathias M\u00F8lster Lidal + */ +public interface Normalizer { + + /** + *

NFKC normalizes a String.

+ * + * @param input String to normalize. + * @return The normalized String. + * @throws ProcessingException If underlying library throws an Exception. + */ + public String normalize(String input); +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java new file mode 100644 index 00000000000..ce8b455707c --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java @@ -0,0 +1,18 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + *

Exception class indicating that a fatal error occured during linguistic processing.

+ * + * @author Simon Thoresen Hult + */ +public class ProcessingException extends RuntimeException { + + public ProcessingException(String message) { + super(message); + } + + public ProcessingException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java new file mode 100644 index 00000000000..73764e06ef6 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.List; + +/** + *

Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a + * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK + * processing).

+ * + * @author Mathias Mølster Lidal + */ +public interface Segmenter { + + /** + * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized + * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only + * contains word-characters, any punctuation and spacing tokens will be removed. + * + * @param input the text to segment. + * @param language language of input text. + * @return the list of segments. + * @throws ProcessingException if an exception is encountered during processing + */ + List segment(String input, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java new file mode 100644 index 00000000000..146d65cb7e2 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java @@ -0,0 +1,45 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author Simon Thoresen Hult + */ +public class SegmenterImpl implements Segmenter { + + private final Tokenizer tokenizer; + + public SegmenterImpl(Tokenizer tokenizer) { + this.tokenizer = tokenizer; + } + + @Override + public List segment(String input, Language language) { + List segments = new ArrayList<>(); + for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) { + findSegments(token, segments); + } + if (segments.isEmpty()) { + segments.add(input); // no segments, return original string + } + return segments; + } + + private void findSegments(Token token, List out) { + int len; + if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { + if (token.isIndexable()) { + out.add(token.getOrig()); + } + } else { + for (int i = 0; i < len; ++i) { + findSegments(token.getComponent(i), out); + } + } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java new file mode 100644 index 00000000000..d355af87f08 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import java.util.AbstractList; +import java.util.ArrayList; + +/** + * A list of strings which does not allow for duplicate elements. + * + * @author steinar + */ +public class StemList extends AbstractList { + private final ArrayList stems; + + public StemList() { + this(new String[0]); + } + + public StemList(String... stems) { + super(); + this.stems = new ArrayList<>(Math.max(stems.length, 3)); + for (String word : stems) { + add(word); + } + } + + @Override + public String get(int i) { + return stems.get(i); + } + + @Override + public int size() { + return stems.size(); + } + + @Override + public String set(int i, String element) { + int existing = stems.indexOf(element); + if (existing >= 0 && existing != i) { + // the element already exists + return element; + } else { + return stems.set(i, element); + } + } + + @Override + public void add(int i, String element) { + int existing = stems.indexOf(element); + if (existing < 0) { + stems.add(i, element); + } + } + + @Override + public String remove(int i) { + return stems.remove(i); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java new file mode 100644 index 00000000000..269b08dcdf7 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java @@ -0,0 +1,45 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * An enum of the stemming modes which can be requested. + * Stemming implementation may support a smaller number of modes by mapping a mode to a more + * inclusive alternative. + * + * @author Mathias Mølster Lidal + */ +public enum StemMode { + + NONE(0), + DEFAULT(1), + ALL(2), + SHORTEST(4), + BEST(5); + + private final int value; + + StemMode(int value) { + this.value = value; + } + + /** + * Returns the stem mode as an int + * + * @deprecated do not use + */ + @Deprecated + public int getValue() { + return value; + } + + @Deprecated + public static StemMode valueOf(int value) { + for (StemMode mode : values()) { + if (mode.value == value) { + return mode; + } + } + return NONE; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java new file mode 100644 index 00000000000..739fd1d9e96 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java @@ -0,0 +1,26 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.List; + +/** + *

Interface providing stemming of single words.

+ * + * @author Mathias Mølster Lidal + */ +public interface Stemmer { + + /** + * Stem input according to specified stemming mode. + * + * @param input the string to stem. + * @param mode the stemming mode + * @param language the language to use for stemming + * @return list of possible stems. Empty if none. + * @throws ProcessingException thrown if there is an exception stemming this input + */ + List stem(String input, StemMode mode, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java new file mode 100644 index 00000000000..0d175a2bf3e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java @@ -0,0 +1,46 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author Simon Thoresen Hult + */ +public class StemmerImpl implements Stemmer { + + private final Tokenizer tokenizer; + + public StemmerImpl(Tokenizer tokenizer) { + this.tokenizer = tokenizer; + } + + @Override + public List stem(String input, StemMode stemMode, Language language) { + List stems = new ArrayList<>(); + for (Token token : tokenizer.tokenize(input, language, stemMode, false)) { + findStems(token, stems); + } + return stems; + } + + private void findStems(Token token, List out) { + int len; + if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { + if (token.isIndexable()) { + StemList word = new StemList(); + word.add(token.getTokenString()); // takes care of getStem(0) + for (int i = 1; i < token.getNumStems(); i++) { + word.add(token.getStem(i)); + } + out.add(word); + } + } else { + for (int i = 0; i < len; ++i) { + findStems(token.getComponent(i), out); + } + } + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java new file mode 100644 index 00000000000..f1dc6639e11 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java @@ -0,0 +1,56 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Interface providing access to a single token produced by the tokenizer. + * + * @author Mathias Mølster Lidal + */ +public interface Token { + + /** Returns the type of this token - word, space or punctuation etc. */ + TokenType getType(); + + /** Returns the original form of this token */ + String getOrig(); + + /** Returns the number of stem forms available for this token. */ + int getNumStems(); + + /** Returns the stem at position i */ + String getStem(int i); + + /** + * Returns the number of components, if this token is a compound word + * (e.g. german "kommunikationsfehler". Otherwise, return 0 + * + * @return number of components, or 0 if none + */ + int getNumComponents(); + + /** Returns a component token of this */ + Token getComponent(int i); + + /** Returns the offset position of this token */ + long getOffset(); + + /** Returns the script of this token */ + TokenScript getScript(); + + /** + * Returns token string in a form suitable for indexing: The + * most lowercased variant of the most processed token form available. + * If called on a compound token this returns a lowercased form of the + * entire word. + * + * @return token string value + */ + String getTokenString(); + + /** Returns whether this is an instance of a declared special token (e.g. c++) */ + boolean isSpecialToken(); + + /** Whether this token should be indexed */ + boolean isIndexable(); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java new file mode 100644 index 00000000000..ba0ad89b454 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java @@ -0,0 +1,77 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different + * linguistics treatment. + * + * @author Mathias Mølster Lidal + */ +public enum TokenScript { + + COMMON, + LATIN, + GREEK, + CYRILLIC, + ARMENIAN, + HEBREW, + ARABIC, + SYRIAC, + THAANA, + DEVANAGARI, + BENGALI, + GURMUKHI, + GUJARATI, + ORIYA, + TAMIL, + TELUGU, + KANNADA, + MALAYALAM, + SINHALA, + THAI, + LAO, + TIBETAN, + MYANMAR, + GEORGIAN, + HANGUL, + ETHIOPIC, + CHEROKEE, + CANADIAN, + OGHAM, + RUNIC, + KHMER, + MONGOLIAN, + HIRAGANA, + KATAKANA, + CHINESE, + HAN, + YI, + OLDITALIC, + GOTHIC, + DESERET, + INHERITED, + TAGALOG, + HANUNOO, + BUHID, + TAGBANWA, + LIMBU, + TAILE, + LINEARB, + UGARITIC, + SHAVIAN, + OSMANYA, + CYPRIOT, + BRAILLE, + ASCII, + BUGINESE, + COPTIC, + GLAGOLITIC, + KHAROSHTHI, + OLDPERSIAN, + SYLOTINAGRI, + TAILUE, + TIFINAGH, + VIETNAMESE, + UNKNOWN; + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java new file mode 100644 index 00000000000..7d880440f1e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java @@ -0,0 +1,51 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * An enumeration of token types. + * + * @author Mathias Mølster Lidal + */ +public enum TokenType { + + UNKNOWN(0), + SPACE(1), + PUNCTUATION(2), + SYMBOL(3), + ALPHABETIC(4), + NUMERIC(5), + MARKER(255); + + private final int value; + + TokenType(int value) { + this.value = value; + } + + /** Returns an int code for this type */ + public int getValue() { return value; } + + /** + * Marker for whether this type of token can be indexed for search. + * Note that a Token can be excluded from an index, even though the token type marks + * it as indexable. + * + * @see com.yahoo.language.process.Token#isIndexable() + * @return whether this type of token can be indexed + */ + public boolean isIndexable() { + switch (this) { + case ALPHABETIC: case NUMERIC: return true; + default: return false; + } + } + + /** Translates this from the int code representation returned from {@link #getValue} */ + public static TokenType valueOf(int value) { + for (TokenType type : values()) { + if (value == type.value) return type; + } + return UNKNOWN; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java new file mode 100644 index 00000000000..d7d1e210de4 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +/** + * Language-sensitive tokenization of a text string. + * + * @author Mathias Mølster Lidal + */ +public interface Tokenizer { + + /** + * Returns the tokens produced from an input string under the rules of the given Language and additional options + * + * @param input the string to tokenize. May be arbitrarily large. + * @param language the language of the input string. + * @param stemMode the stem mode applied on the returned tokens + * @param removeAccents if true accents and similar are removed from the returned tokens + * @return the tokens of the input String. + * @throws ProcessingException If the underlying library throws an Exception. + */ + Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents); + + /** + * Return a replacement for an input token string. + * This accepts strings returned by Token.getTokenString + * and returns a replacement which will be used as the index token. + * The input token string is returned if there is no replacement. + *

+ * This default implementation always returns the input token string. + * + * @param tokenString the token string of the term to lookup a replacement for + * @return the replacement, if any, or the argument token string if not + */ + default String getReplacementTerm(String tokenString) { return tokenString; } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java new file mode 100644 index 00000000000..4d288aafaca --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java @@ -0,0 +1,23 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +/** + * Interface for providers of text transformations such as accent removal. + * + * @author Mathias Mølster Lidal + */ +public interface Transformer { + + /** + * Remove accents from input text. + * + * @param input text to transform. + * @param language language of input text. + * @return text with accents removed, or input-text if the feature is unavailable + * @throws ProcessingException thrown if there is an exception stemming this input + */ + String accentDrop(String input, Language language); + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/package-info.java b/linguistics/src/main/java/com/yahoo/language/process/package-info.java new file mode 100644 index 00000000000..de8d82fcf36 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language.process; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java new file mode 100644 index 00000000000..eca35772296 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -0,0 +1,179 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.text.Utf8; + +import java.nio.ByteBuffer; + +/** + *

Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, + * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character + * set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available, + * then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a + * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string. + * Unfortunately, its not 100% fool-proof. From what I've been able to determine, Korean characters do not overlap with + * Japanese or Chinese characters, so their presence is a good indication of Korean. If a string contains phonetic + * japanese, this is a good indication of Japanese. However, Japanese and Chinese characters occupy many of the same + * character blocks, so if there are no definitive signs of Japanese then it is assumed that the String is Chinese.

+ + * @author Einar M R Rosenvinge + */ +public class SimpleDetector implements Detector { + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); + } + + public static Language guessLanguage(byte[] buf, int offset, int length) { + return guessLanguage(Utf8.toString(buf, offset, length)); + } + + public static Language guessLanguage(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + + // used to record the current theory of language guess, in case of ambiguous characters, such as Chinese + Language soFar = Language.UNKNOWN; + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + Character.UnicodeBlock block = Character.UnicodeBlock.of(c); + + // Check some special cases for Korean. Korean doesn't + // overlap with Japanese or Chinese, so this is a good test. + if ((c >= 0x3200 && c < 0x3220) || // parenthesized hangul + (c >= 0x3260 && c < 0x3280) || // circled hangul + (c >= 0xFFA0 && c < 0xFFE0) || // halfwidth hangul + (c == 0x302E || c == 0x302F) || // hangul tone mark + + // standard Hangul character blocks + block == Character.UnicodeBlock.HANGUL_SYLLABLES || + block == Character.UnicodeBlock.HANGUL_JAMO || + block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) { + return Language.KOREAN; + } + // katakana phonetic extensions. + if (0x31f0 <= c && c <= 0x31ff) { + // See http://www.unicode.org/charts/PDF/U31F0.pdf + // This is a special case because This range of character + // codes is classified as unasigned in + // Character.UnicodeBlock. But clearly it is assigned as + // per above. + return Language.JAPANESE; + } + if (0x31f0 <= c && c <= 0x31ff || // these are standard character blocks for japanese characters. + block == Character.UnicodeBlock.HIRAGANA || + block == Character.UnicodeBlock.KATAKANA || + block == Character.UnicodeBlock.KANBUN) { + // See http://www.unicode.org/charts/PDF/U31F0.pdf + // This is a special case because This range of character + // codes is classified as unasigned in + // Character.UnicodeBlock. But clearly it is assigned as + // per above. + return Language.JAPANESE; + } + if (block == Character.UnicodeBlock.CJK_COMPATIBILITY || + block == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS || + block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || + block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT || + block == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT || + block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || + block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || + block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || + block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) { + // seeing one of these chars, we assume that the text is Chinese, until more concrete evidence is found + soFar = Language.CHINESE_TRADITIONAL; + } + if (block == Character.UnicodeBlock.BOPOMOFO || + block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) { + return Language.CHINESE_TRADITIONAL; + } + if (block == Character.UnicodeBlock.THAI) { + return Language.THAI; + } + } + // got to the end, so return the current best guess + return soFar; + } + + private boolean isTrailingOctet(byte i) { + return ((i >>> 6) & 3) == 2; + } + + // If UTF-8, how many trailing octets are expected? + private int isLeadingFor(byte c) { + int i = c & 0xff; + if ((i & (1 << 7)) == 0) { + return 0; + } else if ((i >>> 5) == ((1 << 3) - 2)) { + return 1; + } else if ((i >>> 4) == ((1 << 4) - 2)) { + return 2; + } else if ((i >>> 3) == ((1 << 5) - 2)) { + return 3; + } else if ((i >>> 2) == ((1 << 6) - 2)) { + return 4; + } else if ((i >>> 1) == ((1 << 7) - 2)) { + return 5; + } else { + return -1; + } + } + + private String guessEncoding(byte[] input) { + boolean isUtf8 = true; + boolean hasHighs = false; + scan: + for (int i = 0; i < input.length; i++) { + final int l = isLeadingFor(input[i]); + if (l < 0 || i + l >= input.length) { + hasHighs = true; + isUtf8 = false; + break; + } + switch (l) { + case 0: + break; + case 5: + isUtf8 = isTrailingOctet(input[++i]); + case 4: + isUtf8 &= isTrailingOctet(input[++i]); + case 3: + isUtf8 &= isTrailingOctet(input[++i]); + case 2: + isUtf8 &= isTrailingOctet(input[++i]); + case 1: + isUtf8 &= isTrailingOctet(input[++i]); + hasHighs = true; + if (!isUtf8) { + break scan; + } + break; + } + } + if (hasHighs && isUtf8) { + return Utf8.getCharset().name(); + } else if (!hasHighs) { + return "US-ASCII"; + } else { + return "ISO-8859-1"; + } + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java new file mode 100644 index 00000000000..857964d5d35 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.collections.Tuple2; +import com.yahoo.component.Version; +import com.yahoo.language.Linguistics; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.process.CharacterClasses; +import com.yahoo.language.process.GramSplitter; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.Segmenter; +import com.yahoo.language.process.SegmenterImpl; +import com.yahoo.language.process.Stemmer; +import com.yahoo.language.process.StemmerImpl; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; + +/** + * Factory of pure Java linguistic processor implementations. + * + * @author bratseth + */ +public class SimpleLinguistics implements Linguistics { + + // Threadsafe instances + private final static Normalizer normalizer = new SimpleNormalizer(); + private final static Transformer transformer = new SimpleTransformer(); + private final static Detector detector = new SimpleDetector(); + private final static CharacterClasses characterClasses = new CharacterClasses(); + private final static GramSplitter gramSplitter = new GramSplitter(characterClasses); + + @Override + public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); } + + @Override + public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer); } + + @Override + public Normalizer getNormalizer() { return normalizer; } + + @Override + public Transformer getTransformer() { return transformer; } + + @Override + public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); } + + @Override + public Detector getDetector() { return detector; } + + @Override + public GramSplitter getGramSplitter() { return gramSplitter; } + + @Override + public CharacterClasses getCharacterClasses() { return characterClasses; } + + @Override + public Tuple2 getVersion(Component component) { + return new Tuple2<>("yahoo", new Version(1, 0)); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java new file mode 100644 index 00000000000..bfc6f813452 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java @@ -0,0 +1,16 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.Normalizer; + +/** + * @author Simon Thoresen + */ +public class SimpleNormalizer implements Normalizer { + + @Override + public String normalize(String input) { + return java.text.Normalizer.normalize(input, java.text.Normalizer.Form.NFKC); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java new file mode 100644 index 00000000000..1cf707bf5be --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -0,0 +1,188 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author Mathias Mølster Lidal + */ +public class SimpleToken implements Token { + + private final List components = new ArrayList<>(); + private final String orig; + private TokenType type = TokenType.UNKNOWN; + private TokenScript script = TokenScript.UNKNOWN; + private String tokenString = null; + private boolean specialToken = false; + private long offset = 0; + + public SimpleToken(String orig) { + this.orig = orig; + } + + @Override + public String getOrig() { + return orig; + } + + @Override + public int getNumStems() { + return tokenString != null ? 1 : 0; + } + + @Override + public String getStem(int i) { + return tokenString; + } + + @Override + public int getNumComponents() { + return components.size(); + } + + @Override + public Token getComponent(int i) { + return components.get(i); + } + + public SimpleToken addComponent(Token token) { + components.add(token); + return this; + } + + @Override + public String getTokenString() { + return tokenString; + } + + public SimpleToken setTokenString(String str) { + tokenString = str; + return this; + } + + @Override + public TokenType getType() { + return type; + } + + public SimpleToken setType(TokenType type) { + this.type = type; + return this; + } + + @Override + public TokenScript getScript() { + return script; + } + + public SimpleToken setScript(TokenScript script) { + this.script = script; + return this; + } + + @Override + public boolean isSpecialToken() { + return specialToken; + } + + public SimpleToken setSpecialToken(boolean specialToken) { + this.specialToken = specialToken; + return this; + } + + @Override + public long getOffset() { + return offset; + } + + public SimpleToken setOffset(long offset) { + this.offset = offset; + return this; + } + + @Override + public int hashCode() { + return orig.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Token)) { + return false; + } + Token rhs = (Token)obj; + if (!getType().equals(rhs.getType())) { + return false; + } + if (!equalsOpt(getOrig(), rhs.getOrig())) { + return false; + } + if (getOffset() != rhs.getOffset()) { + return false; + } + if (!equalsOpt(getScript(), rhs.getScript())) { + return false; + } + if (!equalsOpt(getTokenString(), rhs.getTokenString())) { + return false; + } + if (isSpecialToken() != rhs.isSpecialToken()) { + return false; + } + if (getNumComponents() != rhs.getNumComponents()) { + return false; + } + for (int i = 0, len = getNumComponents(); i < len; ++i) { + if (!equalsOpt(getComponent(i), rhs.getComponent(i))) { + return false; + } + } + return true; + } + + private static boolean equalsOpt(Object lhs, Object rhs) { + if (lhs == null || rhs == null) { + return lhs == rhs; + } + return lhs.equals(rhs); + } + + @Override + public String toString() { + return "token : " + getClass().getSimpleName() + " {\n" + toString(this, " ") + "}"; + } + + private static String toString(Token token, String indent) { + StringBuilder builder = new StringBuilder(); + builder.append(indent).append("components : {\n"); + for (int i = 0, len = token.getNumComponents(); i < len; ++i) { + Token comp = token.getComponent(i); + builder.append(indent).append(" [").append(i).append("] : ").append(comp.getClass().getSimpleName()); + builder.append(" {\n").append(toString(comp, indent + " ")); + builder.append(indent).append(" }\n"); + } + builder.append(indent).append("}\n"); + builder.append(indent).append("offset : ").append(token.getOffset()).append("\n"); + builder.append(indent).append("orig : ").append(quoteString(token.getOrig())).append("\n"); + builder.append(indent).append("script : ").append(token.getScript()).append("\n"); + builder.append(indent).append("special : ").append(token.isSpecialToken()).append("\n"); + builder.append(indent).append("token string : ").append(quoteString(token.getTokenString())).append("\n"); + builder.append(indent).append("type : ").append(token.getType()).append("\n"); + return builder.toString(); + } + + private static String quoteString(String str) { + return str != null ? "'" + str + "'" : null; + } + + @Override + public boolean isIndexable() { + return getType().isIndexable() && (getOrig().length() > 0); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java new file mode 100644 index 00000000000..9d1a6a5dbb8 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java @@ -0,0 +1,68 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenType; + +/** + * @author arnej27959 + */ +class SimpleTokenType { + + public static TokenType valueOf(int codePoint) { + switch (Character.getType(codePoint)) { + case Character.NON_SPACING_MARK: + // "combining grave accent" + // and "DEVANAGARI VOWEL SIGN SHORT E" etc + // (letter-like) + case Character.COMBINING_SPACING_MARK: + // "DEVANAGARI VOWEL SIGN SHORT O" + // and similar (letter-like) + case Character.LETTER_NUMBER: + // "SMALL ROMAN NUMERAL SIX" etc (letter-like) + case Character.UPPERCASE_LETTER: + case Character.LOWERCASE_LETTER: + case Character.TITLECASE_LETTER: + case Character.MODIFIER_LETTER: + case Character.OTHER_LETTER: + return TokenType.ALPHABETIC; + + case Character.ENCLOSING_MARK: + // "enclosing circle" etc is symbol-like + case Character.MATH_SYMBOL: + case Character.CURRENCY_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.OTHER_SYMBOL: + return TokenType.SYMBOL; + + case Character.OTHER_NUMBER: + // "SUPERSCRIPT TWO", + // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE" + // and more numbers that should mostly normalize + // to digits + case Character.DECIMAL_DIGIT_NUMBER: + return TokenType.NUMERIC; + + case Character.SPACE_SEPARATOR: + case Character.LINE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + return TokenType.SPACE; + + case Character.DASH_PUNCTUATION: + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.CONNECTOR_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + return TokenType.PUNCTUATION; + + case Character.CONTROL: + case Character.FORMAT: + case Character.SURROGATE: + case Character.PRIVATE_USE: + case Character.UNASSIGNED: + return TokenType.UNKNOWN; + } + throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint))); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java new file mode 100644 index 00000000000..48a12c54e86 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -0,0 +1,76 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.LinguisticsCase; +import com.yahoo.language.process.*; +import com.yahoo.language.simple.kstem.KStemmer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + *

A tokenizer which splits on whitespace, normalizes and transforms using the given implementations + * and stems using the kstem algorithm.

+ * + *

This is not multithread safe.

+ * + * @author Mathias Mølster Lidal + * @author bratseth + */ +public class SimpleTokenizer implements Tokenizer { + + private final static int SPACE_CODE = 32; + private final Normalizer normalizer; + private final Transformer transformer; + private final KStemmer stemmer = new KStemmer(); + + public SimpleTokenizer() { + this(new SimpleNormalizer(), new SimpleTransformer()); + } + + public SimpleTokenizer(Normalizer normalizer) { + this(normalizer, new SimpleTransformer()); + } + + public SimpleTokenizer(Normalizer normalizer, Transformer transformer) { + this.normalizer = normalizer; + this.transformer = transformer; + } + + @Override + public Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + if (input.isEmpty()) return Collections.emptyList(); + + List tokens = new ArrayList<>(); + int nextCode = input.codePointAt(0); + TokenType prevType = SimpleTokenType.valueOf(nextCode); + for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { + nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; + TokenType nextType = SimpleTokenType.valueOf(nextCode); + if (!prevType.isIndexable() || !nextType.isIndexable()) { + String original = input.substring(prev, next); + String token = processToken(original, language, stemMode, removeAccents); + tokens.add(new SimpleToken(original).setOffset(prev) + .setType(prevType) + .setTokenString(token)); + prev = next; + prevType = nextType; + } + next += Character.charCount(nextCode); + } + return tokens; + } + + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { + token = normalizer.normalize(token); + token = LinguisticsCase.toLowerCase(token); + if (removeAccents) + token = transformer.accentDrop(token, language); + if (stemMode != StemMode.NONE) + token = stemmer.stem(token); + return token; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java new file mode 100644 index 00000000000..409ef44986e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java @@ -0,0 +1,25 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.process.Transformer; + +import java.text.Normalizer; +import java.util.regex.Pattern; + +/** + * Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then + * strips off the diacritics using a regex. + * + * @author Simon Thoresen + */ +public class SimpleTransformer implements Transformer { + + private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + + @Override + public String accentDrop(String input, Language language) { + return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll(""); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java new file mode 100644 index 00000000000..355acf41525 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java @@ -0,0 +1,661 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0. + */ +package com.yahoo.language.simple.kstem; + + +import java.util.Arrays; +import java.util.AbstractMap; +import java.util.AbstractSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +/** + * A simple class that stores key Strings as char[]'s in a + * hash table. Note that this is not a general purpose + * class. For example, it cannot remove items from the + * map, nor does it resize its hash table to be smaller, + * etc. It is designed to be quick to retrieve items + * by char[] keys without the necessity of converting + * to a String first. + */ +public class CharArrayMap extends AbstractMap { + + private static final CharArrayMap EMPTY_MAP = new EmptyCharArrayMap<>(); + + private final static int INIT_SIZE = 8; + private final CharacterUtils charUtils; + private boolean ignoreCase; + private int count; + char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator + V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator + + /** + * Create map with enough capacity to hold startSize terms + * + * @param startSize + * the initial capacity + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + @SuppressWarnings("unchecked") + public CharArrayMap(int startSize, boolean ignoreCase) { + this.ignoreCase = ignoreCase; + int size = INIT_SIZE; + while(startSize + (startSize>>2) > size) + size <<= 1; + keys = new char[size][]; + values = (V[]) new Object[size]; + this.charUtils = CharacterUtils.getInstance(); + } + + /** + * Creates a map from the mappings in another map. + * + * @param c + * a map whose mappings to be copied + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + public CharArrayMap(Map c, boolean ignoreCase) { + this(c.size(), ignoreCase); + putAll(c); + } + + /** Create set from the supplied map (used internally for readonly maps...) */ + private CharArrayMap(CharArrayMap toCopy){ + this.keys = toCopy.keys; + this.values = toCopy.values; + this.ignoreCase = toCopy.ignoreCase; + this.count = toCopy.count; + this.charUtils = toCopy.charUtils; + } + + /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */ + @Override + public void clear() { + count = 0; + Arrays.fill(keys, null); + Arrays.fill(values, null); + } + + /** true if the len chars of text starting at off + * are in the {@link #keySet()} */ + public boolean containsKey(char[] text, int off, int len) { + return keys[getSlot(text, off, len)] != null; + } + + /** true if the CharSequence is in the {@link #keySet()} */ + public boolean containsKey(CharSequence cs) { + return keys[getSlot(cs)] != null; + } + + @Override + public boolean containsKey(Object o) { + if (o instanceof char[]) { + final char[] text = (char[])o; + return containsKey(text, 0, text.length); + } + return containsKey(o.toString()); + } + + /** returns the value of the mapping of len chars of text + * starting at off */ + public V get(char[] text, int off, int len) { + return values[getSlot(text, off, len)]; + } + + /** returns the value of the mapping of the chars inside this {@code CharSequence} */ + public V get(CharSequence cs) { + return values[getSlot(cs)]; + } + + @Override + public V get(Object o) { + if (o instanceof char[]) { + final char[] text = (char[])o; + return get(text, 0, text.length); + } + return get(o.toString()); + } + + private int getSlot(char[] text, int off, int len) { + int code = getHashCode(text, off, len); + int pos = code & (keys.length-1); + char[] text2 = keys[pos]; + if (text2 != null && !equals(text, off, len, text2)) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + pos = code & (keys.length-1); + text2 = keys[pos]; + } while (text2 != null && !equals(text, off, len, text2)); + } + return pos; + } + + /** Returns true if the String is in the set */ + private int getSlot(CharSequence text) { + int code = getHashCode(text); + int pos = code & (keys.length-1); + char[] text2 = keys[pos]; + if (text2 != null && !equals(text, text2)) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + pos = code & (keys.length-1); + text2 = keys[pos]; + } while (text2 != null && !equals(text, text2)); + } + return pos; + } + + /** Add the given mapping. */ + public V put(CharSequence text, V value) { + return put(text.toString(), value); // could be more efficient + } + + @Override + public V put(Object o, V value) { + if (o instanceof char[]) { + return put((char[])o, value); + } + return put(o.toString(), value); + } + + /** Add the given mapping. */ + public V put(String text, V value) { + return put(text.toCharArray(), value); + } + + /** Add the given mapping. + * If ignoreCase is true for this Set, the text array will be directly modified. + * The user should never modify this text array after calling this method. + */ + public V put(char[] text, V value) { + if (ignoreCase) { + charUtils.toLowerCase(text, 0, text.length); + } + int slot = getSlot(text, 0, text.length); + if (keys[slot] != null) { + final V oldValue = values[slot]; + values[slot] = value; + return oldValue; + } + keys[slot] = text; + values[slot] = value; + count++; + + if (count + (count>>2) > keys.length) { + rehash(); + } + + return null; + } + + @SuppressWarnings("unchecked") + private void rehash() { + assert keys.length == values.length; + final int newSize = 2*keys.length; + final char[][] oldkeys = keys; + final V[] oldvalues = values; + keys = new char[newSize][]; + values = (V[]) new Object[newSize]; + + for(int i=0; i entry : entrySet()) { + if (sb.length()>1) sb.append(", "); + sb.append(entry); + } + return sb.append('}').toString(); + } + + private EntrySet entrySet = null; + private CharArraySet keySet = null; + + EntrySet createEntrySet() { + return new EntrySet(true); + } + + @Override + public final EntrySet entrySet() { + if (entrySet == null) { + entrySet = createEntrySet(); + } + return entrySet; + } + + // helper for CharArraySet to not produce endless recursion + final Set originalKeySet() { + return super.keySet(); + } + + /** Returns an {@link CharArraySet} view on the map's keys. + * The set will use the same {@code matchVersion} as this map. */ + @Override @SuppressWarnings({"unchecked","rawtypes"}) + public final CharArraySet keySet() { + if (keySet == null) { + // prevent adding of entries + keySet = new CharArraySet((CharArrayMap) this) { + @Override + public boolean add(Object o) { + throw new UnsupportedOperationException(); + } + @Override + public boolean add(CharSequence text) { + throw new UnsupportedOperationException(); + } + @Override + public boolean add(String text) { + throw new UnsupportedOperationException(); + } + @Override + public boolean add(char[] text) { + throw new UnsupportedOperationException(); + } + }; + } + return keySet; + } + + /** public iterator class so efficient methods are exposed to users */ + public class EntryIterator implements Iterator> { + private int pos=-1; + private int lastPos; + private final boolean allowModify; + + private EntryIterator(boolean allowModify) { + this.allowModify = allowModify; + goNext(); + } + + private void goNext() { + lastPos = pos; + pos++; + while (pos < keys.length && keys[pos] == null) pos++; + } + + @Override + public boolean hasNext() { + return pos < keys.length; + } + + /** gets the next key... do not modify the returned char[] */ + public char[] nextKey() { + goNext(); + return keys[lastPos]; + } + + /** gets the next key as a newly created String object */ + public String nextKeyString() { + return new String(nextKey()); + } + + /** returns the value associated with the last key returned */ + public V currentValue() { + return values[lastPos]; + } + + /** sets the value associated with the last key returned */ + public V setValue(V value) { + if (!allowModify) + throw new UnsupportedOperationException(); + V old = values[lastPos]; + values[lastPos] = value; + return old; + } + + /** use nextCharArray() + currentValue() for better efficiency. */ + @Override + public Map.Entry next() { + goNext(); + return new MapEntry(lastPos, allowModify); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + + private final class MapEntry implements Map.Entry { + private final int pos; + private final boolean allowModify; + + private MapEntry(int pos, boolean allowModify) { + this.pos = pos; + this.allowModify = allowModify; + } + + @Override + public Object getKey() { + // we must clone here, as putAll to another CharArrayMap + // with other case sensitivity flag would corrupt the keys + return keys[pos].clone(); + } + + @Override + public V getValue() { + return values[pos]; + } + + @Override + public V setValue(V value) { + if (!allowModify) + throw new UnsupportedOperationException(); + final V old = values[pos]; + values[pos] = value; + return old; + } + + @Override + public String toString() { + return new StringBuilder().append(keys[pos]).append('=') + .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos]) + .toString(); + } + } + + /** public EntrySet class so efficient methods are exposed to users */ + public final class EntrySet extends AbstractSet> { + private final boolean allowModify; + + private EntrySet(boolean allowModify) { + this.allowModify = allowModify; + } + + @Override + public EntryIterator iterator() { + return new EntryIterator(allowModify); + } + + @Override + @SuppressWarnings("unchecked") + public boolean contains(Object o) { + if (!(o instanceof Map.Entry)) + return false; + final Map.Entry e = (Map.Entry)o; + final Object key = e.getKey(); + final Object val = e.getValue(); + final Object v = get(key); + return v == null ? val == null : v.equals(val); + } + + @Override + public boolean remove(Object o) { + throw new UnsupportedOperationException(); + } + + @Override + public int size() { + return count; + } + + @Override + public void clear() { + if (!allowModify) + throw new UnsupportedOperationException(); + CharArrayMap.this.clear(); + } + } + + /** + * Returns an unmodifiable {@link CharArrayMap}. This allows to provide + * unmodifiable views of internal map for "read-only" use. + * + * @param map + * a map for which the unmodifiable map is returned. + * @return an new unmodifiable {@link CharArrayMap}. + * @throws NullPointerException + * if the given map is null. + */ + public static CharArrayMap unmodifiableMap(CharArrayMap map) { + if (map == null) + throw new NullPointerException("Given map is null"); + if (map == emptyMap() || map.isEmpty()) + return emptyMap(); + if (map instanceof UnmodifiableCharArrayMap) + return map; + return new UnmodifiableCharArrayMap<>(map); + } + + /** + * Returns a copy of the given map as a {@link CharArrayMap}. If the given map + * is a {@link CharArrayMap} the ignoreCase property will be preserved. + * + * @param map + * a map to copy + * @return a copy of the given map as a {@link CharArrayMap}. If the given map + * is a {@link CharArrayMap} the ignoreCase property as well as the + * matchVersion will be of the given map will be preserved. + */ + @SuppressWarnings("unchecked") + public static CharArrayMap copy(final Map map) { + if(map == EMPTY_MAP) + return emptyMap(); + if(map instanceof CharArrayMap) { + CharArrayMap m = (CharArrayMap) map; + // use fast path instead of iterating all values + // this is even on very small sets ~10 times faster than iterating + final char[][] keys = new char[m.keys.length][]; + System.arraycopy(m.keys, 0, keys, 0, keys.length); + final V[] values = (V[]) new Object[m.values.length]; + System.arraycopy(m.values, 0, values, 0, values.length); + m = new CharArrayMap<>(m); + m.keys = keys; + m.values = values; + return m; + } + // In jdk-9b54 or later, a plain diamond causes compile error with "-source 1.7": + return new CharArrayMap(map, false); + } + + /** Returns an empty, unmodifiable map. */ + @SuppressWarnings("unchecked") + public static CharArrayMap emptyMap() { + return (CharArrayMap) EMPTY_MAP; + } + + // package private CharArraySet instanceof check in CharArraySet + static class UnmodifiableCharArrayMap extends CharArrayMap { + + UnmodifiableCharArrayMap(CharArrayMap map) { + super(map); + } + + @Override + public void clear() { + throw new UnsupportedOperationException(); + } + + @Override + public V put(Object o, V val){ + throw new UnsupportedOperationException(); + } + + @Override + public V put(char[] text, V val) { + throw new UnsupportedOperationException(); + } + + @Override + public V put(CharSequence text, V val) { + throw new UnsupportedOperationException(); + } + + @Override + public V put(String text, V val) { + throw new UnsupportedOperationException(); + } + + @Override + public V remove(Object key) { + throw new UnsupportedOperationException(); + } + + @Override + EntrySet createEntrySet() { + return new EntrySet(false); + } + } + + /** + * Empty array map optimized for speed. + * Contains checks will always return false or throw + * NPE if necessary. + */ + private static final class EmptyCharArrayMap extends UnmodifiableCharArrayMap { + EmptyCharArrayMap() { + super(new CharArrayMap(0, false)); + } + + @Override + public boolean containsKey(char[] text, int off, int len) { + if(text == null) + throw new NullPointerException(); + return false; + } + + @Override + public boolean containsKey(CharSequence cs) { + if(cs == null) + throw new NullPointerException(); + return false; + } + + @Override + public boolean containsKey(Object o) { + if(o == null) + throw new NullPointerException(); + return false; + } + + @Override + public V get(char[] text, int off, int len) { + if(text == null) + throw new NullPointerException(); + return null; + } + + @Override + public V get(CharSequence cs) { + if(cs == null) + throw new NullPointerException(); + return null; + } + + @Override + public V get(Object o) { + if(o == null) + throw new NullPointerException(); + return null; + } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java new file mode 100644 index 00000000000..df7dc32070b --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java @@ -0,0 +1,184 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0. + */ +package com.yahoo.language.simple.kstem; + + +import java.util.AbstractSet; +import java.util.Collection; +import java.util.Iterator; +import java.util.Set; + +/** + * A simple class that stores Strings as char[]'s in a + * hash table. Note that this is not a general purpose + * class. For example, it cannot remove items from the + * set, nor does it resize its hash table to be smaller, + * etc. It is designed to be quick to test if a char[] + * is in the set without the necessity of converting it + * to a String first. + * + *

+ * Please note: This class implements {@link java.util.Set Set} but + * does not behave like it should in all cases. The generic type is + * {@code Set}, because you can add any object to it, + * that has a string representation. The add methods will use + * {@link Object#toString} and store the result using a {@code char[]} + * buffer. The same behavior have the {@code contains()} methods. + * The {@link #iterator()} returns an {@code Iterator}. + */ +public class CharArraySet extends AbstractSet { + + public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.emptyMap()); + private static final Object PLACEHOLDER = new Object(); + + private final CharArrayMap map; + + /** + * Create set with enough capacity to hold startSize terms + * + * @param startSize + * the initial capacity + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + public CharArraySet(int startSize, boolean ignoreCase) { + this(new CharArrayMap<>(startSize, ignoreCase)); + } + + /** + * Creates a set from a Collection of objects. + * + * @param c + * a collection whose elements to be placed into the set + * @param ignoreCase + * false if and only if the set should be case sensitive + * otherwise true. + */ + public CharArraySet(Collection c, boolean ignoreCase) { + this(c.size(), ignoreCase); + addAll(c); + } + + /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */ + CharArraySet(final CharArrayMap map){ + this.map = map; + } + + /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */ + @Override + public void clear() { + map.clear(); + } + + /** true if the len chars of text starting at off + * are in the set */ + public boolean contains(char[] text, int off, int len) { + return map.containsKey(text, off, len); + } + + /** true if the CharSequence is in the set */ + public boolean contains(CharSequence cs) { + return map.containsKey(cs); + } + + @Override + public boolean contains(Object o) { + return map.containsKey(o); + } + + @Override + public boolean add(Object o) { + return map.put(o, PLACEHOLDER) == null; + } + + /** Add this CharSequence into the set */ + public boolean add(CharSequence text) { + return map.put(text, PLACEHOLDER) == null; + } + + /** Add this String into the set */ + public boolean add(String text) { + return map.put(text, PLACEHOLDER) == null; + } + + /** Add this char[] directly to the set. + * If ignoreCase is true for this Set, the text array will be directly modified. + * The user should never modify this text array after calling this method. + */ + public boolean add(char[] text) { + return map.put(text, PLACEHOLDER) == null; + } + + @Override + public int size() { + return map.size(); + } + + /** + * Returns an unmodifiable {@link CharArraySet}. This allows to provide + * unmodifiable views of internal sets for "read-only" use. + * + * @param set + * a set for which the unmodifiable set is returned. + * @return an new unmodifiable {@link CharArraySet}. + * @throws NullPointerException + * if the given set is null. + */ + public static CharArraySet unmodifiableSet(CharArraySet set) { + if (set == null) + throw new NullPointerException("Given set is null"); + if (set == EMPTY_SET) + return EMPTY_SET; + if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap) + return set; + return new CharArraySet(CharArrayMap.unmodifiableMap(set.map)); + } + + /** + * Returns a copy of the given set as a {@link CharArraySet}. If the given set + * is a {@link CharArraySet} the ignoreCase property will be preserved. + * + * @param set + * a set to copy + * @return a copy of the given set as a {@link CharArraySet}. If the given set + * is a {@link CharArraySet} the ignoreCase property as well as the + * matchVersion will be of the given set will be preserved. + */ + public static CharArraySet copy(final Set set) { + if(set == EMPTY_SET) + return EMPTY_SET; + if(set instanceof CharArraySet) { + final CharArraySet source = (CharArraySet) set; + return new CharArraySet(CharArrayMap.copy(source.map)); + } + return new CharArraySet(set, false); + } + + /** + * Returns an {@link Iterator} for {@code char[]} instances in this set. + */ + @Override @SuppressWarnings("unchecked") + public Iterator iterator() { + // use the AbstractSet#keySet()'s iterator (to not produce endless recursion) + return map.originalKeySet().iterator(); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("["); + for (Object item : this) { + if (sb.length()>1) sb.append(", "); + if (item instanceof char[]) { + sb.append((char[]) item); + } else { + sb.append(item); + } + } + return sb.append(']').toString(); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java new file mode 100644 index 00000000000..91bd6286b28 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java @@ -0,0 +1,375 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0. + */ +package com.yahoo.language.simple.kstem; + + +import java.io.IOException; +import java.io.Reader; + +/** + * {@link CharacterUtils} provides a unified interface to Character-related + * operations to implement backwards compatible character operations. + */ +public abstract class CharacterUtils { + + private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); + private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); + + /** + * Returns a {@link CharacterUtils} implementation. + */ + public static CharacterUtils getInstance() { + return JAVA_5; + } + + /** + * explicitly returns a version matching java 4 semantics + * @deprecated Only for n-gram backwards compat + */ + @Deprecated + public static CharacterUtils getJava4Instance() { + return JAVA_4; + } + + /** + * Returns the code point at the given index of the {@link CharSequence}. + * + * @param seq + * a character sequence + * @param offset + * the offset to the char values in the chars array to be converted + * + * @return the Unicode code point at the given index + * @throws NullPointerException + * - if the sequence is null. + * @throws IndexOutOfBoundsException + * - if the value offset is negative or not less than the length of + * the character sequence. + */ + public abstract int codePointAt(final CharSequence seq, final int offset); + + /** + * Returns the code point at the given index of the char array where only elements + * with index less than the limit are used. + * + * @param chars + * a character array + * @param offset + * the offset to the char values in the chars array to be converted + * @param limit the index afer the last element that should be used to calculate + * codepoint. + * + * @return the Unicode code point at the given index + * @throws NullPointerException + * - if the array is null. + * @throws IndexOutOfBoundsException + * - if the value offset is negative or not less than the length of + * the char array. + */ + public abstract int codePointAt(final char[] chars, final int offset, final int limit); + + /** Return the number of characters in seq. */ + public abstract int codePointCount(CharSequence seq); + + /** + * Creates a new {@link CharacterBuffer} and allocates a char[] + * of the given bufferSize. + * + * @param bufferSize + * the internal char buffer size, must be >= 2 + * @return a new {@link CharacterBuffer} instance. + */ + public static CharacterBuffer newCharacterBuffer(final int bufferSize) { + if (bufferSize < 2) { + throw new IllegalArgumentException("buffersize must be >= 2"); + } + return new CharacterBuffer(new char[bufferSize], 0, 0); + } + + + /** + * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting + * at the given offset. + * @param buffer the char buffer to lowercase + * @param offset the offset to start at + * @param limit the max char in the buffer to lower case + */ + public final void toLowerCase(final char[] buffer, final int offset, final int limit) { + assert buffer.length >= limit; + assert offset <=0 && offset <= buffer.length; + for (int i = offset; i < limit;) { + i += Character.toChars( + Character.toLowerCase( + codePointAt(buffer, i, limit)), buffer, i); + } + } + + /** + * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting + * at the given offset. + * @param buffer the char buffer to UPPERCASE + * @param offset the offset to start at + * @param limit the max char in the buffer to lower case + */ + public final void toUpperCase(final char[] buffer, final int offset, final int limit) { + assert buffer.length >= limit; + assert offset <=0 && offset <= buffer.length; + for (int i = offset; i < limit;) { + i += Character.toChars( + Character.toUpperCase( + codePointAt(buffer, i, limit)), buffer, i); + } + } + + /** Converts a sequence of Java characters to a sequence of unicode code points. + * @return the number of code points written to the destination buffer */ + public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { + if (srcLen < 0) { + throw new IllegalArgumentException("srcLen must be >= 0"); + } + int codePointCount = 0; + for (int i = 0; i < srcLen; ) { + final int cp = codePointAt(src, srcOff + i, srcOff + srcLen); + final int charCount = Character.charCount(cp); + dest[destOff + codePointCount++] = cp; + i += charCount; + } + return codePointCount; + } + + /** Converts a sequence of unicode code points to a sequence of Java characters. + * @return the number of chars written to the destination buffer */ + public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { + if (srcLen < 0) { + throw new IllegalArgumentException("srcLen must be >= 0"); + } + int written = 0; + for (int i = 0; i < srcLen; ++i) { + written += Character.toChars(src[srcOff + i], dest, destOff + written); + } + return written; + } + + /** + * Fills the {@link CharacterBuffer} with characters read from the given + * reader {@link Reader}. This method tries to read numChars + * characters into the {@link CharacterBuffer}, each call to fill will start + * filling the buffer from offset 0 up to numChars. + * In case code points can span across 2 java characters, this method may + * only fill numChars - 1 characters in order not to split in + * the middle of a surrogate pair, even if there are remaining characters in + * the {@link Reader}. + *

+ * This method guarantees + * that the given {@link CharacterBuffer} will never contain a high surrogate + * character as the last element in the buffer unless it is the last available + * character in the reader. In other words, high and low surrogate pairs will + * always be preserved across buffer boarders. + *

+ *

+ * A return value of false means that this method call exhausted + * the reader, but there may be some bytes which have been read, which can be + * verified by checking whether buffer.getLength() > 0. + *

+ * + * @param buffer + * the buffer to fill. + * @param reader + * the reader to read characters from. + * @param numChars + * the number of chars to read + * @return false if and only if reader.read returned -1 while trying to fill the buffer + * @throws IOException + * if the reader throws an {@link IOException}. + */ + public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException; + + /** Convenience method which calls fill(buffer, reader, buffer.buffer.length). */ + public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { + return fill(buffer, reader, buffer.buffer.length); + } + + /** Return the index within buf[start:start+count] which is by offset + * code points from index. */ + public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset); + + static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { + int read = 0; + while (read < len) { + final int r = reader.read(dest, offset + read, len - read); + if (r == -1) { + break; + } + read += r; + } + return read; + } + + private static final class Java5CharacterUtils extends CharacterUtils { + Java5CharacterUtils() { + } + + @Override + public int codePointAt(final CharSequence seq, final int offset) { + return Character.codePointAt(seq, offset); + } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + return Character.codePointAt(chars, offset, limit); + } + + @Override + public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException { + assert buffer.buffer.length >= 2; + if (numChars < 2 || numChars > buffer.buffer.length) { + throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); + } + final char[] charBuffer = buffer.buffer; + buffer.offset = 0; + final int offset; + + // Install the previously saved ending high surrogate: + if (buffer.lastTrailingHighSurrogate != 0) { + charBuffer[0] = buffer.lastTrailingHighSurrogate; + buffer.lastTrailingHighSurrogate = 0; + offset = 1; + } else { + offset = 0; + } + + final int read = readFully(reader, charBuffer, offset, numChars - offset); + + buffer.length = offset + read; + final boolean result = buffer.length == numChars; + if (buffer.length < numChars) { + // We failed to fill the buffer. Even if the last char is a high + // surrogate, there is nothing we can do + return result; + } + + if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) { + buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; + } + return result; + } + + @Override + public int codePointCount(CharSequence seq) { + return Character.codePointCount(seq, 0, seq.length()); + } + + @Override + public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { + return Character.offsetByCodePoints(buf, start, count, index, offset); + } + } + + private static final class Java4CharacterUtils extends CharacterUtils { + Java4CharacterUtils() { + } + + @Override + public int codePointAt(final CharSequence seq, final int offset) { + return seq.charAt(offset); + } + + @Override + public int codePointAt(final char[] chars, final int offset, final int limit) { + if(offset >= limit) + throw new IndexOutOfBoundsException("offset must be less than limit"); + return chars[offset]; + } + + @Override + public boolean fill(CharacterBuffer buffer, Reader reader, int numChars) + throws IOException { + assert buffer.buffer.length >= 1; + if (numChars < 1 || numChars > buffer.buffer.length) { + throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size"); + } + buffer.offset = 0; + final int read = readFully(reader, buffer.buffer, 0, numChars); + buffer.length = read; + buffer.lastTrailingHighSurrogate = 0; + return read == numChars; + } + + @Override + public int codePointCount(CharSequence seq) { + return seq.length(); + } + + @Override + public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { + final int result = index + offset; + if (result < 0 || result > count) { + throw new IndexOutOfBoundsException(); + } + return result; + } + + } + + /** + * A simple IO buffer to use with + * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. + */ + public static final class CharacterBuffer { + + private final char[] buffer; + private int offset; + private int length; + // NOTE: not private so outer class can access without + // $access methods: + char lastTrailingHighSurrogate; + + CharacterBuffer(char[] buffer, int offset, int length) { + this.buffer = buffer; + this.offset = offset; + this.length = length; + } + + /** + * Returns the internal buffer + * + * @return the buffer + */ + public char[] getBuffer() { + return buffer; + } + + /** + * Returns the data offset in the internal buffer. + * + * @return the offset + */ + public int getOffset() { + return offset; + } + + /** + * Return the length of the data in the internal buffer starting at + * {@link #getOffset()} + * + * @return the length + */ + public int getLength() { + return length; + } + + /** + * Resets the CharacterBuffer. All internals are reset to its default + * values. + */ + public void reset() { + offset = 0; + length = 0; + lastTrailingHighSurrogate = 0; + } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java new file mode 100644 index 00000000000..abdde0d619b --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java @@ -0,0 +1,716 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData1 { + private KStemData1() { + } +// KStemData1 ... KStemData8 are created from "head_word_list.txt" + static String[] data = { +"aback","abacus","abandon","abandoned","abase", +"abash","abate","abattoir","abbess","abbey", +"abbot","abbreviate","abbreviation","abc","abdicate", +"abdomen","abduct","abed","aberrant","aberration", +"abet","abeyance","abhor","abhorrent","abide", +"abiding","abilities","ability","abject","abjure", +"ablative","ablaut","ablaze","able","ablution", +"ablutions","ably","abnegation","abnormal","abo", +"aboard","abode","abolish","abolition","abominable", +"abominate","abomination","aboriginal","aborigine","abort", +"abortion","abortionist","abortive","abound","about", +"above","aboveboard","abracadabra","abrade","abrasion", +"abrasive","abreast","abridge","abridgement","abridgment", +"abroad","abrogate","abrupt","abscess","abscond", +"absence","absent","absentee","absenteeism","absently", +"absinth","absinthe","absolute","absolutely","absolution", +"absolutism","absolve","absorb","absorbent","absorbing", +"absorption","abstain","abstemious","abstention","abstinence", +"abstract","abstracted","abstraction","abstruse","absurd", +"abundance","abundant","abuse","abusive","abut", +"abutment","abysmal","abyss","acacia","academic", +"academician","academy","accede","accelerate","acceleration", +"accelerator","accent","accentuate","accept","acceptable", +"acceptance","access","accessible","accession","accessory", +"accidence","accident","accidental","acclaim","acclamation", +"acclimatize","acclivity","accolade","accommodate","accommodating", +"accommodation","accommodations","accompaniment","accompanist","accompany", +"accomplice","accomplish","accomplished","accomplishment","accord", +"accordance","according","accordingly","accordion","accost", +"account","accountable","accountancy","accountant","accoutrements", +"accredit","accretion","accrue","accumulate","accumulation", +"accumulative","accumulator","accuracy","accurate","accursed", +"accusation","accusative","accuse","accused","accustom", +"accustomed","ace","acerbity","acetate","acetic", +"acetylene","ache","achieve","achievement","achoo", +"acid","acidify","acidity","acidulated","acidulous", +"acknowledge","acknowledgement","acknowledgment","acme","acne", +"acolyte","aconite","acorn","acoustic","acoustics", +"acquaint","acquaintance","acquaintanceship","acquiesce","acquiescent", +"acquire","acquisition","acquisitive","acquit","acquittal", +"acre","acreage","acrid","acrimony","acrobat", +"acrobatic","acrobatics","acronym","across","acrostic", +"act","acting","actinism","action","actionable", +"activate","active","activist","activity","actor", +"actress","acts","actual","actuality","actually", +"actuary","actuate","acuity","acumen","acupuncture", +"acute","adage","adagio","adam","adamant", +"adapt","adaptable","adaptation","adapter","adaptor", +"adc","add","addendum","adder","addict", +"addiction","addictive","addition","additional","additive", +"addle","address","addressee","adduce","adenoidal", +"adenoids","adept","adequate","adhere","adherence", +"adherent","adhesion","adhesive","adieu","adipose", +"adj","adjacent","adjective","adjoin","adjourn", +"adjudge","adjudicate","adjunct","adjure","adjust", +"adjutant","adman","admass","administer","administration", +"administrative","administrator","admirable","admiral","admiralty", +"admiration","admire","admirer","admissible","admission", +"admit","admittance","admitted","admittedly","admixture", +"admonish","admonition","admonitory","ado","adobe", +"adolescent","adopt","adoption","adoptive","adorable", +"adoration","adore","adorn","adornment","adrenalin", +"adrift","adroit","adulate","adulation","adult", +"adulterate","adulterer","adultery","adumbrate","adv", +"advance","advanced","advancement","advances","advantage", +"advantageous","advent","adventist","adventitious","adventure", +"adventurer","adventuress","adventurous","adverb","adverbial", +"adversary","adverse","adversity","advert","advertise", +"advertisement","advertising","advice","advisable","advise", +"advisedly","adviser","advisor","advisory","advocacy", +"advocate","adz","adze","aegis","aeon", +"aerate","aerial","aerie","aerobatic","aerobatics", +"aerodrome","aerodynamic","aerodynamics","aeronautics","aeroplane", +"aerosol","aerospace","aertex","aery","aesthete", +"aesthetic","aesthetics","aether","aethereal","aetiology", +"afar","affable","affair","affect","affectation", +"affected","affecting","affection","affectionate","affiance", +"affidavit","affiliate","affiliation","affinity","affirm", +"affirmative","affix","afflict","affliction","affluent", +"afford","afforest","affray","affricate","affront", +"aficionado","afield","afire","aflame","afloat", +"afoot","aforesaid","aforethought","afraid","afresh", +"afrikaans","afrikaner","afro","aft","after", +"afterbirth","aftercare","aftereffect","afterglow","afterlife", +"aftermath","afternoon","afternoons","afters","aftershave", +"aftertaste","afterthought","afterwards","again","against", +"agape","agate","age","ageing","ageless", +"agency","agenda","agent","agglomerate","agglutination", +"agglutinative","aggrandisement","aggrandizement","aggravate","aggravation", +"aggregate","aggregation","aggression","aggressive","aggressor", +"aggrieved","aggro","aghast","agile","agitate", +"agitation","agitator","aglow","agnostic","ago", +"agog","agonise","agonised","agonising","agonize", +"agonized","agonizing","agony","agoraphobia","agoraphobic", +"agrarian","agree","agreeable","agreeably","agreement", +"agriculture","agronomy","aground","ague","aha", +"ahead","ahem","ahoy","aid","ail", +"aileron","ailment","aim","aimless","air", +"airbase","airbed","airbladder","airborne","airbrake", +"airbrick","airbus","aircraft","aircraftman","aircrew", +"aircushion","airdrop","airedale","airfield","airflow", +"airforce","airgun","airhole","airhostess","airily", +"airing","airlane","airless","airletter","airlift", +"airline","airliner","airlock","airmail","airman", +"airplane","airpocket","airport","airs","airshaft", +"airship","airsick","airspace","airspeed","airstrip", +"airtight","airway","airwoman","airworthy","airy", +"aisle","aitch","ajar","akimbo","akin", +"alabaster","alack","alacrity","alarm","alarmist", +"alas","albatross","albeit","albino","album", +"albumen","alchemist","alchemy","alcohol","alcoholic", +"alcoholism","alcove","alder","alderman","ale", +"alehouse","alert","alfalfa","alfresco","algae", +"algebra","algorithm","alias","alibi","alien", +"alienate","alienation","alienist","alight","align", +"alignment","alike","alimentary","alimony","aline", +"alinement","alive","alkali","alkaline","all", +"allah","allay","allegation","allege","allegedly", +"allegiance","allegorical","allegory","allegretto","allegro", +"alleluia","allergic","allergy","alleviate","alley", +"alleyway","alliance","allied","alligator","alliteration", +"alliterative","allocate","allocation","allopathy","allot", +"allotment","allow","allowable","allowance","alloy", +"allspice","allude","allure","allurement","allusion", +"alluvial","alluvium","ally","almanac","almanack", +"almighty","almond","almoner","almost","alms", +"aloe","aloft","alone","along","alongside", +"aloof","alopecia","aloud","alpaca","alpenhorn", +"alpenstock","alpha","alphabet","alphabetical","alpine", +"already","alright","alsatian","also","altar", +"altarpiece","alter","alteration","altercation","alternate", +"alternative","alternator","although","altimeter","altitude", +"alto","altogether","altruism","altruist","alum", +"aluminium","alumna","alumnus","alveolar","always", +"alyssum","amalgam","amalgamate","amanuensis","amass", +"amateur","amateurish","amatory","amaze","amazing", +"amazon","ambassador","ambassadorial","amber","ambergris", +"ambidextrous","ambience","ambient","ambiguous","ambit", +"ambition","ambitious","ambivalent","amble","ambrosia", +"ambulance","ambush","ame","ameba","ameliorate", +"amen","amenable","amend","amendment","amends", +"amenity","americanise","americanism","americanize","amethyst", +"amiable","amicable","amid","amidships","amir", +"amiss","amity","ammeter","ammo","ammonia", +"ammonite","ammunition","amnesia","amnesty","amoeba", +"amoebic","amok","among","amoral","amorous", +"amorphous","amortise","amortize","amount","amour", +"amp","amperage","ampersand","amphetamine","amphibian", +"amphibious","amphitheater","amphitheatre","amphora","ample", +"amplifier","amplify","amplitude","ampoule","amputate", +"amputee","amuck","amulet","amuse","amusement", +"anachronism","anaconda","anaemia","anaemic","anaesthesia", +"anaesthetic","anaesthetist","anagram","anal","analgesia", +"analgesic","analog","analogize","analogous","analogue", +"analogy","analyse","analysis","analyst","analytic", +"anapaest","anarchic","anarchism","anarchist","anarchy", +"anathema","anathematize","anatomical","anatomist","anatomy", +"ancestor","ancestral","ancestry","anchor","anchorage", +"anchorite","anchovy","ancient","ancients","ancillary", +"and","andante","andiron","androgynous","anecdotal", +"anecdote","anemia","anemometer","anemone","anesthesia", +"anesthetise","anesthetize","anew","angel","angelica", +"angelus","anger","angle","anglican","anglicise", +"anglicism","anglicize","angling","anglophile","anglophilia", +"anglophobe","anglophobia","angora","angostura","angry", +"angst","anguish","anguished","angular","aniline", +"animadversion","animadvert","animal","animalcule","animalism", +"animate","animation","animism","animosity","animus", +"anis","anise","aniseed","ankle","anklet", +"annals","anneal","annex","annexation","annexe", +"annihilate","anniversary","annotate","annotation","announce", +"announcement","announcer","annoy","annoyance","annual", +"annuity","annul","annular","annunciation","anode", +"anodyne","anoint","anomalous","anomaly","anon", +"anonymity","anonymous","anopheles","anorak","anorexia", +"another","answer","answerable","ant","antacid", +"antagonism","antagonist","antagonize","antarctic","ante", +"anteater","antecedence","antecedent","antecedents","antechamber", +"antedate","antediluvian","antelope","antenatal","antenna", +"antepenultimate","anterior","anteroom","anthem","anther", +"anthill","anthology","anthracite","anthrax","anthropocentric", +"anthropoid","anthropologist","anthropology","anthropomorphic","anthropomorphism", +"anthropophagous","anthropophagy","antiaircraft","antibiotic","antibody", +"antic","anticipate","anticipation","anticipatory","anticlerical", +"anticlimax","anticlockwise","antics","anticyclone","antidote", +"antifreeze","antigen","antihero","antihistamine","antiknock", +"antilogarithm","antimacassar","antimatter","antimony","antipathetic", +"antipathy","antipersonnel","antipodal","antipodes","antiquarian", +"antiquary","antiquated","antique","antiquity","antirrhinum", +"antiseptic","antisocial","antithesis","antithetic","antitoxin", +"antler","antonym","anus","anvil","anxiety", +"anxious","any","anybody","anyhow","anyplace", +"anyroad","anything","anyway","anywhere","aorta", +"apace","apanage","apart","apartheid","apartment", +"apartments","apathetic","apathy","ape","aperient", +"aperitif","aperture","apex","aphasia","aphasic", +"aphid","aphorism","aphoristic","aphrodisiac","apiarist", +"apiary","apices","apiculture","apiece","apish", +"aplomb","apocalypse","apocalyptic","apocrypha","apocryphal", +"apogee","apologetic","apologetics","apologia","apologise", +"apologist","apologize","apology","apophthegm","apoplectic", +"apoplexy","apostasy","apostate","apostatise","apostatize", +"apostle","apostolic","apostrophe","apostrophize","apothecary", +"apothegm","apotheosis","appal","appall","appalling", +"appanage","apparatus","apparel","apparent","apparently", +"apparition","appeal","appealing","appear","appearance", +"appearances","appease","appeasement","appellant","appellate", +"appellation","append","appendage","appendectomy","appendicitis", +"appendix","appertain","appetite","appetizer","appetizing", +"applaud","applause","apple","applejack","appliance", +"applicable","applicant","application","applied","apply", +"appoint","appointment","appointments","apportion","apposite", +"apposition","appraisal","appraise","appreciable","appreciate", +"appreciation","appreciative","apprehend","apprehension","apprehensive", +"apprentice","apprenticeship","apprise","appro","approach", +"approachable","approbation","approbatory","appropriate","appropriation", +"approval","approve","approx","approximate","approximation", +"appurtenance","apricot","april","apron","apropos", +"apse","apt","aptitude","aqualung","aquamarine", +"aquaplane","aquarium","aquatic","aquatint","aqueduct", +"aqueous","aquiline","arab","arabesque","arabic", +"arable","arachnid","arak","arbiter","arbitrary", +"arbitrate","arbitration","arbitrator","arbor","arboreal", +"arboretum","arbour","arc","arcade","arcadia", +"arcane","arch","archaeology","archaic","archaism", +"archangel","archbishop","archbishopric","archdeacon","archdeaconry", +"archdiocese","archduke","archeology","archer","archery", +"archetype","archimandrite","archipelago","architect","architecture", +"archive","archway","arctic","ardent","ardor", +"ardour","arduous","are","area","areca", +"arena","argent","argon","argot","arguable", +"argue","argument","argumentative","aria","arid", +"aries","aright","arise","aristocracy","aristocrat", +"aristocratic","arithmetic","arithmetician","ark","arm", +"armada","armadillo","armament","armature","armband", +"armchair","armed","armful","armhole","armistice", +"armlet","armor","armorer","armorial","armory", +"armour","armoured","armourer","armoury","armpit", +"arms","army","aroma","aromatic","arose", +"around","arouse","arpeggio","arquebus","arrack", +"arraign","arrange","arrangement","arrant","arras", +"array","arrears","arrest","arrival","arrive", +"arrogance","arrogant","arrogate","arrow","arrowhead", +"arrowroot","arse","arsenal","arsenic","arson", +"art","artefact","arterial","arteriosclerosis","artery", +"artful","arthritis","artichoke","article","articles", +"articulate","articulated","articulateness","articulation","artifact", +"artifice","artificer","artificial","artillery","artisan", +"artist","artiste","artistic","artistry","artless", +"arts","arty","arum","asbestos","ascend", +"ascendancy","ascendant","ascendency","ascendent","ascension", +"ascent","ascertain","ascetic","ascribe","ascription", +"asepsis","aseptic","asexual","ash","ashamed", +"ashbin","ashcan","ashen","ashes","ashore", +"ashtray","ashy","aside","asinine","ask", +"askance","askew","aslant","asleep","asp", +"asparagus","aspect","aspectual","aspen","asperity", +"aspersion","asphalt","asphodel","asphyxia","asphyxiate", +"aspic","aspidistra","aspirant","aspirate","aspiration", +"aspire","aspirin","ass","assagai","assail", +"assailant","assassin","assassinate","assault","assay", +"assegai","assemblage","assemble","assembly","assemblyman", +"assent","assert","assertion","assertive","assess", +"assessment","assessor","asset","asseverate","assiduity", +"assiduous","assign","assignation","assignment","assimilate", +"assimilation","assist","assistance","assistant","assize", +"assizes","associate","association","assonance","assort", +"assorted","assortment","asst","assuage","assume", +"assumption","assurance","assure","assured","aster", +"asterisk","astern","asteroid","asthma","astigmatic", +"astigmatism","astir","astonish","astonishment","astound", +"astrakhan","astral","astray","astride","astringent", +"astrolabe","astrologer","astrology","astronaut","astronautics", +"astronomer","astronomical","astronomy","astrophysics","astute", +"asunder","asylum","asymmetric","atavism","atchoo", +"ate","atelier","atheism","atheist","athlete", +"athletic","athletics","athwart","atishoo","atlas", +"atmosphere","atmospheric","atmospherics","atoll","atom", +"atomic","atomise","atomize","atonal","atonality", +"atone","atop","atrocious","atrocity","atrophy", +"attach","attachment","attack","attain","attainder", +"attainment","attar","attempt","attend","attendance", +"attendant","attention","attentive","attenuate","attest", +"attestation","attested","attic","attire","attitude", +"attitudinise","attitudinize","attorney","attract","attraction", +"attractive","attributable","attribute","attribution","attributive", +"attrition","attune","atypical","aubergine","aubrietia", +"auburn","auction","auctioneer","audacious","audacity", +"audible","audience","audio","audiometer","audit", +"audition","auditor","auditorium","auditory","auger", +"aught","augment","augmentation","augur","augury", +"august","auk","aunt","aura","aural", +"aureole","auricle","auricular","auriferous","aurora", +"auscultation","auspices","auspicious","aussie","austere", +"austerity","australasian","autarchy","autarky","authentic", +"authenticate","authenticity","author","authoress","authorisation", +"authorise","authoritarian","authoritative","authority","authorization", +"authorize","authorship","autism","autistic","auto", +"autobahn","autobiographical","autobiography","autocracy","autocrat", +"autoeroticism","autograph","automat","automate","automatic", +"automation","automatism","automaton","automobile","autonomous", +"autonomy","autopsy","autostrada","autosuggestion","autumn", +"autumnal","auxiliary","avail","available","avalanche", +"avarice","avaricious","avatar","avaunt","avenge", +"avenue","aver","average","averse","aversion", +"aversive","avert","aviary","aviation","aviator", +"avid","avocado","avocation","avocet","avoid", +"avoidance","avoirdupois","avow","avowal","avowed", +"avuncular","await","awake","awaken","awakening", +"award","aware","awash","away","awe", +"awesome","awestruck","awful","awfully","awhile", +"awkward","awl","awning","awoke","awoken", +"awry","axe","axiom","axiomatic","axis", +"axle","axolotl","ayah","aye","azalea", +"azimuth","azure","baa","babble","babbler", +"babe","babel","baboo","baboon","babu", +"baby","babyhood","babyish","baccalaureate","baccara", +"baccarat","bacchanal","baccy","bachelor","bacillus", +"back","backache","backbench","backbite","backbone", +"backbreaking","backchat","backcloth","backcomb","backdate", +"backdrop","backer","backfire","backgammon","background", +"backhand","backhanded","backhander","backing","backlash", +"backlog","backmost","backpedal","backside","backslide", +"backspace","backstage","backstairs","backstay","backstroke", +"backtrack","backup","backward","backwards","backwash", +"backwater","backwoods","backwoodsman","backyard","bacon", +"bacteria","bacteriology","bactrian","bad","bade", +"badge","badger","badinage","badly","badminton", +"baffle","baffling","bag","bagatelle","bagful", +"baggage","baggy","bagpipes","bags","bah", +"bail","bailey","bailiff","bairn","bait", +"baize","bake","bakelite","baker","bakery", +"baksheesh","balaclava","balalaika","balance","balanced", +"balcony","bald","balderdash","balding","baldly", +"baldric","bale","baleful","balk","ball", +"ballad","ballade","ballast","ballcock","ballerina", +"ballet","ballistic","ballistics","ballocks","balloon", +"ballooning","balloonist","ballot","ballpoint","ballroom", +"balls","bally","ballyhoo","balm","balmy", +"baloney","balsa","balsam","balustrade","bamboo", +"bamboozle","ban","banal","banana","band", +"bandage","bandana","bandanna","bandbox","bandeau", +"bandit","banditry","bandmaster","bandoleer","bandolier", +"bandsman","bandstand","bandwagon","bandy","bane", +"baneful","bang","banger","bangle","banian", +"banish","banister","banjo","bank","bankbook", +"banker","banking","bankrupt","bankruptcy","banner", +"bannock","banns","banquet","banshee","bantam", +"bantamweight","banter","banyan","baobab","baptise", +"baptism","baptist","baptize","bar","barb", +"barbarian","barbaric","barbarise","barbarism","barbarize", +"barbarous","barbecue","barbed","barbel","barber", +"barbican","barbiturate","barcarole","barcarolle","bard", +"bare","bareback","barebacked","barefaced","barefoot", +"bareheaded","barelegged","barely","bargain","barge", +"bargee","baritone","barium","bark","barker", +"barley","barleycorn","barmaid","barman","barmy", +"barn","barnacle","barnstorm","barnyard","barograph", +"barometer","baron","baroness","baronet","baronetcy", +"baronial","barony","baroque","barque","barrack", +"barracks","barracuda","barrage","barred","barrel", +"barren","barricade","barricades","barrier","barring", +"barrister","barrow","bartender","barter","basalt", +"base","baseball","baseboard","baseless","baseline", +"basement","bases","bash","bashful","basic", +"basically","basics","basil","basilica","basilisk", +"basin","basis","bask","basket","basketball", +"basketful","basketry","basketwork","bass","basset", +"bassinet","bassoon","bast","bastard","bastardise", +"bastardize","bastardy","baste","bastinado","bastion", +"bat","batch","bated","bath","bathing", +"bathos","bathrobe","bathroom","baths","bathtub", +"bathysphere","batik","batiste","batman","baton", +"bats","batsman","battalion","batten","batter", +"battery","battle","battleax","battleaxe","battlefield", +"battlements","battleship","batty","bauble","baulk", +"bauxite","bawd","bawdy","bawl","bay", +"bayonet","bayou","bazaar","bazooka","bbc", +"beach","beachcomber","beachhead","beachwear","beacon", +"bead","beading","beadle","beady","beagle", +"beagling","beak","beaker","beam","bean", +"beanpole","beanstalk","bear","bearable","beard", +"bearded","bearer","bearing","bearings","bearish", +"bearskin","beast","beastly","beat","beaten", +"beater","beatific","beatification","beatify","beating", +"beatitude","beatitudes","beatnik","beau","beaujolais", +"beaut","beauteous","beautician","beautiful","beautify", +"beauty","beaver","bebop","becalmed","because", +"beck","beckon","become","becoming","bed", +"bedaub","bedbug","bedclothes","bedding","bedeck", +"bedevil","bedewed","bedfellow","bedimmed","bedlam", +"bedouin","bedpan","bedpost","bedraggled","bedridden", +"bedrock","bedroom","bedside","bedsore","bedspread", +"bedstead","bedtime","bee","beech","beef", +"beefcake","beefeater","beefsteak","beefy","beehive", +"beeline","been","beer","beery","beeswax", +"beet","beetle","beetling","beetroot","beeves", +"befall","befit","befitting","before","beforehand", +"befriend","befuddle","beg","beget","beggar", +"beggarly","beggary","begin","beginner","beginning", +"begone","begonia","begorra","begot","begotten", +"begrudge","beguile","begum","begun","behalf", +"behave","behavior","behaviorism","behaviour","behaviourism", +"behead","behemoth","behest","behind","behindhand", +"behold","beholden","behove","beige","being", +"belabor","belabour","belated","belay","belch", +"beleaguer","belfry","belie","belief","believable", +"believe","believer","belittle","bell","belladonna", +"bellboy","belle","bellflower","bellicose","belligerency", +"belligerent","bellow","bellows","belly","bellyache", +"bellyful","belong","belongings","beloved","below", +"belt","belted","belting","beltway","bemoan", +"bemused","ben","bench","bencher","bend", +"bended","bends","beneath","benedictine","benediction", +"benedictus","benefaction","benefactor","benefice","beneficent", +"beneficial","beneficiary","benefit","benevolence","benevolent", +"benighted","benign","benignity","bent","benumbed", +"benzedrine","benzene","benzine","bequeath","bequest", +"berate","bereave","bereaved","bereavement","bereft", +"beret","beriberi","berk","berry","berserk", +"berth","beryl","beseech","beseem","beset", +"besetting","beside","besides","besiege","besmear", +"besmirch","besom","besotted","besought","bespattered", +"bespeak","bespoke","best","bestial","bestiality", +"bestiary","bestir","bestow","bestrew","bestride", +"bet","beta","betake","betel","bethel", +"bethink","betide","betimes","betoken","betray", +"betrayal","betroth","betrothal","betrothed","better", +"betterment","betters","bettor","between","betwixt", +"bevel","beverage","bevy","bewail","beware", +"bewilder","bewitch","bey","beyond","bezique", +"bhang","bias","bib","bible","biblical", +"bibliographer","bibliography","bibliophile","bibulous","bicarb", +"bicarbonate","bicentenary","bicentennial","biceps","bicker", +"bicycle","bid","biddable","bidding","bide", +"bidet","biennial","bier","biff","bifocals", +"bifurcate","big","bigamist","bigamous","bigamy", +"bighead","bight","bigot","bigoted","bigotry", +"bigwig","bijou","bike","bikini","bilabial", +"bilateral","bilberry","bile","bilge","bilingual", +"bilious","bilk","bill","billboard","billet", +"billfold","billhook","billiard","billiards","billion", +"billow","billposter","billy","biltong","bimetallic", +"bimetallism","bimonthly","bin","binary","bind", +"binder","bindery","binding","bindweed","binge", +"bingo","binnacle","binocular","binoculars","binomial", +"biochemistry","biodegradable","biographer","biographical","biography", +"biological","biology","biomedical","bionic","biosphere", +"biotechnology","bipartisan","bipartite","biped","biplane", +"birch","bird","birdie","birdlime","birdseed", +"biretta","biro","birth","birthday","birthmark", +"birthplace","birthrate","birthright","biscuit","bisect", +"bisexual","bishop","bishopric","bismuth","bison", +"bisque","bistro","bit","bitch","bitchy", +"bite","biting","bitter","bittern","bitters", +"bittersweet","bitty","bitumen","bituminous","bivalve", +"bivouac","biweekly","bizarre","blab","blabber", +"blabbermouth","black","blackamoor","blackball","blackberry", +"blackbird","blackboard","blackcurrant","blacken","blackguard", +"blackhead","blacking","blackjack","blackleg","blacklist", +"blackly","blackmail","blackout","blackshirt","blacksmith", +"blackthorn","bladder","blade","blaeberry","blah", +"blame","blameless","blameworthy","blanch","blancmange", +"bland","blandishments","blank","blanket","blare", +"blarney","blaspheme","blasphemous","blasphemy","blast", +"blasted","blatant","blather","blaze","blazer", +"blazes","blazing","blazon","blazonry","bleach", +"bleachers","bleak","bleary","bleat","bleed", +"bleeder","bleeding","bleep","blemish","blench", +"blend","blender","bless","blessed","blessing", +"blether","blew","blight","blighter","blimey", +"blimp","blind","blinder","blinders","blindfold", +"blink","blinkered","blinkers","blinking","blip", +"bliss","blister","blistering","blithe","blithering", +"blitz","blizzard","bloated","bloater","blob", +"bloc","block","blockade","blockage","blockbuster", +"blockhead","blockhouse","bloke","blond","blood", +"bloodbath","bloodcurdling","bloodhound","bloodless","bloodletting", +"bloodshed","bloodshot","bloodstain","bloodstock","bloodstream", +"bloodsucker","bloodthirsty","bloody","bloom","bloomer", +"bloomers","blooming","blossom","blot","blotch", +"blotter","blotto","blouse","blow","blower", +"blowfly","blowgun","blowhard","blowhole","blowlamp", +"blown","blowout","blowpipe","blowsy","blowy", +"blowzy","blubber","bludgeon","blue","bluebag", +"bluebeard","bluebell","blueberry","bluebird","bluebottle", +"bluecoat","bluefish","bluejacket","blueprint","blues", +"bluestocking","bluff","blunder","blunderbuss","blunt", +"bluntly","blur","blurb","blurt","blush", +"bluster","blustery","boa","boar","board", +"boarder","boarding","boardinghouse","boardroom","boards", +"boardwalk","boast","boaster","boastful","boat", +"boater","boathouse","boatman","boatswain","bob", +"bobbin","bobby","bobcat","bobolink","bobsleigh", +"bobtail","bobtailed","bock","bod","bode", +"bodice","bodily","boding","bodkin","body", +"bodyguard","bodywork","boer","boffin","bog", +"bogey","boggle","boggy","bogie","bogus", +"bohemian","boil","boiler","boisterous","bold", +"boldface","boldfaced","bole","bolero","boll", +"bollard","bollocks","boloney","bolshevik","bolshevism", +"bolshy","bolster","bolt","bolthole","bomb", +"bombard","bombardier","bombardment","bombast","bomber", +"bombproof","bombshell","bombsight","bombsite","bonanza", +"bonbon","bond","bondage","bonded","bondholder", +"bonds","bone","boned","bonehead","boner", +"bonesetter","boneshaker","bonfire","bongo","bonhomie", +"bonito","bonkers","bonnet","bonny","bonsai", +"bonus","bony","bonzer","boo","boob", +"boobs","booby","boodle","boohoo","book", +"bookable","bookbindery","bookbinding","bookcase","bookend", +"booking","bookish","bookkeeping","booklet","bookmaker", +"bookmark","bookmobile","bookplate","books","bookseller", +"bookshop","bookstall","bookwork","bookworm","boom", +"boomerang","boon","boor","boost","booster", +"boot","bootblack","booted","bootee","booth", +"bootlace","bootleg","bootless","boots","bootstraps", +"booty","booze","boozer","boozy","bop", +"bopper","boracic","borage","borax","bordeaux", +"bordello","border","borderer","borderland","borderline", +"bore","borealis","borehole","borer","born", +"borne","boron","borough","borrow","borrowing", +"borscht","borshcht","borstal","borzoi","bosh", +"bosom","bosomy","boss","bossy","bosun", +"botanical","botanise","botanist","botanize","botany", +"botch","both","bother","botheration","bothersome", +"bottle","bottleful","bottleneck","bottom","bottomless", +"botulism","boudoir","bouffant","bougainvillaea","bougainvillea", +"bough","bought","bouillabaisse","bouillon","boulder", +"boulevard","bounce","bouncer","bouncing","bouncy", +"bound","boundary","bounden","bounder","boundless", +"bounds","bounteous","bountiful","bounty","bouquet", +"bourbon","bourgeois","bourgeoisie","bourn","bourne", +"bourse","bout","boutique","bouzouki","bovine", +"bovril","bovver","bow","bowdlerise","bowdlerize", +"bowed","bowel","bowels","bower","bowerbird", +"bowing","bowl","bowler","bowlful","bowline", +"bowling","bowls","bowman","bowser","bowshot", +"bowsprit","bowwow","box","boxer","boxful", +"boxing","boxwood","boy","boycott","boyfriend", +"boyhood","boyish","boys","bra","brace", +"bracelet","bracelets","braces","bracing","bracken", +"bracket","brackish","bract","bradawl","brae", +"brag","braggadocio","braggart","brahman","braid", +"braille","brain","brainchild","brainless","brainpan", +"brains","brainstorm","brainwash","brainwashing","brainwave", +"brainy","braise","brake","bramble","bran", +"branch","brand","brandish","brandy","brash", +"brass","brasserie","brassiere","brassy","brat", +"bravado","brave","bravo","bravura","brawl", +"brawn","brawny","bray","brazen","brazier", +"bre","breach","bread","breadbasket","breadboard", +"breadcrumb","breaded","breadfruit","breadline","breadth", +"breadthways","breadwinner","break","breakage","breakaway", +"breakdown","breaker","breakfast","breakneck","breakout", +"breakthrough","breakup","breakwater","bream","breast", +"breastbone","breastplate","breaststroke","breastwork","breath", +"breathalyse","breathalyser","breathe","breather","breathing", +"breathless","breathtaking","breathy","breech","breeches", +"breed","breeder","breeding","breeze","breezeblock", +"breezy","brethren","breve","brevet","breviary", +"brevity","brew","brewer","brewery","briar", +"bribe","bribery","brick","brickbat","brickfield", +"bricklayer","brickwork","bridal","bride","bridegroom", +"bridesmaid","bridge","bridgehead","bridgework","bridle", +"brie","brief","briefcase","briefing","briefs", +"brier","brig","brigade","brigadier","brigand", +"brigandage","brigantine","bright","brighten","brill", +"brilliancy","brilliant","brilliantine","brim","brimful", +"brimfull","brimstone","brindled","brine","bring", +"brink","brinkmanship","brioche","briquet","briquette", +"brisk","brisket","bristle","bristly","bristols", +"brit","britches","britisher","briton","brittle", +"broach","broad","broadcast","broadcasting","broadcloth", +"broaden","broadloom","broadminded","broadsheet","broadside", +"broadsword","broadways","brocade","broccoli","brochure", +"brogue","broil","broiler","broke","broken", +"broker","brolly","bromide","bromine","bronchial", +"bronchitis","bronco","brontosaurus","bronze","brooch", +"brood","broody","brook","broom","broomstick", +"broth","brothel","brother","brotherhood","brougham", +"brought","brouhaha","brow","browbeat","brown", +"brownie","brownstone","browse","brucellosis","bruin", +"bruise","bruiser","bruising","bruit","brunch", +"brunet","brunette","brunt","brush","brushwood", +"brushwork","brusque","brutal","brutalise","brutality", +"brutalize","brute","brutish","bubble","bubbly", +"buccaneer","buck","buckboard","bucked","bucket", +"buckle","buckler","buckram","buckshee","buckshot", +"buckskin","bucktooth","buckwheat","bucolic","bud", +"buddhism","budding","buddy","budge","budgerigar", +"budget","budgetary","buff","buffalo","buffer", +"buffet","buffoon","buffoonery","bug","bugaboo", +"bugbear","bugger","buggered","buggery","buggy", +"bughouse","bugle","bugrake","buhl","build", +"builder","building","buildup","bulb","bulbous", +"bulbul","bulge","bulk","bulkhead","bulky", +"bull","bulldog","bulldoze","bulldozer","bullet", +"bulletin","bulletproof","bullfight","bullfighting","bullfinch", +"bullfrog","bullheaded","bullion","bullnecked","bullock", +"bullring","bullshit","bully","bullyboy","bulrush", +"bulwark","bum","bumble","bumblebee","bumboat", +"bumf","bummer","bump","bumper","bumph", +"bumpkin","bumptious","bumpy","bun","bunch", +"bundle","bung","bungalow","bunghole","bungle", +"bunion","bunk","bunker","bunkered","bunkhouse", +"bunkum","bunny","bunting","buoy","buoyancy", +"bur","burberry","burble","burden","burdensome", +"burdock","bureau","bureaucracy","bureaucrat","bureaucratic", +"burg","burgeon","burgess","burgh","burgher", +"burglar","burglary","burgle","burgomaster","burgundy", +"burial","burlap","burlesque","burly","burn", +"burner","burning","burnish","burnous","burnouse", +"burnt","burp","burr","burro","burrow", +"bursar","bursary","burst","burthen","burton", +"bury","bus","busby","bush","bushbaby", +"bushed","bushel","bushwhack","bushy","business", +"businesslike","businessman","busk","busker","busman", +"bust","bustard","buster","bustle","busy", +"busybody","but","butane","butch","butcher", +"butchery","butler","butt","butter","buttercup", +"butterfingers","butterfly","buttermilk","butterscotch","buttery", +"buttock","buttocks","button","buttonhole","buttonhook", +"buttons","buttress","buxom","buy","buyer", +"buzz","buzzard","buzzer","bye","byelaw", +"bygone","bygones","bylaw","bypass","byplay", +"byre","bystander","byway","byways","byword", +"byzantine","cab","cabal","cabaret","cabbage", +"cabbie","cabby","cabdriver","caber","cabin", +"cabinet","cable","cablegram","caboodle","caboose", +"cabriolet","cacao","cache","cachet","cachou", +"cackle","cacophony","cactus","cad","cadaver", +"cadaverous","caddie","caddy","cadence","cadenza", +"cadet","cadge","cadi","cadmium","cadre", +"caerphilly","caesura","cafeteria","caffeine","caftan", +"cage","cagey","cahoots","caiman","caique", +"cairn","caisson","cajole","cake","calabash", +"calaboose","calamitous","calamity","calcify","calcination", +"calcine","calcium","calculable","calculate","calculating", +"calculation","calculator","calculus","caldron","calendar", +"calender","calends","calf","calfskin","caliber", +"calibrate","calibration","calibre","calico","caliper", +"calipers","caliph","caliphate","calisthenic","calisthenics", +"calk","call","calla","callboy","caller", +"calligraphy","calling","calliper","callipers","callisthenic", +"callisthenics","callous","callow","callus","calm", +"calomel","calorie","calorific","calumniate","calumny", +"calvary","calve","calves","calvinism","calypso", +"calyx","cam","camaraderie","camber","cambric", +"came","camel","camelhair","camellia","camembert", +"cameo","camera","cameraman","camisole","camomile", +"camouflage","camp","campaign","campanile","campanology", +"campanula","camper","campfire","campground","camphor", +"camphorated","campion","campsite","campus","camshaft", +"can","canal","canalise","canalize","canard", +"canary","canasta","cancan","cancel","cancellation", +"cancer","cancerous","candela","candelabrum","candid", +"candidate","candidature","candidly","candied","candle", +"candlelight","candlemas","candlepower","candlestick","candlewick", +"candor","candour","candy","candyfloss","candytuft", +"cane","canine","canis","canister","canker", +"canna","cannabis","canned","cannelloni","cannery", +"cannibal","cannibalise","cannibalism","cannibalize","cannon", +"cannonade","cannonball","cannot","canny","canoe", +"canon","canonical","canonicals","canonise","canonize", +"canoodle","canopy","canst","cant","cantab", +"cantabrigian","cantaloup","cantaloupe","cantankerous","cantata", +"canteen","canter","canticle","cantilever","canto", +"canton","cantonment","cantor","canvas","canvass", +"canyon","cap","capabilities","capability","capable", +"capacious","capacity","caparison","cape","caper", +"capillarity","capillary","capital","capitalisation","capitalise", +"capitalism","capitalist","capitalization","capitalize","capitals", +"capitation","capitol","capitulate","capitulation","capitulations", +"capon","capriccio","caprice","capricious","capricorn", +"capsicum","capsize","capstan","capsule","captain", +"caption","captious","captivate","captive","captivity", +"captor","capture","car","carafe","caramel", +"carapace","carat","caravan","caravanning","caravanserai", +"caraway","carbide","carbine","carbohydrate","carbolic", +"carbon","carbonated","carbonation","carboniferous","carbonise", +"carbonize","carborundum","carboy","carbuncle","carburetor", +"carburettor","carcase","carcass","carcinogen","card", +"cardamom","cardboard","cardiac","cardigan","cardinal", +"cardpunch","cards","cardsharp","care","careen", +"career","careerist","carefree","careful","careless", +"caress","caret","caretaker","careworn","cargo", +"caribou","caricature","caries","carillon","carious", +"carmelite","carmine","carnage","carnal","carnation", +"carnelian","carnival","carnivore","carnivorous","carob", +"carol","carotid","carousal","carouse","carousel", +"carp","carpal","carpenter","carpentry","carpet", +"carpetbag","carpetbagger","carpeting","carport","carpus", +"carriage","carriageway","carrier","carrion","carrot", +"carroty","carrousel","carry","carryall","carrycot", +"carryout","carsick","cart","cartage","cartel", +"carter","carthorse","cartilage","cartilaginous","cartographer", +"cartography","carton","cartoon","cartridge","cartwheel", +"carve","carver","carving","caryatid","cascade", +"cascara","case","casebook","casein","casework", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java new file mode 100644 index 00000000000..bd724f048be --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData2 { + private KStemData2() { + } + static String[] data = { +"cash","cashew","cashier","cashmere","casing", +"casino","cask","casket","casque","cassava", +"casserole","cassette","cassock","cassowary","cast", +"castanets","castaway","castellated","caster","castigate", +"casting","castle","castor","castrate","casual", +"casualty","casuist","casuistry","cat","cataclysm", +"catacomb","catafalque","catalepsy","catalog","catalogue", +"catalpa","catalysis","catalyst","catamaran","catapult", +"cataract","catarrh","catastrophe","catatonic","catcall", +"catch","catcher","catching","catchpenny","catchphrase", +"catchword","catchy","catechise","catechism","catechize", +"categorical","categorise","categorize","category","cater", +"caterer","caterpillar","caterwaul","catfish","catgut", +"catharsis","cathartic","cathedral","catheter","cathode", +"catholic","catholicism","catholicity","catkin","catnap", +"catnip","catsup","cattle","catty","catwalk", +"caucus","caudal","caught","caul","cauldron", +"cauliflower","caulk","causal","causality","causation", +"causative","cause","causeless","causeway","caustic", +"cauterise","cauterize","caution","cautionary","cautious", +"cavalcade","cavalier","cavalry","cavalryman","cave", +"caveat","caveman","cavern","cavernous","caviar", +"caviare","cavil","cavity","cavort","cavy", +"caw","cay","cayman","cease","ceaseless", +"cedar","cede","cedilla","ceiling","celandine", +"celebrant","celebrate","celebrated","celebration","celebrity", +"celerity","celery","celestial","celibacy","celibate", +"cell","cellar","cellarage","cellist","cello", +"cellophane","cellular","celluloid","cellulose","celsius", +"celtic","cement","cemetery","cenotaph","censor", +"censorious","censorship","censure","census","cent", +"centaur","centavo","centenarian","centenary","centennial", +"center","centerboard","centerpiece","centigrade","centigram", +"centigramme","centime","centimeter","centimetre","centipede", +"central","centralise","centralism","centralize","centre", +"centreboard","centrepiece","centrifugal","centrifuge","centripetal", +"centrist","centurion","century","cephalic","ceramic", +"ceramics","cereal","cerebellum","cerebral","cerebration", +"cerebrum","ceremonial","ceremonious","ceremony","cerise", +"cert","certain","certainly","certainty","certifiable", +"certificate","certificated","certify","certitude","cerulean", +"cervical","cervix","cessation","cession","cesspit", +"cetacean","chablis","chaconne","chafe","chaff", +"chaffinch","chagrin","chain","chair","chairman", +"chairmanship","chairperson","chairwoman","chaise","chalet", +"chalice","chalk","chalky","challenge","challenging", +"chamber","chamberlain","chambermaid","chambers","chameleon", +"chamiomile","chamois","chamomile","champ","champagne", +"champaign","champion","championship","chance","chancel", +"chancellery","chancellor","chancery","chancy","chandelier", +"chandler","change","changeable","changeless","changeling", +"changeover","channel","chant","chanterelle","chanticleer", +"chantry","chanty","chaos","chaotic","chap", +"chapel","chapelgoer","chaperon","chaperone","chapfallen", +"chaplain","chaplaincy","chaplet","chaps","chapter", +"char","charabanc","character","characterise","characteristic", +"characterization","characterize","characterless","charade","charades", +"charcoal","chard","charge","chargeable","charged", +"charger","chariot","charioteer","charisma","charismatic", +"charitable","charity","charlady","charlatan","charleston", +"charlock","charlotte","charm","charmer","charming", +"chart","charter","chartreuse","charwoman","chary", +"charybdis","chase","chaser","chasm","chassis", +"chaste","chasten","chastise","chastisement","chastity", +"chasuble","chat","chatelaine","chattel","chatter", +"chatterbox","chatty","chauffeur","chauvinism","chauvinist", +"cheap","cheapen","cheapskate","cheat","check", +"checkbook","checked","checker","checkerboard","checkers", +"checklist","checkmate","checkoff","checkout","checkpoint", +"checkrail","checkrein","checkroom","checkup","cheddar", +"cheek","cheekbone","cheeky","cheep","cheer", +"cheerful","cheering","cheerio","cheerleader","cheerless", +"cheers","cheery","cheese","cheesecake","cheesecloth", +"cheeseparing","cheetah","chef","chem","chemical", +"chemise","chemist","chemistry","chemotherapy","chenille", +"cheque","chequebook","chequer","cherish","cheroot", +"cherry","cherub","chervil","chess","chessboard", +"chessman","chest","chesterfield","chestnut","chesty", +"chevalier","chevron","chevvy","chevy","chew", +"chi","chianti","chiaroscuro","chic","chicanery", +"chicano","chichi","chick","chicken","chickenfeed", +"chickenhearted","chickpea","chickweed","chicle","chicory", +"chide","chief","chiefly","chieftain","chieftainship", +"chiffon","chiffonier","chiffonnier","chigger","chignon", +"chihuahua","chilblain","child","childbearing","childbirth", +"childhood","childish","childlike","chile","chill", +"chiller","chilli","chilly","chimaera","chime", +"chimera","chimerical","chimney","chimneybreast","chimneypiece", +"chimneypot","chimneystack","chimneysweep","chimpanzee","chin", +"china","chinatown","chinaware","chinchilla","chine", +"chink","chinless","chinook","chinstrap","chintz", +"chinwag","chip","chipboard","chipmunk","chippendale", +"chipping","chippy","chiromancy","chiropody","chiropractic", +"chirp","chirpy","chisel","chiseler","chiseller", +"chit","chitchat","chivalrous","chivalry","chive", +"chivvy","chivy","chloride","chlorinate","chlorine", +"chloroform","chlorophyll","chock","chocolate","choice", +"choir","choirboy","choirmaster","choke","choker", +"chokey","choky","choler","cholera","choleric", +"cholesterol","chomp","choose","choosey","choosy", +"chop","chopfallen","chophouse","chopper","choppers", +"choppy","chopstick","choral","chorale","chord", +"chore","choreographer","choreography","chorine","chorister", +"chortle","chorus","chose","chosen","chow", +"chowder","christ","christen","christendom","christening", +"christian","christianity","christlike","christmastime","chromatic", +"chrome","chromium","chromosome","chronic","chronicle", +"chronograph","chronological","chronology","chronometer","chrysalis", +"chrysanthemum","chub","chubby","chuck","chuckle", +"chug","chukker","chum","chummy","chump", +"chunk","chunky","church","churchgoer","churching", +"churchwarden","churchyard","churl","churlish","churn", +"chute","chutney","cia","cicada","cicatrice", +"cicerone","cid","cider","cif","cigar", +"cigaret","cigarette","cinch","cincture","cinder", +"cinderella","cinders","cine","cinema","cinematograph", +"cinematography","cinnamon","cinquefoil","cipher","circa", +"circadian","circle","circlet","circuit","circuitous", +"circular","circularise","circularize","circulate","circulation", +"circumcise","circumcision","circumference","circumflex","circumlocution", +"circumnavigate","circumscribe","circumscription","circumspect","circumstance", +"circumstances","circumstantial","circumvent","circus","cirque", +"cirrhosis","cirrus","cissy","cistern","citadel", +"citation","cite","citizen","citizenry","citizenship", +"citron","citrous","citrus","city","civet", +"civic","civics","civies","civil","civilian", +"civilisation","civilise","civility","civilization","civilize", +"civilly","civvies","clack","clad","claim", +"claimant","clairvoyance","clairvoyant","clam","clambake", +"clamber","clammy","clamor","clamorous","clamour", +"clamp","clampdown","clamshell","clan","clandestine", +"clang","clanger","clangor","clangour","clank", +"clannish","clansman","clap","clapboard","clapper", +"clapperboard","clappers","claptrap","claque","claret", +"clarification","clarify","clarinet","clarinetist","clarinettist", +"clarion","clarity","clarts","clash","clasp", +"class","classic","classical","classicism","classicist", +"classics","classification","classified","classify","classless", +"classmate","classroom","classy","clatter","clause", +"claustrophobia","claustrophobic","clavichord","clavicle","claw", +"clay","claymore","clean","cleaner","cleanliness", +"cleanly","cleanse","cleanser","cleanup","clear", +"clearance","clearing","clearinghouse","clearly","clearout", +"clearway","cleat","cleavage","cleave","cleaver", +"clef","cleft","clematis","clemency","clement", +"clench","clerestory","clergy","clergyman","clerical", +"clerihew","clerk","clever","clew","click", +"client","clientele","cliff","cliffhanger","climacteric", +"climactic","climate","climatic","climatology","climax", +"climb","climber","clime","clinch","clincher", +"cline","cling","clinging","clingy","clinic", +"clinical","clink","clinker","clip","clipboard", +"clipper","clippers","clippie","clipping","clique", +"cliquey","cliquish","clitoris","cloaca","cloak", +"cloakroom","clobber","cloche","clock","clockwise", +"clockwork","clod","cloddish","clodhopper","clog", +"cloggy","cloister","clone","clop","close", +"closed","closedown","closefisted","closet","closure", +"clot","cloth","clothe","clothes","clothesbasket", +"clotheshorse","clothesline","clothier","clothing","cloture", +"cloud","cloudbank","cloudburst","cloudless","cloudy", +"clout","clove","cloven","clover","cloverleaf", +"clown","clownish","cloy","club","clubbable", +"clubfoot","clubhouse","cluck","clue","clueless", +"clump","clumsy","clung","cluster","clutch", +"clutches","clutter","coach","coachbuilder","coachman", +"coachwork","coadjutor","coagulant","coagulate","coal", +"coalbunker","coalesce","coalface","coalfield","coalhole", +"coalhouse","coalition","coalmine","coalscuttle","coarse", +"coarsen","coast","coastal","coaster","coastguard", +"coastguardsman","coastline","coastwise","coat","coating", +"coax","cob","cobalt","cobber","cobble", +"cobbler","cobblers","cobblestone","cobra","cobweb", +"cocaine","coccyx","cochineal","cochlea","cock", +"cockade","cockatoo","cockchafer","cockcrow","cockerel", +"cockeyed","cockfight","cockhorse","cockle","cockleshell", +"cockney","cockpit","cockroach","cockscomb","cocksure", +"cocktail","cocky","coco","cocoa","coconut", +"cocoon","cod","coda","coddle","code", +"codeine","codex","codger","codicil","codify", +"codling","codpiece","codswallop","coed","coeducation", +"coefficient","coelacanth","coequal","coerce","coercion", +"coercive","coeternal","coeval","coexist","coexistence", +"coffee","coffeepot","coffer","cofferdam","coffers", +"coffin","cog","cogency","cogent","cogitate", +"cogitation","cognac","cognate","cognition","cognitive", +"cognizance","cognizant","cognomen","cognoscenti","cogwheel", +"cohabit","cohere","coherence","coherent","cohesion", +"cohesive","cohort","coif","coiffeur","coiffure", +"coil","coin","coinage","coincide","coincidence", +"coincident","coincidental","coir","coitus","coke", +"col","cola","colander","cold","coleslaw", +"coley","colic","colicky","colitis","collaborate", +"collaboration","collaborationist","collage","collapse","collapsible", +"collar","collarbone","collate","collateral","collation", +"colleague","collect","collected","collection","collective", +"collectivise","collectivism","collectivize","collector","colleen", +"college","collegiate","collide","collie","collier", +"colliery","collision","collocate","collocation","colloquial", +"colloquialism","colloquy","collude","collusion","collywobbles", +"cologne","colon","colonel","colonial","colonialism", +"colonialist","colonies","colonise","colonist","colonize", +"colonnade","colony","color","coloration","coloratura", +"colored","colorfast","colorful","coloring","colorless", +"colors","colossal","colossally","colossus","colostrum", +"colour","coloured","colourfast","colourful","colouring", +"colourless","colours","colt","colter","coltish", +"columbine","column","columnist","coma","comatose", +"comb","combat","combatant","combative","comber", +"combination","combinations","combinatorial","combine","combo", +"combustible","combustion","come","comeback","comecon", +"comedian","comedienne","comedown","comedy","comely", +"comer","comestible","comet","comfit","comfort", +"comfortable","comforter","comfrey","comfy","comic", +"comical","comics","cominform","coming","comintern", +"comity","comma","command","commandant","commandeer", +"commander","commanding","commandment","commando","commemorate", +"commemoration","commemorative","commence","commencement","commend", +"commendable","commendation","commendatory","commensurable","commensurate", +"comment","commentary","commentate","commentator","commerce", +"commercial","commercialise","commercialism","commercialize","commie", +"commiserate","commiseration","commissar","commissariat","commissary", +"commission","commissionaire","commissioner","commit","commitment", +"committal","committed","committee","committeeman","commode", +"commodious","commodity","commodore","common","commonage", +"commonalty","commoner","commonly","commonplace","commons", +"commonweal","commonwealth","commotion","communal","commune", +"communicable","communicant","communicate","communication","communications", +"communicative","communion","communism","communist","community", +"commutable","commutation","commutative","commutator","commute", +"commuter","compact","compacted","companion","companionable", +"companionship","companionway","company","comparable","comparative", +"comparatively","compare","comparison","compartment","compartmentalise", +"compartmentalize","compass","compassion","compassionate","compatibility", +"compatible","compatriot","compeer","compel","compendious", +"compendium","compensate","compensation","compensatory","compere", +"compete","competence","competent","competition","competitive", +"competitor","compilation","compile","complacency","complacent", +"complain","complainant","complaint","complaisance","complaisant", +"complement","complementary","complete","completely","completion", +"complex","complexion","complexity","compliance","compliant", +"complicate","complicated","complication","complicity","compliment", +"complimentary","compliments","complin","compline","comply", +"compo","component","comport","comportment","compose", +"composer","composite","composition","compositor","compost", +"composure","compote","compound","comprehend","comprehensible", +"comprehension","comprehensive","compress","compressible","compression", +"compressor","comprise","compromise","comptometer","comptroller", +"compulsion","compulsive","compulsory","compunction","computation", +"compute","computer","computerize","comrade","comradeship", +"coms","con","concatenate","concatenation","concave", +"concavity","conceal","concealment","concede","conceit", +"conceited","conceivable","conceive","concentrate","concentrated", +"concentration","concentric","concept","conception","conceptual", +"conceptualise","conceptualize","concern","concerned","concernedly", +"concerning","concert","concerted","concertgoer","concertina", +"concertmaster","concerto","concession","concessionaire","concessive", +"conch","conchology","concierge","conciliate","conciliation", +"conciliatory","concise","concision","conclave","conclude", +"conclusion","conclusive","concoct","concoction","concomitance", +"concomitant","concord","concordance","concordant","concordat", +"concourse","concrete","concubinage","concubine","concupiscence", +"concur","concurrence","concurrent","concuss","concussion", +"condemn","condemnation","condensation","condense","condenser", +"condescend","condescension","condign","condiment","condition", +"conditional","conditions","condole","condolence","condom", +"condominium","condone","condor","conduce","conducive", +"conduct","conduction","conductive","conductivity","conductor", +"conduit","cone","coney","confabulate","confabulation", +"confection","confectioner","confectionery","confederacy","confederate", +"confederation","confer","conference","confess","confessed", +"confession","confessional","confessor","confetti","confidant", +"confide","confidence","confident","confidential","confiding", +"configuration","confine","confinement","confines","confirm", +"confirmation","confirmed","confiscate","confiscatory","conflagration", +"conflate","conflict","confluence","conform","conformable", +"conformation","conformist","conformity","confound","confounded", +"confraternity","confront","confrontation","confucian","confucianism", +"confuse","confusion","confute","conga","congeal", +"congenial","congenital","congest","congestion","conglomerate", +"conglomeration","congrats","congratulate","congratulations","congratulatory", +"congregate","congregation","congregational","congregationalism","congress", +"congressional","congressman","congruent","congruity","congruous", +"conic","conical","conifer","coniferous","conj", +"conjectural","conjecture","conjoin","conjoint","conjugal", +"conjugate","conjugation","conjunction","conjunctiva","conjunctive", +"conjunctivitis","conjuncture","conjure","conjurer","conjuror", +"conk","conker","conkers","connect","connected", +"connection","connective","connexion","connivance","connive", +"connoisseur","connotation","connotative","connote","connubial", +"conquer","conquest","conquistador","consanguineous","consanguinity", +"conscience","conscientious","conscious","consciousness","conscript", +"conscription","consecrate","consecration","consecutive","consensus", +"consent","consequence","consequent","consequential","consequently", +"conservancy","conservation","conservationist","conservatism","conservative", +"conservatoire","conservatory","conserve","consider","considerable", +"considerably","considerate","consideration","considered","considering", +"consign","consignee","consigner","consignment","consignor", +"consist","consistency","consistent","consistory","consolation", +"consolatory","console","consolidate","consols","consonance", +"consonant","consort","consortium","conspectus","conspicuous", +"conspiracy","conspirator","conspiratorial","conspire","constable", +"constabulary","constancy","constant","constellation","consternation", +"constipate","constipation","constituency","constituent","constitute", +"constitution","constitutional","constitutionalism","constitutionally","constitutive", +"constrain","constrained","constraint","constrict","constriction", +"constrictor","construct","construction","constructive","constructor", +"construe","consubstantiation","consul","consular","consulate", +"consult","consultancy","consultant","consultation","consultative", +"consulting","consume","consumer","consummate","consummation", +"consumption","consumptive","contact","contagion","contagious", +"contain","contained","container","containerise","containerize", +"containment","contaminate","contamination","contemplate","contemplation", +"contemplative","contemporaneous","contemporary","contempt","contemptible", +"contemptuous","contend","contender","content","contented", +"contention","contentious","contentment","contents","contest", +"contestant","context","contextual","contiguity","contiguous", +"continence","continent","continental","contingency","contingent", +"continual","continuance","continuation","continue","continuity", +"continuo","continuous","continuum","contort","contortion", +"contortionist","contour","contraband","contrabass","contraception", +"contraceptive","contract","contractile","contraction","contractor", +"contractual","contradict","contradiction","contradictory","contradistinction", +"contrail","contraindication","contralto","contraption","contrapuntal", +"contrariety","contrariwise","contrary","contrast","contravene", +"contravention","contretemps","contribute","contribution","contributor", +"contributory","contrite","contrition","contrivance","contrive", +"contrived","control","controller","controversial","controversy", +"controvert","contumacious","contumacy","contumelious","contumely", +"contuse","contusion","conundrum","conurbation","convalesce", +"convalescence","convalescent","convection","convector","convene", +"convener","convenience","convenient","convenor","convent", +"conventicle","convention","conventional","conventionality","converge", +"conversant","conversation","conversational","conversationalist","conversazione", +"converse","conversion","convert","converter","convertible", +"convex","convexity","convey","conveyance","conveyancer", +"conveyancing","conveyer","conveyor","convict","conviction", +"convince","convinced","convincing","convivial","convocation", +"convoke","convoluted","convolution","convolvulus","convoy", +"convulse","convulsion","convulsive","cony","coo", +"cook","cooker","cookery","cookhouse","cookie", +"cooking","cookout","cool","coolant","cooler", +"coolie","coon","coop","cooper","cooperate", +"cooperation","cooperative","coordinate","coordinates","coordination", +"coot","cop","cope","copeck","copier", +"copilot","coping","copingstone","copious","copper", +"copperhead","copperplate","coppersmith","coppice","copra", +"coptic","copula","copulate","copulative","copy", +"copybook","copyboy","copycat","copydesk","copyhold", +"copyist","copyright","copywriter","coquetry","coquette", +"cor","coracle","coral","corbel","cord", +"cordage","cordial","cordiality","cordially","cordillera", +"cordite","cordon","cords","corduroy","core", +"corelate","coreligionist","corer","corespondent","corgi", +"coriander","corinthian","cork","corkage","corked", +"corker","corkscrew","corm","cormorant","corn", +"corncob","corncrake","cornea","cornelian","corner", +"cornerstone","cornet","cornfield","cornflakes","cornflower", +"cornice","cornish","cornucopia","corny","corolla", +"corollary","corona","coronary","coronation","coroner", +"coronet","corpora","corporal","corporate","corporation", +"corporeal","corps","corpse","corpulence","corpulent", +"corpus","corpuscle","corral","correct","correction", +"correctitude","corrective","correlate","correlation","correlative", +"correspond","correspondence","correspondent","corresponding","corridor", +"corrie","corrigendum","corroborate","corroboration","corroborative", +"corroboree","corrode","corrosion","corrosive","corrugate", +"corrugation","corrupt","corruption","corsage","corsair", +"corse","corselet","corset","cortex","cortisone", +"corundum","coruscate","corvette","cos","cosh", +"cosignatory","cosine","cosmetic","cosmetician","cosmic", +"cosmogony","cosmology","cosmonaut","cosmopolitan","cosmos", +"cosset","cost","costermonger","costive","costly", +"costs","costume","costumier","cosy","cot", +"cotangent","cote","coterie","coterminous","cotillion", +"cottage","cottager","cottar","cotter","cotton", +"cottonseed","cottontail","cotyledon","couch","couchant", +"couchette","cougar","cough","could","couldst", +"coulter","council","councillor","counsel","counsellor", +"counselor","count","countable","countdown","countenance", +"counter","counteract","counterattack","counterattraction","counterbalance", +"counterblast","counterclaim","counterclockwise","counterespionage","counterfeit", +"counterfoil","counterintelligence","counterirritant","countermand","countermarch", +"countermeasure","counteroffensive","counterpane","counterpart","counterpoint", +"counterpoise","countersign","countersink","countertenor","countervail", +"countess","countinghouse","countless","countrified","country", +"countryman","countryside","county","coup","couple", +"couplet","coupling","coupon","courage","courageous", +"courgette","courier","course","courser","coursing", +"court","courteous","courtesan","courtesy","courthouse", +"courtier","courting","courtly","courtroom","courtship", +"courtyard","couscous","cousin","couture","cove", +"coven","covenant","coventry","cover","coverage", +"covering","coverlet","covert","covet","covetous", +"covey","cow","coward","cowardice","cowardly", +"cowbell","cowboy","cowcatcher","cower","cowgirl", +"cowhand","cowheel","cowherd","cowhide","cowl", +"cowlick","cowling","cowman","cowpat","cowpox", +"cowrie","cowry","cowshed","cowslip","cox", +"coxcomb","coy","coyote","coypu","cozen", +"cozy","cpa","crab","crabbed","crabby", +"crabgrass","crabwise","crack","crackbrained","crackdown", +"cracked","cracker","crackers","crackle","crackleware", +"crackling","crackpot","cracksman","crackup","cradle", +"craft","craftsman","crafty","crag","craggy", +"crake","cram","crammer","cramp","cramped", +"crampon","cramps","cranberry","crane","cranial", +"cranium","crank","crankshaft","cranky","cranny", +"crap","crape","crappy","craps","crash", +"crashing","crass","crate","crater","cravat", +"crave","craven","craving","crawl","crawler", +"crawlers","crayfish","crayon","craze","crazy", +"creak","creaky","cream","creamer","creamery", +"creamy","crease","create","creation","creative", +"creativity","creator","creature","credence","credentials", +"credibility","credible","credit","creditable","creditor", +"credo","credulous","creed","creek","creel", +"creep","creeper","creepers","creeps","creepy", +"cremate","crematorium","crenelated","crenellated","creole", +"creosote","crept","crepuscular","crescendo","crescent", +"cress","crest","crested","crestfallen","cretaceous", +"cretin","cretonne","crevasse","crevice","crew", +"crewman","crib","cribbage","crick","cricket", +"cricketer","crier","cries","crikey","crime", +"criminal","criminology","crimp","crimplene","crimson", +"cringe","crinkle","crinkly","crinoid","crinoline", +"cripes","cripple","crisis","crisp","crispy", +"crisscross","criterion","critic","critical","criticise", +"criticism","criticize","critique","critter","croak", +"crochet","crock","crockery","crocodile","crocus", +"croft","crofter","croissant","cromlech","crone", +"crony","crook","crooked","croon","crooner", +"crop","cropper","croquet","croquette","crore", +"crosier","cross","crossbar","crossbeam","crossbenches", +"crossbones","crossbow","crossbred","crossbreed","crosscheck", +"crosscurrent","crosscut","crossfire","crossing","crossover", +"crosspatch","crosspiece","crossply","crossroad","crossroads", +"crosstree","crosswalk","crosswind","crosswise","crossword", +"crotch","crotchet","crotchety","crouch","croup", +"croupier","crouton","crow","crowbar","crowd", +"crowded","crowfoot","crown","crozier","crucial", +"crucible","crucifix","crucifixion","cruciform","crucify", +"crude","crudity","cruel","cruelty","cruet", +"cruise","cruiser","crumb","crumble","crumbly", +"crummy","crumpet","crumple","crunch","crupper", +"crusade","cruse","crush","crust","crustacean", +"crusty","crutch","crux","cry","crybaby", +"crying","crypt","cryptic","cryptogram","cryptography", +"crystal","crystalline","crystallise","crystallize","cub", +"cubbyhole","cube","cubic","cubical","cubicle", +"cubism","cubit","cubs","cuckold","cuckoldry", +"cuckoo","cucumber","cud","cuddle","cuddlesome", +"cuddly","cudgel","cue","cuff","cuffs", +"cuirass","cuisine","culinary","cull","cullender", +"culminate","culmination","culotte","culottes","culpable", +"culprit","cult","cultivable","cultivate","cultivated", +"cultivation","cultivator","cultural","culture","cultured", +"culvert","cumber","cumbersome","cumin","cummerbund", +"cumulative","cumulonimbus","cumulus","cuneiform","cunnilingus", +"cunning","cunt","cup","cupbearer","cupboard", +"cupid","cupidity","cupola","cuppa","cupping", +"cupric","cur","curable","curacy","curate", +"curative","curator","curb","curd","curdle", +"cure","curettage","curfew","curia","curio", +"curiosity","curious","curl","curler","curlew", +"curlicue","curling","curly","curlycue","curmudgeon", +"currant","currency","current","curriculum","currish", +"curry","curse","cursed","cursive","cursory", +"curt","curtail","curtain","curtains","curtsey", +"curtsy","curvaceous","curvacious","curvature","curve", +"cushion","cushy","cusp","cuspidor","cuss", +"cussed","custard","custodial","custodian","custody", +"custom","customary","customer","customs","cut", +"cutaway","cutback","cuticle","cutlass","cutler", +"cutlery","cutlet","cutoff","cutout","cutpurse", +"cutter","cutthroat","cutting","cuttlefish","cutworm", +"cwm","cwt","cyanide","cybernetics","cyclamate", +"cyclamen","cycle","cyclic","cyclist","cyclone", +"cyclopaedia","cyclopedia","cyclostyle","cyclotron","cyder", +"cygnet","cylinder","cymbal","cynic","cynical", +"cynicism","cynosure","cypher","cypress","cyrillic", +"cyst","cystitis","cytology","czar","czarina", +"czech","dab","dabble","dabchick","dabs", +"dace","dachshund","dactyl","dad","daddy", +"dado","daemon","daffodil","daft","dagger", +"dago","daguerreotype","dahlia","daily","dainty", +"daiquiri","dairy","dairying","dairymaid","dairyman", +"dais","daisy","dale","dalliance","dally", +"dalmation","dam","damage","damages","damascene", +"damask","damn","damnable","damnation","damnedest", +"damning","damocles","damp","dampen","damper", +"dampish","damsel","damson","dance","dandelion", +"dander","dandified","dandle","dandruff","dandy", +"danger","dangerous","dangle","dank","dapper", +"dappled","dare","daredevil","daresay","daring", +"dark","darken","darkey","darkroom","darky", +"darling","darn","darning","dart","dartboard", +"dartmoor","darts","dash","dashboard","dashed", +"dashing","data","date","dated","dateless", +"dateline","dates","dative","daub","daughter", +"daunt","dauntless","dauphin","davit","dawdle", +"dawn","day","dayboy","daybreak","daydream", +"daylight","dayroom","days","daytime","daze", +"dazzle","ddt","deacon","dead","deaden", +"deadline","deadlock","deadly","deadpan","deadweight", +"deaf","deafen","deal","dealer","dealing", +"dealings","dean","deanery","dear","dearest", +"dearie","dearly","dearth","deary","death", +"deathbed","deathblow","deathless","deathlike","deathly", +"deathwatch","deb","debar","debark","debase", +"debatable","debate","debater","debauch","debauchee", +"debauchery","debenture","debilitate","debility","debit", +"debonair","debone","debouch","debrief","debris", +"debt","debtor","debug","debunk","debut", +"debutante","decade","decadence","decadent","decalogue", +"decamp","decant","decanter","decapitate","decathlon", +"decay","decease","deceased","deceit","deceitful", +"deceive","decelerate","december","decencies","decency", +"decent","decentralise","decentralize","deception","deceptive", +"decibel","decide","decided","decidedly","deciduous", +"decimal","decimalise","decimalize","decimate","decipher", +"decision","decisive","deck","deckchair","deckhand", +"declaim","declamation","declaration","declare","declared", +"declassify","declension","declination","decline","declivity", +"declutch","decoction","decode","decolonise","decolonize", +"decompose","decompress","decongestant","decontaminate","decontrol", +"decorate","decoration","decorative","decorator","decorous", +"decorum","decoy","decrease","decree","decrepit", +"decrepitude","decry","dedicate","dedicated","dedication", +"deduce","deduct","deduction","deductive","deed", +"deem","deep","deepen","deer","deerstalker", +"def","deface","defame","default","defeat", +"defeatism","defecate","defect","defection","defective", +"defence","defend","defendant","defense","defensible", +"defensive","defer","deference","defiance","defiant", +"deficiency","deficient","deficit","defile","define", +"definite","definitely","definition","definitive","deflate", +"deflation","deflationary","deflect","deflection","deflower", +"defoliant","defoliate","deforest","deform","deformation", +"deformity","defraud","defray","defrock","defrost", +"deft","defunct","defuse","defy","degauss", +"degeneracy","degenerate","degeneration","degenerative","degrade", +"degree","dehorn","dehumanise","dehumanize","dehydrate", +"deice","deification","deify","deign","deism", +"deity","dejected","dejection","dekko","delay", +"delectable","delectation","delegacy","delegate","delegation", +"delete","deleterious","deletion","delft","deliberate", +"deliberation","deliberative","delicacy","delicate","delicatessen", +"delicious","delight","delightful","delimit","delineate", +"delinquency","delinquent","deliquescent","delirious","delirium", +"deliver","deliverance","delivery","deliveryman","dell", +"delouse","delphic","delphinium","delta","delude", +"deluge","delusion","delusive","delve","demagnetise", +"demagnetize","demagogic","demagogue","demagoguery","demand", +"demanding","demarcate","demarcation","demean","demeanor", +"demeanour","demented","demerit","demesne","demigod", +"demijohn","demilitarise","demilitarize","demise","demist", +"demister","demo","demob","demobilise","demobilize", +"democracy","democrat","democratic","democratise","democratize", +"demography","demolish","demolition","demon","demonetise", +"demonetize","demoniacal","demonic","demonstrable","demonstrate", +"demonstration","demonstrative","demonstrator","demoralise","demoralize", +"demote","demotic","demur","demure","demystify", +"den","denationalise","denationalize","denial","denier", +"denigrate","denim","denims","denizen","denominate", +"denomination","denominational","denominator","denotation","denote", +"denouement","denounce","dense","density","dent", +"dental","dentifrice","dentist","dentistry","denture", +"dentures","denude","denunciation","deny","deodorant", +"deodorise","deodorize","depart","departed","department", +"departure","depend","dependable","dependant","dependence", +"dependency","dependent","depict","depilatory","deplete", +"deplorable","deplore","deploy","deponent","depopulate", +"deport","deportee","deportment","depose","deposit", +"deposition","depositor","depository","depot","deprave", +"depravity","deprecate","deprecatory","depreciate","depreciatory", +"depredation","depress","depressed","depression","deprivation", +"deprive","deprived","depth","depths","deputation", +"depute","deputise","deputize","deputy","derail", +"derange","derby","derelict","dereliction","deride", +"derision","derisive","derisory","derivative","derive", +"dermatitis","dermatology","derogate","derogatory","derrick", +"derv","dervish","des","desalinise","desalinize", +"descale","descant","descend","descendant","descended", +"descent","describe","description","descriptive","descry", +"desecrate","desegregate","desensitise","desensitize","desert", +"deserter","desertion","deserts","deserve","deservedly", +"deserving","desiccant","desiccate","desideratum","design", +"designate","designation","designedly","designer","designing", +"designs","desirable","desire","desirous","desist", +"desk","deskwork","desolate","despair","despairing", +"despatch","despatches","desperado","desperate","desperation", +"despicable","despise","despite","despoil","despondent", +"despot","despotic","despotism","dessert","dessertspoon", +"dessertspoonful","destination","destined","destiny","destitute", +"destroy","destroyer","destruction","destructive","desuetude", +"desultory","detach","detached","detachedly","detachment", +"detail","detailed","detain","detainee","detect", +"detection","detective","detector","detention","deter", +"detergent","deteriorate","determinant","determination","determine", +"determined","determiner","determinism","deterrent","detest", +"dethrone","detonate","detonation","detonator","detour", +"detract","detractor","detrain","detriment","detritus", +"deuce","deuced","deuteronomy","devaluation","devalue", +"devastate","devastating","develop","developer","development", +"developmental","deviance","deviant","deviate","deviation", +"deviationist","device","devil","devilish","devilishly", +"devilment","devious","devise","devitalise","devitalize", +"devoid","devolution","devolve","devote","devoted", +"devotee","devotion","devotional","devotions","devour", +"devout","devoutly","dew","dewdrop","dewlap", +"dewpond","dewy","dexterity","dexterous","dextrose", +"dhoti","dhow","diabetes","diabetic","diabolic", +"diabolical","diacritic","diacritical","diadem","diaeresis", +"diagnose","diagnosis","diagnostic","diagonal","diagram", +"dial","dialect","dialectic","dialectician","dialog", +"dialogue","diameter","diametrically","diamond","diaper", +"diaphanous","diaphragm","diarist","diarrhea","diarrhoea", +"diary","diaspora","diatom","diatribe","dibble", +"dice","dicey","dichotomy","dick","dicker", +"dickie","dicky","dickybird","dictaphone","dictate", +"dictation","dictator","dictatorial","dictatorship","diction", +"dictionary","dictum","did","didactic","diddle", +"didst","die","diehard","dieresis","diet", +"dietary","dietetic","dietetics","dietician","dietitian", +"differ","difference","different","differential","differentiate", +"difficult","difficulty","diffident","diffract","diffuse", +"diffusion","dig","digest","digestion","digestive", +"digger","digging","diggings","digit","digital", +"dignified","dignify","dignitary","dignity","digraph", +"digress","digression","digs","dike","dilapidated", +"dilapidation","dilapidations","dilate","dilatory","dildo", +"dilemma","dilettante","diligence","diligent","dill", +"dillydally","dilute","dilution","dim","dimension", +"dimensions","diminish","diminuendo","diminution","diminutive", +"dimity","dimple","dimwit","din","dinar", +"dine","diner","dingdong","dinghy","dingle", +"dingo","dingy","dink","dinkum","dinky", +"dinner","dinosaur","dint","diocese","dioxide", +"dip","diphtheria","diphthong","diploma","diplomacy", +"diplomat","diplomatic","diplomatically","diplomatist","dipper", +"dipsomania","dipsomaniac","dipstick","dipswitch","diptych", +"dire","direct","direction","directional","directions", +"directive","directly","director","directorate","directorship", +"directory","direful","dirge","dirigible","dirk", +"dirndl","dirt","dirty","disability","disable", +"disabled","disabuse","disadvantage","disadvantageous","disaffected", +"disaffection","disaffiliate","disafforest","disagree","disagreeable", +"disagreement","disallow","disappear","disappearance","disappoint", +"disappointed","disappointing","disappointment","disapprobation","disapproval", +"disapprove","disarm","disarmament","disarrange","disarray", +"disassociate","disaster","disastrous","disavow","disband", +"disbar","disbelief","disbelieve","disburden","disburse", +"disbursement","disc","discard","discern","discerning", +"discernment","discharge","disciple","discipleship","disciplinarian", +"disciplinary","discipline","disclaim","disclaimer","disclose", +"disclosure","disco","discolor","discoloration","discolour", +"discolouration","discomfit","discomfiture","discomfort","discommode", +"discompose","disconcert","disconnect","disconnected","disconnection", +"disconsolate","discontent","discontented","discontinue","discontinuity", +"discontinuous","discord","discordance","discordant","discotheque", +"discount","discountenance","discourage","discouragement","discourse", +"discourteous","discourtesy","discover","discovery","discredit", +"discreditable","discreet","discrepancy","discrete","discretion", +"discretionary","discriminate","discriminating","discrimination","discriminatory", +"discursive","discus","discuss","discussion","disdain", +"disdainful","disease","disembark","disembarrass","disembodied", +"disembowel","disembroil","disenchant","disencumber","disendow", +"disengage","disengaged","disentangle","disequilibrium","disestablish", +"disfavor","disfavour","disfigure","disforest","disfranchise", +"disfrock","disgorge","disgrace","disgraceful","disgruntled", +"disguise","disgust","dish","dishabille","disharmony", +"dishcloth","dishearten","dishes","dishevelled","dishful", +"dishonest","dishonesty","dishonor","dishonorable","dishonour", +"dishonourable","dishwasher","dishwater","dishy","disillusion", +"disillusioned","disillusionment","disincentive","disinclination","disinclined", +"disinfect","disinfectant","disinfest","disingenuous","disinherit", +"disintegrate","disinter","disinterested","disjoint","disjointed", +"disjunctive","disk","dislike","dislocate","dislocation", +"dislodge","disloyal","dismal","dismantle","dismast", +"dismay","dismember","dismiss","dismissal","dismount", +"disobedient","disobey","disoblige","disorder","disorderly", +"disorganise","disorganize","disorientate","disown","disparage", +"disparate","disparity","dispassionate","dispatch","dispatches", +"dispel","dispensable","dispensary","dispensation","dispense", +"dispenser","dispersal","disperse","dispersion","dispirit", +"displace","displacement","display","displease","displeasure", +"disport","disposable","disposal","dispose","disposed", +"disposition","dispossess","dispossessed","disproof","disproportion", +"disproportionate","disprove","disputable","disputant","disputation", +"disputatious","dispute","disqualification","disqualify","disquiet", +"disquietude","disquisition","disregard","disrelish","disremember", +"disrepair","disreputable","disrepute","disrespect","disrobe", +"disrupt","dissatisfaction","dissatisfy","dissect","dissection", +"dissemble","disseminate","dissension","dissent","dissenter", +"dissenting","dissertation","disservice","dissever","dissident", +"dissimilar","dissimilarity","dissimulate","dissipate","dissipated", +"dissipation","dissociate","dissoluble","dissolute","dissolution", +"dissolve","dissonance","dissonant","dissuade","distaff", +"distal","distance","distant","distantly","distaste", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java new file mode 100644 index 00000000000..4c72ef87526 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData3 { + private KStemData3() { + } + static String[] data = { +"distasteful","distemper","distempered","distend","distension", +"distil","distill","distillation","distiller","distillery", +"distinct","distinction","distinctive","distinguish","distinguishable", +"distinguished","distort","distortion","distract","distracted", +"distraction","distrain","distraint","distrait","distraught", +"distress","distressing","distribute","distribution","distributive", +"distributor","district","distrust","distrustful","disturb", +"disturbance","disturbed","disunion","disunite","disunity", +"disuse","disused","disyllabic","disyllable","ditch", +"dither","dithers","ditto","ditty","diuretic", +"diurnal","divagate","divan","dive","diver", +"diverge","divergence","divers","diverse","diversify", +"diversion","diversionary","diversity","divert","divertimento", +"divertissement","divest","divide","dividend","dividers", +"divination","divine","diviner","divingboard","divinity", +"divisible","division","divisive","divisor","divorce", +"divot","divulge","divvy","dixie","dixieland", +"dizzy","djinn","dna","do","dobbin", +"doc","docile","dock","docker","docket", +"dockyard","doctor","doctoral","doctorate","doctrinaire", +"doctrinal","doctrine","document","documentary","documentation", +"dodder","doddering","doddle","dodge","dodgems", +"dodger","dodgy","dodo","doe","doer", +"doeskin","doff","dog","dogcart","dogcatcher", +"dogfight","dogfish","dogged","doggerel","doggie", +"doggo","doggone","doggy","doghouse","dogie", +"dogleg","dogma","dogmatic","dogmatics","dogmatism", +"dogs","dogsbody","dogtooth","dogtrot","dogwood", +"doh","doily","doings","doldrums","dole", +"doleful","doll","dollar","dollop","dolly", +"dolmen","dolor","dolorous","dolour","dolphin", +"dolt","domain","dome","domed","domestic", +"domesticate","domesticity","domicile","domiciliary","dominance", +"dominant","dominate","domination","domineer","dominican", +"dominion","domino","dominoes","don","donate", +"donation","donjon","donkey","donkeywork","donnish", +"donor","doodle","doodlebug","doom","doomsday", +"door","doorbell","doorframe","doorkeeper","doorknob", +"doorknocker","doorman","doormat","doornail","doorplate", +"doorscraper","doorstep","doorstopper","doorway","dope", +"dopey","dopy","doric","dormant","dormer", +"dormitory","dormouse","dorsal","dory","dosage", +"dose","doss","dosser","dosshouse","dossier", +"dost","dot","dotage","dote","doth", +"doting","dottle","dotty","double","doubles", +"doublet","doublethink","doubloon","doubly","doubt", +"doubtful","doubtless","douche","dough","doughnut", +"doughty","doughy","dour","douse","dove", +"dovecote","dovetail","dowager","dowdy","dowel", +"dower","down","downbeat","downcast","downdraft", +"downdraught","downer","downfall","downgrade","downhearted", +"downhill","downpour","downright","downstage","downstairs", +"downstream","downtown","downtrodden","downward","downwards", +"downwind","downy","dowry","dowse","doxology", +"doyen","doyley","doze","dozen","dozy", +"dpt","drab","drabs","drachm","drachma", +"draconian","draft","draftee","draftsman","drafty", +"drag","draggled","draggy","dragnet","dragoman", +"dragon","dragonfly","dragoon","drain","drainage", +"drainpipe","drake","dram","drama","dramatic", +"dramatics","dramatise","dramatist","dramatize","drank", +"drape","draper","drapery","drastic","drat", +"draught","draughtboard","draughts","draughtsman","draughty", +"draw","drawback","drawbridge","drawer","drawers", +"drawing","drawl","drawn","drawstring","dray", +"dread","dreadful","dreadfully","dreadnaught","dreadnought", +"dream","dreamboat","dreamer","dreamland","dreamless", +"dreamlike","dreamy","drear","dreary","dredge", +"dredger","dregs","drench","dress","dressage", +"dresser","dressing","dressmaker","dressy","drew", +"dribble","driblet","dribs","drier","drift", +"driftage","drifter","driftnet","driftwood","drill", +"drily","drink","drinkable","drinker","drip", +"dripping","drive","drivel","driver","driveway", +"driving","drizzle","drogue","droll","drollery", +"dromedary","drone","drool","droop","drop", +"dropkick","droplet","dropout","dropper","droppings", +"drops","dropsy","dross","drought","drove", +"drover","drown","drowse","drowsy","drub", +"drudge","drudgery","drug","drugget","druggist", +"drugstore","druid","drum","drumbeat","drumfire", +"drumhead","drummer","drumstick","drunk","drunkard", +"drunken","drupe","dry","dryad","dryer", +"dual","dub","dubbin","dubiety","dubious", +"ducal","ducat","duchess","duchy","duck", +"duckboards","duckling","ducks","duckweed","ducky", +"duct","ductile","dud","dude","dudgeon", +"duds","due","duel","duenna","dues", +"duet","duff","duffel","duffer","duffle", +"dug","dugout","duke","dukedom","dukes", +"dulcet","dulcimer","dull","dullard","duly", +"dumb","dumbbell","dumbfound","dumbwaiter","dumfound", +"dummy","dump","dumper","dumpling","dumps", +"dumpy","dun","dunce","dunderhead","dung", +"dungaree","dungarees","dungeon","dunghill","dunk", +"duo","duodecimal","duodenum","duologue","dupe", +"duplex","duplicate","duplicator","duplicity","durable", +"duration","durbar","duress","durex","during", +"durst","dusk","dusky","dust","dustbin", +"dustbowl","dustcart","dustcoat","duster","dustman", +"dustpan","dustsheet","dustup","dusty","dutch", +"dutiable","dutiful","duty","duvet","dwarf", +"dwell","dwelling","dwindle","dyarchy","dye", +"dyestuff","dyeworks","dyke","dynamic","dynamics", +"dynamism","dynamite","dynamo","dynasty","dysentery", +"dyslexia","dyspepsia","dyspeptic","each","eager", +"eagle","eaglet","ear","earache","eardrum", +"eared","earful","earl","earliest","earlobe", +"early","earmark","earmuff","earn","earnest", +"earnings","earphone","earpiece","earplug","earring", +"earshot","earth","earthbound","earthen","earthenware", +"earthling","earthly","earthnut","earthquake","earthshaking", +"earthwork","earthworm","earthy","earwax","earwig", +"ease","easel","easily","east","eastbound", +"easter","easterly","eastern","easterner","easternmost", +"easy","easygoing","eat","eatable","eatables", +"eater","eats","eaves","eavesdrop","ebb", +"ebony","ebullience","ebullient","eccentric","eccentricity", +"ecclesiastic","ecclesiastical","ecg","echelon","echo", +"eclectic","eclipse","ecliptic","eclogue","ecological", +"ecologically","ecology","economic","economical","economically", +"economics","economise","economist","economize","economy", +"ecosystem","ecstasy","ecstatic","ect","ectoplasm", +"ecumenical","ecumenicalism","eczema","edam","eddy", +"edelweiss","eden","edge","edgeways","edging", +"edgy","edible","edibles","edict","edification", +"edifice","edify","edit","edition","editor", +"editorial","editorialise","editorialize","educate","educated", +"education","educational","educationist","educator","educe", +"eec","eeg","eel","eerie","efface", +"effect","effective","effectively","effectiveness","effectives", +"effects","effectual","effectually","effectuate","effeminacy", +"effeminate","effendi","effervesce","effete","efficacious", +"efficacy","efficiency","efficient","effigy","efflorescence", +"effluent","efflux","effort","effortless","effrontery", +"effulgence","effulgent","effusion","effusive","eft", +"egalitarian","egg","eggcup","egghead","eggnog", +"eggplant","eggshell","egis","eglantine","ego", +"egocentric","egoism","egoist","egotism","egotist", +"egregious","egress","egret","eiderdown","eight", +"eighteen","eightsome","eighty","eisteddfod","either", +"ejaculate","ejaculation","eject","ejector","eke", +"ekg","elaborate","elaboration","eland","elapse", +"elastic","elasticity","elastoplast","elate","elated", +"elation","elbow","elbowroom","elder","elderberry", +"elderflower","elderly","eldest","elect","election", +"electioneer","electioneering","elective","elector","electoral", +"electorate","electric","electrical","electrician","electricity", +"electrify","electrocardiogram","electrocardiograph","electrocute","electrode", +"electroencephalogram","electroencephalograph","electrolysis","electrolyte","electron", +"electronic","electronics","electroplate","eleemosynary","elegant", +"elegiac","elegy","element","elemental","elementary", +"elements","elephant","elephantiasis","elephantine","elevate", +"elevated","elevation","elevator","eleven","elevenses", +"elf","elfin","elfish","elicit","elide", +"eligible","eliminate","elite","elitism","elixir", +"elizabethan","elk","elkhound","ellipse","ellipsis", +"elliptic","elm","elocution","elocutionary","elocutionist", +"elongate","elongation","elope","eloquence","eloquent", +"else","elsewhere","elucidate","elucidatory","elude", +"elusive","elver","elves","elvish","elysian", +"elysium","emaciate","emanate","emancipate","emancipation", +"emasculate","embalm","embankment","embargo","embark", +"embarkation","embarrass","embarrassment","embassy","embattled", +"embed","embellish","ember","embezzle","embitter", +"emblazon","emblem","emblematic","embodiment","embody", +"embolden","embolism","embonpoint","embosomed","emboss", +"embowered","embrace","embrasure","embrocation","embroider", +"embroidery","embroil","embryo","embryonic","emend", +"emendation","emerald","emerge","emergence","emergency", +"emergent","emeritus","emery","emetic","emigrant", +"emigrate","eminence","eminent","eminently","emir", +"emirate","emissary","emission","emit","emmentaler", +"emmenthaler","emollient","emolument","emote","emotion", +"emotional","emotionalism","emotionally","emotive","empanel", +"empathy","emperor","emphasis","emphasise","emphasize", +"emphatic","emphatically","emphysema","empire","empirical", +"empiricism","emplacement","emplane","employ","employable", +"employee","employer","employment","emporium","empower", +"empress","emptily","empty","empurpled","empyreal", +"empyrean","emu","emulate","emulation","emulsify", +"emulsion","enable","enabling","enact","enactment", +"enamel","enamelware","enamored","enamoured","encamp", +"encampment","encapsulate","encase","encaustic","encephalitis", +"enchain","enchant","enchanter","enchanting","enchantment", +"encipher","encircle","enclave","enclose","enclosure", +"encode","encomium","encompass","encore","encounter", +"encourage","encouragement","encroach","encroachment","encrust", +"encumber","encumbrance","encyclical","encyclopaedia","encyclopaedic", +"encyclopedia","encyclopedic","end","endanger","endear", +"endearing","endearment","endeavor","endeavour","endemic", +"ending","endive","endless","endocrine","endorse", +"endow","endowment","endpaper","endurance","endure", +"enduring","endways","enema","enemy","energetic", +"energize","energy","enervate","enfeeble","enfilade", +"enfold","enforce","enfranchise","engage","engaged", +"engagement","engaging","engender","engine","engineer", +"engineering","english","englishman","engraft","engrave", +"engraving","engross","engrossing","engulf","enhance", +"enigma","enigmatic","enjoin","enjoy","enjoyable", +"enjoyment","enkindle","enlarge","enlargement","enlighten", +"enlightened","enlightenment","enlist","enliven","enmesh", +"enmity","ennoble","ennui","enormity","enormous", +"enormously","enough","enplane","enquire","enquiring", +"enquiry","enrage","enrapture","enrich","enrol", +"enroll","enrollment","enrolment","ensanguined","ensconce", +"ensemble","enshrine","enshroud","ensign","enslave", +"ensnare","ensue","ensure","entail","entangle", +"entanglement","entente","enter","enteritis","enterprise", +"enterprising","entertain","entertainer","entertaining","entertainment", +"enthral","enthrall","enthrone","enthroned","enthuse", +"enthusiasm","enthusiast","entice","enticement","entire", +"entirety","entitle","entity","entomb","entomology", +"entourage","entrails","entrain","entrance","entrant", +"entrap","entreat","entreaty","entrench","entrenched", +"entrenchment","entrepreneur","entresol","entropy","entrust", +"entry","entwine","enumerate","enunciate","enunciation", +"envelop","envenom","enviable","envious","environed", +"environment","environmental","environmentalist","environs","envisage", +"envoi","envoy","envy","enzyme","eon", +"epaulet","epaulette","ephemeral","epic","epicenter", +"epicentre","epicure","epicurean","epidemic","epidermis", +"epidiascope","epiglottis","epigram","epigrammatic","epilepsy", +"epileptic","epilogue","epiphany","episcopacy","episcopal", +"episcopalian","episode","episodic","epistle","epistolary", +"epitaph","epithet","epitome","epitomise","epitomize", +"epoch","eponymous","equability","equable","equal", +"equalise","equalitarian","equality","equalize","equally", +"equanimity","equate","equation","equator","equatorial", +"equerry","equestrian","equidistant","equilateral","equilibrium", +"equine","equinoctial","equinox","equip","equipage", +"equipment","equipoise","equitable","equitation","equities", +"equity","equivalence","equivalent","equivocal","equivocate", +"equivocation","era","eradicate","eradicator","erase", +"eraser","erasure","ere","erect","erectile", +"erection","eremite","erg","ergo","ergonomics", +"ermine","erode","erogenous","erosion","erotic", +"erotica","eroticism","err","errand","errant", +"erratic","erratum","erroneous","error","ersatz", +"erse","eructation","erudite","erupt","eruption", +"erysipelas","escalate","escalator","escalope","escapade", +"escape","escapee","escapement","escapism","escapology", +"escarpment","eschatology","eschew","escort","escritoire", +"escutcheon","eskimo","esophagus","esoteric","esp", +"espalier","especial","especially","esperanto","espionage", +"esplanade","espousal","espouse","espresso","espy", +"essay","essence","essential","essentially","establish", +"establishment","estaminet","estate","esteem","esthete", +"esthetic","esthetics","estimable","estimate","estimation", +"estimator","estrange","estrangement","estrogen","estuary", +"etch","etching","eternal","eternity","ether", +"ethereal","ethic","ethical","ethically","ethics", +"ethnic","ethnically","ethnographer","ethnography","ethnologist", +"ethnology","ethos","ethyl","etiolate","etiology", +"etiquette","etymologist","etymology","eucalyptus","eucharist", +"euclidean","euclidian","eugenic","eugenics","eulogise", +"eulogist","eulogistic","eulogize","eulogy","eunuch", +"euphemism","euphemistic","euphonious","euphonium","euphony", +"euphoria","euphuism","eurasian","eureka","eurhythmic", +"eurhythmics","eurocrat","eurodollar","eurythmic","eurythmics", +"euthanasia","evacuate","evacuee","evade","evaluate", +"evanescent","evangelic","evangelical","evangelise","evangelist", +"evangelize","evaporate","evasion","evasive","eve", +"even","evening","evenings","evens","evensong", +"event","eventful","eventide","eventual","eventuality", +"eventually","eventuate","ever","evergreen","everlasting", +"everlastingly","evermore","every","everybody","everyday", +"everything","everywhere","evict","evidence","evident", +"evidently","evil","evildoer","evince","eviscerate", +"evocative","evoke","evolution","evolutionary","evolve", +"ewe","ewer","exacerbate","exact","exacting", +"exaction","exactly","exaggerate","exaggeration","exalt", +"exaltation","exalted","exam","examination","examine", +"example","exasperate","exasperation","excavate","excavation", +"excavator","exceed","exceedingly","excel","excellence", +"excellency","excellent","excelsior","except","excepted", +"excepting","exception","exceptionable","exceptional","excerpt", +"excess","excesses","excessive","exchange","exchequer", +"excise","excision","excitable","excite","excited", +"excitement","exciting","exclaim","exclamation","exclamatory", +"exclude","excluding","exclusion","exclusive","exclusively", +"excogitate","excommunicate","excommunication","excoriate","excrement", +"excrescence","excreta","excrete","excretion","excruciating", +"exculpate","excursion","excursionist","excusable","excuse", +"execrable","execrate","executant","execute","execution", +"executioner","executive","executor","exegesis","exemplary", +"exemplification","exemplify","exempt","exemption","exercise", +"exercises","exert","exertion","exeunt","exhalation", +"exhale","exhaust","exhaustion","exhaustive","exhibit", +"exhibition","exhibitionism","exhibitor","exhilarate","exhilarating", +"exhort","exhortation","exhume","exigency","exigent", +"exiguous","exile","exist","existence","existent", +"existential","existentialism","existing","exit","exodus", +"exogamy","exonerate","exorbitant","exorcise","exorcism", +"exorcist","exorcize","exotic","expand","expanse", +"expansion","expansive","expatiate","expatriate","expect", +"expectancy","expectant","expectation","expectations","expectorate", +"expediency","expedient","expedite","expedition","expeditionary", +"expeditious","expel","expend","expendable","expenditure", +"expense","expenses","expensive","experience","experienced", +"experiment","experimental","experimentation","expert","expertise", +"expiate","expiration","expire","explain","explanation", +"explanatory","expletive","explicable","explicate","explicit", +"explode","exploded","exploit","exploration","exploratory", +"explore","explosion","explosive","expo","exponent", +"exponential","export","exportation","exporter","expose", +"exposition","expostulate","exposure","expound","express", +"expression","expressionism","expressionless","expressive","expressly", +"expressway","expropriate","expulsion","expunge","expurgate", +"exquisite","extant","extemporaneous","extempore","extemporise", +"extemporize","extend","extension","extensive","extent", +"extenuate","extenuation","exterior","exteriorise","exteriorize", +"exterminate","external","externalise","externalize","externally", +"externals","exterritorial","extinct","extinction","extinguish", +"extinguisher","extirpate","extol","extort","extortion", +"extortionate","extortions","extra","extract","extraction", +"extracurricular","extraditable","extradite","extrajudicial","extramarital", +"extramural","extraneous","extraordinarily","extraordinary","extrapolate", +"extraterrestrial","extraterritorial","extravagance","extravagant","extravaganza", +"extravert","extreme","extremely","extremism","extremities", +"extremity","extricate","extrinsic","extrovert","extrude", +"exuberance","exuberant","exude","exult","exultant", +"exultation","eye","eyeball","eyebrow","eyecup", +"eyeful","eyeglass","eyeglasses","eyelash","eyelet", +"eyelid","eyeliner","eyepiece","eyes","eyeshot", +"eyesight","eyesore","eyestrain","eyetooth","eyewash", +"eyewitness","eyot","eyrie","eyry","fabian", +"fable","fabled","fabric","fabricate","fabrication", +"fabulous","fabulously","face","facecloth","faceless", +"facet","facetious","facial","facile","facilitate", +"facilities","facility","facing","facings","facsimile", +"fact","faction","factious","factitious","factor", +"factorial","factorise","factorize","factory","factotum", +"factual","faculty","fad","fade","faeces", +"faerie","faery","fag","fagged","faggot", +"fagot","fahrenheit","faience","fail","failing", +"failure","fain","faint","fair","fairground", +"fairly","fairway","fairy","fairyland","faith", +"faithful","faithfully","faithless","fake","fakir", +"falcon","falconer","falconry","fall","fallacious", +"fallacy","fallen","fallible","fallout","fallow", +"falls","false","falsehood","falsetto","falsies", +"falsify","falsity","falter","fame","famed", +"familial","familiar","familiarise","familiarity","familiarize", +"familiarly","family","famine","famish","famished", +"famous","famously","fan","fanatic","fanaticism", +"fancier","fancies","fanciful","fancy","fancywork", +"fandango","fanfare","fang","fanlight","fanny", +"fantasia","fantastic","fantasy","far","faraway", +"farce","fare","farewell","farfetched","farinaceous", +"farm","farmer","farmhand","farmhouse","farming", +"farmyard","farrago","farrier","farrow","farsighted", +"fart","farther","farthest","farthing","fascia", +"fascinate","fascinating","fascination","fascism","fascist", +"fashion","fashionable","fast","fasten","fastener", +"fastening","fastidious","fastness","fat","fatal", +"fatalism","fatalist","fatality","fatally","fate", +"fated","fateful","fates","fathead","father", +"fatherhood","fatherly","fathom","fathomless","fatigue", +"fatigues","fatless","fatted","fatten","fatty", +"fatuity","fatuous","faucet","fault","faultfinding", +"faultless","faulty","faun","fauna","favor", +"favorable","favored","favorite","favoritism","favour", +"favourable","favoured","favourite","favouritism","favours", +"fawn","fay","faze","fbi","fealty", +"fear","fearful","fearless","fearsome","feasible", +"feast","feat","feather","featherbed","featherbrained", +"featherweight","feathery","feature","featureless","features", +"febrile","february","feces","feckless","fecund", +"fed","federal","federalism","federalist","federate", +"federation","fee","feeble","feebleminded","feed", +"feedback","feedbag","feeder","feel","feeler", +"feeling","feelings","feet","feign","feint", +"feldspar","felicitate","felicitous","felicity","feline", +"fell","fellah","fellatio","fellow","fellowship", +"felon","felony","felspar","felt","felucca", +"fem","female","feminine","femininity","feminism", +"feminist","femur","fen","fence","fencer", +"fencing","fend","fender","fennel","feoff", +"feral","ferment","fermentation","fern","ferocious", +"ferocity","ferret","ferroconcrete","ferrous","ferrule", +"ferry","ferryboat","ferryman","fertile","fertilise", +"fertility","fertilize","fertilizer","ferule","fervent", +"fervid","fervor","fervour","festal","fester", +"festival","festive","festivity","festoon","fetal", +"fetch","fetching","fete","fetid","fetish", +"fetishism","fetishist","fetlock","fetter","fettle", +"fetus","feud","feudal","feudalism","feudatory", +"fever","fevered","feverish","feverishly","few", +"fey","fez","fiasco","fiat","fib", +"fiber","fiberboard","fiberglass","fibre","fibreboard", +"fibreglass","fibrositis","fibrous","fibula","fichu", +"fickle","fiction","fictional","fictionalisation","fictionalization", +"fictitious","fiddle","fiddler","fiddlesticks","fiddling", +"fidelity","fidget","fidgets","fidgety","fie", +"fief","field","fielder","fieldwork","fiend", +"fiendish","fiendishly","fierce","fiery","fiesta", +"fife","fifteen","fifth","fifty","fig", +"fight","fighter","figment","figurative","figure", +"figured","figurehead","figures","figurine","filament", +"filbert","filch","file","filet","filial", +"filibuster","filigree","filings","fill","filler", +"fillet","filling","fillip","filly","film", +"filmable","filmstrip","filmy","filter","filth", +"filthy","fin","finable","final","finale", +"finalise","finalist","finality","finalize","finally", +"finance","finances","financial","financially","financier", +"finch","find","finder","finding","fine", +"fineable","finely","finery","finesse","finger", +"fingerboard","fingering","fingernail","fingerplate","fingerpost", +"fingerprint","fingerstall","fingertip","finicky","finis", +"finish","finished","finite","fink","fiord", +"fir","fire","firearm","fireball","firebomb", +"firebox","firebrand","firebreak","firebrick","firebug", +"fireclay","firecracker","firedamp","firedog","firefly", +"fireguard","firelight","firelighter","fireman","fireplace", +"firepower","fireproof","fireside","firestorm","firetrap", +"firewalking","firewatcher","firewater","firewood","firework", +"fireworks","firkin","firm","firmament","first", +"firstborn","firstfruits","firsthand","firstly","firth", +"firtree","fiscal","fish","fishcake","fisherman", +"fishery","fishing","fishmonger","fishplate","fishwife", +"fishy","fissile","fission","fissionable","fissure", +"fist","fisticuffs","fistula","fit","fitful", +"fitment","fitness","fitted","fitter","fitting", +"five","fiver","fives","fix","fixation", +"fixative","fixed","fixedly","fixity","fixture", +"fizz","fizzle","fizzy","fjord","flabbergast", +"flabby","flaccid","flag","flagellant","flagellate", +"flageolet","flagon","flagpole","flagrancy","flagrant", +"flagship","flagstaff","flagstone","flail","flair", +"flak","flake","flaky","flambeau","flamboyant", +"flame","flamenco","flaming","flamingo","flammable", +"flan","flange","flank","flannel","flannelette", +"flannels","flap","flapjack","flapper","flare", +"flared","flares","flash","flashback","flashbulb", +"flashcube","flasher","flashgun","flashlight","flashy", +"flask","flat","flatcar","flatfish","flatfoot", +"flatiron","flatlet","flatly","flatten","flatter", +"flattery","flattop","flatulence","flaunt","flautist", +"flavor","flavoring","flavour","flavouring","flaw", +"flawless","flax","flaxen","flay","flea", +"fleabag","fleabite","fleapit","fleck","fledged", +"fledgling","flee","fleece","fleecy","fleet", +"fleeting","flesh","fleshings","fleshly","fleshpot", +"fleshy","flew","flex","flexible","flibbertigibbet", +"flick","flicker","flicks","flier","flies", +"flight","flightless","flighty","flimsy","flinch", +"fling","flint","flintlock","flinty","flip", +"flippancy","flippant","flipper","flipping","flirt", +"flirtation","flirtatious","flit","flitch","flivver", +"float","floatation","floating","flock","floe", +"flog","flogging","flood","floodgate","floodlight", +"floor","floorboard","flooring","floorwalker","floosy", +"floozy","flop","floppy","flora","floral", +"floriculture","florid","florin","florist","floss", +"flotation","flotilla","flounce","flounder","flour", +"flourish","flourmill","floury","flout","flow", +"flower","flowerbed","flowered","flowering","flowerless", +"flowerpot","flowery","flowing","flown","flu", +"fluctuate","flue","fluency","fluent","fluff", +"fluffy","fluid","fluidity","fluke","flukey", +"fluky","flume","flummery","flummox","flung", +"flunk","flunkey","flunky","fluorescent","fluoridate", +"fluoride","fluorine","flurry","flush","flushed", +"fluster","flute","fluting","flutist","flutter", +"fluvial","flux","fly","flyaway","flyblown", +"flyby","flycatcher","flyer","flying","flyleaf", +"flyover","flypaper","flypast","flysheet","flyswatter", +"flytrap","flyweight","flywheel","flywhisk","foal", +"foam","fob","focal","focus","fodder", +"foe","foeman","foetal","foetus","fog", +"fogbank","fogbound","fogey","foggy","foghorn", +"fogy","foible","foil","foist","fold", +"foldaway","folder","foliage","folio","folk", +"folklore","folklorist","folks","folksy","folktale", +"folkway","follicle","follow","follower","following", +"folly","foment","fomentation","fond","fondant", +"fondle","fondly","fondu","fondue","font", +"food","foodstuff","fool","foolery","foolhardy", +"foolish","foolproof","foolscap","foot","footage", +"football","footbath","footboard","footbridge","footer", +"footfall","foothill","foothold","footing","footle", +"footlights","footling","footloose","footman","footnote", +"footpad","footpath","footplate","footprint","footrace", +"footsie","footslog","footsore","footstep","footstool", +"footsure","footwear","footwork","fop","foppish", +"for","forage","foray","forbear","forbearance", +"forbearing","forbid","forbidden","forbidding","force", +"forced","forceful","forcemeat","forceps","forces", +"forcible","forcibly","ford","fore","forearm", +"forebode","foreboding","forecast","forecastle","foreclose", +"foreclosure","forecourt","foredoomed","forefather","forefinger", +"forefoot","forefront","forego","foregoing","foreground", +"forehand","forehead","foreign","foreigner","foreknowledge", +"foreland","foreleg","forelock","foreman","foremost", +"forename","forenoon","forensic","foreordain","forepart", +"foreplay","forerunner","foresail","foresee","foreseeable", +"foreshadow","foreshore","foreshorten","foresight","foreskin", +"forest","forestall","forester","forestry","foreswear", +"foretaste","foretell","forethought","forever","forewarn", +"forewent","forewoman","foreword","forfeit","forfeiture", +"forgather","forgave","forge","forger","forgery", +"forget","forgetful","forging","forgivable","forgive", +"forgiveable","forgiveness","forgiving","forgo","fork", +"forked","forkful","forklift","forlorn","form", +"formal","formaldehyde","formalin","formalise","formalism", +"formality","formalize","format","formation","formative", +"formbook","former","formerly","formica","formidable", +"formless","formula","formulaic","formulate","formulation", +"fornicate","fornication","forrader","forsake","forsooth", +"forswear","forsythia","fort","forte","forth", +"forthcoming","forthright","forthwith","fortieth","fortification", +"fortify","fortissimo","fortitude","fortnight","fortnightly", +"fortress","fortuitous","fortunate","fortunately","fortune", +"forty","forum","forward","forwarding","forwardly", +"forwardness","forwent","foss","fosse","fossil", +"fossilise","fossilize","foster","fought","foul", +"found","foundation","foundations","founder","foundling", +"foundry","fount","fountain","fountainhead","four", +"foureyes","fourpenny","fours","foursquare","fourteen", +"fourth","fowl","fox","foxglove","foxhole", +"foxhound","foxhunt","foxtrot","foxy","foyer", +"fracas","fraction","fractional","fractionally","fractious", +"fracture","fragile","fragment","fragmentary","fragmentation", +"fragrance","fragrant","frail","frailty","frame", +"frames","framework","franc","franchise","franciscan", +"frank","frankfurter","frankincense","franklin","frankly", +"frantic","fraternal","fraternise","fraternity","fraternize", +"fratricide","frau","fraud","fraudulence","fraudulent", +"fraught","fraulein","fray","frazzle","freak", +"freakish","freckle","free","freebee","freebie", +"freeboard","freebooter","freeborn","freedman","freedom", +"freehand","freehanded","freehold","freeholder","freelance", +"freeload","freely","freeman","freemason","freemasonry", +"freepost","freesia","freestanding","freestone","freestyle", +"freethinker","freeway","freewheel","freewheeling","freewill", +"freeze","freezer","freezing","freight","freighter", +"freightliner","frenchman","frenetic","frenzied","frenzy", +"frequency","frequent","fresco","fresh","freshen", +"fresher","freshet","freshly","freshwater","fret", +"fretful","fretsaw","fretwork","freudian","friable", +"friar","friary","fricassee","fricative","friction", +"friday","fridge","friend","friendless","friendly", +"friends","friendship","frier","frieze","frig", +"frigate","frigging","fright","frighten","frightened", +"frightful","frightfully","frigid","frigidity","frill", +"frilled","frills","frilly","fringe","frippery", +"frisbee","frisian","frisk","frisky","frisson", +"fritter","frivolity","frivolous","frizz","frizzle", +"frizzy","fro","frock","frog","frogged", +"frogman","frogmarch","frogspawn","frolic","frolicsome", +"from","frond","front","frontage","frontal", +"frontbench","frontier","frontiersman","frontispiece","frost", +"frostbite","frostbitten","frostbound","frosting","frosty", +"froth","frothy","frown","frowst","frowsty", +"frowsy","frowzy","froze","frozen","frs", +"fructification","fructify","frugal","frugality","fruit", +"fruitcake","fruiterer","fruitful","fruition","fruitless", +"fruits","fruity","frump","frustrate","frustration", +"fry","fryer","fuchsia","fuck","fucker", +"fucking","fuddle","fudge","fuehrer","fuel", +"fug","fugitive","fugue","fuhrer","fulcrum", +"fulfil","fulfill","fulfillment","fulfilment","full", +"fullback","fuller","fully","fulmar","fulminate", +"fulmination","fulness","fulsome","fumble","fume", +"fumes","fumigate","fun","function","functional", +"functionalism","functionalist","functionary","fund","fundamental", +"fundamentalism","fundamentally","funds","funeral","funerary", +"funereal","funfair","fungicide","fungoid","fungous", +"fungus","funicular","funk","funky","funnel", +"funnies","funnily","funny","fur","furbelow", +"furbish","furious","furiously","furl","furlong", +"furlough","furnace","furnish","furnishings","furniture", +"furore","furrier","furrow","furry","further", +"furtherance","furthermore","furthermost","furthest","furtive", +"fury","furze","fuse","fused","fuselage", +"fusilier","fusillade","fusion","fuss","fusspot", +"fussy","fustian","fusty","futile","futility", +"future","futureless","futures","futurism","futuristic", +"futurity","fuzz","fuzzy","gab","gabardine", +"gabble","gaberdine","gable","gabled","gad", +"gadabout","gadfly","gadget","gadgetry","gaelic", +"gaff","gaffe","gaffer","gag","gaga", +"gaggle","gaiety","gaily","gain","gainful", +"gainfully","gainsay","gait","gaiter","gal", +"gala","galactic","galantine","galaxy","gale", +"gall","gallant","gallantry","galleon","gallery", +"galley","gallic","gallicism","gallivant","gallon", +"gallop","galloping","gallows","gallstone","galore", +"galosh","galumph","galvanic","galvanise","galvanism", +"galvanize","gambit","gamble","gamboge","gambol", +"game","gamecock","gamekeeper","games","gamesmanship", +"gamey","gamma","gammon","gammy","gamp", +"gamut","gamy","gander","gang","ganger", +"gangling","ganglion","gangplank","gangrene","gangster", +"gangway","gannet","gantry","gaol","gaolbird", +"gaoler","gap","gape","gapes","garage", +"garb","garbage","garble","garden","gardenia", +"gardening","gargantuan","gargle","gargoyle","garish", +"garland","garlic","garment","garner","garnet", +"garnish","garret","garrison","garrote","garrotte", +"garrulity","garrulous","garter","gas","gasbag", +"gaseous","gash","gasholder","gasify","gasket", +"gaslight","gasman","gasolene","gasoline","gasp", +"gassy","gastric","gastritis","gastroenteritis","gastronomy", +"gasworks","gat","gate","gatecrash","gatehouse", +"gatekeeper","gatepost","gateway","gather","gathering", +"gauche","gaucherie","gaucho","gaudy","gauge", +"gaunt","gauntlet","gauze","gave","gavel", +"gavotte","gawk","gawky","gawp","gay", +"gayness","gaze","gazebo","gazelle","gazette", +"gazetteer","gazump","gce","gear","gearbox", +"gecko","gee","geese","geezer","geisha", +"gel","gelatine","gelatinous","geld","gelding", +"gelignite","gem","gemini","gen","gendarme", +"gender","gene","genealogist","genealogy","genera", +"general","generalisation","generalise","generalissimo","generality", +"generalization","generalize","generally","generate","generation", +"generative","generator","generic","generous","genesis", +"genetic","geneticist","genetics","genial","geniality", +"genie","genital","genitals","genitive","genius", +"genocide","genre","gent","genteel","gentian", +"gentile","gentility","gentle","gentlefolk","gentleman", +"gentlemanly","gentlewoman","gently","gentry","gents", +"genuflect","genuine","genus","geocentric","geographer", +"geography","geologist","geology","geometric","geometry", +"geophysics","geopolitics","georgette","geranium","geriatric", +"geriatrician","geriatrics","germ","germane","germanic", +"germicide","germinal","germinate","gerontology","gerrymander", +"gerund","gestalt","gestapo","gestation","gesticulate", +"gesture","get","getaway","getup","geum", +"gewgaw","geyser","gharry","ghastly","ghat", +"ghaut","ghee","gherkin","ghetto","ghi", +"ghost","ghostly","ghoul","ghoulish","ghq", +"ghyll","giant","giantess","gibber","gibberish", +"gibbet","gibbon","gibbous","gibe","giblets", +"giddy","gift","gifted","gig","gigantic", +"giggle","gigolo","gild","gilded","gilding", +"gill","gillie","gilly","gilt","gimcrack", +"gimlet","gimmick","gimmicky","gin","ginger", +"gingerbread","gingerly","gingham","gingivitis","gingko", +"ginkgo","ginseng","gipsy","giraffe","gird", +"girder","girdle","girl","girlfriend","girlhood", +"girlie","girlish","girly","giro","girt", +"girth","gist","give","giveaway","given", +"gizzard","glacial","glacier","glad","gladden", +"glade","gladiator","gladiolus","gladly","glamor", +"glamorise","glamorize","glamorous","glamour","glamourous", +"glance","glancing","gland","glandular","glare", +"glaring","glass","glassblower","glasscutter","glasses", +"glasshouse","glassware","glassworks","glassy","glaucoma", +"glaucous","glaze","glazier","glazing","glc", +"gleam","glean","gleaner","gleanings","glebe", +"glee","gleeful","glen","glengarry","glib", +"glide","glider","gliding","glimmer","glimmerings", +"glimpse","glint","glissade","glissando","glisten", +"glister","glitter","glittering","gloaming","gloat", +"global","globe","globefish","globetrotter","globular", +"globule","glockenspiel","gloom","gloomy","gloria", +"glorification","glorify","glorious","glory","gloss", +"glossary","glossy","glottal","glottis","glove", +"glow","glower","glowing","glucose","glue", +"gluey","glum","glut","gluten","glutinous", +"glutton","gluttonous","gluttony","glycerin","glycerine", +"gnarled","gnash","gnat","gnaw","gnawing", +"gneiss","gnocchi","gnome","gnp","gnu", +"goad","goal","goalkeeper","goalmouth","goalpost", +"goat","goatee","goatherd","goatskin","gob", +"gobbet","gobble","gobbledegook","gobbledygook","gobbler", +"goblet","goblin","god","godchild","goddam", +"goddamn","goddie","godforsaken","godhead","godless", +"godlike","godly","godown","godparent","gods", +"godsend","godspeed","goer","goggle","goggles", +"goings","goiter","goitre","gold","goldbeater", +"golden","goldfield","goldfinch","goldfish","goldmine", +"goldsmith","golf","goliath","golliwog","golly", +"gollywog","gonad","gondola","gondolier","gone", +"goner","gong","gonna","gonorrhea","gonorrhoea", +"goo","good","goodbye","goodish","goodly", +"goodness","goodnight","goods","goodwill","goody", +"gooey","goof","goofy","googly","goon", +"goose","gooseberry","gooseflesh","goosestep","gopher", +"gore","gorge","gorgeous","gorgon","gorgonzola", +"gorilla","gormandise","gormandize","gormless","gorse", +"gory","gosh","gosling","gospel","gossamer", +"gossip","gossipy","got","gothic","gotta", +"gotten","gouache","gouda","gouge","goulash", +"gourd","gourmand","gourmet","gout","gouty", +"govern","governance","governess","governing","government", +"governor","gown","gpo","grab","grace", +"graceful","graceless","graces","gracious","gradation", +"grade","gradient","gradual","graduate","graduation", +"graffiti","graft","grafter","grail","grain", +"gram","grammar","grammarian","grammatical","gramme", +"gramophone","grampus","gran","granary","grand", +"grandad","grandchild","granddad","granddaughter","grandee", +"grandeur","grandfather","grandiloquent","grandiose","grandma", +"grandmother","grandpa","grandparent","grandson","grandstand", +"grange","granite","grannie","granny","grant", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java new file mode 100644 index 00000000000..4ffb6d00cc5 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData4 { + private KStemData4() { + } + static String[] data = { +"granular","granulate","granule","grape","grapefruit", +"grapeshot","grapevine","graph","graphic","graphical", +"graphically","graphite","graphology","grapnel","grapple", +"grasp","grasping","grass","grasshopper","grassland", +"grassy","grate","grateful","grater","gratification", +"gratify","gratifying","grating","gratis","gratitude", +"gratuitous","gratuity","grave","gravel","gravelly", +"gravestone","graveyard","gravitate","gravitation","gravity", +"gravure","gravy","gray","graybeard","grayish", +"graze","grease","greasepaint","greaseproof","greaser", +"greasy","great","greatcoat","greater","greatly", +"grebe","grecian","greed","greedy","green", +"greenback","greenery","greenfly","greengage","greengrocer", +"greenhorn","greenhouse","greenish","greenroom","greens", +"greenwood","greet","greeting","gregarious","gremlin", +"grenade","grenadier","grenadine","grew","grey", +"greybeard","greyhound","greyish","grid","griddle", +"gridiron","grief","grievance","grieve","grievous", +"griffin","grill","grim","grimace","grime", +"grimy","grin","grind","grinder","grindstone", +"gringo","grip","gripe","gripes","gripping", +"grisly","grist","gristle","grit","grits", +"grizzle","grizzled","groan","groat","groats", +"grocer","groceries","grocery","grog","groggy", +"groin","groom","groove","groover","groovy", +"grope","gropingly","gross","grotesque","grotto", +"grotty","grouch","ground","grounding","groundless", +"groundnut","grounds","groundsel","groundsheet","groundsman", +"groundwork","group","groupie","grouping","grouse", +"grove","grovel","grow","grower","growl", +"grown","growth","groyne","grub","grubby", +"grudge","grudging","gruel","grueling","gruelling", +"gruesome","gruff","grumble","grumbling","grumpy", +"grundyism","grunt","gryphon","guano","guarantee", +"guarantor","guaranty","guard","guarded","guardhouse", +"guardian","guardianship","guardrail","guardroom","guardsman", +"guava","gubernatorial","gudgeon","guerilla","guerrilla", +"guess","guesswork","guest","guesthouse","guestroom", +"guffaw","guidance","guide","guidelines","guild", +"guilder","guildhall","guile","guileless","guillemot", +"guillotine","guilt","guilty","guinea","guipure", +"guise","guitar","gulch","gulden","gulf", +"gull","gullet","gulley","gullible","gully", +"gulp","gum","gumbo","gumboil","gumboot", +"gumdrop","gummy","gumption","gun","gunboat", +"gundog","gunfire","gunge","gunman","gunmetal", +"gunnel","gunner","gunnery","gunnysack","gunpoint", +"gunpowder","gunrunner","gunshot","gunshy","gunsmith", +"gunwale","guppy","gurgle","guru","gush", +"gusher","gushing","gushy","gusset","gust", +"gustatory","gusto","gusty","gut","gutless", +"guts","gutsy","gutter","guttersnipe","guttural", +"guv","guvnor","guy","guzzle","gym", +"gymkhana","gymnasium","gymnast","gymnastic","gymnastics", +"gymslip","gynaecology","gynecology","gyp","gypsum", +"gypsy","gyrate","gyration","gyroscope","gyves", +"haberdasher","haberdashery","habiliment","habit","habitable", +"habitat","habitation","habitual","habituate","hacienda", +"hack","hackles","hackney","hackneyed","hacksaw", +"hackwork","had","haddock","hadji","haft", +"hag","haggard","haggis","haggle","hagiography", +"haiku","hail","hailstone","hailstorm","hair", +"hairbrush","haircut","hairdo","hairdresser","hairgrip", +"hairless","hairline","hairnet","hairpiece","hairpin", +"hairspring","hairy","hajji","hake","halberd", +"halcyon","hale","half","halfback","halfpence", +"halfpenny","halfpennyworth","halftone","halfway","halibut", +"halitosis","hall","halleluja","halliard","hallmark", +"hallo","hallow","hallstand","hallucinate","hallucination", +"hallucinatory","hallucinogenic","hallway","halma","halo", +"halt","halter","halterneck","halting","halve", +"halves","halyard","ham","hamadryad","hamburger", +"hamlet","hammer","hammock","hamper","hamster", +"hamstring","hand","handbag","handball","handbarrow", +"handbill","handbook","handbrake","handcart","handclap", +"handcuff","handcuffs","handful","handgun","handhold", +"handicap","handicraft","handiwork","handkerchief","handle", +"handlebars","handler","handloom","handmade","handmaiden", +"handout","handpick","handrail","handshake","handsome", +"handstand","handwork","handwriting","handwritten","handy", +"handyman","hang","hangar","hangdog","hanger", +"hanging","hangings","hangman","hangnail","hangout", +"hangover","hangup","hank","hanker","hankering", +"hankie","hanky","hansard","hansom","hap", +"haphazard","hapless","haply","happen","happening", +"happily","happiness","happy","harangue","harass", +"harassment","harbinger","harbor","harbour","hard", +"hardback","hardboard","hardbound","harden","hardheaded", +"hardihood","hardiness","hardly","hardness","hardship", +"hardtop","hardware","hardwearing","hardwood","hardy", +"hare","harebell","harebrained","harelip","harem", +"haricot","hark","harlequin","harlequinade","harlot", +"harm","harmless","harmonic","harmonica","harmonise", +"harmonium","harmonize","harmony","harness","harp", +"harpoon","harpsichord","harpy","harquebus","harridan", +"harrier","harrow","harrowing","harry","harsh", +"hart","hartal","hartebeest","harvest","harvester", +"has","hash","hashish","hasp","hassle", +"hassock","hast","haste","hasten","hasty", +"hat","hatband","hatch","hatchback","hatchery", +"hatchet","hatching","hatchway","hate","hateful", +"hath","hatless","hatpin","hatred","hatter", +"hauberk","haughty","haul","haulage","haulier", +"haulm","haunch","haunt","haunting","hautbois", +"hautboy","hauteur","havana","have","haven", +"haver","haversack","haves","havoc","haw", +"hawk","hawker","hawser","hawthorn","hay", +"haycock","hayfork","haymaker","haystack","haywire", +"hazard","hazardous","haze","hazel","hazy", +"head","headache","headband","headboard","headcheese", +"headdress","header","headfirst","headgear","headhunter", +"heading","headland","headless","headlight","headline", +"headlong","headman","headmaster","headphone","headpiece", +"headquarters","headrest","headroom","headset","headship", +"headshrinker","headstall","headstone","headstrong","headway", +"headwind","headword","heady","heal","health", +"healthful","healthy","heap","hear","hearer", +"hearing","hearken","hearsay","hearse","heart", +"heartache","heartbeat","heartbreak","heartbreaking","heartbroken", +"heartburn","hearten","heartening","heartfelt","hearth", +"hearthrug","heartily","heartless","heartrending","heartsease", +"heartsick","heartstrings","heartthrob","heartwarming","heartwood", +"hearty","heat","heated","heater","heath", +"heathen","heather","heating","heatstroke","heave", +"heaven","heavenly","heavenwards","heavy","heavyhearted", +"heavyweight","hebdomadal","hebraic","hebrew","hecatomb", +"heck","heckle","hectare","hectic","hector", +"hedge","hedgehog","hedgehop","hedgerow","hedonism", +"heed","heel","heelball","hefty","hegemony", +"hegira","heifer","height","heighten","heinous", +"heir","heiress","heirloom","hejira","held", +"helicopter","heliograph","heliotrope","heliport","helium", +"hell","hellcat","hellene","hellenic","hellenistic", +"hellish","hellishly","hello","helm","helmet", +"helmeted","helmsman","helot","help","helpful", +"helping","helpless","helpmate","helve","hem", +"hemisphere","hemline","hemlock","hemoglobin","hemophilia", +"hemophiliac","hemorrhage","hemorrhoid","hemp","hempen", +"hemstitch","hen","henbane","hence","henceforth", +"henchman","henna","hennaed","henpecked","hepatitis", +"heptagon","her","herald","heraldic","heraldry", +"herb","herbaceous","herbage","herbal","herbalist", +"herbivorous","herculean","herd","herdsman","here", +"hereabouts","hereafter","hereby","hereditament","hereditary", +"heredity","herein","hereinafter","hereof","heresy", +"heretic","hereto","heretofore","hereunder","hereupon", +"herewith","heritable","heritage","hermaphrodite","hermetic", +"hermit","hermitage","hernia","hero","heroic", +"heroics","heroin","heroism","heron","heronry", +"herpes","herr","herring","herringbone","hers", +"herself","hertz","hesitancy","hesitant","hesitate", +"hesitation","hesperus","hessian","heterodox","heterodoxy", +"heterogeneous","heterosexual","heuristic","heuristics","hew", +"hewer","hex","hexagon","hexagram","hexameter", +"hey","heyday","hiatus","hibernate","hibiscus", +"hiccough","hiccup","hick","hickory","hide", +"hideaway","hidebound","hideous","hiding","hie", +"hierarchy","hieroglyph","hieroglyphics","high","highball", +"highborn","highboy","highbrow","higher","highfalutin", +"highland","highlander","highlands","highlight","highly", +"highness","highpitched","highroad","highway","highwayman", +"hijack","hike","hilarious","hilarity","hill", +"hillbilly","hillock","hillside","hilly","hilt", +"him","himself","hind","hinder","hindmost", +"hindquarters","hindrance","hindsight","hindu","hinduism", +"hinge","hint","hinterland","hip","hipbath", +"hippie","hippodrome","hippopotamus","hippy","hipster", +"hire","hireling","hirsute","his","hiss", +"hist","histamine","histology","historian","historic", +"historical","history","histrionic","histrionics","hit", +"hitch","hitchhike","hither","hitherto","hive", +"hives","hms","hoard","hoarding","hoarfrost", +"hoarse","hoary","hoax","hob","hobble", +"hobbledehoy","hobby","hobbyhorse","hobgoblin","hobnail", +"hobnob","hobo","hock","hockey","hod", +"hodgepodge","hoe","hog","hoggish","hogmanay", +"hogshead","hogwash","hoist","hold","holdall", +"holder","holding","holdover","holdup","hole", +"holiday","holidaymaker","holiness","holler","hollow", +"holly","hollyhock","hollywood","holocaust","holograph", +"holstein","holster","holy","homage","homburg", +"home","homecoming","homegrown","homeland","homelike", +"homely","homemade","homeopath","homeopathy","homeric", +"homesick","homespun","homestead","hometown","homeward", +"homewards","homework","homey","homicidal","homicide", +"homiletic","homiletics","homily","homing","hominy", +"homoeopath","homoeopathy","homogeneous","homogenise","homogenize", +"homograph","homonym","homophone","homosexual","homy", +"hone","honest","honestly","honesty","honey", +"honeybee","honeycomb","honeycombed","honeydew","honeyed", +"honeymoon","honeysuckle","honk","honkie","honky", +"honor","honorable","honorarium","honorary","honorific", +"honors","honour","honourable","honours","hooch", +"hood","hooded","hoodlum","hoodoo","hoodwink", +"hooey","hoof","hook","hookah","hooked", +"hooker","hookey","hookup","hookworm","hooky", +"hooligan","hoop","hooray","hoot","hooter", +"hoover","hooves","hop","hope","hopeful", +"hopefully","hopeless","hopper","hopscotch","horde", +"horizon","horizontal","hormone","horn","hornbeam", +"hornbill","horned","hornet","hornpipe","horny", +"horology","horoscope","horrendous","horrible","horrid", +"horrific","horrify","horror","horrors","horse", +"horseback","horsebox","horseflesh","horsefly","horsehair", +"horselaugh","horseman","horsemanship","horsemeat","horseplay", +"horsepower","horseracing","horseradish","horseshit","horseshoe", +"horsewhip","horsewoman","horsy","hortative","horticulture", +"hosanna","hose","hosier","hosiery","hospice", +"hospitable","hospital","hospitalise","hospitality","hospitalize", +"host","hostage","hostel","hosteler","hosteller", +"hostelry","hostess","hostile","hostilities","hostility", +"hostler","hot","hotbed","hotchpotch","hotel", +"hotelier","hotfoot","hothead","hothouse","hotly", +"hotplate","hotpot","hottentot","hound","hour", +"hourglass","houri","hourly","house","houseboat", +"housebound","houseboy","housebreaker","housebroken","housecoat", +"housecraft","housedog","housefather","housefly","houseful", +"household","householder","housekeeper","housekeeping","housemaid", +"houseman","housemaster","housemother","houseroom","housetops", +"housewarming","housewife","housewifery","housework","housing", +"hove","hovel","hover","hovercraft","how", +"howdah","howdy","however","howitzer","howl", +"howler","howling","howsoever","hoyden","hrh", +"hub","hubbub","hubby","hubcap","hubris", +"huckaback","huckleberry","huckster","huddle","hue", +"huff","huffish","huffy","hug","huge", +"hugely","huguenot","huh","hula","hulk", +"hulking","hull","hullabaloo","hullo","hum", +"human","humane","humanise","humanism","humanitarian", +"humanitarianism","humanities","humanity","humanize","humankind", +"humanly","humble","humbug","humdinger","humdrum", +"humerus","humid","humidify","humidity","humidor", +"humiliate","humility","hummingbird","hummock","humor", +"humorist","humorous","humour","hump","humpback", +"humph","humus","hun","hunch","hunchback", +"hundred","hundredweight","hung","hunger","hungry", +"hunk","hunkers","hunt","hunter","hunting", +"huntress","huntsman","hurdle","hurl","hurling", +"hurray","hurricane","hurried","hurry","hurt", +"hurtful","hurtle","husband","husbandman","husbandry", +"hush","husk","husky","hussar","hussy", +"hustings","hustle","hustler","hut","hutch", +"hutment","huzza","huzzah","hyacinth","hyaena", +"hybrid","hybridise","hybridize","hydra","hydrangea", +"hydrant","hydrate","hydraulic","hydraulics","hydrocarbon", +"hydroelectric","hydrofoil","hydrogen","hydrophobia","hydroplane", +"hydroponics","hydrotherapy","hyena","hygiene","hygienic", +"hymen","hymeneal","hymn","hymnal","hyperbola", +"hyperbole","hyperbolic","hypercritical","hypermarket","hypersensitive", +"hyphen","hyphenate","hypnosis","hypnotise","hypnotism", +"hypnotist","hypnotize","hypo","hypochondria","hypochondriac", +"hypocrisy","hypocrite","hypodermic","hypotenuse","hypothermia", +"hypothesis","hypothetical","hysterectomy","hysteria","hysterical", +"hysterics","iamb","iberian","ibex","ibidem", +"ibis","icbm","ice","iceberg","icebound", +"icebox","icebreaker","icefall","icehouse","iceman", +"icicle","icing","icon","iconoclast","icy", +"idea","ideal","idealise","idealism","idealist", +"idealize","ideally","idem","identical","identification", +"identify","identikit","identity","ideogram","ideology", +"ides","idiocy","idiom","idiomatic","idiosyncrasy", +"idiot","idle","idol","idolater","idolatrous", +"idolatry","idolise","idolize","idyl","idyll", +"igloo","igneous","ignite","ignition","ignoble", +"ignominious","ignominy","ignoramus","ignorance","ignorant", +"ignore","iguana","ikon","ilex","ilk", +"ill","illegal","illegality","illegible","illegitimate", +"illiberal","illicit","illimitable","illiterate","illness", +"illogical","illuminate","illuminating","illumination","illuminations", +"illusion","illusionist","illusory","illustrate","illustration", +"illustrative","illustrator","illustrious","image","imagery", +"imaginable","imaginary","imagination","imaginative","imagine", +"imam","imbalance","imbecile","imbecility","imbed", +"imbibe","imbroglio","imbue","imitate","imitation", +"imitative","imitator","immaculate","immanence","immanent", +"immaterial","immature","immeasurable","immediacy","immediate", +"immediately","immemorial","immense","immensely","immensity", +"immerse","immersion","immigrant","immigrate","imminence", +"imminent","immobile","immobilise","immobilize","immoderate", +"immodest","immolate","immoral","immorality","immortal", +"immortalise","immortality","immortalize","immovable","immune", +"immunise","immunize","immure","immutable","imp", +"impact","impacted","impair","impala","impale", +"impalpable","impanel","impart","impartial","impassable", +"impasse","impassioned","impassive","impatience","impatient", +"impeach","impeccable","impecunious","impedance","impede", +"impediment","impedimenta","impel","impending","impenetrable", +"impenitent","imperative","imperceptible","imperfect","imperial", +"imperialism","imperialist","imperialistic","imperil","imperious", +"imperishable","impermanent","impermeable","impersonal","impersonate", +"impertinent","imperturbable","impervious","impetigo","impetuous", +"impetus","impiety","impinge","impious","impish", +"implacable","implant","implement","implicate","implication", +"implicit","implore","implosion","imply","impolite", +"impolitic","imponderable","import","importance","important", +"importation","importunate","importune","impose","imposing", +"imposition","impossible","impostor","imposture","impotent", +"impound","impoverish","impracticable","impractical","imprecation", +"impregnable","impregnate","impresario","impress","impression", +"impressionable","impressionism","impressionist","impressionistic","impressive", +"imprimatur","imprint","imprison","improbability","improbable", +"impromptu","improper","impropriety","improve","improvement", +"improvident","improvise","imprudent","impudent","impugn", +"impulse","impulsion","impulsive","impunity","impure", +"impurity","imputation","impute","inability","inaccessible", +"inaccurate","inaction","inactive","inadequacy","inadequate", +"inadmissible","inadvertent","inalienable","inamorata","inane", +"inanimate","inanition","inanity","inapplicable","inappropriate", +"inapt","inaptitude","inarticulate","inartistic","inattention", +"inattentive","inaudible","inaugural","inaugurate","inauspicious", +"inboard","inborn","inbound","inbred","inbreeding", +"inc","incalculable","incandescent","incantation","incapable", +"incapacitate","incapacity","incarcerate","incarnate","incarnation", +"incautious","incendiarism","incendiary","incense","incentive", +"inception","incertitude","incessant","incest","incestuous", +"inch","inchoate","incidence","incident","incidental", +"incidentally","incidentals","incinerate","incinerator","incipience", +"incipient","incise","incision","incisive","incisor", +"incite","incivility","inclement","inclination","incline", +"inclined","inclose","inclosure","include","included", +"including","inclusion","inclusive","incognito","incoherent", +"incombustible","income","incoming","incommensurable","incommensurate", +"incommode","incommodious","incommunicable","incommunicado","incommunicative", +"incomparable","incompatible","incompetence","incompetent","incomplete", +"incomprehensible","incomprehensibly","incomprehension","inconceivable","inconclusive", +"incongruity","incongruous","inconsequent","inconsequential","inconsiderable", +"inconsiderate","inconsistent","inconsolable","inconspicuous","inconstant", +"incontestable","incontinent","incontrovertible","inconvenience","inconvenient", +"incorporate","incorporated","incorporeal","incorrect","incorrigible", +"incorruptible","increase","increasingly","incredible","incredulity", +"incredulous","increment","incriminate","incrust","incrustation", +"incubate","incubation","incubator","incubus","inculcate", +"inculpate","incumbency","incumbent","incur","incurable", +"incurious","incursion","incurved","indebted","indecent", +"indecipherable","indecision","indecisive","indecorous","indecorum", +"indeed","indefatigable","indefensible","indefinable","indefinite", +"indefinitely","indelible","indelicate","indemnification","indemnify", +"indemnity","indent","indentation","indenture","independence", +"independent","indescribable","indestructible","indeterminable","indeterminate", +"index","indian","indicate","indication","indicative", +"indicator","indices","indict","indictable","indifferent", +"indigenous","indigent","indigestible","indigestion","indignant", +"indignation","indignity","indigo","indirect","indiscernible", +"indiscipline","indiscreet","indiscretion","indiscriminate","indispensable", +"indisposed","indisposition","indisputable","indissoluble","indistinct", +"indistinguishable","individual","individualise","individualism","individuality", +"individualize","individually","indivisible","indocile","indoctrinate", +"indolent","indomitable","indoor","indoors","indorse", +"indrawn","indubitable","induce","inducement","induct", +"induction","inductive","indue","indulge","indulgence", +"indulgent","industrial","industrialise","industrialism","industrialist", +"industrialize","industrious","industry","inebriate","inedible", +"ineducable","ineffable","ineffaceable","ineffective","ineffectual", +"inefficient","inelastic","inelegant","ineligible","ineluctable", +"inept","ineptitude","inequality","inequitable","inequity", +"ineradicable","inert","inertia","inescapable","inessential", +"inestimable","inevitable","inexact","inexactitude","inexcusable", +"inexhaustible","inexorable","inexpediency","inexpedient","inexpensive", +"inexperience","inexperienced","inexpert","inexpiable","inexplicable", +"inexplicably","inexpressible","inextinguishable","inextricable","infallible", +"infallibly","infamous","infamy","infancy","infant", +"infanticide","infantile","infantry","infantryman","infatuated", +"infatuation","infect","infection","infectious","infelicitous", +"infer","inference","inferential","inferior","infernal", +"inferno","infertile","infest","infidel","infidelity", +"infield","infighting","infiltrate","infiltration","infinite", +"infinitesimal","infinitive","infinitude","infinity","infirm", +"infirmary","infirmity","inflame","inflamed","inflammable", +"inflammation","inflammatory","inflatable","inflate","inflated", +"inflation","inflationary","inflect","inflection","inflexible", +"inflexion","inflict","infliction","inflow","influence", +"influential","influenza","influx","info","inform", +"informal","informant","information","informative","informed", +"informer","infra","infraction","infrared","infrastructure", +"infrequent","infringe","infuriate","infuse","infusion", +"ingathering","ingenious","ingenuity","ingenuous","ingest", +"inglenook","inglorious","ingoing","ingot","ingraft", +"ingrained","ingratiate","ingratiating","ingratitude","ingredient", +"ingress","ingrown","inhabit","inhabitant","inhale", +"inhaler","inharmonious","inhere","inherent","inherently", +"inherit","inheritance","inhibit","inhibited","inhibition", +"inhospitable","inhuman","inhumane","inhumanity","inimical", +"inimitable","iniquitous","iniquity","initial","initially", +"initiate","initiation","initiative","inject","injection", +"injudicious","injunction","injure","injurious","injury", +"injustice","ink","inkbottle","inkling","inkpad", +"inkstand","inkwell","inky","inlaid","inland", +"inlay","inlet","inmate","inmost","inn", +"innards","innate","inner","inning","innings", +"innkeeper","innocent","innocuous","innovate","innovation", +"innuendo","innumerable","inoculate","inoffensive","inoperable", +"inoperative","inopportune","inordinate","inorganic","input", +"inquest","inquietude","inquire","inquiring","inquiry", +"inquisition","inquisitive","inquisitor","inquisitorial","inroad", +"inrush","insalubrious","insane","insanitary","insanity", +"insatiable","insatiate","inscribe","inscription","inscrutable", +"insect","insecticide","insectivore","insectivorous","insecure", +"inseminate","insemination","insensate","insensibility","insensible", +"insensitive","inseparable","insert","insertion","inset", +"inshore","inside","insider","insidious","insight", +"insignia","insignificant","insincere","insinuate","insinuation", +"insipid","insist","insistence","insistency","insistent", +"insole","insolent","insoluble","insolvable","insolvent", +"insomnia","insomniac","insouciance","inspect","inspection", +"inspector","inspectorate","inspectorship","inspiration","inspire", +"inspired","instability","install","installation","installment", +"instalment","instance","instant","instantaneous","instantly", +"instead","instep","instigate","instigation","instil", +"instill","instinct","instinctive","institute","institution", +"instruct","instruction","instructive","instructor","instructress", +"instrument","instrumental","instrumentalist","instrumentality","instrumentation", +"insubordinate","insubstantial","insufferable","insufficiency","insufficient", +"insular","insularity","insulate","insulation","insulator", +"insulin","insult","insuperable","insupportable","insurance", +"insure","insured","insurer","insurgent","insurmountable", +"insurrection","intact","intaglio","intake","intangible", +"integer","integral","integrate","integrated","integrity", +"integument","intellect","intellectual","intelligence","intelligent", +"intelligentsia","intelligible","intemperate","intend","intended", +"intense","intensifier","intensify","intensity","intensive", +"intent","intention","intentional","intentions","inter", +"interact","interaction","interbreed","intercalary","intercalate", +"intercede","intercept","interceptor","intercession","interchange", +"interchangeable","intercity","intercollegiate","intercom","intercommunicate", +"intercommunion","intercontinental","intercourse","interdenominational","interdependent", +"interdict","interest","interested","interesting","interests", +"interface","interfere","interference","interim","interior", +"interject","interjection","interlace","interlard","interleave", +"interline","interlinear","interlink","interlock","interlocutor", +"interloper","interlude","intermarriage","intermarry","intermediary", +"intermediate","interment","intermezzo","interminable","intermingle", +"intermission","intermittent","intern","internal","internalise", +"internalize","international","internationale","internationalise","internationalism", +"internationalize","interne","internecine","internee","internment", +"interpellate","interpenetrate","interpersonal","interplanetary","interplay", +"interpol","interpolate","interpolation","interpose","interposition", +"interpret","interpretation","interpretative","interpreter","interracial", +"interregnum","interrelate","interrelation","interrogate","interrogative", +"interrogatory","interrupt","intersect","intersection","intersperse", +"interstate","interstellar","interstice","intertribal","intertwine", +"interurban","interval","intervene","intervention","interview", +"interweave","intestate","intestinal","intestine","intimacy", +"intimate","intimidate","intimidation","into","intolerable", +"intolerant","intonation","intone","intoxicant","intoxicate", +"intractable","intramural","intransigent","intransitive","intravenous", +"intrench","intrepid","intricacy","intricate","intrigue", +"intrinsic","intro","introduce","introduction","introductory", +"introit","introspection","introspective","introvert","introverted", +"intrude","intruder","intrusion","intrusive","intrust", +"intuit","intuition","intuitive","intumescence","inundate", +"inundation","inure","invade","invalid","invalidate", +"invalidism","invaluable","invariable","invasion","invective", +"inveigh","inveigle","invent","invention","inventive", +"inventor","inventory","inverse","inversion","invert", +"invertebrate","invest","investigate","investiture","investment", +"inveterate","invidious","invigilate","invigorate","invincible", +"inviolable","inviolate","invisible","invitation","invite", +"inviting","invocation","invoice","invoke","involuntary", +"involve","involved","invulnerable","inward","inwardness", +"inwards","inwrought","iodin","iodine","iodise", +"iodize","ion","ionic","ionise","ionize", +"ionosphere","iota","iou","ipa","ira", +"irascible","irate","ire","iridescent","iridium", +"irishman","irk","irksome","iron","ironclad", +"ironic","ironically","ironing","ironmonger","ironmongery", +"ironmould","irons","ironstone","ironware","ironwork", +"ironworks","irony","irradiate","irrational","irreconcilable", +"irrecoverable","irredeemable","irreducible","irrefutable","irregular", +"irregularity","irrelevance","irrelevant","irreligious","irremediable", +"irremovable","irreparable","irreplaceable","irrepressible","irreproachable", +"irresistible","irresolute","irresponsible","irretrievable","irreverent", +"irreversible","irrevocable","irrigate","irritable","irritant", +"irritate","irritation","irruption","isinglass","islam", +"island","islander","isle","islet","ism", +"isobar","isolate","isolated","isolation","isolationism", +"isotherm","isotope","israelite","issue","isthmus", +"ita","italic","italicise","italicize","italics", +"itch","itchy","item","itemise","itemize", +"iterate","itinerant","itinerary","itn","its", +"itself","itv","iud","ivied","ivory", +"ivy","jab","jabber","jack","jackal", +"jackanapes","jackaroo","jackass","jackboot","jackdaw", +"jackeroo","jacket","jackpot","jackrabbit","jacobean", +"jacobite","jade","jaded","jaffa","jag", +"jagged","jaguar","jail","jailbird","jailbreak", +"jailer","jailor","jalopy","jam","jamb", +"jamboree","jammy","jangle","janissary","janitor", +"january","japan","jape","japonica","jar", +"jargon","jasmine","jasper","jaundice","jaundiced", +"jaunt","jaunty","javelin","jaw","jawbone", +"jawbreaker","jaws","jay","jaywalk","jazz", +"jazzy","jealous","jealousy","jeans","jeep", +"jeer","jehovah","jejune","jell","jellied", +"jello","jelly","jellyfish","jemmy","jenny", +"jeopardise","jeopardize","jeopardy","jerboa","jeremiad", +"jerk","jerkin","jerky","jeroboam","jerry", +"jersey","jest","jester","jesting","jesuit", +"jesuitical","jet","jetsam","jettison","jetty", +"jew","jewel","jeweled","jeweler","jewelled", +"jeweller","jewellery","jewelry","jewess","jewish", +"jezebel","jib","jibe","jiffy","jig", +"jigger","jiggered","jiggle","jigsaw","jihad", +"jilt","jiminy","jimjams","jimmy","jingle", +"jingo","jingoism","jinks","jinn","jinrikisha", +"jinx","jitney","jitterbug","jitters","jiujitsu", +"jive","jnr","job","jobber","jobbery", +"jobbing","jobless","jockey","jockstrap","jocose", +"jocular","jocund","jodhpurs","jog","joggle", +"john","johnny","join","joiner","joinery", +"joint","joist","joke","joker","jollification", +"jollity","jolly","jolt","jolty","jonah", +"jonquil","josh","jostle","jot","jotter", +"jotting","joule","journal","journalese","journalism", +"journalist","journey","journeyman","joust","jove", +"jovial","jowl","joy","joyful","joyless", +"joyous","joyride","joystick","jubilant","jubilation", +"jubilee","judaic","judaism","judder","judge", +"judgement","judgment","judicature","judicial","judiciary", +"judicious","judo","jug","juggernaut","juggle", +"juice","juicy","jujitsu","juju","jujube", +"jukebox","julep","july","jumble","jumbo", +"jump","jumper","jumps","jumpy","junction", +"juncture","june","jungle","junior","juniper", +"junk","junket","junketing","junkie","junky", +"junoesque","junta","jupiter","juridical","jurisdiction", +"jurisprudence","jurist","juror","jury","juryman", +"just","justice","justifiable","justification","justified", +"justify","jut","jute","juvenile","juxtapose", +"juxtaposition","kaffir","kafir","kaftan","kail", +"kaiser","kale","kaleidoscope","kaleidoscopic","kalends", +"kampong","kangaroo","kaolin","kapok","kappa", +"kaput","karat","karate","karma","katydid", +"kayak","kazoo","kebab","kebob","kedgeree", +"keel","keelhaul","keen","keep","keeper", +"keeping","keeps","keepsake","keg","kelp", +"kelvin","ken","kennel","kennels","kepi", +"kept","kerb","kerchief","kerfuffle","kernel", +"kerosene","kerosine","kersey","kestrel","ketch", +"ketchup","kettle","kettledrum","key","keyboard", +"keyhole","keyless","keynote","keypunch","keystone", +"khaki","khalif","khalifate","khan","kibbutz", +"kibosh","kick","kickback","kicker","kickoff", +"kicks","kid","kiddie","kiddy","kidnap", +"kidney","kike","kill","killer","killing", +"killjoy","kiln","kilo","kilogram","kilogramme", +"kilohertz","kiloliter","kilolitre","kilometer","kilometre", +"kilowatt","kilt","kimono","kin","kind", +"kindergarten","kindle","kindling","kindly","kindness", +"kindred","kine","kinetic","kinetics","kinfolk", +"king","kingcup","kingdom","kingfisher","kingly", +"kingmaker","kingpin","kings","kingship","kink", +"kinky","kinsfolk","kinship","kinsman","kiosk", +"kip","kipper","kirk","kirsch","kirtle", +"kismet","kiss","kisser","kit","kitchen", +"kitchenette","kite","kitsch","kitten","kittenish", +"kittiwake","kitty","kiwi","klaxon","kleenex", +"kleptomania","kleptomaniac","knack","knacker","knackered", +"knapsack","knave","knavery","knead","knee", +"kneecap","kneel","knell","knew","knickerbockers", +"knickers","knife","knight","knighthood","knightly", +"knit","knitter","knitting","knitwear","knives", +"knob","knobbly","knobkerrie","knock","knockabout", +"knockdown","knocker","knockers","knockout","knoll", +"knot","knothole","knotty","knout","know", +"knowing","knowingly","knowledge","knowledgeable","known", +"knuckle","koala","kohl","kohlrabi","kookaburra", +"kopeck","kopek","kopje","koppie","koran", +"kosher","kowtow","kraal","kremlin","kris", +"krona","krone","kudos","kukri","kumis", +"kumquat","kuomintang","kurus","kvass","kwashiorkor", +"kwela","laager","lab","label","labial", +"labor","laboratory","laborer","laborious","labour", +"labourer","labourite","labrador","laburnum","labyrinth", +"lace","lacerate","laceration","lachrymal","lachrymose", +"lack","lackadaisical","lackey","lacking","lackluster", +"lacklustre","laconic","lacquer","lacrosse","lactation", +"lactic","lactose","lacuna","lacy","lad", +"ladder","laddie","laddy","laden","ladies", +"lading","ladle","lady","ladybird","ladylike", +"ladyship","lag","lager","laggard","lagging", +"lagoon","laid","lain","lair","laird", +"laity","lake","lam","lama","lamaism", +"lamasery","lamb","lambaste","lambent","lambkin", +"lamblike","lambskin","lame","lament","lamentable", +"lamentation","laminate","lamming","lamp","lampoon", +"lamppost","lamprey","lampshade","lance","lancer", +"lancers","lancet","land","landau","landed", +"landfall","landing","landlady","landlocked","landlord", +"landlubber","landmark","landmine","lands","landscape", +"landslide","landslip","landward","landwards","lane", +"language","languid","languish","languor","lank", +"lanky","lanolin","lantern","lanternslide","lanyard", +"lap","lapdog","lapel","lapidary","lapse", +"lapsed","lapwing","larboard","larceny","larch", +"lard","larder","large","largely","largess", +"largesse","largo","lariat","lark","larkspur", +"larrup","larva","laryngeal","laryngitis","laryngoscope", +"larynx","lasagna","lascivious","laser","lash", +"lashing","lashings","lass","lasso","last", +"lasting","lastly","lat","latch","latchkey", +"late","latecomer","lately","latent","lateral", +"latest","latex","lath","lathe","lather", +"latin","latinise","latinize","latitude","latitudes", +"latitudinal","latitudinarian","latrine","latter","latterly", +"lattice","laud","laudable","laudanum","laudatory", +"laugh","laughable","laughingstock","laughter","launch", +"launder","launderette","laundress","laundry","laureate", +"laurel","laurels","lava","lavatory","lave", +"lavender","lavish","law","lawful","lawless", +"lawn","lawsuit","lawyer","lax","laxative", +"laxity","lay","layabout","layer","layette", +"layman","layout","laze","lazy","lbw", +"lcm","lea","leach","lead","leaden", +"leader","leadership","leading","leads","leaf", +"leafage","leafed","leaflet","leafy","league", +"leak","leakage","leaky","lean","leaning", +"leap","leapfrog","learn","learned","learner", +"learning","lease","leasehold","leash","least", +"leastways","leather","leatherette","leathery","leave", +"leaved","leaven","leavening","leaves","leavings", +"lech","lecher","lecherous","lechery","lectern", +"lecture","lecturer","lectureship","led","ledge", +"ledger","lee","leech","leek","leer", +"leery","lees","leeward","leeway","left", +"leftist","leftovers","leftward","leftwards","leg", +"legacy","legal","legalise","legality","legalize", +"legate","legatee","legation","legato","legend", +"legendary","leger","legerdemain","legged","leggings", +"leggy","legible","legion","legionary","legislate", +"legislation","legislative","legislator","legislature","legit", +"legitimate","legitimatise","legitimatize","legroom","legume", +"leguminous","lei","leisure","leisured","leisurely", +"leitmotif","leitmotive","lemming","lemon","lemonade", +"lemur","lend","length","lengthen","lengthways", +"lengthy","lenience","lenient","lenity","lens", +"lent","lentil","lento","leo","leonine", +"leopard","leotard","leper","leprechaun","leprosy", +"lesbian","lesion","less","lessee","lessen", +"lesser","lesson","lessor","lest","let", +"letdown","lethal","lethargy","letraset","letter", +"letterbox","lettered","letterhead","lettering","letterpress", +"letters","letting","lettuce","letup","leucocyte", +"leucotomy","leukaemia","leukemia","leukocyte","levee", +"level","leveler","leveller","lever","leverage", +"leveret","leviathan","levitate","levity","levodopa", +"levy","lewd","lexical","lexicographer","lexicography", +"lexicon","lexis","liability","liable","liaise", +"liaison","liana","liar","lib","libation", +"libel","libellous","libelous","liberal","liberalise", +"liberalism","liberality","liberalize","liberally","liberate", +"liberated","liberation","libertarian","liberties","libertine", +"liberty","libidinous","libido","libra","librarian", +"library","librettist","libretto","lice","licence", +"licenced","license","licensed","licensee","licentiate", +"licentious","lichen","licit","lick","licking", +"licorice","lid","lido","lie","lieder", +"lief","liege","lien","lieu","lieutenant", +"life","lifeblood","lifeboat","lifeguard","lifeless", +"lifelike","lifeline","lifelong","lifer","lifetime", +"lift","liftboy","ligament","ligature","light", +"lighten","lighter","lighterage","lighthouse","lighting", +"lightly","lightness","lightning","lights","lightship", +"lightweight","ligneous","lignite","likable","like", +"likeable","likelihood","likely","liken","likeness", +"likes","likewise","liking","lilac","lilliputian", +"lilo","lilt","lily","limb","limber", +"limbo","lime","limeade","limejuice","limekiln", +"limelight","limerick","limestone","limey","limit", +"limitation","limited","limiting","limitless","limn", +"limousine","limp","limpet","limpid","limy", +"linchpin","linctus","linden","line","lineage", +"lineal","lineament","linear","lineman","linen", +"lineout","liner","linertrain","lines","lineshooter", +"linesman","lineup","ling","linger","lingerie", +"lingering","lingo","lingual","linguist","linguistic", +"linguistics","liniment","lining","link","linkage", +"linkman","links","linkup","linnet","linocut", +"linoleum","linotype","linseed","lint","lintel", +"lion","lionize","lip","lipid","lipstick", +"liquefaction","liquefy","liquescent","liqueur","liquid", +"liquidate","liquidation","liquidator","liquidity","liquidize", +"liquidizer","liquor","liquorice","lira","lisle", +"lisp","lissom","lissome","list","listen", +"listenable","listener","listless","lists","lit", +"litany","litchi","liter","literacy","literal", +"literally","literary","literate","literati","literature", +"lithe","lithium","lithograph","lithographic","lithography", +"litigant","litigate","litigation","litigious","litmus", +"litotes","litre","litter","litterateur","litterbin", +"litterlout","little","littoral","liturgical","liturgy", +"livable","live","liveable","livelihood","livelong", +"lively","liven","liver","liveried","liverish", +"livery","liveryman","lives","livestock","livid", +"living","lizard","llama","load","loaded", +"loadstar","loadstone","loaf","loafsugar","loam", +"loan","loanword","loath","loathe","loathing", +"loathsome","loaves","lob","lobby","lobed", +"lobotomy","lobster","lobsterpot","local","locale", +"localise","localism","locality","localize","locally", +"locate","located","location","loch","loci", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java new file mode 100644 index 00000000000..c917c7ace29 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData5 { + private KStemData5() { + } + static String[] data = { +"lock","locker","locket","lockjaw","locknut", +"lockout","locks","locksmith","lockstitch","lockup", +"loco","locomotion","locomotive","locum","locus", +"locust","locution","lode","lodestar","lodestone", +"lodge","lodgement","lodger","lodging","lodgings", +"lodgment","loess","loft","lofted","lofty", +"log","loganberry","logarithm","logarithmic","logbook", +"logger","loggerheads","loggia","logic","logical", +"logically","logician","logistic","logistics","logjam", +"logrolling","loin","loincloth","loins","loiter", +"loll","lollipop","lollop","lolly","lone", +"lonely","loner","lonesome","long","longboat", +"longbow","longevity","longhaired","longhand","longheaded", +"longhop","longing","longish","longitude","longitudinal", +"longship","longshoreman","longsighted","longstanding","longstop", +"longsuffering","longueur","longways","longwearing","longwinded", +"longwise","loo","loofa","loofah","look", +"looker","lookout","looks","loom","loon", +"loony","loop","loophole","loose","loosebox", +"loosen","loot","lop","lope","loppings", +"loquacious","loquat","lord","lordly","lords", +"lordship","lore","lorgnette","lorn","lorry", +"lose","loser","loss","lost","lot", +"loth","lotion","lottery","lotto","lotus", +"loud","loudhailer","loudmouth","loudspeaker","lough", +"lounge","lounger","lour","louse","lousy", +"lout","louver","louvre","lovable","love", +"loveable","lovebird","lovechild","loveless","lovelorn", +"lovely","lovemaking","lover","lovers","lovesick", +"lovey","loving","low","lowborn","lowbred", +"lowbrow","lowdown","lower","lowermost","lowland", +"lowlander","lowly","loyal","loyalist","loyalty", +"lozenge","lsd","ltd","lubber","lubricant", +"lubricate","lubricator","lubricious","lucerne","lucid", +"luck","luckless","lucky","lucrative","lucre", +"ludicrous","ludo","luff","lug","luggage", +"lugger","lughole","lugsail","lugubrious","lugworm", +"lukewarm","lull","lullaby","lumbago","lumbar", +"lumber","lumberjack","lumberman","lumberyard","luminary", +"luminous","lumme","lummox","lummy","lump", +"lumpish","lumpy","lunacy","lunar","lunate", +"lunatic","lunch","lunchtime","lung","lunge", +"lungfish","lungpower","lupin","lurch","lure", +"lurgy","lurid","lurk","luscious","lush", +"lust","luster","lustful","lustre","lustrous", +"lusty","lutanist","lute","lutenist","luv", +"luxuriant","luxuriate","luxurious","luxury","lychee", +"lychgate","lye","lymph","lymphatic","lynch", +"lynx","lyre","lyrebird","lyric","lyrical", +"lyricism","lyricist","lyrics","mac","macabre", +"macadam","macadamise","macadamize","macaroni","macaroon", +"macaw","mace","macerate","mach","machete", +"machiavellian","machination","machine","machinegun","machinery", +"machinist","mackerel","mackintosh","macrobiotic","macrocosm", +"mad","madam","madame","madcap","madden", +"maddening","madder","made","madeira","mademoiselle", +"madhouse","madly","madman","madness","madonna", +"madrigal","maelstrom","maenad","maestro","mafia", +"mag","magazine","magenta","maggot","maggoty", +"magi","magic","magical","magician","magisterial", +"magistracy","magistrate","magma","magnanimity","magnanimous", +"magnate","magnesia","magnesium","magnet","magnetic", +"magnetise","magnetism","magnetize","magneto","magnificat", +"magnification","magnificent","magnifier","magnify","magniloquent", +"magnitude","magnolia","magnum","magpie","magus", +"maharaja","maharajah","maharanee","maharani","mahatma", +"mahlstick","mahogany","mahout","maid","maiden", +"maidenhair","maidenhead","maidenhood","maidenly","maidservant", +"mail","mailbag","mailbox","maim","main", +"mainland","mainline","mainly","mainmast","mains", +"mainsail","mainspring","mainstay","mainstream","maintain", +"maintenance","maisonette","maisonnette","maize","majestic", +"majesty","majolica","major","majordomo","majorette", +"majority","make","maker","makeshift","making", +"makings","malachite","maladjusted","maladministration","maladroit", +"malady","malaise","malapropism","malapropos","malaria", +"malarial","malay","malcontent","malcontented","male", +"malediction","malefactor","maleficent","malevolent","malfeasance", +"malformation","malformed","malfunction","malice","malicious", +"malign","malignancy","malignant","malignity","malinger", +"mall","mallard","malleable","mallet","mallow", +"malmsey","malnutrition","malodorous","malpractice","malt", +"malthusian","maltreat","maltster","mama","mamba", +"mambo","mamma","mammal","mammary","mammon", +"mammoth","mammy","man","manacle","manage", +"manageable","management","manager","manageress","managerial", +"manatee","mandarin","mandate","mandatory","mandible", +"mandolin","mandrake","mandrill","maneuver","maneuverable", +"manful","manganese","mange","manger","mangle", +"mango","mangosteen","mangrove","mangy","manhandle", +"manhole","manhood","manhour","mania","maniac", +"maniacal","manic","manicure","manicurist","manifest", +"manifestation","manifesto","manifold","manikin","manila", +"manilla","manipulate","manipulation","mankind","manly", +"manna","manned","mannequin","manner","mannered", +"mannerism","mannerly","manners","mannikin","mannish", +"manoeuverable","manoeuvre","manometer","manor","manorial", +"manpower","mansard","manse","manservant","mansion", +"mansions","manslaughter","mantelpiece","mantelshelf","mantilla", +"mantis","mantle","mantrap","manual","manufacture", +"manufacturer","manumit","manure","manuscript","manx", +"many","maoism","maori","map","maple", +"mapping","maquis","mar","marabou","marabout", +"maraschino","marathon","maraud","marble","marbled", +"marbles","marc","marcasite","march","marchioness", +"margarine","margin","marginal","marguerite","marigold", +"marihuana","marijuana","marimba","marina","marinade", +"marinate","marine","mariner","marionette","marital", +"maritime","marjoram","mark","markdown","marked", +"marker","market","marketeer","marketer","marketing", +"marketplace","marking","marksman","marksmanship","markup", +"marl","marlinespike","marmalade","marmoreal","marmoset", +"marmot","marocain","maroon","marquee","marquess", +"marquetry","marquis","marriage","marriageable","married", +"marrow","marrowbone","marrowfat","marry","mars", +"marsala","marseillaise","marsh","marshal","marshmallow", +"marshy","marsupial","mart","marten","martial", +"martian","martin","martinet","martini","martinmas", +"martyr","martyrdom","marvel","marvellous","marvelous", +"marxism","marzipan","mascara","mascot","masculine", +"masculinity","maser","mash","mashie","mask", +"masked","masochism","mason","masonic","masonry", +"masque","masquerade","mass","massacre","massage", +"masses","masseur","massif","massive","massy", +"mast","mastectomy","master","masterful","masterly", +"mastermind","masterpiece","mastership","masterstroke","mastery", +"masthead","mastic","masticate","mastiff","mastitis", +"mastodon","mastoid","mastoiditis","masturbate","mat", +"matador","match","matchbox","matching","matchless", +"matchlock","matchmaker","matchstick","matchwood","mate", +"material","materialise","materialism","materialist","materialize", +"maternal","maternity","matey","mathematician","mathematics", +"matins","matriarch","matriarchy","matricide","matriculate", +"matrimony","matrix","matron","matronly","matt", +"matter","matting","mattins","mattock","mattress", +"maturation","mature","maturity","maudlin","maul", +"maulstick","maunder","mausoleum","mauve","maverick", +"maw","mawkish","maxi","maxim","maximal", +"maximise","maximize","maximum","may","maybe", +"maybeetle","mayday","mayfly","mayhem","mayonnaise", +"mayor","mayoralty","mayoress","maypole","mayst", +"maze","mazed","mazurka","mccarthyism","mead", +"meadow","meadowsweet","meager","meagre","meal", +"mealie","mealtime","mealy","mealybug","mean", +"meander","meanderings","meaning","meaningful","meaningless", +"means","meant","meantime","meanwhile","measles", +"measly","measurable","measure","measured","measureless", +"measurement","meat","meatball","meaty","mecca", +"mechanic","mechanical","mechanics","mechanise","mechanism", +"mechanistic","mechanize","medal","medalist","medallion", +"medallist","meddle","meddlesome","media","mediaeval", +"medial","median","mediate","medic","medical", +"medicament","medicare","medicate","medication","medicinal", +"medicine","medico","medieval","mediocre","mediocrity", +"meditate","meditation","meditative","mediterranean","medium", +"medlar","medley","meed","meek","meerschaum", +"meet","meeting","meetinghouse","megadeath","megahertz", +"megalith","megalithic","megalomania","megalomaniac","megaphone", +"megaton","megrim","meiosis","melancholia","melancholic", +"melancholy","meld","melee","meliorate","meliorism", +"mellifluous","mellow","melodic","melodious","melodrama", +"melodramatic","melody","melon","melt","melting", +"member","membership","membrane","membranous","memento", +"memo","memoir","memoirs","memorabilia","memorable", +"memorandum","memorial","memorise","memorize","memory", +"memsahib","men","menace","menagerie","mend", +"mendacious","mendacity","mendelian","mendicant","mending", +"menfolk","menial","meningitis","meniscus","menopause", +"menses","menstrual","menstruate","mensurable","mensuration", +"mental","mentality","menthol","mentholated","mention", +"mentor","menu","meow","mephistopheles","mercantile", +"mercenary","mercer","mercerise","mercerize","merchandise", +"merchant","merchantman","merciful","merciless","mercurial", +"mercury","mercy","mere","merely","meretricious", +"merge","merger","meridian","meridional","meringue", +"merino","merit","meritocracy","meritorious","mermaid", +"merman","merriment","merry","merrymaking","mesa", +"mescalin","mescaline","mesdames","mesdemoiselles","meseems", +"mesh","mesmeric","mesmerise","mesmerism","mesmerist", +"mesmerize","mess","message","messenger","messiah", +"messianic","messieurs","messmate","messrs","messuage", +"messy","mestizo","met","metabolic","metabolise", +"metabolism","metabolize","metacarpal","metal","metalanguage", +"metallic","metallurgist","metallurgy","metalwork","metamorphose", +"metamorphosis","metaphor","metaphorical","metaphysics","metatarsal", +"mete","metempsychosis","meteor","meteoric","meteorite", +"meteoroid","meteorologist","meteorology","meter","methane", +"methinks","method","methodical","methodism","methodology", +"meths","methuselah","meticulous","metre","metric", +"metrical","metrication","metricise","metricize","metro", +"metronome","metropolis","metropolitan","mettle","mettlesome", +"mew","mews","mezzanine","mezzo","mezzotint", +"miaow","miasma","mica","mice","michaelmas", +"mick","mickey","microbe","microbiologist","microbiology", +"microcosm","microelectronics","microfiche","microfilm","micromesh", +"micrometer","micron","microorganism","microphone","microscope", +"microscopic","microsecond","microwave","mid","midair", +"midcourse","midday","midden","middle","middlebrow", +"middleman","middleweight","middling","midge","midget", +"midi","midland","midlands","midmost","midnight", +"midpoint","midriff","midshipman","midships","midst", +"midsummer","midway","midweek","midwest","midwicket", +"midwife","midwifery","mien","miffed","might", +"mightily","mighty","mignonette","migraine","migrant", +"migrate","migration","migratory","mikado","mike", +"milady","mild","mildew","mildly","mile", +"mileage","mileometer","miler","milestone","milieu", +"militancy","militant","militarise","militarism","militarize", +"military","militate","militia","militiaman","milk", +"milker","milkmaid","milkman","milksop","milkweed", +"milky","mill","millboard","milldam","millenarian", +"millenium","millepede","miller","millet","millibar", +"milligram","milligramme","milliliter","millilitre","millimeter", +"millimetre","milliner","millinery","million","millionaire", +"millipede","millpond","millrace","millstone","millwheel", +"millwright","milometer","milord","milt","mime", +"mimeograph","mimetic","mimic","mimicry","mimosa", +"min","minaret","minatory","mince","mincemeat", +"mincer","mincingly","mind","minded","mindful", +"mindless","mine","minefield","minelayer","miner", +"mineral","mineralogist","mineralogy","minestrone","minesweeper", +"mingle","mingy","mini","miniature","miniaturist", +"minibus","minim","minimal","minimise","minimize", +"minimum","mining","minion","minister","ministerial", +"ministrant","ministration","ministry","miniver","mink", +"minnow","minor","minority","minotaur","minster", +"minstrel","minstrelsy","mint","minuet","minus", +"minuscule","minute","minutely","minuteman","minutes", +"minutia","minx","miracle","miraculous","mirage", +"mire","mirror","mirth","miry","misadventure", +"misadvise","misalliance","misanthrope","misanthropy","misapplication", +"misapply","misapprehend","misapprehension","misappropriate","misbegotten", +"misbehave","misbehaved","misbehavior","misbehaviour","miscalculate", +"miscall","miscarry","miscast","miscegenation","miscellaneous", +"miscellany","mischance","mischief","mischievous","misconceive", +"misconception","misconduct","misconstruction","misconstrue","miscount", +"miscreant","miscue","misdate","misdeal","misdeed", +"misdemeanor","misdemeanour","misdirect","misdoing","miser", +"miserable","miserably","miserly","misery","misfire", +"misfit","misfortune","misgiving","misgovern","misguide", +"misguided","mishandle","mishap","mishear","mishit", +"mishmash","misinform","misinterpret","misjudge","misjudgement", +"misjudgment","mislay","mislead","mismanage","mismatch", +"misname","misnomer","misogynist","misogyny","misplace", +"misprint","mispronounce","mispronunciation","misquote","misread", +"misreport","misrepresent","misrule","miss","missal", +"misshapen","missile","missing","mission","missionary", +"missis","missive","misspell","misspend","misstate", +"misstatement","missus","missy","mist","mistake", +"mistaken","mister","mistime","mistletoe","mistral", +"mistranslate","mistress","mistrial","mistrust","mistrustful", +"mists","misty","misunderstand","misunderstanding","misuse", +"mite","miter","mitigate","mitosis","mitre", +"mitt","mitten","mix","mixed","mixer", +"mixture","mizen","mizzen","mizzenmast","mizzle", +"mnemonic","mnemonics","moa","moan","moat", +"moated","mob","mobile","mobilisation","mobilise", +"mobility","mobilization","mobilize","mobster","moccasin", +"mocha","mock","mockers","mockery","mockingbird", +"modal","mode","model","moderate","moderately", +"moderation","moderations","moderato","moderator","modern", +"modernise","modernism","modernistic","modernity","modernize", +"modest","modesty","modicum","modification","modifier", +"modify","modish","mods","modular","modulate", +"modulation","module","moggy","mogul","moh", +"mohair","mohammedan","mohammedanism","moiety","moist", +"moisten","moisture","moisturise","moisturize","moke", +"molar","molasses","mold","molder","molding", +"moldy","mole","molecular","molecule","molehill", +"moleskin","molest","moll","mollify","mollusc", +"mollusk","mollycoddle","molt","molten","molto", +"molybdenum","mom","moment","momentarily","momentary", +"momentous","moments","momentum","momma","mommy", +"monarch","monarchic","monarchism","monarchist","monarchy", +"monastery","monastic","monasticism","monaural","monday", +"monetary","money","moneybags","moneybox","moneychanger", +"moneyed","moneylender","moneymaker","moneys","monger", +"mongol","mongolism","mongoose","mongrel","monies", +"monitor","monk","monkey","mono","monochrome", +"monocle","monogamous","monogamy","monogram","monograph", +"monolith","monolithic","monolog","monologue","monomania", +"monomaniac","mononucleosis","monophonic","monophthong","monoplane", +"monopolise","monopolist","monopolize","monopoly","monorail", +"monosyllabic","monosyllable","monotheism","monotone","monotonous", +"monotony","monotype","monoxide","monsieur","monsignor", +"monsoon","monster","monstrance","monstrosity","monstrous", +"montage","month","monthly","monument","monumental", +"monumentally","moo","mooch","moocow","mood", +"moody","moon","moonbeam","mooncalf","moonlight", +"moonlit","moonshine","moonstone","moonstruck","moony", +"moor","moorhen","moorings","moorish","moorland", +"moose","moot","mop","mope","moped", +"moppet","moquette","moraine","moral","morale", +"moralise","moralist","moralistic","morality","moralize", +"morally","morals","morass","moratorium","morbid", +"morbidity","mordant","more","morello","moreover", +"mores","moresque","morganatic","morgue","moribund", +"mormon","mormonism","morn","morning","mornings", +"morocco","moron","moronic","morose","morpheme", +"morphemics","morpheus","morphine","morphology","morrow", +"morsel","mortal","mortality","mortally","mortar", +"mortarboard","mortgage","mortgagee","mortgagor","mortice", +"mortician","mortification","mortify","mortise","mortuary", +"mosaic","moselle","mosey","moslem","mosque", +"mosquito","moss","mossy","most","mostly", +"mote","motel","motet","moth","mothball", +"mothballs","mother","motherhood","motherly","mothproof", +"motif","motion","motionless","motions","motivate", +"motivation","motive","motley","motocross","motor", +"motorbike","motorboat","motorcade","motorcar","motorcycle", +"motorcyclist","motoring","motorise","motorist","motorize", +"motorman","motorway","mottled","motto","mould", +"moulder","moulding","mouldy","moult","mound", +"mount","mountain","mountaineer","mountaineering","mountainous", +"mountainside","mountaintop","mountebank","mountie","mourn", +"mourner","mournful","mourning","mouse","mouser", +"mousetrap","moussaka","mousse","moustache","mousy", +"mouth","mouthful","mouthorgan","mouthpiece","mouthwash", +"movable","move","moveable","movement","movements", +"mover","movie","movies","moving","mow", +"mower","mpg","mph","mra","mrs", +"msc","much","muchness","mucilage","muck", +"muckheap","muckrake","mucky","mucous","mucus", +"mud","muddle","muddy","mudflat","mudguard", +"mudpack","mudslinger","muesli","muezzin","muff", +"muffin","muffle","muffler","mufti","mug", +"mugger","muggins","muggy","mugwump","muhammadan", +"muhammadanism","mulatto","mulberry","mulch","mulct", +"mule","muleteer","mulish","mull","mullah", +"mullet","mulligatawny","mullion","mullioned","multifarious", +"multiform","multilateral","multilingual","multimillionaire","multiple", +"multiplex","multiplication","multiplicity","multiply","multiracial", +"multistorey","multitude","multitudinous","mum","mumble", +"mummer","mummery","mummify","mumming","mummy", +"mumps","munch","mundane","municipal","municipality", +"munificence","munificent","muniments","munition","munitions", +"mural","murder","murderous","murk","murky", +"murmur","murphy","murrain","muscatel","muscle", +"muscled","muscleman","muscovite","muscular","muse", +"museum","mush","mushroom","mushy","music", +"musical","musically","musician","musicianship","musk", +"musket","musketeer","musketry","muskmelon","muskrat", +"musky","muslim","muslin","musquash","muss", +"mussel","must","mustache","mustachio","mustang", +"mustard","muster","musty","mutable","mutant", +"mutation","mute","muted","mutilate","mutilation", +"mutineer","mutinous","mutiny","mutt","mutter", +"mutton","muttonchops","mutual","mutuality","muzak", +"muzzle","muzzy","mycology","myelitis","myna", +"mynah","myopia","myriad","myrrh","myrtle", +"myself","mysterious","mystery","mystic","mystical", +"mysticism","mystification","mystify","mystique","myth", +"mythical","mythological","mythologist","mythology","myxomatosis", +"nab","nabob","nacelle","nacre","nadir", +"nag","naiad","nail","nailbrush","naive", +"naivete","naivety","naked","name","namedrop", +"nameless","namely","nameplate","namesake","nanny", +"nap","napalm","naphtha","naphthalene","napkin", +"nappy","narc","narcissism","narcissus","narcotic", +"nark","narky","narrate","narration","narrative", +"narrator","narrow","narrowly","narrows","narwhal", +"nasal","nasalise","nasalize","nascent","nasturtium", +"nasty","natal","nation","national","nationalise", +"nationalism","nationalist","nationalistic","nationality","nationalize", +"nationwide","native","nativity","nato","natter", +"natty","natural","naturalise","naturalism","naturalist", +"naturalistic","naturalize","naturally","naturalness","nature", +"naturism","naturopath","naught","naughty","nausea", +"nauseate","nauseous","nautch","nautical","nautilus", +"naval","nave","navel","navigable","navigate", +"navigation","navigator","navvy","navy","nay", +"nazi","nco","neanderthal","neapolitan","near", +"nearby","nearly","nearside","nearsighted","neat", +"nebula","nebular","nebulous","necessaries","necessarily", +"necessary","necessitate","necessitous","necessity","neck", +"neckband","neckerchief","necklace","necklet","neckline", +"necktie","neckwear","necromancer","necromancy","necrophilia", +"necrophiliac","necropolis","nectar","nectarine","need", +"needful","needle","needless","needlessly","needlewoman", +"needlework","needs","needy","nefarious","negate", +"negative","neglect","neglectful","negligee","negligence", +"negligent","negligible","negotiable","negotiate","negotiation", +"negress","negro","negus","neigh","neighbor", +"neighborhood","neighboring","neighborly","neighbour","neighbourhood", +"neighbouring","neighbourly","neither","nelson","nemesis", +"neoclassical","neocolonialism","neolithic","neologism","neon", +"neonate","neophyte","neoplasm","nephew","nephritis", +"nepotism","neptune","nereid","nerve","nerveless", +"nerves","nervous","nervy","ness","nest", +"nesting","nestle","nestling","nestor","net", +"netball","nether","nethermost","nets","nett", +"netting","nettle","network","neural","neuralgia", +"neurasthenia","neurasthenic","neuritis","neurologist","neurology", +"neurosis","neurotic","neuter","neutral","neutralise", +"neutrality","neutralize","neutralizer","neutron","never", +"nevermore","nevertheless","new","newborn","newcomer", +"newel","newfangled","newfoundland","newly","newlywed", +"newmarket","news","newsagent","newsboy","newscast", +"newscaster","newsletter","newsmonger","newspaper","newsprint", +"newsreel","newsroom","newssheet","newsstand","newsvendor", +"newsworthy","newsy","newt","newtonian","next", +"nexus","nhs","niacin","nib","nibble", +"niblick","nibs","nice","nicely","nicety", +"niche","nick","nickel","nicker","nicknack", +"nickname","nicotine","niece","niff","nifty", +"niggard","niggardly","nigger","niggle","niggling", +"nigh","night","nightcap","nightclothes","nightclub", +"nightdress","nightfall","nighthawk","nightingale","nightjar", +"nightlife","nightlight","nightline","nightlong","nightly", +"nightmare","nights","nightshade","nightshirt","nightstick", +"nighttime","nihilism","nilotic","nimble","nimbus", +"nimrod","nincompoop","nine","ninepin","ninepins", +"nines","nineteen","ninety","ninny","ninth", +"nip","nipper","nippers","nipping","nipple", +"nippy","nirvana","nisi","nit","niter", +"nitpick","nitpicking","nitrate","nitre","nitric", +"nitrochalk","nitrogen","nitroglycerin","nitroglycerine","nitrous", +"nitwit","nix","nob","nobble","nobility", +"noble","nobleman","nobly","nobody","nocturnal", +"nocturne","nod","nodal","noddle","nodular", +"nodule","noel","noes","nog","noggin", +"nohow","noise","noisome","noisy","nomad", +"nomadic","nomenclature","nominal","nominate","nomination", +"nominative","nominee","nonage","nonagenarian","nonaggression", +"nonaligned","nonalignment","nonassertive","nonce","nonchalance", +"nonchalant","noncombatant","noncommittal","nonconductor","nonconformist", +"nonconformity","noncontributory","nondescript","none","nonentity", +"nonesuch","nonetheless","nonfiction","nonflammable","nonintervention", +"nonobservance","nonpareil","nonpayment","nonplus","nonproliferation", +"nonresident","nonrestrictive","nonsense","nonsensical","nonskid", +"nonsmoker","nonstandard","nonstarter","nonstick","nonstop", +"nonunion","nonverbal","nonviolence","nonviolent","nonwhite", +"noodle","nook","noon","noonday","noose", +"nope","nor","nordic","norm","normal", +"normalise","normality","normalize","normally","norman", +"normative","north","northbound","northeast","northeaster", +"northeasterly","northeastern","northeastward","northeastwards","northerly", +"northern","northerner","northernmost","northward","northwards", +"northwest","northwester","northwesterly","northwestern","northwestward", +"northwestwards","nos","nose","nosebag","nosebleed", +"nosecone","nosedive","nosegay","nosey","nosh", +"nostalgia","nostril","nostrum","nosy","not", +"notability","notable","notably","notarise","notarize", +"notary","notation","notch","note","notebook", +"notecase","noted","notepaper","noteworthy","nothing", +"nothingness","notice","noticeable","notifiable","notification", +"notify","notion","notional","notions","notoriety", +"notorious","notwithstanding","nougat","nought","noun", +"nourish","nourishment","nous","nova","novel", +"novelette","novelettish","novelist","novella","novelty", +"november","novice","noviciate","novitiate","novocaine", +"now","nowadays","nowhere","nowise","noxious", +"nozzle","nth","nuance","nub","nubile", +"nuclear","nucleus","nude","nudge","nudism", +"nudity","nugatory","nugget","nuisance","null", +"nullah","nullify","nullity","numb","number", +"numberless","numberplate","numbers","numbly","numbskull", +"numeracy","numeral","numerate","numeration","numerator", +"numerical","numerology","numerous","numinous","numismatic", +"numismatics","numskull","nun","nuncio","nunnery", +"nuptial","nuptials","nurse","nurseling","nursemaid", +"nursery","nurseryman","nursing","nursling","nurture", +"nut","nutcase","nutcracker","nuthouse","nutmeg", +"nutria","nutrient","nutriment","nutrition","nutritious", +"nutritive","nuts","nutshell","nutty","nuzzle", +"nylon","nylons","nymph","nymphet","nymphomania", +"nymphomaniac","oaf","oak","oaken","oakum", +"oap","oar","oarlock","oarsman","oarsmanship", +"oasis","oat","oatcake","oath","oatmeal", +"oats","obbligato","obdurate","obeah","obedient", +"obeisance","obelisk","obese","obey","obfuscate", +"obituary","object","objection","objectionable","objective", +"objector","oblation","obligate","obligation","obligatory", +"oblige","obliging","oblique","obliterate","oblivion", +"oblivious","oblong","obloquy","obnoxious","oboe", +"oboist","obscene","obscenity","obscurantism","obscure", +"obscurity","obsequies","obsequious","observable","observance", +"observant","observation","observations","observatory","observe", +"observer","observing","obsess","obsession","obsessional", +"obsessive","obsidian","obsolescent","obsolete","obstacle", +"obstetrician","obstetrics","obstinate","obstreperous","obstruct", +"obstruction","obstructionism","obstructive","obtain","obtainable", +"obtrude","obtrusive","obtuse","obverse","obviate", +"obvious","obviously","ocarina","occasion","occasional", +"occident","occidental","occult","occupancy","occupant", +"occupation","occupational","occupier","occupy","occur", +"occurrence","ocean","oceangoing","oceanography","ocelot", +"ocher","ochre","octagon","octane","octave", +"octavo","octet","october","octogenarian","octopus", +"octosyllabic","ocular","oculist","odalisque","odd", +"oddball","oddity","oddly","oddment","odds", +"ode","odious","odium","odor","odoriferous", +"odorous","odour","odyssey","oecumenical","oecumenicalism", +"oesophagus","oestrogen","off","offal","offbeat", +"offence","offend","offender","offense","offensive", +"offer","offering","offertory","offhand","office", +"officeholder","officer","offices","official","officialdom", +"officialese","officially","officiate","officious","offing", +"offish","offprint","offset","offshoot","offshore", +"offside","offspring","offstage","oft","often", +"ogle","ogre","ohm","oho","oil", +"oilcake","oilcan","oilcloth","oiled","oilfield", +"oilman","oilrig","oils","oilskin","oilskins", +"oily","oink","ointment","okapi","okay", +"okra","old","olden","oldish","oldster", +"oleaginous","oleander","oleograph","olfactory","oligarch", +"oligarchy","olive","olympiad","olympian","olympic", +"ombudsman","omega","omelet","omelette","omen", +"ominous","omission","omit","omnibus","omnipotent", +"omnipresent","omniscient","omnivorous","once","oncoming", +"one","onerous","oneself","onetime","ongoing", +"onion","onlooker","only","onomatopoeia","onrush", +"onset","onshore","onside","onslaught","onto", +"ontology","onus","onward","onwards","onyx", +"oodles","oof","oomph","oops","ooze", +"opacity","opal","opalescent","opaque","ope", +"open","opencast","opener","openhearted","opening", +"openly","openwork","opera","operable","operate", +"operation","operational","operative","operator","operetta", +"ophthalmia","ophthalmic","ophthalmology","ophthalmoscope","opiate", +"opine","opinion","opinionated","opium","opossum", +"opponent","opportune","opportunism","opportunity","oppose", +"opposite","opposition","oppress","oppression","oppressive", +"oppressor","opprobrious","opprobrium","ops","opt", +"optative","optic","optical","optician","optics", +"optimism","optimum","option","optional","opulence", +"opulent","opus","oracle","oracular","oral", +"orange","orangeade","orangeman","orangutang","oration", +"orator","oratorical","oratorio","oratory","orb", +"orbit","orchard","orchestra","orchestral","orchestrate", +"orchid","ordain","ordeal","order","ordered", +"orderly","orders","ordinal","ordinance","ordinand", +"ordinarily","ordinary","ordinate","ordination","ordnance", +"ordure","ore","oregano","organ","organdie", +"organdy","organic","organisation","organise","organised", +"organism","organist","organization","organize","organized", +"orgasm","orgiastic","orgy","orient","oriental", +"orientalist","orientate","orientation","orifice","origin", +"original","originality","originally","originate","oriole", +"orison","orlon","ormolu","ornament","ornamental", +"ornamentation","ornate","ornery","ornithology","orotund", +"orphan","orphanage","orrery","orrisroot","orthodontic", +"orthodontics","orthodox","orthodoxy","orthography","orthopaedic", +"orthopaedics","orthopedic","orthopedics","ortolan","oryx", +"oscar","oscillate","oscillation","oscillator","oscillograph", +"oscilloscope","osculation","osier","osmosis","osprey", +"osseous","ossification","ossify","ostensible","ostentation", +"osteoarthritis","osteopath","osteopathy","ostler","ostracise", +"ostracize","ostrich","other","otherwise","otherworldly", +"otiose","otter","ottoman","oubliette","ouch", +"ought","ounce","our","ours","ourselves", +"ousel","oust","out","outback","outbalance", +"outbid","outbound","outbrave","outbreak","outbuilding", +"outburst","outcast","outcaste","outclass","outcome", +"outcrop","outcry","outdated","outdistance","outdo", +"outdoor","outdoors","outer","outermost","outface", +"outfall","outfield","outfight","outfit","outflank", +"outflow","outfox","outgeneral","outgoing","outgoings", +"outgrow","outgrowth","outhouse","outing","outlandish", +"outlast","outlaw","outlay","outlet","outline", +"outlive","outlook","outlying","outmaneuver","outmanoeuvre", +"outmarch","outmatch","outmoded","outmost","outnumber", +"outpatient","outplay","outpoint","outpost","outpourings", +"output","outrage","outrageous","outrange","outrank", +"outride","outrider","outrigger","outright","outrival", +"outrun","outsell","outset","outshine","outside", +"outsider","outsize","outskirts","outsmart","outspoken", +"outspread","outstanding","outstay","outstretched","outstrip", +"outtalk","outvote","outward","outwardly","outwards", +"outwear","outweigh","outwit","outwork","outworn", +"ouzel","ouzo","ova","oval","ovarian", +"ovary","ovation","oven","ovenware","over", +"overact","overage","overall","overalls","overarch", +"overarm","overawe","overbalance","overbear","overbearing", +"overbid","overblown","overboard","overburden","overcall", +"overcapitalise","overcapitalize","overcast","overcharge","overcloud", +"overcoat","overcome","overcompensate","overcrop","overcrowd", +"overdevelop","overdo","overdone","overdose","overdraft", +"overdraw","overdrawn","overdress","overdrive","overdue", +"overestimate","overexpose","overflow","overfly","overgrown", +"overgrowth","overhand","overhang","overhaul","overhead", +"overheads","overhear","overjoyed","overkill","overland", +"overlap","overlay","overleaf","overleap","overload", +"overlong","overlook","overlord","overly","overman", +"overmaster","overmuch","overnight","overpass","overpay", +"overplay","overpopulated","overpopulation","overpower","overpowering", +"overprint","overrate","overreach","override","overriding", +"overrule","overrun","overseas","oversee","overseer", +"oversell","oversexed","overshadow","overshoe","overshoot", +"overside","oversight","oversimplify","oversleep","overspill", +"overstate","overstatement","overstay","oversteer","overstep", +"overstock","overstrung","overstuffed","oversubscribed","overt", +"overtake","overtax","overthrow","overtime","overtone", +"overtones","overtop","overtrump","overture","overtures", +"overturn","overweening","overweight","overwhelm","overwhelming", +"overwork","overwrought","oviduct","oviparous","ovoid", +"ovulate","ovum","owe","owl","owlet", +"owlish","own","owner","ownership","oxbridge", +"oxcart","oxeye","oxide","oxidise","oxidize", +"oxon","oxonian","oxtail","oxyacetylene","oxygen", +"oxygenate","oyez","oyster","oystercatcher","ozone", +"pabulum","pace","pacemaker","pacesetter","pachyderm", +"pacific","pacifier","pacifism","pacifist","pacify", +"pack","package","packed","packer","packet", +"packing","packsaddle","pact","pad","padding", +"paddle","paddock","paddy","padlock","padre", +"paean","paederast","paederasty","paediatrician","paediatrics", +"paella","paeony","pagan","paganism","page", +"pageant","pageantry","pagination","pagoda","paid", +"pail","paillasse","pain","pained","painful", +"painkiller","painless","pains","painstaking","paint", +"paintbrush","painter","painting","paints","paintwork", +"pair","paisley","pajama","pajamas","pal", +"palace","paladin","palais","palakeen","palanquin", +"palatable","palatal","palatalize","palate","palatial", +"palatinate","palaver","pale","paleface","paleography", +"paleolithic","paleontology","palette","palfrey","palimpsest", +"palindrome","paling","palings","palisade","palish", +"pall","palladian","pallbearer","pallet","palliasse", +"palliate","palliation","palliative","pallid","pallor", +"pally","palm","palmer","palmetto","palmist", +"palmistry","palmy","palomino","palpable","palpate", +"palpitate","palpitation","palsied","palsy","palter", +"paltry","pampas","pamper","pamphlet","pamphleteer", +"pan","panacea","panache","panama","panatela", +"panatella","pancake","panchromatic","pancreas","panda", +"pandemic","pandemonium","pander","pandit","panegyric", +"panel","paneling","panelist","panelling","panellist", +"pang","panhandle","panic","panicky","panjabi", +"panjandrum","pannier","pannikin","panoplied","panoply", +"panorama","panpipes","pansy","pant","pantaloon", +"pantaloons","pantechnicon","pantheism","pantheon","panther", +"panties","pantile","panto","pantograph","pantomime", +"pantry","pants","panty","panzer","pap", +"papa","papacy","papadum","papal","papaya", +"paper","paperback","paperboy","paperhanger","papers", +"paperweight","paperwork","papery","papist","papoose", +"pappy","paprika","papyrus","par","parable", +"parabola","parachute","parachutist","paraclete","parade", +"paradigm","paradigmatic","paradise","paradisiacal","paradox", +"paraffin","paragon","paragraph","parakeet","parallel", +"parallelism","parallelogram","paralyse","paralysis","paralytic", +"paralyze","paramilitary","paramount","paramountcy","paramour", +"paranoia","paranoiac","paranoid","parapet","paraphernalia", +"paraphrase","paraplegia","paraplegic","paraquat","paras", +"parasite","parasitic","parasol","parathyroid","paratrooper", +"paratroops","paratyphoid","parboil","parcel","parch", +"parchment","pard","pardon","pardonable","pardonably", +"pardoner","pare","parent","parentage","parental", +"parenthesis","parenthetic","parenthood","parer","parhelion", +"pariah","paring","parish","parishioner","parisian", +"parity","park","parka","parkin","parking", +"parkland","parky","parlance","parley","parliament", +"parliamentarian","parliamentary","parlor","parlour","parlous", +"parmesan","parochial","parodist","parody","parole", +"paroxysm","parquet","parr","parricide","parrot", +"parry","parse","parsee","parsi","parsimonious", +"parsimony","parsley","parsnip","parson","parsonage", +"part","partake","parterre","parthenogenesis","partial", +"partiality","partially","participant","participate","participation", +"participial","participle","particle","particular","particularise", +"particularity","particularize","particularly","particulars","parting", +"partisan","partita","partition","partitive","partizan", +"partly","partner","partnership","partook","partridge", +"parts","parturition","party","parvenu","paschal", +"pasha","pass","passable","passage","passageway", +"passbook","passenger","passerby","passim","passing", +"passion","passionate","passionately","passionflower","passive", +"passivity","passivize","passkey","passover","passport", +"password","past","pasta","paste","pasteboard", +"pastel","pastern","pasteurise","pasteurize","pastiche", +"pastille","pastime","pasting","pastor","pastoral", +"pastorale","pastorate","pastrami","pastry","pasturage", +"pasture","pasty","pat","patch","patchouli", +"patchwork","patchy","patella","patent","patentee", +"patently","pater","paterfamilias","paternal","paternalism", +"paternity","paternoster","path","pathan","pathetic", +"pathfinder","pathological","pathologist","pathology","pathos", +"pathway","patience","patient","patina","patio", +"patisserie","patois","patrial","patriarch","patriarchal", +"patriarchate","patriarchy","patrician","patricide","patrimony", +"patriot","patriotic","patriotism","patrol","patrolman", +"patron","patronage","patroness","patronise","patronize", +"patronymic","patten","patter","pattern","patty", +"paucity","paunch","paunchy","pauper","pauperise", +"pauperism","pauperize","pause","pavan","pavane", +"pave","paved","pavement","pavilion","paving", +"paw","pawky","pawl","pawn","pawnbroker", +"pawnshop","pawpaw","pay","payable","payday", +"payee","payer","payload","paymaster","payment", +"paynim","payoff","payola","payroll","pea", +"peace","peaceable","peaceful","peacekeeping","peacemaker", +"peacetime","peach","peachick","peacock","peafowl", +"peahen","peak","peaked","peaky","peal", +"peanut","peanuts","pear","pearl","pearly", +"pearmain","peasant","peasantry","peashooter","peat", +"pebble","pebbledash","pebbly","pecan","peccadillo", +"peccary","peck","pecker","peckish","pectic", +"pectin","pectoral","peculate","peculiar","peculiarity", +"peculiarly","pecuniary","pedagogue","pedagogy","pedal", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java new file mode 100644 index 00000000000..636af5f8251 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData6 { + private KStemData6() { + } + static String[] data = { +"pedant","pedantic","pedantry","peddle","peddler", +"pederast","pederasty","pedestal","pedestrian","pediatrician", +"pediatrics","pedicab","pedicel","pedicure","pedigree", +"pediment","pedlar","pedometer","pee","peek", +"peekaboo","peel","peeler","peelings","peep", +"peeper","peephole","peepul","peer","peerage", +"peeress","peerless","peeve","peevish","peewit", +"peg","pejorative","pekinese","pekingese","pekoe", +"pelagic","pelf","pelican","pellagra","pellet", +"pellucid","pelmet","pelota","pelt","pelvic", +"pelvis","pemican","pemmican","pen","penal", +"penalise","penalize","penalty","penance","pence", +"penchant","pencil","pendant","pendent","pending", +"pendulous","pendulum","penetrate","penetrating","penetration", +"penetrative","penguin","penicillin","peninsula","penis", +"penitent","penitential","penitentiary","penknife","penmanship", +"pennant","penniless","pennon","penny","pennyweight", +"pennywort","penology","pension","pensionable","pensioner", +"pensive","pentagon","pentagram","pentameter","pentateuch", +"pentathlon","pentecost","penthouse","penultimate","penumbra", +"penurious","penury","peon","peony","people", +"pep","pepper","peppercorn","peppermint","peppery", +"pepsin","peptic","per","peradventure","perambulate", +"perambulator","perceive","percentage","percentile","perceptible", +"perception","perceptive","perch","perchance","percipient", +"percolate","percolator","percussion","percussionist","perdition", +"peregrination","peremptory","perennial","perfect","perfectible", +"perfection","perfectionist","perfectly","perfidious","perfidy", +"perforate","perforation","perforce","perform","performance", +"performer","perfume","perfumier","perfunctory","pergola", +"perhaps","perigee","perihelion","peril","perilous", +"perimeter","period","periodic","periodical","periods", +"peripatetic","peripheral","periphery","periphrasis","periphrastic", +"periscope","perish","perishable","perisher","perishing", +"peristyle","peritonitis","periwig","periwinkle","perjure", +"perjurer","perjury","perk","perky","perm", +"permafrost","permanence","permanency","permanent","permanganate", +"permeable","permeate","permissible","permission","permissive", +"permit","permutation","permute","pernicious","pernickety", +"pernod","peroration","peroxide","perpendicular","perpetrate", +"perpetual","perpetuate","perpetuity","perplex","perplexed", +"perplexity","perquisite","perry","persecute","persecution", +"perseverance","persevere","persevering","persian","persiflage", +"persimmon","persist","persistence","persistent","persnickety", +"person","persona","personable","personage","personal", +"personalise","personalities","personality","personalize","personally", +"personification","personify","personnel","perspective","perspex", +"perspicacious","perspiration","perspire","persuade","persuasion", +"persuasive","pert","pertain","pertinacious","pertinent", +"perturb","perturbation","peruke","peruse","pervade", +"pervasive","perverse","perversion","perversity","pervert", +"peseta","pesky","peso","pessary","pessimism", +"pessimist","pest","pester","pesticide","pestiferous", +"pestilence","pestilent","pestle","pet","petal", +"petaled","petalled","petard","peterman","petite", +"petition","petitioner","petrel","petrifaction","petrify", +"petrochemical","petrol","petroleum","petrology","petticoat", +"pettifogging","pettish","petty","petulant","petunia", +"pew","pewit","pewter","peyote","pfennig", +"phaeton","phagocyte","phalanx","phalarope","phallic", +"phallus","phantasmagoria","phantasmal","phantasy","phantom", +"pharaoh","pharisaic","pharisee","pharmaceutical","pharmacist", +"pharmacology","pharmacopoeia","pharmacy","pharyngitis","pharynx", +"phase","phd","pheasant","phenobarbitone","phenol", +"phenomenal","phenomenally","phenomenon","phew","phi", +"phial","philander","philanthropic","philanthropist","philanthropy", +"philatelist","philately","philharmonic","philhellene","philippic", +"philistine","philological","philologist","philology","philosopher", +"philosophical","philosophise","philosophize","philosophy","philter", +"philtre","phizog","phlebitis","phlebotomy","phlegm", +"phlegmatic","phlox","phobia","phoenician","phoenix", +"phone","phoneme","phonemic","phonemics","phonetic", +"phonetician","phonetics","phoney","phonic","phonics", +"phonograph","phonology","phony","phooey","phosphate", +"phosphorescence","phosphorescent","phosphoric","phosphorus","photo", +"photocopier","photocopy","photoelectric","photogenic","photograph", +"photographer","photographic","photography","photosensitive","photosensitize", +"photostat","photosynthesis","phototsensitise","phrasal","phrase", +"phrasebook","phraseology","phrenetic","phrenology","phthisis", +"phut","phylloxera","phylum","physic","physical", +"physically","physician","physicist","physics","physio", +"physiognomy","physiology","physiotherapy","physique","pianissimo", +"pianist","piano","pianola","piaster","piastre", +"piazza","pibroch","picador","picaresque","piccalilli", +"piccaninny","piccolo","pick","pickaback","pickaninny", +"pickax","pickaxe","picked","picker","pickerel", +"picket","pickings","pickle","pickled","pickpocket", +"picky","picnic","picnicker","pictorial","picture", +"pictures","picturesque","piddle","piddling","pidgin", +"pie","piebald","piece","piecemeal","pieces", +"piecework","piecrust","pied","pier","pierce", +"piercing","pierrot","piety","piezoelectric","piffle", +"piffling","pig","pigeon","pigeonhole","piggery", +"piggish","piggy","piggyback","piggybank","pigheaded", +"piglet","pigment","pigmentation","pigmy","pignut", +"pigskin","pigsticking","pigsty","pigswill","pigtail", +"pike","pikestaff","pilaster","pilau","pilchard", +"pile","piles","pileup","pilfer","pilferage", +"pilgrim","pilgrimage","pill","pillage","pillar", +"pillbox","pillion","pillock","pillory","pillow", +"pillowcase","pilot","pimento","pimp","pimpernel", +"pimple","pin","pinafore","pincer","pincers", +"pinch","pinchbeck","pinched","pinchpenny","pincushion", +"pine","pineal","pineapple","pinecone","pinewood", +"piney","ping","pinhead","pinion","pink", +"pinkeye","pinkie","pinkish","pinko","pinky", +"pinnace","pinnacle","pinnate","pinny","pinpoint", +"pinprick","pinstripe","pint","pinta","pintable", +"pinup","pinwheel","piny","pioneer","pious", +"piousness","pip","pipal","pipe","pipeline", +"piper","pipes","pipette","piping","pipit", +"pippin","pipsqueak","piquant","pique","piquet", +"piracy","piranha","pirate","pirouette","piscatorial", +"pish","piss","pissed","pistachio","pistil", +"pistol","piston","pit","pitch","pitchblende", +"pitcher","pitchfork","piteous","pitfall","pith", +"pithead","pithy","pitiable","pitiful","pitiless", +"pitman","piton","pittance","pituitary","pity", +"pivot","pivotal","pixie","pixilated","pixy", +"pizza","pizzicato","placard","placate","place", +"placebo","placed","placekick","placement","placenta", +"placid","placket","plagarise","plagarize","plagiarism", +"plague","plaguey","plaice","plaid","plain", +"plainly","plainsman","plainsong","plainspoken","plaint", +"plaintiff","plaintive","plait","plan","planchette", +"planet","planetarium","planetary","plangent","plank", +"planking","plankton","planner","plant","plantain", +"plantation","planter","plaque","plash","plasma", +"plaster","plasterboard","plastered","plasterer","plastering", +"plastic","plasticine","plasticity","plastics","plastron", +"plate","plateau","platelayer","platform","plating", +"platinum","platitude","platonic","platoon","platter", +"platypus","plaudit","plausible","play","playable", +"playback","playbill","playboy","player","playful", +"playgoer","playground","playgroup","playhouse","playmate", +"playpen","playroom","playsuit","plaything","playtime", +"playwright","plaza","plea","pleach","plead", +"pleading","pleadings","pleasant","pleasantry","please", +"pleased","pleasing","pleasurable","pleasure","pleat", +"pleb","plebeian","plebiscite","plectrum","pled", +"pledge","pleistocene","plenary","plenipotentiary","plenitude", +"plenteous","plentiful","plenty","pleonasm","plethora", +"pleurisy","plexus","pliable","pliant","pliers", +"plight","plimsoll","plinth","pliocene","plod", +"plodder","plonk","plop","plosive","plot", +"plough","ploughboy","ploughman","ploughshare","plover", +"plow","plowboy","plowman","plowshare","ploy", +"pluck","plucky","plug","plughole","plum", +"plumage","plumb","plumbago","plumber","plumbing", +"plume","plumed","plummet","plummy","plump", +"plunder","plunge","plunger","plunk","pluperfect", +"plural","pluralism","plurality","pluribus","plus", +"plush","plushy","pluto","plutocracy","plutocrat", +"plutonium","ply","plywood","pneumatic","pneumoconiosis", +"pneumonia","poach","poacher","pock","pocked", +"pocket","pocketbook","pocketful","pocketknife","pockmark", +"pockmarked","pod","podgy","podiatry","podium", +"poem","poesy","poet","poetaster","poetess", +"poetic","poetical","poetry","pogrom","poignancy", +"poignant","poinsettia","point","pointed","pointer", +"pointillism","pointless","points","pointsman","poise", +"poised","poison","poisonous","poke","poker", +"pokerwork","poky","polack","polar","polarisation", +"polarise","polarity","polarization","polarize","polaroid", +"polaroids","polder","pole","poleax","poleaxe", +"polecat","polemic","polemical","polemics","police", +"policeman","policewoman","policy","polio","polish", +"polisher","politburo","polite","politic","politicalise", +"politicalize","politician","politicise","politicize","politicking", +"politico","politics","polity","polka","poll", +"pollard","pollen","pollinate","polling","pollster", +"pollutant","pollute","pollution","polly","pollyanna", +"polo","polonaise","polony","poltergeist","poltroon", +"poly","polyandrous","polyandry","polyanthus","polyester", +"polyethylene","polygamist","polygamous","polygamy","polyglot", +"polygon","polymath","polymer","polymorphous","polyp", +"polyphony","polypus","polystyrene","polysyllable","polytechnic", +"polytheism","polythene","polyurethane","pomade","pomander", +"pomegranate","pomeranian","pommel","pommy","pomp", +"pompom","pomposity","pompous","ponce","poncho", +"poncy","pond","ponder","ponderous","pone", +"pong","poniard","pontiff","pontifical","pontificals", +"pontificate","pontoon","pony","ponytail","pooch", +"poodle","poof","pooh","pool","poolroom", +"pools","poop","pooped","poor","poorhouse", +"poorly","poorness","poove","pop","popadam", +"popadum","popcorn","popery","popgun","popinjay", +"popish","poplar","poplin","poppa","popper", +"poppet","poppy","poppycock","popshop","popsy", +"populace","popular","popularise","popularity","popularize", +"popularly","populate","population","populism","populist", +"populous","porcelain","porch","porcine","porcupine", +"pore","pork","porker","porky","porn", +"pornography","porosity","porous","porphyry","porpoise", +"porridge","porringer","port","portable","portage", +"portal","portals","portcullis","portend","portent", +"portentous","porter","porterage","porterhouse","portfolio", +"porthole","portico","portion","portly","portmanteau", +"portrait","portraitist","portraiture","portray","portrayal", +"pose","poser","poseur","posh","posit", +"position","positional","positive","positively","positiveness", +"positivism","positron","posse","possess","possessed", +"possession","possessive","possessor","posset","possibility", +"possible","possibly","possum","post","postage", +"postal","postbag","postbox","postcard","postcode", +"postdate","poster","posterior","posterity","postern", +"postgraduate","posthaste","posthumous","postilion","postillion", +"posting","postman","postmark","postmaster","postmortem", +"postpaid","postpone","postprandial","postscript","postulant", +"postulate","posture","postwar","posy","pot", +"potable","potash","potassium","potation","potato", +"potbellied","potbelly","potboiler","potbound","poteen", +"potency","potent","potentate","potential","potentiality", +"pothead","pother","potherb","pothole","potholing", +"pothouse","pothunter","potion","potluck","potpourri", +"potsherd","potshot","pottage","potted","potter", +"potteries","pottery","potty","pouch","pouf", +"pouffe","poulterer","poultice","poultry","pounce", +"pound","poundage","pounding","pour","pout", +"poverty","powder","powdered","powdery","power", +"powerboat","powerful","powerhouse","powerless","powers", +"powwow","pox","pps","practicable","practical", +"practicality","practically","practice","practiced","practise", +"practised","practitioner","praesidium","praetor","praetorian", +"pragmatic","pragmatism","prairie","praise","praises", +"praiseworthy","praline","pram","prance","prank", +"prankster","prat","prate","pratfall","prattle", +"prawn","praxis","pray","prayer","preach", +"preachify","preamble","prearrange","prebend","prebendary", +"precarious","precast","precaution","precede","precedence", +"precedent","preceding","precentor","precept","preceptor", +"precession","precinct","precincts","preciosity","precious", +"precipice","precipitate","precipitation","precipitous","precise", +"precisely","precision","preclude","precocious","precognition", +"preconceived","preconception","precondition","precook","precursor", +"predator","predatory","predecease","predecessor","predestinate", +"predestination","predestine","predetermine","predeterminer","predicament", +"predicate","predicative","predict","predictable","prediction", +"predigest","predilection","predispose","predisposition","predominance", +"predominant","predominantly","predominate","preeminent","preeminently", +"preempt","preemption","preemptive","preen","preexist", +"preexistence","prefab","prefabricate","prefabricated","preface", +"prefatory","prefect","prefecture","prefer","preferable", +"preference","preferential","preferment","prefigure","prefix", +"pregnancy","pregnant","preheat","prehensile","prehistoric", +"prehistory","prejudge","prejudice","prejudiced","prejudicial", +"prelacy","prelate","prelim","preliminary","prelims", +"preliterate","prelude","premarital","premature","premeditate", +"premeditated","premier","premise","premises","premiss", +"premium","premonition","premonitory","prenatal","prentice", +"preoccupation","preoccupied","preoccupy","preordain","prep", +"prepack","preparation","preparatory","prepare","prepared", +"preparedness","prepay","preponderance","preponderant","preponderate", +"preposition","prepositional","prepossessed","prepossessing","prepossession", +"preposterous","prepuce","prerecord","prerequisite","prerogative", +"presage","presbyter","presbyterian","presbytery","preschool", +"prescient","prescribe","prescribed","prescript","prescription", +"prescriptive","presence","present","presentable","presentation", +"presenter","presentiment","presently","presents","preservable", +"preservation","preservative","preserve","preserver","preset", +"preshrunk","preside","presidency","president","presidential", +"presidium","press","pressed","pressgang","pressing", +"pressman","pressmark","pressure","pressurise","pressurize", +"prestidigitation","prestige","prestigious","prestissimo","presto", +"prestressed","presumable","presume","presumption","presumptive", +"presumptuous","presuppose","presupposition","pretence","pretend", +"pretended","pretender","pretense","pretension","pretentious", +"pretentiousness","preterit","preterite","preternatural","pretext", +"pretor","pretorian","prettify","prettily","pretty", +"pretzel","prevail","prevailing","prevalent","prevaricate", +"prevent","prevention","preventive","preview","previous", +"prevision","prewar","prey","price","priceless", +"pricey","prick","prickle","prickly","pricy", +"pride","priest","priesthood","priestly","prig", +"priggish","prim","primacy","primaeval","primal", +"primarily","primary","primate","prime","primer", +"primeval","priming","primitive","primogeniture","primordial", +"primp","primrose","primula","primus","prince", +"princedom","princely","princess","principal","principality", +"principally","principle","principled","principles","prink", +"print","printable","printer","printing","printout", +"prior","priority","priory","prise","prism", +"prismatic","prison","prisoner","prissy","pristine", +"prithee","privacy","private","privateer","privation", +"privet","privilege","privileged","privily","privy", +"prize","prizefight","prizeman","pro","probability", +"probable","probably","probate","probation","probationer", +"probe","probity","problem","problematic","proboscis", +"procedural","procedure","proceed","proceeding","proceedings", +"proceeds","process","procession","processional","proclaim", +"proclamation","proclivity","proconsul","proconsulate","procrastinate", +"procreate","proctor","procure","procurer","prod", +"prodigal","prodigious","prodigy","produce","producer", +"product","production","productive","productivity","proem", +"prof","profanation","profane","profanity","profess", +"professed","professedly","profession","professional","professionalism", +"professor","professorial","professorship","proffer","proficient", +"profile","profit","profitable","profiteer","profligacy", +"profligate","profound","profundity","profuse","profusion", +"progenitor","progeny","progesterone","prognathous","prognosis", +"prognostic","prognosticate","prognostication","program","programer", +"programmer","progress","progression","progressive","prohibit", +"prohibition","prohibitionist","prohibitive","prohibitory","project", +"projectile","projection","projectionist","projector","prolapse", +"prole","prolegomena","proletarian","proletariat","proliferate", +"proliferation","prolific","prolix","prolog","prologue", +"prolong","prolongation","prolonged","prom","promenade", +"promenader","prominence","prominent","promiscuity","promiscuous", +"promise","promising","promontory","promote","promoter", +"promotion","prompt","prompter","promptness","promulgate", +"pron","prone","prong","pronominal","pronoun", +"pronounce","pronounceable","pronounced","pronouncement","pronto", +"pronunciamento","pronunciation","proof","proofread","prop", +"propaganda","propagandise","propagandist","propagandize","propagate", +"propagation","propane","propel","propellant","propellent", +"propeller","propensity","proper","properly","propertied", +"property","prophecy","prophesy","prophet","prophetess", +"prophetic","prophets","prophylactic","prophylaxis","propinquity", +"propitiate","propitiatory","propitious","propjet","proponent", +"proportion","proportional","proportionate","proportions","proposal", +"propose","proposition","propound","proprietary","proprieties", +"proprietor","proprietress","propriety","propulsion","propulsive", +"propylene","prorogation","prorogue","prosaic","proscenium", +"proscribe","proscription","prose","prosecute","prosecution", +"prosecutor","proselyte","proselytise","proselytize","prosody", +"prospect","prospective","prospector","prospects","prospectus", +"prosper","prosperity","prosperous","prostate","prosthesis", +"prostitute","prostitution","prostrate","prostration","prosy", +"protagonist","protean","protect","protection","protectionism", +"protective","protector","protectorate","protein","protest", +"protestant","protestation","protocol","proton","protoplasm", +"prototype","protozoa","protozoan","protozoon","protract", +"protraction","protractor","protrude","protrusion","protrusive", +"protuberance","protuberant","proud","provable","prove", +"proven","provenance","provender","proverb","proverbial", +"proverbially","proverbs","provide","provided","providence", +"provident","providential","provider","providing","province", +"provinces","provincial","provision","provisional","provisions", +"proviso","provocation","provocative","provoke","provoking", +"provost","prow","prowess","prowl","prowler", +"prox","proximal","proximate","proximity","proximo", +"proxy","prude","prudence","prudent","prudential", +"prudery","prudish","prune","pruning","prurience", +"prurient","pruritus","prussian","pry","psalm", +"psalmist","psalmody","psalms","psalter","psaltery", +"psephology","pseud","pseudonym","pseudonymous","pshaw", +"psittacosis","psoriasis","psst","psyche","psychedelic", +"psychiatric","psychiatrist","psychiatry","psychic","psycho", +"psychoanalyse","psychoanalysis","psychoanalyst","psychoanalytic","psychoanalyze", +"psychokinesis","psychological","psychologist","psychology","psychopath", +"psychosis","psychosomatic","psychotherapy","psychotic","pta", +"ptarmigan","pterodactyl","pto","ptomaine","pub", +"puberty","pubic","public","publican","publication", +"publicise","publicist","publicity","publicize","publish", +"publisher","publishing","puce","puck","pucker", +"puckish","pud","pudding","puddle","pudendum", +"pudgy","pueblo","puerile","puerility","puerperal", +"puff","puffball","puffed","puffer","puffin", +"puffy","pug","pugilism","pugilist","pugnacious", +"pugnacity","puissance","puissant","puke","pukka", +"pulchritude","pulchritudinous","pule","pull","pullback", +"pullet","pulley","pullman","pullout","pullover", +"pullthrough","pullulate","pulmonary","pulp","pulpit", +"pulsar","pulsate","pulsation","pulse","pulverise", +"pulverize","puma","pumice","pummel","pump", +"pumpernickel","pumpkin","pun","punch","punchy", +"punctilio","punctilious","punctual","punctuate","punctuation", +"puncture","pundit","pungent","punic","punish", +"punishable","punishing","punishment","punitive","punjabi", +"punk","punkah","punnet","punster","punt", +"puny","pup","pupa","pupate","pupil", +"puppet","puppeteer","puppy","purblind","purchase", +"purchaser","purdah","pure","pureblooded","purebred", +"puree","purely","pureness","purgation","purgative", +"purgatory","purge","purification","purify","purist", +"puritan","puritanical","purity","purl","purler", +"purlieus","purloin","purple","purplish","purport", +"purpose","purposeful","purposeless","purposely","purposive", +"purr","purse","purser","pursuance","pursue", +"pursuer","pursuit","purulent","purvey","purveyance", +"purveyor","purview","pus","push","pushbike", +"pushcart","pushchair","pushed","pusher","pushover", +"pushy","pusillanimous","puss","pussy","pussycat", +"pussyfoot","pustule","put","putative","putrefaction", +"putrefactive","putrefy","putrescent","putrid","putsch", +"putt","puttee","putter","putto","putty", +"puzzle","puzzlement","puzzler","pvc","pygmy", +"pyjama","pyjamas","pylon","pyorrhea","pyorrhoea", +"pyramid","pyre","pyrex","pyrexia","pyrites", +"pyromania","pyromaniac","pyrotechnic","pyrotechnics","python", +"pyx","qed","qty","qua","quack", +"quackery","quad","quadragesima","quadrangle","quadrangular", +"quadrant","quadrilateral","quadrille","quadrillion","quadroon", +"quadruped","quadruple","quadruplet","quadruplicate","quaff", +"quagga","quagmire","quail","quaint","quake", +"quaker","qualification","qualifications","qualified","qualifier", +"qualify","qualitative","quality","qualm","quandary", +"quantify","quantitative","quantity","quantum","quarantine", +"quark","quarrel","quarrelsome","quarry","quart", +"quarter","quarterdeck","quarterfinal","quartering","quarterly", +"quartermaster","quarters","quarterstaff","quartet","quartette", +"quarto","quartz","quasar","quash","quatercentenary", +"quatrain","quaver","quay","quean","queasy", +"queen","queenly","queer","quell","quench", +"quenchless","querulous","query","quest","question", +"questionable","questioner","questioning","questionnaire","quetzal", +"queue","quibble","quick","quicken","quickie", +"quicklime","quicksand","quicksilver","quickstep","quid", +"quiescent","quiet","quieten","quietism","quietude", +"quietus","quiff","quill","quilt","quilted", +"quin","quince","quinine","quinquagesima","quinsy", +"quintal","quintessence","quintet","quintette","quintuplet", +"quip","quire","quirk","quisling","quit", +"quits","quittance","quitter","quiver","quixotic", +"quiz","quizmaster","quizzical","quod","quoit", +"quoits","quondam","quorum","quota","quotable", +"quotation","quote","quoth","quotidian","quotient", +"rabbi","rabbinical","rabbit","rabble","rabelaisian", +"rabid","rabies","rac","raccoon","race", +"racecourse","racehorse","raceme","racer","races", +"racetrack","racial","racialism","racially","racing", +"rack","racket","racketeer","racketeering","rackets", +"raconteur","racoon","racquet","racquets","racy", +"radar","radial","radiance","radiant","radiate", +"radiation","radiator","radical","radicalise","radicalism", +"radicalize","radicle","radii","radio","radioactive", +"radioactivity","radiogram","radiograph","radiographer","radiography", +"radioisotope","radiolocation","radiology","radiotherapist","radiotherapy", +"radish","radium","radius","raffia","raffish", +"raffle","raft","rafter","raftered","raftsman", +"rag","raga","ragamuffin","ragbag","rage", +"ragged","raglan","ragout","ragtag","ragtime", +"raid","raider","rail","railhead","railing", +"raillery","railroad","rails","railway","raiment", +"rain","rainbow","raincoat","raindrop","rainfall", +"rainproof","rains","rainstorm","rainwater","rainy", +"raise","raisin","raj","raja","rajah", +"rake","rakish","rallentando","rally","ram", +"ramadan","ramble","rambler","rambling","rambunctious", +"ramekin","ramification","ramify","ramjet","ramp", +"rampage","rampant","rampart","ramrod","ramshackle", +"ran","ranch","rancher","rancid","rancor", +"rancorous","rancour","rand","random","randy", +"ranee","rang","range","ranger","rani", +"rank","ranker","ranking","rankle","ranks", +"ransack","ransom","rant","rap","rapacious", +"rapacity","rape","rapid","rapids","rapier", +"rapine","rapist","rapport","rapprochement","rapscallion", +"rapt","rapture","rapturous","rare","rarebit", +"rarefied","rarefy","rarely","raring","rarity", +"rascal","rascally","rash","rasher","rasp", +"raspberry","rat","ratable","ratchet","rate", +"rateable","ratepayer","rather","ratify","rating", +"ratio","ratiocination","ration","rational","rationale", +"rationalise","rationalism","rationalist","rationalize","rations", +"ratlin","ratline","rats","rattan","ratter", +"rattle","rattlebrained","rattlesnake","rattletrap","rattling", +"ratty","raucous","raunchy","ravage","ravages", +"rave","ravel","raven","ravening","ravenous", +"raver","ravine","raving","ravings","ravioli", +"ravish","ravishing","ravishment","raw","rawhide", +"ray","rayon","raze","razor","razorback", +"razzle","reach","react","reaction","reactionary", +"reactivate","reactive","reactor","read","readable", +"readdress","reader","readership","readily","readiness", +"reading","readjust","readout","ready","reafforest", +"reagent","real","realign","realisable","realisation", +"realise","realism","realist","realistic","reality", +"realizable","realization","realize","really","realm", +"realpolitik","realtor","realty","ream","reanimate", +"reap","reaper","reappear","reappraisal","rear", +"rearguard","rearm","rearmament","rearmost","rearrange", +"rearward","rearwards","reason","reasonable","reasonably", +"reasoned","reasoning","reassure","rebarbative","rebate", +"rebel","rebellion","rebellious","rebind","rebirth", +"reborn","rebound","rebuff","rebuild","rebuke", +"rebus","rebut","rebuttal","recalcitrance","recalcitrant", +"recall","recant","recap","recapitulate","recapitulation", +"recapture","recast","recce","recd","recede", +"receipt","receipts","receivable","receive","received", +"receiver","receivership","receiving","recent","recently", +"receptacle","reception","receptionist","receptive","recess", +"recession","recessional","recessive","recharge","recidivist", +"recipe","recipient","reciprocal","reciprocate","reciprocity", +"recital","recitation","recitative","recite","reck", +"reckless","reckon","reckoner","reckoning","reclaim", +"reclamation","recline","recluse","recognise","recognition", +"recognizance","recognize","recoil","recollect","recollection", +"recommend","recommendation","recompense","reconcile","reconciliation", +"recondite","recondition","reconnaissance","reconnoiter","reconnoitre", +"reconsider","reconstitute","reconstruct","reconstruction","record", +"recorder","recording","recordkeeping","recount","recoup", +"recourse","recover","recovery","recreant","recreate", +"recreation","recreational","recriminate","recrimination","recrudescence", +"recruit","rectal","rectangle","rectangular","rectification", +"rectifier","rectify","rectilinear","rectitude","recto", +"rector","rectory","rectum","recumbent","recuperate", +"recuperative","recur","recurrence","recurrent","recurved", +"recusant","recycle","red","redbreast","redbrick", +"redcap","redcoat","redcurrant","redden","reddish", +"redecorate","redeem","redeemer","redemption","redemptive", +"redeploy","redhead","rediffusion","redirect","redistribute", +"redo","redolence","redolent","redouble","redoubt", +"redoubtable","redound","redress","redskin","reduce", +"reduction","redundancy","redundant","reduplicate","redwing", +"redwood","reecho","reed","reeds","reeducate", +"reedy","reef","reefer","reek","reel", +"reentry","reeve","ref","reface","refashion", +"refectory","refer","referee","reference","referendum", +"refill","refine","refined","refinement","refiner", +"refinery","refit","reflate","reflation","reflect", +"reflection","reflective","reflector","reflex","reflexes", +"reflexive","refloat","refoot","reforest","reform", +"reformation","reformatory","refract","refractory","refrain", +"refresh","refresher","refreshing","refreshment","refreshments", +"refrigerant","refrigerate","refrigeration","refrigerator","reft", +"refuel","refuge","refugee","refulgence","refulgent", +"refund","refurbish","refusal","refuse","refutable", +"refutation","refute","regain","regal","regale", +"regalia","regard","regardful","regarding","regardless", +"regards","regatta","regency","regenerate","regent", +"reggae","regicide","regime","regimen","regiment", +"regimental","regimentals","regina","region","regional", +"regions","register","registrar","registration","registry", +"regnant","regress","regressive","regret","regrets", +"regrettable","regrettably","regroup","regular","regularise", +"regularity","regularize","regularly","regulate","regulation", +"regulator","regulo","regurgitate","rehabilitate","rehash", +"rehear","rehearsal","rehearse","rehouse","reich", +"reification","reify","reign","reimburse","reimbursement", +"rein","reincarnate","reincarnation","reindeer","reinforce", +"reinforcement","reinforcements","reins","reinstate","reinsure", +"reissue","reiterate","reject","rejection","rejoice", +"rejoicing","rejoicings","rejoin","rejoinder","rejuvenate", +"rekindle","relaid","relapse","relate","related", +"relation","relational","relations","relationship","relative", +"relatively","relativism","relativistic","relativity","relax", +"relaxation","relaxing","relay","release","relegate", +"relent","relentless","relevance","relevant","reliability", +"reliable","reliance","reliant","relic","relics", +"relict","relief","relieve","relieved","religion", +"religious","religiously","reline","relinquish","reliquary", +"relish","relive","reload","relocate","reluctance", +"reluctant","reluctantly","rely","remain","remainder", +"remains","remake","remand","remark","remarkable", +"remarkably","remarry","remediable","remedial","remedy", +"remember","remembrance","remilitarise","remilitarize","remind", +"reminder","reminisce","reminiscence","reminiscences","reminiscent", +"remiss","remission","remit","remittance","remittent", +"remnant","remodel","remold","remonstrance","remonstrate", +"remorse","remorseful","remote","remotely","remould", +"remount","removal","remove","remover","remunerate", +"remunerative","renaissance","renal","rename","renascent", +"rend","render","rendering","rendezvous","rendition", +"renegade","renege","renegue","renew","renewable", +"renewal","rennet","renounce","renovate","renown", +"renowned","rent","rental","renter","rentier", +"renunciation","reopen","reorganise","reorganize","rep", +"repaid","repair","reparable","reparation","reparations", +"repartee","repast","repatriate","repay","repayable", +"repayment","repeal","repeat","repeated","repeatedly", +"repeater","repeating","repel","repellent","repent", +"repentance","repentant","repercussion","repertoire","repertory", +"repetition","repetitious","repine","replace","replacement", +"replay","replenish","replete","repletion","replica", +"replicate","reply","repoint","report","reportage", +"reportedly","reporter","repose","repository","repossess", +"repot","repp","reprehend","reprehensible","represent", +"representation","representational","representations","representative","repress", +"repressed","repression","repressive","reprieve","reprimand", +"reprint","reprisal","reprise","reproach","reprobate", +"reproduce","reproducer","reproduction","reproductive","reproof", +"reprove","reproving","reptile","reptilian","republic", +"republican","republicanism","repudiate","repugnance","repugnant", +"repulse","repulsion","repulsive","reputable","reputation", +"repute","reputed","reputedly","request","requiem", +"require","requirement","requisite","requisition","requital", +"requite","reredos","rerun","rescind","rescript", +"rescue","research","reseat","resemblance","resemble", +"resent","resentment","reservation","reserve","reserved", +"reservedly","reservist","reservoir","reset","resettle", +"reshuffle","reside","residence","residency","resident", +"residential","residual","residuary","residue","resign", +"resignation","resigned","resilience","resilient","resin", +"resinated","resist","resistance","resistant","resistor", +"resole","resolute","resolution","resolvable","resolve", +"resonance","resonant","resonate","resonator","resort", +"resound","resounding","resource","resourceful","resources", +"respect","respectability","respectable","respecter","respectful", +"respecting","respective","respectively","respects","respiration", +"respirator","respiratory","respire","respite","resplendence", +"resplendent","respond","respondent","response","responsibility", +"responsible","responsibly","responsive","rest","restage", +"restate","restaurant","restaurateur","restful","restitution", +"restive","restless","restock","restoration","restorative", +"restore","restorer","restrain","restrained","restraint", +"restrict","restricted","restriction","restrictive","restructure", +"result","resultant","resume","resumption","resurface", +"resurgence","resurgent","resurrect","resurrection","resuscitate", +"retail","retailer","retain","retainer","retake", +"retaliate","retaliation","retaliatory","retard","retarded", +"retch","retd","retell","retention","retentive", +"rethink","reticence","reticent","reticulated","reticulation", +"reticule","retina","retinue","retire","retired", +"retirement","retiring","retort","retouch","retrace", +"retract","retractable","retractile","retraction","retread", +"retreat","retrench","retrial","retraining","retribution", +"retributive","retrieval","retrieve","retriever","retroactive", +"retroflex","retrograde","retrogress","retrogressive","retrospect", +"retrospection","retrospective","retroversion","retsina","return", +"returnable","returns","reunion","reunite","reuse", +"rev","revalue","revamp","reveal","revealing", +"reveille","revel","revelation","revelry","revenge", +"revenue","reverberant","reverberate","reverberation","revere", +"reverence","reverend","reverent","reverential","reverie", +"revers","reversal","reverse","reversion","reversionary", +"revert","revetment","review","reviewer","revile", +"revise","revision","revisionism","revitalise","revitalize", +"revival","revivalist","revive","revivify","revocable", +"revocation","revoke","revolt","revolting","revolution", +"revolutionary","revolutionise","revolutionize","revolve","revolver", +"revolving","revue","revulsion","reward","rewarding", +"rewards","rewire","reword","rewrite","rex", +"rhapsodise","rhapsodize","rhapsody","rhea","rhenish", +"rheostat","rhetoric","rhetorical","rhetorically","rhetorician", +"rheum","rheumatic","rheumaticky","rheumatics","rheumatism", +"rheumatoid","rhinestone","rhinoceros","rhizome","rhododendron", +"rhomboid","rhombus","rhubarb","rhyme","rhymed", +"rhymester","rhythm","rhythmic","rib","ribald", +"ribaldry","ribbed","ribbing","ribbon","riboflavin", +"rice","rich","riches","richly","richness", +"rick","rickets","rickety","ricksha","rickshaw", +"ricochet","rid","riddance","ridden","riddle", +"ride","rider","riderless","ridge","ridgepole", +"ridicule","ridiculous","riding","riesling","rife", +"riff","riffle","riffraff","rifle","rifleman", +"rifles","rifling","rift","rig","rigging", +"right","righteous","rightful","rightist","rightly", +"rights","rightward","rightwards","rigid","rigidity", +"rigmarole","rigor","rigorous","rigour","rile", +"rill","rim","rime","rind","rinderpest", +"ring","ringer","ringleader","ringlet","ringmaster", +"ringside","ringworm","rink","rinse","riot", +"riotous","rip","riparian","ripcord","ripen", +"riposte","ripple","ripsaw","riptide","rise", +"riser","risibility","risible","rising","risk", +"risky","risotto","rissole","rite","ritual", +"ritualism","ritzy","rival","rivalry","rive", +"river","riverbed","riverside","rivet","riveter", +"riveting","riviera","rivulet","rna","roach", +"road","roadbed","roadblock","roadhouse","roadman", +"roadside","roadstead","roadster","roadway","roadworthy", +"roam","roan","roar","roaring","roast", +"roaster","roasting","rob","robber","robbery", +"robe","robin","robot","robust","rock", +"rockbound","rocker","rockery","rocket","rocketry", +"rocks","rocky","rococo","rod","rode", +"rodent","rodeo","rodomontade","roe","roebuck", +"rogation","roger","rogue","roguery","roguish", +"roisterer","role","roll","roller","rollicking", +"rolling","rolls","romaic","roman","romance", +"romanesque","romantic","romanticise","romanticism","romanticize", +"romany","romish","romp","romper","rompers", +"rondeau","rondo","roneo","rood","roodscreen", +"roof","roofing","roofless","rooftree","rook", +"rookery","rookie","room","roomer","roommate", +"rooms","roomy","roost","rooster","root", +"rooted","rootless","roots","rope","ropedancer", +"ropes","ropewalk","ropeway","ropey","ropy", +"roquefort","rosary","rose","roseate","rosebud", +"roseleaf","rosemary","rosette","rosewater","rosewood", +"rosin","roster","rostrum","rosy","rot", +"rota","rotary","rotate","rotation","rotatory", +"rotgut","rotisserie","rotogravure","rotor","rotten", +"rottenly","rotter","rotund","rotunda","rouble", +"rouge","rough","roughage","roughcast","roughen", +"roughhouse","roughly","roughneck","roughness","roughrider", +"roughshod","roulette","round","roundabout","roundel", +"roundelay","rounders","roundhead","roundhouse","roundish", +"roundly","rounds","roundsman","roundup","roup", +"rouse","rousing","roustabout","rout","route", +"routine","roux","rove","rover","row", +"rowan","rowanberry","rowdy","rowdyism","rowel", +"rower","rowing","rowlock","royal","royalist", +"royalty","rpm","rsm","rsvp","rub", +"rubber","rubberise","rubberize","rubberneck","rubbery", +"rubbing","rubbish","rubbishy","rubble","rubdown", +"rubella","rubicon","rubicund","ruble","rubric", +"ruby","ruck","rucksack","ruckus","ruction", +"ructions","rudder","ruddle","ruddy","rude", +"rudely","rudiment","rudimentary","rudiments","rue", +"rueful","ruff","ruffian","ruffianly","ruffle", +"rug","rugby","rugged","ruin","ruination", +"ruinous","ruins","rule","rulebook","ruler", +"ruling","rum","rumba","rumble","rumbling", +"rumbustious","ruminant","ruminate","ruminative","rummage", +"rummy","rumor","rumored","rumormonger","rumour", +"rumoured","rumourmonger","rump","rumple","rumpus", +"run","runaway","rung","runnel","runner", +"running","runny","runs","runt","runway", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java new file mode 100644 index 00000000000..9ac12daf8df --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java @@ -0,0 +1,715 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This algorithm is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData7 { + private KStemData7() { + } + static String[] data = { +"rupee","rupture","rural","ruritanian","ruse", +"rush","rushes","rushlight","rusk","russet", +"rust","rustic","rusticate","rustication","rustle", +"rustler","rustless","rustling","rustproof","rusty", +"rut","ruthless","rutting","rye","sabbatarian", +"sabbath","sabbatical","saber","sable","sabot", +"sabotage","saboteur","sabra","sabre","sac", +"saccharin","saccharine","sacerdotal","sacerdotalism","sachet", +"sack","sackbut","sackcloth","sacral","sacrament", +"sacramental","sacred","sacrifice","sacrificial","sacrilege", +"sacrilegious","sacristan","sacristy","sacroiliac","sacrosanct", +"sad","sadden","saddle","saddlebag","saddler", +"saddlery","sadducee","sadhu","sadism","sadly", +"sadomasochism","safari","safe","safebreaker","safeguard", +"safekeeping","safety","saffron","sag","saga", +"sagacious","sagacity","sagebrush","sago","sahib", +"said","sail","sailcloth","sailing","sailor", +"sailplane","saint","sainted","saintly","saith", +"sake","saki","salaam","salable","salacious", +"salacity","salad","salamander","salami","salaried", +"salary","sale","saleable","saleroom","sales", +"salesclerk","salesgirl","saleslady","salesman","salesmanship", +"salient","saliferous","salify","saline","salinometer", +"saliva","salivary","salivate","sallow","sally", +"salmon","salmonella","salon","saloon","salsify", +"salt","saltcellar","saltire","saltlick","saltpan", +"saltpeter","saltpetre","salts","saltshaker","saltwater", +"salty","salubrious","salutary","salutation","salute", +"salvage","salvation","salvationist","salve","salvedge", +"salver","salvia","salvo","samaritan","samaritans", +"samba","same","sameness","samovar","sampan", +"sample","sampler","samurai","sanatorium","sanctify", +"sanctimonious","sanction","sanctities","sanctity","sanctuary", +"sanctum","sanctus","sand","sandal","sandalwood", +"sandbag","sandbank","sandbar","sandblast","sandbox", +"sandboy","sandcastle","sander","sandglass","sandman", +"sandpaper","sandpiper","sandpit","sands","sandshoe", +"sandstone","sandstorm","sandwich","sandy","sane", +"sang","sangfroid","sangria","sanguinary","sanguine", +"sanitary","sanitation","sanitorium","sanity","sank", +"sans","sanskrit","sap","sapience","sapient", +"sapless","sapling","sapper","sapphic","sapphire", +"sappy","sapwood","saraband","sarabande","sarcasm", +"sarcastic","sarcophagus","sardine","sardonic","sarge", +"sari","sarky","sarong","sarsaparilla","sartorial", +"sash","sashay","sass","sassafras","sassy", +"sat","satan","satanic","satanism","satchel", +"sate","sateen","satellite","satiable","satiate", +"satiety","satin","satinwood","satiny","satire", +"satirical","satirise","satirize","satisfaction","satisfactory", +"satisfy","satisfying","satrap","satsuma","saturate", +"saturation","saturday","saturn","saturnalia","saturnine", +"satyr","sauce","saucepan","saucer","saucy", +"sauerkraut","sauna","saunter","saurian","sausage", +"sauterne","sauternes","savage","savagery","savanna", +"savannah","savant","save","saveloy","saver", +"saving","savings","savior","saviour","savor", +"savory","savour","savoury","savoy","savvy", +"saw","sawbones","sawbuck","sawdust","sawhorse", +"sawmill","sawpit","sawyer","saxifrage","saxon", +"saxophone","saxophonist","say","saying","scab", +"scabbard","scabby","scabies","scabious","scabrous", +"scads","scaffold","scaffolding","scalar","scalawag", +"scald","scalding","scale","scalene","scallion", +"scallop","scallywag","scalp","scalpel","scaly", +"scamp","scamper","scampi","scan","scandal", +"scandalise","scandalize","scandalmonger","scandalous","scandinavian", +"scanner","scansion","scant","scanty","scapegoat", +"scapegrace","scapula","scar","scarab","scarce", +"scarcely","scarcity","scare","scarecrow","scared", +"scaremonger","scarf","scarify","scarlet","scarp", +"scarper","scary","scat","scathing","scatology", +"scatter","scatterbrain","scatterbrained","scattered","scatty", +"scavenge","scavenger","scenario","scenarist","scene", +"scenery","sceneshifter","scenic","scent","scepter", +"sceptic","sceptical","scepticism","sceptre","schedule", +"schema","schematic","schematize","scheme","scherzo", +"schism","schismatic","schist","schizoid","schizophrenia", +"schizophrenic","schmaltz","schmalz","schnapps","schnitzel", +"schnorkel","scholar","scholarly","scholarship","scholastic", +"scholasticism","school","schoolboy","schoolhouse","schooling", +"schoolman","schoolmarm","schoolmaster","schoolmastering","schoolmate", +"schoolwork","schooner","schwa","sciatic","sciatica", +"science","scientific","scientist","scientology","scimitar", +"scintilla","scintillate","scion","scissor","scissors", +"sclerosis","scoff","scold","scollop","sconce", +"scone","scoop","scoot","scooter","scope", +"scorbutic","scorch","scorcher","scorching","score", +"scoreboard","scorebook","scorecard","scorekeeper","scoreless", +"scorer","scorn","scorpio","scorpion","scotch", +"scoundrel","scoundrelly","scour","scourer","scourge", +"scout","scoutmaster","scow","scowl","scrabble", +"scrag","scraggly","scraggy","scram","scramble", +"scrap","scrapbook","scrape","scraper","scrapings", +"scrappy","scraps","scratch","scratchpad","scratchy", +"scrawl","scrawny","scream","screamingly","scree", +"screech","screed","screen","screening","screenplay", +"screw","screwball","screwdriver","screwy","scribble", +"scribbler","scribe","scrimmage","scrimp","scrimshank", +"scrimshaw","scrip","script","scripted","scriptural", +"scripture","scriptwriter","scrivener","scrofula","scrofulous", +"scroll","scrollwork","scrooge","scrotum","scrounge", +"scrub","scrubber","scrubby","scruff","scruffy", +"scrum","scrumcap","scrumhalf","scrummage","scrumptious", +"scrumpy","scrunch","scruple","scrupulous","scrutineer", +"scrutinise","scrutinize","scrutiny","scuba","scud", +"scuff","scuffle","scull","scullery","scullion", +"sculptor","sculptural","sculpture","scum","scupper", +"scurf","scurrility","scurrilous","scurry","scurvy", +"scut","scutcheon","scuttle","scylla","scythe", +"sea","seabed","seabird","seaboard","seaborne", +"seafaring","seafood","seafront","seagirt","seagoing", +"seagull","seahorse","seakale","seal","sealer", +"sealing","sealskin","sealyham","seam","seaman", +"seamanlike","seamanship","seamstress","seamy","seaplane", +"seaport","sear","search","searching","searchlight", +"searing","seascape","seashell","seashore","seasick", +"seaside","season","seasonable","seasonal","seasoning", +"seat","seating","seawall","seaward","seawards", +"seawater","seaway","seaweed","seaworthy","sec", +"secateurs","secede","secession","seclude","secluded", +"seclusion","seclusive","second","secondary","seconds", +"secrecy","secret","secretarial","secretariat","secretary", +"secrete","secretion","secretive","sect","sectarian", +"section","sectional","sectionalism","sector","secular", +"secularise","secularism","secularize","secure","security", +"sedan","sedate","sedation","sedative","sedentary", +"sedge","sediment","sedimentary","sedimentation","sedition", +"seditious","seduce","seduction","seductive","sedulous", +"see","seed","seedbed","seedcake","seedling", +"seedsman","seedy","seeing","seek","seem", +"seeming","seemingly","seemly","seen","seep", +"seepage","seer","seersucker","seesaw","seethe", +"segment","segmentation","segregate","segregated","segregation", +"seigneur","seine","seismic","seismograph","seismology", +"seize","seizure","seldom","select","selection", +"selective","selector","selenium","self","selfish", +"selfless","selfsame","sell","seller","sellotape", +"selvage","selves","semantic","semantics","semaphore", +"semblance","semeiology","semen","semester","semibreve", +"semicircle","semicolon","semiconductor","semidetached","semifinal", +"semifinalist","seminal","seminar","seminarist","seminary", +"semiology","semiprecious","semiquaver","semitic","semitone", +"semitropical","semivowel","semiweekly","semolina","sempstress", +"sen","senate","senator","senatorial","send", +"sender","senescence","senescent","seneschal","senile", +"senility","senior","seniority","senna","sensation", +"sensational","sensationalism","sense","senseless","senses", +"sensibility","sensible","sensitise","sensitive","sensitivity", +"sensitize","sensor","sensory","sensual","sensualist", +"sensuality","sensuous","sent","sentence","sententious", +"sentient","sentiment","sentimental","sentimentalise","sentimentalism", +"sentimentality","sentimentalize","sentinel","sentry","sepal", +"separable","separate","separation","separatism","separator", +"sepia","sepoy","sepsis","september","septet", +"septic","septicaemia","septicemia","septuagenarian","septuagesima", +"septuagint","sepulcher","sepulchral","sepulchre","sequel", +"sequence","sequencing","sequent","sequential","sequester", +"sequestrate","sequestration","sequin","sequoia","seraglio", +"seraph","seraphic","sere","serenade","serendipity", +"serene","serf","serfdom","serge","sergeant", +"serial","serialise","serialize","seriatim","sericulture", +"series","serif","seriocomic","serious","seriously", +"sermon","sermonise","sermonize","serous","serpent", +"serpentine","serrated","serried","serum","serval", +"servant","serve","server","servery","service", +"serviceable","serviceman","serviette","servile","serving", +"servitor","servitude","servomechanism","servomotor","sesame", +"session","sessions","set","setback","setscrew", +"setsquare","sett","settee","setter","setting", +"settle","settled","settlement","settler","seven", +"seventeen","seventy","sever","several","severally", +"severance","severity","sew","sewage","sewer", +"sewerage","sewing","sex","sexagenarian","sexagesima", +"sexism","sexist","sexless","sextant","sextet", +"sexton","sextuplet","sexual","sexuality","sexy", +"sforzando","sgt","shabby","shack","shackle", +"shad","shade","shades","shading","shadow", +"shadowbox","shadowy","shady","shaft","shag", +"shagged","shaggy","shagreen","shah","shake", +"shakedown","shaker","shakes","shako","shaky", +"shale","shall","shallop","shallot","shallow", +"shallows","shalom","shalt","sham","shaman", +"shamble","shambles","shame","shamefaced","shameful", +"shameless","shammy","shampoo","shamrock","shandy", +"shanghai","shank","shantung","shanty","shantytown", +"shape","shaped","shapely","shard","share", +"sharecropper","shareholder","shares","shark","sharkskin", +"sharp","sharpen","sharpener","sharper","sharpshooter", +"shatter","shave","shaver","shaving","shawl", +"shay","she","sheaf","shear","shears", +"sheath","sheathe","sheathing","shebang","shebeen", +"shed","sheen","sheep","sheepdip","sheepdog", +"sheepfold","sheepish","sheepskin","sheer","sheet", +"sheeting","sheik","sheikdom","sheikh","sheikhdom", +"sheila","shekels","shelduck","shelf","shell", +"shellac","shellacking","shellfish","shellshock","shelter", +"sheltered","shelve","shelves","shelving","shenanigan", +"shepherd","shepherdess","sheraton","sherbet","sherd", +"sheriff","sherpa","sherry","shew","shh", +"shibboleth","shield","shift","shiftless","shifty", +"shilling","shimmer","shin","shinbone","shindig", +"shindy","shine","shiner","shingle","shingles", +"shining","shinny","shinto","shiny","ship", +"shipboard","shipbroker","shipbuilding","shipmate","shipment", +"shipper","shipping","shipshape","shipwreck","shipwright", +"shipyard","shire","shires","shirk","shirring", +"shirt","shirtfront","shirting","shirtsleeve","shirttail", +"shirtwaist","shirtwaister","shirty","shit","shits", +"shitty","shiver","shivers","shivery","shoal", +"shock","shocker","shockheaded","shocking","shockproof", +"shod","shoddy","shoe","shoeblack","shoehorn", +"shoelace","shoemaker","shoeshine","shoestring","shone", +"shoo","shook","shoot","shop","shopkeeper", +"shoplift","shopsoiled","shopworn","shore","shorn", +"short","shortage","shortbread","shortcake","shortcoming", +"shorten","shortening","shortfall","shorthand","shorthanded", +"shorthorn","shortie","shortly","shorts","shortsighted", +"shorty","shot","shotgun","should","shoulder", +"shouldst","shout","shouting","shove","shovel", +"shovelboard","show","showboat","showcase","showdown", +"shower","showery","showgirl","showing","showman", +"showmanship","shown","showpiece","showplace","showroom", +"showy","shrank","shrapnel","shred","shredder", +"shrew","shrewd","shrewish","shriek","shrift", +"shrike","shrill","shrimp","shrine","shrink", +"shrinkage","shrive","shrivel","shroud","shrub", +"shrubbery","shrug","shuck","shucks","shudder", +"shuffle","shuffleboard","shufty","shun","shunt", +"shunter","shush","shut","shutdown","shutter", +"shuttle","shuttlecock","shy","shyster","sibilant", +"sibling","sibyl","sibylline","sic","sick", +"sickbay","sickbed","sicken","sickening","sickle", +"sickly","sickness","sickroom","side","sidearm", +"sideboard","sideboards","sidecar","sidekick","sidelight", +"sideline","sidelong","sidereal","sidesaddle","sideshow", +"sideslip","sidesman","sidesplitting","sidestep","sidestroke", +"sideswipe","sidetrack","sidewalk","sideward","sidewards", +"sideways","siding","sidle","siege","sienna", +"sierra","siesta","sieve","sift","sifter", +"sigh","sight","sighted","sightless","sightly", +"sightscreen","sightsee","sightseer","sign","signal", +"signaler","signalise","signalize","signaller","signally", +"signalman","signatory","signature","signer","signet", +"significance","significant","signification","signify","signor", +"signora","signorina","signpost","signposted","silage", +"silence","silencer","silent","silhouette","silica", +"silicate","silicon","silicone","silicosis","silk", +"silken","silkworm","silky","sill","sillabub", +"silly","silo","silt","silvan","silver", +"silverfish","silverside","silversmith","silverware","silvery", +"simian","similar","similarity","similarly","simile", +"similitude","simmer","simony","simper","simple", +"simpleton","simplicity","simplify","simply","simulacrum", +"simulate","simulated","simulation","simulator","simultaneous", +"sin","since","sincere","sincerely","sincerity", +"sinecure","sinew","sinewy","sinful","sing", +"singe","singhalese","singing","single","singleness", +"singles","singlestick","singlet","singleton","singly", +"singsong","singular","singularly","sinhalese","sinister", +"sink","sinker","sinless","sinner","sinology", +"sinuous","sinus","sip","siphon","sir", +"sire","siren","sirloin","sirocco","sirrah", +"sis","sisal","sissy","sister","sisterhood", +"sisterly","sit","sitar","site","sitter", +"sitting","situated","situation","six","sixpence", +"sixteen","sixty","sizable","size","sizeable", +"sizzle","sizzler","skate","skateboard","skedaddle", +"skeet","skein","skeleton","skeptic","skeptical", +"skepticism","sketch","sketchpad","sketchy","skew", +"skewbald","skewer","ski","skibob","skid", +"skidlid","skidpan","skiff","skiffle","skilful", +"skill","skilled","skillet","skillful","skim", +"skimmer","skimp","skimpy","skin","skinflint", +"skinful","skinhead","skinny","skint","skip", +"skipper","skirl","skirmish","skirt","skit", +"skitter","skittish","skittle","skittles","skive", +"skivvy","skua","skulduggery","skulk","skull", +"skullcap","skullduggery","skunk","sky","skydiving", +"skyhook","skyjack","skylark","skylight","skyline", +"skyrocket","skyscraper","skywriting","slab","slack", +"slacken","slacker","slacks","slag","slagheap", +"slain","slake","slalom","slam","slander", +"slanderous","slang","slangy","slant","slantwise", +"slap","slapdash","slaphappy","slapstick","slash", +"slat","slate","slattern","slaty","slaughter", +"slaughterhouse","slave","slaver","slavery","slavic", +"slavish","slay","sleazy","sled","sledge", +"sledgehammer","sleek","sleep","sleeper","sleepless", +"sleepwalker","sleepy","sleepyhead","sleet","sleeve", +"sleigh","slender","slenderise","slenderize","slept", +"sleuth","slew","slewed","slice","slick", +"slicker","slide","slight","slightly","slim", +"slimy","sling","slingshot","slink","slip", +"slipcover","slipknot","slipover","slipper","slippery", +"slippy","slips","slipshod","slipstream","slipway", +"slit","slither","slithery","sliver","slivovitz", +"slob","slobber","sloe","slog","slogan", +"sloop","slop","slope","sloppy","slosh", +"sloshed","slot","sloth","slothful","slouch", +"slough","sloven","slovenly","slow","slowcoach", +"slowworm","sludge","slue","slug","sluggard", +"sluggish","sluice","sluiceway","slum","slumber", +"slumberous","slummy","slump","slung","slunk", +"slur","slurp","slurry","slush","slut", +"sly","smack","smacker","small","smallholder", +"smallholding","smallpox","smalls","smarmy","smart", +"smarten","smash","smashed","smasher","smashing", +"smattering","smear","smell","smelly","smelt", +"smile","smirch","smirk","smite","smith", +"smithereens","smithy","smitten","smock","smocking", +"smog","smoke","smoker","smokescreen","smokestack", +"smoking","smoky","smolder","smooch","smooth", +"smoothie","smoothy","smorgasbord","smote","smother", +"smoulder","smudge","smug","smuggle","smut", +"smutty","snack","snaffle","snag","snail", +"snake","snakebite","snaky","snap","snapdragon", +"snapper","snappish","snappy","snapshot","snare", +"snarl","snatch","snazzy","sneak","sneaker", +"sneaking","sneaky","sneer","sneeze","snick", +"snicker","snide","sniff","sniffle","sniffles", +"sniffy","snifter","snigger","snip","snippet", +"snips","snitch","snivel","snob","snobbery", +"snobbish","snog","snood","snook","snooker", +"snoop","snooper","snoot","snooty","snooze", +"snore","snorkel","snort","snorter","snot", +"snotty","snout","snow","snowball","snowberry", +"snowbound","snowdrift","snowdrop","snowfall","snowfield", +"snowflake","snowline","snowman","snowplough","snowplow", +"snowshoe","snowstorm","snowy","snr","snub", +"snuff","snuffer","snuffle","snug","snuggle", +"soak","soaked","soaking","soap","soapbox", +"soapstone","soapsuds","soapy","soar","sob", +"sober","sobriety","sobriquet","soccer","sociable", +"social","socialise","socialism","socialist","socialite", +"socialize","society","sociology","sock","socket", +"sod","soda","sodden","sodium","sodomite", +"sodomy","soever","sofa","soft","softball", +"soften","softhearted","softie","software","softwood", +"softy","soggy","soigne","soignee","soil", +"sojourn","sol","solace","solar","solarium", +"sold","solder","soldier","soldierly","soldiery", +"sole","solecism","solely","solemn","solemnise", +"solemnity","solemnize","solicit","solicitor","solicitous", +"solicitude","solid","solidarity","solidify","solidity", +"solidus","soliloquise","soliloquize","soliloquy","solipsism", +"solitaire","solitary","solitude","solo","soloist", +"solstice","soluble","solution","solve","solvency", +"solvent","somber","sombre","sombrero","some", +"somebody","someday","somehow","somersault","something", +"sometime","sometimes","someway","somewhat","somewhere", +"somnambulism","somnolent","son","sonar","sonata", +"song","songbird","songbook","songster","sonic", +"sonnet","sonny","sonority","sonorous","sonsy", +"soon","soot","soothe","soothsayer","sop", +"sophism","sophisticate","sophisticated","sophistication","sophistry", +"sophomore","soporific","sopping","soppy","soprano", +"sorbet","sorcerer","sorcery","sordid","sore", +"sorehead","sorely","sorghum","sorority","sorrel", +"sorrow","sorry","sort","sortie","sos", +"sot","sottish","sou","soubrette","soubriquet", +"sough","sought","soul","soulful","soulless", +"sound","soundings","soundproof","soundtrack","soup", +"sour","source","sourdough","sourpuss","sousaphone", +"souse","soused","south","southbound","southeast", +"southeaster","southeasterly","southeastern","southeastward","southeastwards", +"southerly","southern","southerner","southernmost","southpaw", +"southward","southwards","southwest","southwester","southwesterly", +"southwestern","southwestward","southwestwards","souvenir","sovereign", +"sovereignty","soviet","sow","sox","soy", +"soybean","sozzled","spa","space","spacecraft", +"spaceship","spacesuit","spacing","spacious","spade", +"spadework","spaghetti","spake","spam","span", +"spangle","spaniel","spank","spanking","spanner", +"spar","spare","spareribs","sparing","spark", +"sparkle","sparkler","sparks","sparrow","sparse", +"spartan","spasm","spasmodic","spastic","spat", +"spatchcock","spate","spatial","spatter","spatula", +"spavin","spawn","spay","speak","speakeasy", +"speaker","speakership","spear","spearhead","spearmint", +"spec","special","specialise","specialised","specialist", +"speciality","specialize","specialized","specially","specie", +"species","specific","specifically","specification","specifics", +"specify","specimen","specious","speck","speckle", +"spectacle","spectacled","spectacles","spectacular","spectator", +"specter","spectral","spectre","spectroscope","spectrum", +"speculate","speculation","speculative","speech","speechify", +"speechless","speed","speedboat","speeding","speedometer", +"speedway","speedwell","speedy","spelaeology","speleology", +"spell","spellbind","spelling","spend","spender", +"spendthrift","spent","sperm","spermaceti","spermatozoa", +"spew","sphagnum","sphere","spherical","spheroid", +"sphincter","sphinx","spice","spicy","spider", +"spidery","spiel","spigot","spike","spikenard", +"spiky","spill","spillover","spillway","spin", +"spinach","spinal","spindle","spindly","spine", +"spineless","spinet","spinnaker","spinner","spinney", +"spinster","spiny","spiral","spire","spirit", +"spirited","spiritless","spirits","spiritual","spiritualise", +"spiritualism","spirituality","spiritualize","spirituous","spirt", +"spit","spite","spitfire","spittle","spittoon", +"spiv","splash","splashy","splat","splatter", +"splay","splayfoot","spleen","splendid","splendiferous", +"splendor","splendour","splenetic","splice","splicer", +"splint","splinter","split","splits","splitting", +"splotch","splurge","splutter","spoil","spoilage", +"spoils","spoilsport","spoke","spoken","spokeshave", +"spokesman","spoliation","spondee","sponge","spongy", +"sponsor","spontaneous","spoof","spook","spooky", +"spool","spoon","spoonerism","spoonful","spoor", +"sporadic","spore","sporran","sport","sporting", +"sportive","sports","sportsman","sportsmanlike","sportsmanship", +"sporty","spot","spotless","spotlight","spotted", +"spotter","spotty","spouse","spout","sprain", +"sprang","sprat","sprawl","spray","sprayer", +"spread","spree","sprig","sprigged","sprightly", +"spring","springboard","springbok","springtime","springy", +"sprinkle","sprinkler","sprinkling","sprint","sprite", +"sprocket","sprout","spruce","sprung","spry", +"spud","spume","spun","spunk","spur", +"spurious","spurn","spurt","sputter","sputum", +"spy","spyglass","squab","squabble","squad", +"squadron","squalid","squall","squalor","squander", +"square","squash","squashy","squat","squatter", +"squaw","squawk","squeak","squeaky","squeal", +"squeamish","squeegee","squeeze","squeezer","squelch", +"squib","squid","squidgy","squiffy","squiggle", +"squint","squirarchy","squire","squirearchy","squirm", +"squirrel","squirt","squirter","sri","srn", +"ssh","stab","stabbing","stabilise","stabiliser", +"stability","stabilize","stabilizer","stable","stabling", +"staccato","stack","stadium","staff","stag", +"stage","stagecoach","stager","stagestruck","stagger", +"staggering","staggers","staging","stagnant","stagnate", +"stagy","staid","stain","stainless","stair", +"staircase","stairs","stairwell","stake","stakeholder", +"stakes","stalactite","stalagmite","stale","stalemate", +"stalk","stall","stallholder","stallion","stalls", +"stalwart","stamen","stamina","stammer","stamp", +"stampede","stance","stanch","stanchion","stand", +"standard","standardise","standardize","standby","standing", +"standoffish","standpipe","standpoint","standstill","stank", +"stanza","staple","stapler","star","starboard", +"starch","starchy","stardom","stardust","stare", +"starfish","stargazer","stargazing","staring","stark", +"starkers","starlet","starlight","starling","starlit", +"starry","stars","start","starter","starters", +"startle","starvation","starve","starveling","stash", +"state","statecraft","statehood","stateless","stately", +"statement","stateroom","states","stateside","statesman", +"static","statics","station","stationary","stationer", +"stationery","stationmaster","statistic","statistician","statistics", +"statuary","statue","statuesque","statuette","stature", +"status","statute","statutory","staunch","stave", +"staves","stay","stayer","stays","std", +"stead","steadfast","steady","steak","steal", +"stealth","stealthy","steam","steamboat","steamer", +"steamroller","steamship","steed","steel","steelworker", +"steelworks","steely","steelyard","steenbok","steep", +"steepen","steeple","steeplechase","steeplejack","steer", +"steerage","steerageway","steersman","stein","steinbok", +"stele","stellar","stem","stench","stencil", +"stenographer","stenography","stentorian","step","stepbrother", +"stepchild","stepladder","stepparent","steps","stepsister", +"stereo","stereoscope","stereoscopic","stereotype","sterile", +"sterilise","sterility","sterilize","sterling","stern", +"sternum","steroid","stertorous","stet","stethoscope", +"stetson","stevedore","stew","steward","stewardess", +"stewardship","stewed","stick","sticker","stickleback", +"stickler","stickpin","sticks","sticky","stiff", +"stiffen","stiffener","stiffening","stifle","stigma", +"stigmata","stigmatise","stigmatize","stile","stiletto", +"still","stillbirth","stillborn","stillroom","stilly", +"stilt","stilted","stilton","stimulant","stimulate", +"stimulus","sting","stinger","stingo","stingray", +"stingy","stink","stinking","stint","stipend", +"stipendiary","stipple","stipulate","stipulation","stir", +"stirrer","stirring","stirrup","stitch","stoat", +"stock","stockade","stockbreeder","stockbroker","stockcar", +"stockfish","stockholder","stockily","stockinet","stockinette", +"stocking","stockist","stockjobber","stockman","stockpile", +"stockpot","stockroom","stocks","stocktaking","stocky", +"stockyard","stodge","stodgy","stoic","stoical", +"stoicism","stoke","stokehold","stoker","stole", +"stolen","stolid","stomach","stomachache","stomachful", +"stomp","stone","stonebreaker","stonecutter","stoned", +"stoneless","stonemason","stonewall","stoneware","stonework", +"stony","stood","stooge","stool","stoolpigeon", +"stoop","stop","stopcock","stopgap","stopover", +"stoppage","stopper","stopping","stopwatch","storage", +"store","storehouse","storekeeper","storeroom","stores", +"storey","storied","stork","storm","stormbound", +"stormy","story","storybook","storyteller","stoup", +"stout","stouthearted","stove","stovepipe","stow", +"stowage","stowaway","straddle","stradivarius","strafe", +"straggle","straggly","straight","straightaway","straightedge", +"straighten","straightforward","straightway","strain","strained", +"strainer","strait","straitened","straitjacket","straitlaced", +"straits","strand","stranded","strange","stranger", +"strangle","stranglehold","strangulate","strangulation","strap", +"straphanging","strapless","strapping","strata","stratagem", +"strategic","strategist","strategy","stratification","stratify", +"stratosphere","stratum","straw","strawberry","strawboard", +"stray","streak","streaker","streaky","stream", +"streamer","streamline","streamlined","street","streetcar", +"streetwalker","strength","strengthen","strenuous","streptococcus", +"streptomycin","stress","stretch","stretcher","stretchy", +"strew","strewth","striated","striation","stricken", +"strict","stricture","stride","stridency","strident", +"stridulate","strife","strike","strikebound","strikebreaker", +"strikebreaking","striker","striking","string","stringency", +"stringent","strings","stringy","strip","stripe", +"striped","stripling","stripper","striptease","stripy", +"strive","strode","stroke","stroll","stroller", +"strolling","strong","strongarm","strongbox","stronghold", +"strontium","strop","strophe","stroppy","strove", +"struck","structural","structure","strudel","struggle", +"strum","strumpet","strung","strut","strychnine", +"stub","stubble","stubborn","stubby","stucco", +"stuck","stud","studbook","student","studied", +"studio","studious","study","stuff","stuffing", +"stuffy","stultify","stumble","stump","stumper", +"stumpy","stun","stung","stunk","stunner", +"stunning","stunt","stupefaction","stupefy","stupendous", +"stupid","stupidity","stupor","sturdy","sturgeon", +"stutter","sty","stye","stygian","style", +"stylise","stylish","stylist","stylistic","stylistics", +"stylize","stylus","stymie","styptic","suasion", +"suave","sub","subaltern","subatomic","subcommittee", +"subconscious","subcontinent","subcontract","subcontractor","subcutaneous", +"subdivide","subdue","subdued","subedit","subeditor", +"subheading","subhuman","subject","subjection","subjective", +"subjoin","subjugate","subjunctive","sublease","sublet", +"sublieutenant","sublimate","sublime","subliminal","submarine", +"submariner","submerge","submergence","submersible","submission", +"submissive","submit","subnormal","suborbital","subordinate", +"suborn","subplot","subpoena","subscribe","subscriber", +"subscription","subsequent","subservience","subservient","subside", +"subsidence","subsidiary","subsidise","subsidize","subsidy", +"subsist","subsistence","subsoil","subsonic","substance", +"substandard","substantial","substantially","substantiate","substantival", +"substantive","substation","substitute","substratum","substructure", +"subsume","subtenant","subtend","subterfuge","subterranean", +"subtitle","subtitles","subtle","subtlety","subtopia", +"subtract","subtraction","subtropical","suburb","suburban", +"suburbanite","suburbia","suburbs","subvention","subversive", +"subvert","subway","succeed","success","successful", +"succession","successive","successor","succinct","succor", +"succour","succubus","succulence","succulent","succumb", +"such","suchlike","suck","sucker","suckle", +"suckling","sucrose","suction","sudden","suds", +"sue","suet","suffer","sufferable","sufferance", +"sufferer","suffering","suffice","sufficiency","sufficient", +"suffix","suffocate","suffragan","suffrage","suffragette", +"suffuse","sugar","sugarcane","sugarcoated","sugarloaf", +"sugary","suggest","suggestible","suggestion","suggestive", +"suicidal","suicide","suit","suitability","suitable", +"suitcase","suiting","suitor","sulfate","sulfide", +"sulfur","sulfuret","sulfurous","sulk","sulks", +"sulky","sullen","sully","sulphate","sulphide", +"sulphur","sulphuret","sulphurous","sultan","sultana", +"sultanate","sultry","sum","sumac","sumach", +"summarise","summarize","summary","summat","summation", +"summer","summerhouse","summertime","summery","summit", +"summon","summons","sump","sumptuary","sumptuous", +"sun","sunbaked","sunbathe","sunbeam","sunblind", +"sunbonnet","sunburn","sunburnt","sundae","sunday", +"sundeck","sunder","sundew","sundial","sundown", +"sundowner","sundrenched","sundries","sundry","sunfish", +"sunflower","sung","sunglasses","sunk","sunken", +"sunlamp","sunless","sunlight","sunlit","sunny", +"sunray","sunrise","sunroof","sunset","sunshade", +"sunshine","sunspot","sunstroke","suntan","suntrap", +"sup","super","superabundance","superabundant","superannuate", +"superannuated","superannuation","superb","supercharged","supercharger", +"supercilious","superconductivity","superduper","superego","superficial", +"superficies","superfine","superfluity","superfluous","superhuman", +"superimpose","superintend","superintendent","superior","superlative", +"superlatively","superman","supermarket","supernal","supernatural", +"supernova","supernumerary","superscription","supersede","supersession", +"supersonic","superstar","superstition","superstitious","superstructure", +"supertax","supervene","supervise","supervisory","supine", +"supper","supplant","supple","supplement","supplementary", +"suppliant","supplicant","supplicate","supplier","supplies", +"supply","support","supportable","supporter","supportive", +"suppose","supposed","supposedly","supposing","supposition", +"suppository","suppress","suppression","suppressive","suppressor", +"suppurate","supranational","supremacist","supremacy","supreme", +"surcharge","surcoat","surd","sure","surefire", +"surefooted","surely","surety","surf","surface", +"surfboard","surfboat","surfeit","surfer","surge", +"surgeon","surgery","surgical","surly","surmise", +"surmount","surname","surpass","surpassing","surplice", +"surplus","surprise","surprising","surreal","surrealism", +"surrealist","surrealistic","surrender","surreptitious","surrey", +"surrogate","surround","surrounding","surroundings","surtax", +"surveillance","survey","surveyor","survival","survive", +"survivor","susceptibilities","susceptibility","susceptible","suspect", +"suspend","suspender","suspenders","suspense","suspension", +"suspicion","suspicious","sustain","sustenance","suttee", +"suture","suzerain","suzerainty","svelte","swab", +"swaddle","swag","swagger","swain","swallow", +"swallowtailed","swam","swami","swamp","swampy", +"swan","swank","swanky","swansdown","swansong", +"swap","sward","swarf","swarm","swarthy", +"swashbuckler","swashbuckling","swastika","swat","swatch", +"swath","swathe","swatter","sway","swayback", +"swear","swearword","sweat","sweatband","sweated", +"sweater","sweatshirt","sweatshop","sweaty","swede", +"sweep","sweeper","sweeping","sweepings","sweepstake", +"sweepstakes","sweet","sweetbread","sweetbriar","sweetbrier", +"sweeten","sweetener","sweetening","sweetheart","sweetie", +"sweetish","sweetmeat","sweets","swell","swelling", +"swelter","sweltering","swept","swerve","swift", +"swig","swill","swim","swimming","swimmingly", +"swindle","swine","swineherd","swing","swingeing", +"swinger","swinging","swinish","swipe","swirl", +"swish","switch","switchback","switchblade","switchboard", +"switchgear","switchman","swivel","swiz","swizzle", +"swollen","swoon","swoop","swop","sword", +"swordfish","swordplay","swordsman","swordsmanship","swordstick", +"swore","sworn","swot","swum","swung", +"sybarite","sybaritic","sycamore","sycophant","sycophantic", +"sylabub","syllabary","syllabic","syllabify","syllable", +"syllabub","syllabus","syllogism","syllogistic","sylph", +"sylphlike","sylvan","symbiosis","symbol","symbolic", +"symbolise","symbolism","symbolist","symbolize","symmetrical", +"symmetry","sympathetic","sympathies","sympathise","sympathize", +"sympathy","symphonic","symphony","symposium","symptom", +"symptomatic","synagogue","sync","synch","synchonise", +"synchromesh","synchronize","synchrotron","syncopate","syncope", +"syndic","syndicalism","syndicate","syndrome","synod", +"synonym","synonymous","synopsis","synoptic","syntactic", +"syntax","synthesis","synthesise","synthesiser","synthesize", +"synthesizer","synthetic","syphilis","syphilitic","syphon", +"syringe","syrup","syrupy","system","systematic", +"systematise","systematize","systemic","tab","tabard", +"tabasco","tabby","tabernacle","table","tableau", +"tablecloth","tableland","tablemat","tablespoon","tablespoonful", +"tablet","tableware","tabloid","taboo","tabor", +"tabular","tabulate","tabulator","tacit","taciturn", +"tack","tackiness","tackle","tacky","tact", +"tactic","tactical","tactician","tactics","tactile", +"tactual","tadpole","taffeta","taffrail","taffy", +"tag","tail","tailback","tailboard","tailcoat", +"taillight","tailor","tailpiece","tails","tailspin", +"tailwind","taint","take","takeaway","takeoff", +"takeover","taking","takings","talc","tale", +"talebearer","talent","talented","talisman","talk", +"talkative","talker","talkie","talks","tall", +"tallboy","tallow","tally","tallyho","tallyman", +"talmud","talon","tamale","tamarind","tamarisk", +"tambour","tambourine","tame","tammany","tamp", +"tamper","tampon","tan","tandem","tang", +"tangent","tangential","tangerine","tangible","tangle", +"tango","tank","tankard","tanker","tanner", +"tannery","tannin","tanning","tannoy","tansy", +"tantalise","tantalize","tantalus","tantamount","tantrum", +"taoism","tap","tape","taper","tapestry", +"tapeworm","tapioca","tapir","tappet","taproom", +"taproot","taps","tar","tarantella","tarantula", +"tarboosh","tardy","target","tariff","tarmac", +"tarn","tarnish","taro","tarot","tarpaulin", +"tarragon","tarry","tarsal","tarsus","tart", +"tartan","tartar","task","taskmaster","tassel", +"taste","tasteful","tasteless","taster","tasty", +"tat","tatas","tatter","tattered","tatters", +"tatting","tattle","tattoo","tattooist","tatty", +"taught","taunt","taurus","taut","tautological", +"tautology","tavern","tawdry","tawny","tawse", +"tax","taxation","taxi","taxidermist","taxidermy", +"taximeter","taxonomy","tea","teabag","teacake", +"teach","teacher","teaching","teacup","teacupful", +"teagarden","teahouse","teak","teakettle","teal", +"tealeaf","team","teamster","teamwork","teapot", +"tear","tearaway","teardrop","tearful","teargas", +"tearjerker","tearless","tearoom","tease","teasel", +"teaser","teaspoon","teaspoonful","teat","teatime", +"teazle","tech","technical","technicality","technician", +"technique","technocracy","technocrat","technological","technologist", +"technology","techy","tedious","tedium","tee", +"teem","teeming","teenage","teenager","teens", +"teenybopper","teeter","teeth","teethe","teetotal", +"teetotaler","teetotaller","teflon","tegument","tele", +"telecast","telecommunications","telegram","telegraph","telegrapher", +"telegraphese","telegraphic","telemarketing","telemeter","telemetry", +"teleology","telepathic","telepathist","telepathy","telephone", +"telephonist","telephony","telephotograph","telephotography","teleprinter", +"teleprompter","telescope","telescopic","televise","television", +"televisual","telex","telfer","tell","teller", +"telling","telltale","telly","telpher","telstar", +"temerity","temp","temper","tempera","temperament", +"temperamental","temperance","temperate","temperature","tempest", +"tempestuous","template","temple","templet","tempo", +"temporal","temporary","temporise","temporize","tempt", +"temptation","ten","tenable","tenacious","tenacity", +"tenancy","tenant","tenantry","tench","tend", +"tendency","tendentious","tender","tenderfoot","tenderhearted", +"tenderise","tenderize","tenderloin","tendon","tendril", +"tenement","tenet","tenner","tennis","tenon", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java new file mode 100644 index 00000000000..001a4657aa7 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java @@ -0,0 +1,614 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This algorithm is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** A list of words used by Kstem + */ +class KStemData8 { + private KStemData8() { + } + static String[] data = { +"tenor","tenpin","tense","tensile","tension", +"tent","tentacle","tentative","tenterhooks","tenuity", +"tenuous","tenure","tepee","tepid","tequila", +"tercentenary","tercentennial","term","termagant","terminable", +"terminal","terminate","termination","terminology","terminus", +"termite","terms","tern","terpsichorean","terrace", +"terracotta","terrain","terrapin","terrestrial","terrible", +"terribly","terrier","terrific","terrifically","terrify", +"territorial","territory","terror","terrorise","terrorism", +"terrorize","terrycloth","terse","tertian","tertiary", +"terylene","tessellated","test","testament","testamentary", +"testate","testator","tester","testicle","testify", +"testimonial","testimony","testis","testy","tetanus", +"tetchy","tether","teutonic","text","textbook", +"textile","textual","texture","thalidomide","than", +"thane","thank","thankful","thankless","thanks", +"thanksgiving","thankyou","that","thatch","thaw", +"the","theater","theatergoer","theatre","theatregoer", +"theatrical","theatricals","thee","theft","thegn", +"their","theirs","theism","them","theme", +"themselves","then","thence","thenceforth","theocracy", +"theocratic","theodolite","theologian","theology","theorem", +"theoretical","theoretically","theorise","theorist","theorize", +"theory","theosophy","therapeutic","therapeutics","therapist", +"therapy","there","thereabouts","thereafter","thereby", +"therefore","therein","thereinafter","thereof","thereon", +"thereto","thereunder","thereupon","therm","thermal", +"thermionic","thermionics","thermodynamics","thermometer","thermonuclear", +"thermoplastic","thermos","thermosetting","thermostat","thesaurus", +"these","thesis","thespian","thews","they", +"thick","thicken","thickener","thicket","thickheaded", +"thickness","thickset","thief","thieve","thieving", +"thievish","thigh","thimble","thimbleful","thin", +"thine","thing","thingamajig","thingamujig","things", +"think","thinkable","thinking","thinner","third", +"thirst","thirsty","thirteen","thirty","this", +"thistle","thistledown","thither","thole","thong", +"thorax","thorn","thorny","thorough","thoroughbred", +"thoroughfare","thoroughgoing","those","thou","though", +"thought","thoughtful","thoughtless","thousand","thraldom", +"thrall","thralldom","thrash","thrashing","thread", +"threadbare","threadlike","threat","threaten","three", +"threepence","threnody","thresh","thresher","threshold", +"threw","thrice","thrift","thrifty","thrill", +"thriller","thrive","throat","throaty","throb", +"throes","thrombosis","throne","throng","throstle", +"throttle","through","throughout","throughput","throughway", +"throw","throwaway","throwback","thru","thrum", +"thrush","thrust","thruster","thruway","thud", +"thug","thuggery","thumb","thumbnail","thumbscrew", +"thumbtack","thump","thumping","thunder","thunderbolt", +"thunderclap","thundercloud","thundering","thunderous","thunderstorm", +"thunderstruck","thundery","thurible","thursday","thus", +"thwack","thwart","thy","thyme","thyroid", +"thyself","tiara","tibia","tic","tick", +"ticker","tickertape","ticket","ticking","tickle", +"tickler","ticklish","tidal","tidbit","tiddler", +"tiddley","tiddleywinks","tiddly","tiddlywinks","tide", +"tidemark","tidewater","tideway","tidings","tidy", +"tie","tiebreaker","tiepin","tier","tiff", +"tiffin","tig","tiger","tigerish","tight", +"tighten","tightfisted","tightrope","tights","tightwad", +"tigress","tike","tilde","tile","till", +"tillage","tiller","tilt","timber","timbered", +"timberline","timbre","timbrel","time","timekeeper", +"timeless","timely","timepiece","timer","times", +"timesaving","timeserver","timeserving","timetable","timework", +"timeworn","timid","timing","timorous","timothy", +"timpani","timpanist","tin","tincture","tinder", +"tinderbox","tinfoil","ting","tingaling","tinge", +"tingle","tinker","tinkle","tinny","tinplate", +"tinsel","tint","tintack","tintinnabulation","tiny", +"tip","tippet","tipple","tipstaff","tipster", +"tipsy","tiptoe","tirade","tire","tired", +"tireless","tiresome","tiro","tissue","tit", +"titan","titanic","titanium","titbit","titfer", +"tithe","titillate","titivate","title","titled", +"titleholder","titmouse","titter","tittivate","tittle", +"titty","titular","tizzy","tnt","toad", +"toadstool","toady","toast","toaster","toastmaster", +"tobacco","tobacconist","toboggan","toccata","tocsin", +"tod","today","toddle","toddler","toddy", +"toe","toehold","toenail","toff","toffee", +"toffy","tog","toga","together","togetherness", +"toggle","togs","toil","toilet","toiletries", +"toiletry","toils","tokay","token","told", +"tolerable","tolerably","tolerance","tolerant","tolerate", +"toleration","toll","tollgate","tollhouse","tomahawk", +"tomato","tomb","tombola","tomboy","tombstone", +"tomcat","tome","tomfoolery","tommyrot","tomorrow", +"tomtit","ton","tonal","tonality","tone", +"toneless","tong","tongs","tongue","tonic", +"tonight","tonnage","tonne","tonsil","tonsilitis", +"tonsillitis","tonsorial","tonsure","tontine","too", +"took","tool","toot","tooth","toothache", +"toothbrush","toothcomb","toothpaste","toothpick","toothsome", +"toothy","tootle","toots","tootsie","top", +"topaz","topcoat","topdressing","topee","topgallant", +"topi","topiary","topic","topical","topicality", +"topknot","topless","topmast","topmost","topographer", +"topographical","topography","topper","topping","topple", +"tops","topsail","topside","topsoil","topspin", +"toque","tor","torch","torchlight","tore", +"toreador","torment","tormentor","torn","tornado", +"torpedo","torpid","torpor","torque","torrent", +"torrential","torrid","torsion","torso","tort", +"tortilla","tortoise","tortoiseshell","tortuous","torture", +"tory","toss","tot","total","totalisator", +"totalitarian","totalitarianism","totality","totalizator","tote", +"totem","totter","tottery","toucan","touch", +"touchdown","touched","touching","touchline","touchstone", +"touchy","tough","toughen","toupee","tour", +"tourism","tourist","tournament","tourney","tourniquet", +"tousle","tout","tow","towards","towel", +"toweling","towelling","tower","towering","towline", +"town","townscape","township","townsman","townspeople", +"towpath","toxaemia","toxemia","toxic","toxicologist", +"toxicology","toxin","toy","toyshop","trace", +"tracer","tracery","trachea","trachoma","tracing", +"track","trackless","tracksuit","tract","tractable", +"traction","tractor","trad","trade","trademark", +"trader","trades","tradesman","tradespeople","tradition", +"traditional","traditionalism","traduce","traffic","trafficator", +"trafficker","tragedian","tragedienne","tragedy","tragic", +"tragicomedy","trail","trailer","train","trainbearer", +"trainee","training","trainman","traipse","trait", +"traitor","traitorous","trajectory","tram","tramline", +"trammel","trammels","tramp","trample","trampoline", +"trance","tranny","tranquil","tranquiliser","tranquillise", +"tranquillize","tranquillizer","transact","transaction","transactions", +"transalpine","transatlantic","transcend","transcendence","transcendent", +"transcendental","transcendentalism","transcontinental","transcribe","transcript", +"transcription","transept","transfer","transference","transfiguration", +"transfigure","transfix","transform","transformation","transformer", +"transfuse","transgress","tranship","transience","transient", +"transistor","transistorise","transistorize","transit","transition", +"transitive","translate","translator","transliterate","translucence", +"translucent","transmigration","transmission","transmit","transmitter", +"transmogrify","transmute","transoceanic","transom","transparency", +"transparent","transpiration","transpire","transplant","transpolar", +"transport","transportation","transporter","transpose","transship", +"transubstantiation","transverse","transvestism","transvestite","trap", +"trapdoor","trapeze","trapezium","trapezoid","trapper", +"trappings","trappist","trapse","trapshooting","trash", +"trashcan","trashy","trauma","traumatic","travail", +"travel","traveled","traveler","travelled","traveller", +"travelog","travelogue","travels","travelsick","traverse", +"travesty","trawl","trawler","tray","treacherous", +"treachery","treacle","treacly","tread","treadle", +"treadmill","treason","treasonable","treasure","treasurer", +"treasury","treat","treatise","treatment","treaty", +"treble","tree","trefoil","trek","trellis", +"tremble","tremendous","tremolo","tremor","tremulous", +"trench","trenchant","trencher","trencherman","trend", +"trendsetter","trendy","trepan","trephine","trepidation", +"trespass","tresses","trestle","trews","triad", +"trial","triangle","triangular","tribal","tribalism", +"tribe","tribesman","tribulation","tribunal","tribune", +"tributary","tribute","trice","triceps","trichinosis", +"trick","trickery","trickle","trickster","tricky", +"tricolor","tricolour","tricycle","trident","triennial", +"trier","trifle","trifler","trifling","trigger", +"trigonometry","trike","trilateral","trilby","trilingual", +"trill","trillion","trilobite","trilogy","trim", +"trimaran","trimester","trimmer","trimming","trinitrotoluene", +"trinity","trinket","trio","trip","tripartite", +"triple","triplet","triplex","triplicate","tripod", +"tripos","tripper","tripping","triptych","tripwire", +"trireme","trisect","trite","triumph","triumphal", +"triumphant","triumvir","triumvirate","trivet","trivia", +"trivial","trivialise","triviality","trivialize","trochaic", +"trochee","trod","trodden","troglodyte","troika", +"trojan","troll","trolley","trolleybus","trollop", +"trombone","trombonist","troop","trooper","troops", +"troopship","trope","trophy","tropic","tropical", +"tropics","trot","troth","trotskyist","trotter", +"troubadour","trouble","troublemaker","troubleshooter","troublesome", +"trough","trounce","troupe","trouper","trouser", +"trousers","trousseau","trout","trove","trowel", +"truancy","truant","truce","truck","trucking", +"truckle","truculence","truculent","trudge","true", +"trueborn","truehearted","truelove","truffle","trug", +"truism","truly","trump","trumpery","trumpet", +"trumps","truncate","truncheon","trundle","trunk", +"trunks","truss","trust","trustee","trusteeship", +"trustful","trustworthy","trusty","truth","truthful", +"try","tryst","tsar","tsarina","tsp", +"tub","tuba","tubby","tube","tubeless", +"tuber","tubercular","tuberculosis","tubful","tubing", +"tubular","tuck","tucker","tuckerbag","tuesday", +"tuft","tug","tugboat","tuition","tulip", +"tulle","tumble","tumbledown","tumbler","tumbleweed", +"tumbrel","tumbril","tumescent","tumid","tummy", +"tumor","tumour","tumult","tumultuous","tumulus", +"tun","tuna","tundra","tune","tuneful", +"tuneless","tuner","tungsten","tunic","tunnel", +"tunny","tup","tuppence","tuppenny","turban", +"turbid","turbine","turbojet","turboprop","turbot", +"turbulence","turbulent","turd","tureen","turf", +"turgid","turkey","turmeric","turmoil","turn", +"turnabout","turncoat","turncock","turner","turning", +"turnip","turnkey","turnout","turnover","turnpike", +"turnstile","turntable","turpentine","turpitude","turquoise", +"turret","turtle","turtledove","turtleneck","tush", +"tusk","tusker","tussle","tussock","tut", +"tutelage","tutelary","tutor","tutorial","tutu", +"tuxedo","twaddle","twain","twang","twat", +"tweak","twee","tweed","tweeds","tweedy", +"tweet","tweeter","tweezers","twelfth","twelve", +"twelvemonth","twenty","twerp","twice","twiddle", +"twig","twilight","twill","twin","twinge", +"twinkle","twinkling","twirl","twirp","twist", +"twister","twit","twitch","twitter","twixt", +"two","twofaced","twopence","twopenny","twosome", +"tycoon","tyke","tympanum","type","typecast", +"typeface","typescript","typesetter","typewriter","typewritten", +"typhoid","typhoon","typhus","typical","typically", +"typify","typist","typographer","typographic","typography", +"tyrannical","tyrannise","tyrannize","tyrannosaurus","tyranny", +"tyrant","tyre","tyro","tzar","tzarina", +"ubiquitous","ucca","udder","ufo","ugh", +"ugly","uhf","ukulele","ulcer","ulcerate", +"ulcerous","ullage","ulna","ult","ulterior", +"ultimate","ultimately","ultimatum","ultimo","ultramarine", +"ultrasonic","ultraviolet","umber","umbrage","umbrella", +"umlaut","umpire","umpteen","unabashed","unabated", +"unable","unabridged","unaccompanied","unaccountable","unaccustomed", +"unadopted","unadulterated","unadvised","unaffected","unalloyed", +"unanimous","unannounced","unanswerable","unapproachable","unarmed", +"unasked","unassuming","unattached","unattended","unavailing", +"unawares","unbalance","unbar","unbearable","unbearably", +"unbeknown","unbelief","unbelievable","unbeliever","unbelieving", +"unbend","unbending","unbidden","unbind","unblushing", +"unborn","unbosom","unbounded","unbowed","unbridled", +"unbuckle","unburden","unbuttoned","uncanny","unceremonious", +"uncertain","uncertainty","uncharitable","uncharted","unchecked", +"unchristian","unclad","uncle","unclean","unclouded", +"uncolored","uncoloured","uncomfortable","uncommitted","uncommonly", +"uncompromising","unconcerned","unconditional","unconscionable","unconscious", +"unconsidered","uncork","uncouple","uncouth","uncover", +"uncritical","uncrowned","uncrushable","unction","unctuous", +"uncut","undaunted","undeceive","undecided","undeclared", +"undeniable","under","underact","underarm","underbelly", +"underbrush","undercarriage","undercharge","underclothes","undercoat", +"undercover","undercurrent","undercut","underdog","underdone", +"underestimate","underfelt","underfloor","underfoot","undergarment", +"undergo","undergraduate","underground","undergrowth","underhand", +"underhanded","underhung","underlay","underlie","underline", +"underling","underlying","undermanned","undermentioned","undermine", +"underneath","undernourish","underpants","underpass","underpin", +"underplay","underprivileged","underproof","underquote","underrate", +"underscore","undersecretary","undersell","undersexed","undershirt", +"underside","undersigned","undersized","underslung","understaffed", +"understand","understanding","understate","understatement","understudy", +"undertake","undertaker","undertaking","undertone","undertow", +"underwater","underwear","underweight","underwent","underworld", +"underwrite","underwriter","undesirable","undeveloped","undies", +"undischarged","undistinguished","undivided","undo","undoing", +"undomesticated","undone","undoubted","undress","undressed", +"undue","undulate","undulation","unduly","undying", +"unearth","unearthly","unease","uneasy","uneconomic", +"uneducated","unemployed","unemployment","unenlightened","unenviable", +"unequal","unequaled","unequalled","unequivocal","unerring", +"unesco","uneven","uneventful","unexampled","unexceptionable", +"unfailing","unfaithful","unfaltering","unfathomable","unfathomed", +"unfavorable","unfavourable","unfeeling","unfettered","unfit", +"unflagging","unflappable","unflinching","unfold","unforeseen", +"unforgettable","unfortunate","unfortunately","unfounded","unfrequented", +"unfrock","unfurl","ungainly","ungenerous","ungodly", +"ungovernable","ungracious","ungrateful","ungrudging","unguarded", +"unguent","unhallowed","unhand","unhappily","unhappy", +"unhealthy","unheard","unhinge","unholy","unhook", +"unhorse","unicef","unicorn","unidentified","unification", +"uniform","uniformed","unify","unilateral","unimpeachable", +"uninformed","uninhabitable","uninhibited","uninterested","uninterrupted", +"union","unionise","unionism","unionist","unionize", +"unique","unisex","unison","unit","unitarian", +"unite","united","unity","universal","universally", +"universe","university","unkempt","unkind","unkindly", +"unknowing","unknown","unlawful","unlearn","unleash", +"unleavened","unless","unlettered","unlike","unlikely", +"unload","unlock","unloose","unloosen","unmade", +"unmannerly","unmarried","unmask","unmatched","unmeasured", +"unmentionable","unmentionables","unmindful","unmistakable","unmitigated", +"unmoved","unnatural","unnecessary","unnerve","unnumbered", +"uno","unobtrusive","unofficial","unorthodox","unpack", +"unparalleled","unparliamentary","unperson","unpick","unplaced", +"unplayable","unpleasant","unplumbed","unpracticed","unpractised", +"unprecedented","unprejudiced","unpretentious","unprincipled","unprintable", +"unprofessional","unprompted","unprovoked","unqualified","unquestionable", +"unquestioning","unquiet","unquote","unravel","unreadable", +"unreal","unreasonable","unreasoning","unrelenting","unrelieved", +"unremitting","unrequited","unreserved","unrest","unrestrained", +"unrip","unrivaled","unrivalled","unroll","unruffled", +"unruly","unsaddle","unsaid","unsavory","unsavoury", +"unsay","unscathed","unschooled","unscramble","unscrew", +"unscripted","unscrupulous","unseat","unseeing","unseemly", +"unseen","unserviceable","unsettle","unsettled","unsex", +"unsexed","unshakable","unshakeable","unshod","unsightly", +"unskilled","unsociable","unsocial","unsophisticated","unsound", +"unsparing","unspeakable","unspotted","unstop","unstrung", +"unstuck","unstudied","unsullied","unsung","unswerving", +"untangle","untapped","untenable","unthinkable","unthinking", +"untie","until","untimely","untinged","untiring", +"unto","untold","untouchable","untoward","untruth", +"untruthful","untutored","unused","unusual","unusually", +"unutterable","unvarnished","unveil","unversed","unvoiced", +"unwarranted","unwed","unwell","unwieldy","unwind", +"unwitting","unwonted","unzip","upbeat","upbraid", +"upbringing","upcoming","update","upend","upgrade", +"upheaval","uphill","uphold","upholster","upholsterer", +"upholstery","upkeep","upland","uplift","upon", +"upper","uppercut","uppermost","uppish","uppity", +"upright","uprising","uproar","uproarious","uproot", +"upset","upshot","upstage","upstairs","upstanding", +"upstart","upstream","upsurge","upswing","uptake", +"uptight","uptown","upturn","upturned","upward", +"upwards","uranium","uranus","urban","urbane", +"urbanise","urbanize","urchin","urge","urgent", +"uric","urinal","urinary","urinate","urine", +"urn","usage","use","useful","usefulness", +"useless","user","usher","usherette","ussr", +"usual","usually","usurer","usurious","usurp", +"usury","utensil","uterine","uterus","utilise", +"utilitarian","utilitarianism","utility","utilize","utmost", +"utopia","utopian","utter","utterance","utterly", +"uvula","uvular","uxorious","vac","vacancy", +"vacant","vacate","vacation","vaccinate","vaccination", +"vaccine","vacillate","vacuity","vacuous","vacuum", +"vagabond","vagary","vagina","vaginal","vagrancy", +"vagrant","vague","vain","vainglorious","vainglory", +"valance","vale","valediction","valedictory","valency", +"valentine","valerian","valet","valetudinarian","valiant", +"valiantly","valid","validate","valise","valley", +"valor","valour","valse","valuable","valuation", +"value","valuer","valve","valvular","vamoose", +"vamp","vampire","van","vanadium","vandal", +"vandalise","vandalism","vandalize","vane","vanguard", +"vanilla","vanish","vanity","vanquish","vantagepoint", +"vapid","vapidity","vapor","vaporise","vaporize", +"vaporous","vapors","vapour","vapours","variability", +"variable","variance","variant","variation","varicolored", +"varicoloured","varicose","varied","variegated","variegation", +"variety","variform","variorum","various","variously", +"varlet","varmint","varnish","varsity","vary", +"vascular","vase","vasectomy","vaseline","vassal", +"vassalage","vast","vastly","vastness","vat", +"vatican","vaudeville","vault","vaulted","vaulting", +"vaunt","veal","vector","veer","veg", +"vegan","vegetable","vegetarian","vegetarianism","vegetate", +"vegetation","vehement","vehicle","vehicular","veil", +"veiled","vein","veined","veining","velar", +"velarize","veld","veldt","vellum","velocipede", +"velocity","velour","velours","velvet","velveteen", +"velvety","venal","vend","vendee","vender", +"vendetta","vendor","veneer","venerable","venerate", +"venereal","vengeance","vengeful","venial","venison", +"venom","venomous","venous","vent","ventilate", +"ventilation","ventilator","ventricle","ventriloquism","ventriloquist", +"venture","venturer","venturesome","venue","veracious", +"veracity","veranda","verandah","verb","verbal", +"verbalise","verbalize","verbally","verbatim","verbena", +"verbiage","verbose","verbosity","verdant","verdict", +"verdigris","verdure","verge","verger","verify", +"verily","verisimilitude","veritable","verity","vermicelli", +"vermiculite","vermiform","vermifuge","vermilion","vermin", +"verminous","vermouth","vernacular","vernal","veronal", +"veronica","verruca","versatile","verse","versed", +"versification","versify","version","verso","versus", +"vertebra","vertebrate","vertex","vertical","vertiginous", +"vertigo","verve","very","vesicle","vesicular", +"vesper","vespers","vessel","vest","vestibule", +"vestige","vestigial","vestment","vestry","vestryman", +"vesture","vet","vetch","veteran","veterinary", +"veto","vex","vexation","vexatious","vhf", +"via","viable","viaduct","vial","viands", +"vibes","vibrancy","vibrant","vibraphone","vibrate", +"vibration","vibrato","vibrator","vicar","vicarage", +"vicarious","vice","vicelike","viceregal","vicereine", +"viceroy","vicinity","vicious","vicissitudes","victim", +"victimise","victimize","victor","victorian","victorious", +"victory","victual","victualer","victualler","victuals", +"vicuaa","vicuana","vide","videlicet","video", +"videotape","vie","view","viewer","viewfinder", +"viewless","viewpoint","vigil","vigilance","vigilant", +"vigilante","vignette","vigor","vigorous","vigour", +"viking","vile","vilification","vilify","villa", +"village","villager","villain","villainies","villainous", +"villainy","villein","villeinage","villenage","vim", +"vinaigrette","vindicate","vindication","vindictive","vine", +"vinegar","vinegary","vinery","vineyard","vino", +"vinous","vintage","vintner","vinyl","viol", +"viola","violate","violence","violent","violet", +"violin","violoncello","vip","viper","virago", +"virgin","virginal","virginals","virginia","virginity", +"virgo","virgule","virile","virility","virologist", +"virology","virtu","virtual","virtually","virtue", +"virtuosity","virtuoso","virtuous","virulence","virulent", +"virus","visa","visage","viscera","visceral", +"viscosity","viscount","viscountcy","viscountess","viscous", +"vise","visibility","visible","visibly","vision", +"visionary","visit","visitant","visitation","visiting", +"visitor","visor","vista","visual","visualise", +"visualize","visually","vital","vitalise","vitality", +"vitalize","vitally","vitals","vitamin","vitiate", +"viticulture","vitreous","vitrify","vitriol","vitriolic", +"vituperate","vituperation","vituperative","vivace","vivacious", +"vivarium","vivid","viviparous","vivisect","vivisection", +"vivisectionist","vixen","vixenish","vizier","vocab", +"vocabulary","vocal","vocalise","vocalist","vocalize", +"vocation","vocational","vocative","vociferate","vociferation", +"vociferous","vodka","vogue","voice","voiceless", +"void","voile","vol","volatile","volcanic", +"volcano","vole","volition","volitional","volley", +"volleyball","volt","voltage","voluble","volume", +"volumes","voluminous","voluntary","volunteer","voluptuary", +"voluptuous","volute","vomit","voodoo","voracious", +"vortex","votary","vote","voter","votive", +"vouch","voucher","vouchsafe","vow","vowel", +"voyage","voyager","voyages","voyeur","vtol", +"vulcanise","vulcanite","vulcanize","vulgar","vulgarian", +"vulgarise","vulgarism","vulgarity","vulgarize","vulgate", +"vulnerable","vulpine","vulture","vulva","wac", +"wack","wacky","wad","wadding","waddle", +"wade","wader","wadge","wadi","wady", +"wafer","waffle","waft","wag","wage", +"wager","wages","waggery","waggish","waggle", +"waggon","waggoner","waggonette","wagon","wagoner", +"wagonette","wagtail","waif","wail","wain", +"wainscot","waist","waistband","waistcoat","waistline", +"wait","waiter","waits","waive","waiver", +"wake","wakeful","waken","waking","walk", +"walkabout","walkaway","walker","walking","walkout", +"walkover","wall","walla","wallaby","wallah", +"wallet","wallflower","wallop","walloping","wallow", +"wallpaper","walnut","walrus","waltz","wampum", +"wan","wand","wander","wanderer","wandering", +"wanderings","wanderlust","wane","wangle","wank", +"wanker","want","wanting","wanton","wants", +"wapiti","war","warble","warbler","ward", +"warden","warder","wardrobe","wardroom","warehouse", +"wares","warfare","warhead","warhorse","warily", +"warlike","warlock","warlord","warm","warmonger", +"warmth","warn","warning","warp","warpath", +"warrant","warrantee","warrantor","warranty","warren", +"warrior","warship","wart","warthog","wartime", +"wary","was","wash","washable","washbasin", +"washboard","washbowl","washcloth","washday","washer", +"washerwoman","washhouse","washing","washout","washroom", +"washstand","washwoman","washy","wasp","waspish", +"wassail","wast","wastage","waste","wasteful", +"waster","wastrel","watch","watchband","watchdog", +"watches","watchful","watchmaker","watchman","watchtower", +"watchword","water","waterborne","watercolor","watercolour", +"watercourse","watercress","waterfall","waterfowl","waterfront", +"waterhole","waterline","waterlogged","waterloo","waterman", +"watermark","watermelon","watermill","waterpower","waterproof", +"waters","watershed","waterside","waterspout","watertight", +"waterway","waterwheel","waterwings","waterworks","watery", +"watt","wattage","wattle","wave","wavelength", +"waver","wavy","wax","waxen","waxworks", +"waxy","way","waybill","wayfarer","wayfaring", +"waylay","ways","wayside","wayward","weak", +"weaken","weakling","weakness","weal","weald", +"wealth","wealthy","wean","weapon","weaponry", +"wear","wearing","wearisome","weary","weasel", +"weather","weatherboard","weathercock","weatherglass","weatherman", +"weatherproof","weathers","weave","weaver","web", +"webbed","webbing","wed","wedded","wedding", +"wedge","wedged","wedgwood","wedlock","wednesday", +"wee","weed","weeds","weedy","week", +"weekday","weekend","weekender","weekly","weeknight", +"weeny","weep","weeping","weepy","weevil", +"weft","weigh","weighbridge","weight","weighted", +"weighting","weightless","weighty","weir","weird", +"weirdie","weirdo","welch","welcome","weld", +"welder","welfare","welkin","well","wellbeing", +"wellborn","wellington","wellspring","welsh","welt", +"weltanschauung","welter","welterweight","wen","wench", +"wend","wensleydale","went","wept","were", +"werewolf","wert","wesleyan","west","westbound", +"westerly","western","westerner","westernise","westernize", +"westernmost","westward","westwards","wet","wether", +"wetting","whack","whacked","whacker","whacking", +"whale","whalebone","whaler","whaling","wham", +"wharf","what","whatever","whatnot","wheat", +"wheaten","wheedle","wheel","wheelbarrow","wheelbase", +"wheelchair","wheelhouse","wheeling","wheels","wheelwright", +"wheeze","wheezy","whelk","whelp","when", +"whence","whenever","where","whereabouts","whereas", +"whereat","whereby","wherefore","wherefores","wherein", +"whereof","whereon","wheresoever","whereto","whereupon", +"wherever","wherewithal","wherry","whet","whether", +"whetstone","whew","whey","which","whichever", +"whiff","whiffy","whig","while","whim", +"whimper","whimsey","whimsical","whimsicality","whimsy", +"whin","whine","whiner","whinny","whip", +"whipcord","whiplash","whippersnapper","whippet","whipping", +"whippoorwill","whippy","whir","whirl","whirligig", +"whirlpool","whirlwind","whirlybird","whirr","whisk", +"whisker","whiskered","whiskers","whiskey","whisky", +"whisper","whist","whistle","whit","white", +"whitebait","whitehall","whiten","whitening","whites", +"whitethorn","whitethroat","whitewash","whither","whiting", +"whitlow","whitsun","whitsuntide","whittle","whiz", +"whizz","who","whoa","whodunit","whoever", +"whole","wholemeal","wholesale","wholesaler","wholesome", +"wholly","whom","whoop","whoopee","whoosh", +"whop","whopper","whopping","whore","whorehouse", +"whoremonger","whorl","whortleberry","whose","whosoever", +"why","whys","wick","wicked","wicker", +"wickerwork","wicket","wide","widely","widen", +"widespread","widgeon","widow","widowed","widower", +"widowhood","width","wield","wife","wifely", +"wig","wigged","wigging","wiggle","wight", +"wigwam","wilco","wild","wildcat","wildebeest", +"wilderness","wildfire","wildfowl","wildlife","wildly", +"wile","wiles","wilful","wiliness","will", +"willful","willies","willing","willow","willowy", +"willpower","wilt","wily","wimple","wimpy", +"win","wince","winceyette","winch","wind", +"windbag","windbreak","windcheater","windfall","windily", +"winding","windjammer","windlass","windless","windmill", +"window","windowpane","windowsill","windpipe","windscreen", +"windshield","windsock","windstorm","windswept","windward", +"windy","wine","winebibbing","wineglass","winepress", +"wineskin","wing","winger","wings","wingspan", +"wink","winkers","winkle","winner","winning", +"winnings","winnow","winsome","winter","wintergreen", +"wintertime","wintry","wipe","wiper","wire", +"wirecutters","wireless","wiretap","wireworm","wiring", +"wiry","wisdom","wise","wisecrack","wish", +"wishbone","wisp","wispy","wisteria","wistful", +"wit","witch","witchcraft","witchdoctor","witchery", +"witching","with","withal","withdraw","withdrawal", +"withdrawn","withe","wither","withering","withers", +"withhold","within","without","withstand","withy", +"witless","witness","witticism","witting","witty", +"wives","wizard","wizardry","wizened","woad", +"wobble","wobbly","woe","woebegone","woeful", +"wog","woke","woken","wold","wolf", +"wolfhound","wolfram","wolfsbane","woman","womanhood", +"womanise","womanish","womanize","womankind","womanly", +"womb","wombat","womenfolk","won","wonder", +"wonderful","wonderland","wonderment","wonders","wondrous", +"wonky","wont","wonted","woo","wood", +"woodbine","woodblock","woodcock","woodcraft","woodcut", +"woodcutter","wooded","wooden","woodenheaded","woodland", +"woodlouse","woodpecker","woodpile","woodshed","woodsman", +"woodwind","woodwork","woodworm","woody","wooer", +"woof","woofer","wool","woolen","woolens", +"woolgather","woolgathering","woollen","woollens","woolly", +"woolsack","woozy","wop","word","wording", +"wordless","wordplay","words","wordy","wore", +"work","workable","workaday","workbag","workbasket", +"workbench","workbook","workday","worker","workhorse", +"workhouse","working","workings","workman","workmanlike", +"workmanship","workout","workpeople","workroom","works", +"workshop","worktop","world","worldly","worldshaking", +"worldwide","worm","wormhole","wormwood","wormy", +"worn","worried","worrisome","worry","worse", +"worsen","worship","worshipful","worst","worsted", +"wort","worth","worthless","worthwhile","worthy", +"wot","wotcher","would","wouldst","wound", +"wove","woven","wow","wrac","wrack", +"wraith","wrangle","wrangler","wrap","wrapper", +"wrapping","wrath","wreak","wreath","wreathe", +"wreck","wreckage","wrecker","wren","wrench", +"wrest","wrestle","wretch","wretched","wriggle", +"wright","wring","wringer","wrinkle","wrist", +"wristband","wristlet","wristwatch","wristy","writ", +"write","writer","writhe","writing","writings", +"written","wrong","wrongdoing","wrongful","wrongheaded", +"wrote","wroth","wrought","wrung","wry", +"wurst","wyvern","xenon","xenophobia","xerox", +"xylophone","yacht","yachting","yachtsman","yahoo", +"yak","yam","yammer","yang","yank", +"yankee","yap","yard","yardage","yardarm", +"yardstick","yarn","yarrow","yashmak","yaw", +"yawl","yawn","yaws","yea","yeah", +"year","yearbook","yearling","yearlong","yearly", +"yearn","yearning","years","yeast","yeasty", +"yell","yellow","yelp","yen","yeoman", +"yeomanry","yes","yesterday","yet","yeti", +"yew","yid","yiddish","yield","yielding", +"yin","yippee","yobbo","yodel","yoga", +"yoghurt","yogi","yogurt","yoke","yokel", +"yolk","yonder","yonks","yore","yorker", +"you","young","younger","youngster","your", +"yours","yourself","youth","youthful","yowl", +"yoyo","yucca","yule","yuletide","zany", +"zeal","zealot","zealotry","zealous","zebra", +"zebu","zed","zeitgeist","zen","zenana", +"zenith","zephyr","zeppelin","zero","zest", +"ziggurat","zigzag","zinc","zinnia","zionism", +"zip","zipper","zippy","zither","zizz", +"zodiac","zombi","zombie","zonal","zone", +"zoning","zonked","zoo","zoologist","zoology", +"zoom","zoophyte","zouave","zucchini","zulu", +}; +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java new file mode 100644 index 00000000000..9169a1d335f --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java @@ -0,0 +1,1426 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This algorithm is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0, which was adapted from + * the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts + * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license. + */ +package com.yahoo.language.simple.kstem; + +/** + * A stemmer implementing the Kstem algorithm by Bob Krovetz. + */ +public class KStemmer { + + static private final int MaxWordLen = 50; + + static private final String[] exceptionWords = {"aide", "bathe", "caste", + "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage", + "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane", + "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune", + "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite", + "swinge", "tare", "tine", "tope", "tripe", "twine"}; + + static private final String[][] directConflations = { {"aging", "age"}, + {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"}, + {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"}, + {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"}, + {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"}, + {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"}, + {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"}, + {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"}, + {"political", "politics"}, {"metaphysical", "metaphysics"}, + {"cylindrical", "cylinder"}, {"nazism", "nazi"}, + {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"}, + {"credulity", "credulous"}, {"generosity", "generous"}, + {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"}, + {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}}; + + static private final String[][] countryNationality = { + {"afghan", "afghanistan"}, {"african", "africa"}, + {"albanian", "albania"}, {"algerian", "algeria"}, + {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"}, + {"arabian", "arabia"}, {"argentine", "argentina"}, + {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"}, + {"austrian", "austria"}, {"azerbaijani", "azerbaijan"}, + {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"}, + {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"}, + {"bosnian", "bosnia"}, {"botswanan", "botswana"}, + {"brazilian", "brazil"}, {"british", "britain"}, + {"bulgarian", "bulgaria"}, {"burmese", "burma"}, + {"californian", "california"}, {"cambodian", "cambodia"}, + {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"}, + {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"}, + {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"}, + {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"}, + {"egyptian", "egypt"}, {"equadorian", "equador"}, + {"eritrean", "eritrea"}, {"estonian", "estonia"}, + {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"}, + {"filipino", "philippines"}, {"finnish", "finland"}, + {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"}, + {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"}, + {"grenadan", "grenada"}, {"guamian", "guam"}, + {"guatemalan", "guatemala"}, {"guinean", "guinea"}, + {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"}, + {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"}, + {"icelandic", "iceland"}, {"indonesian", "indonesia"}, + {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"}, + {"irish", "ireland"}, {"israeli", "israel"}, + {"italian", "italy"}, + {"jamaican", "jamaica"}, + {"japanese", "japan"}, + {"jordanian", "jordan"}, + {"kampuchean", "cambodia"}, + {"kenyan", "kenya"}, + {"korean", "korea"}, + {"kuwaiti", "kuwait"}, + {"lankan", "lanka"}, + {"laotian", "laos"}, + {"latvian", "latvia"}, + {"lebanese", "lebanon"}, + {"liberian", "liberia"}, + {"libyan", "libya"}, + {"lithuanian", "lithuania"}, + {"macedonian", "macedonia"}, + {"madagascan", "madagascar"}, + {"malaysian", "malaysia"}, + {"maltese", "malta"}, + {"mauritanian", "mauritania"}, + {"mexican", "mexico"}, + {"micronesian", "micronesia"}, + {"moldovan", "moldova"}, + {"monacan", "monaco"}, + {"mongolian", "mongolia"}, + {"montenegran", "montenegro"}, + {"moroccan", "morocco"}, + {"myanmar", "burma"}, + {"namibian", "namibia"}, + {"nepalese", "nepal"}, + // {"netherlands", "dutch"}, + {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"}, + {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"}, + {"panamanian", "panama"}, {"papuan", "papua"}, + {"paraguayan", "paraguay"}, {"peruvian", "peru"}, + {"portuguese", "portugal"}, {"romanian", "romania"}, + {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"}, + {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"}, + {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"}, + {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"}, + {"slovenian", "slovenia"}, {"somali", "somalia"}, + {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"}, + {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"}, + {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"}, + {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"}, + {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"}, + {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"}, + {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"}, + {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"}, + {"zambian", "zambia"}, {"zealander", "zealand"}, + {"zimbabwean", "zimbabwe"}}; + + static private final String[] supplementDict = {"aids", "applicator", + "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere", + "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph", + "toroid", "superconduct", "backscatter", "connectionism"}; + + static private final String[] properNouns = {"abrams", "achilles", + "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred", + "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles", + "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher", + "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor", + "barbados", "barger", "bering", "brahms", "brandeis", "brussels", + "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker", + "charles", "cheops", "ching", "christmas", "cocos", "collins", + "columbus", "confucius", "conners", "connolly", "copernicus", "cramer", + "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies", + "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris", + "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries", + "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans", + "everglades", "fairbanks", "federales", "fisher", "fitzsimmons", + "fleming", "forbes", "fowler", "france", "francis", "goering", + "goodling", "goths", "grenadines", "guiness", "hades", "harding", + "harris", "hastings", "hawkes", "hawking", "hayes", "heights", + "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras", + "hopkins", "hughes", "humphreys", "illinois", "indianapolis", + "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james", + "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius", + "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds", + "levis", "leviticus", "lewis", "louis", "maccabees", "madras", + "maimonides", "maldive", "massachusetts", "matthews", "mauritius", + "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed", + "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing", + "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus", + "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker", + "pauling", "peking", "pershing", "peter", "peters", "philippines", + "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais", + "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins", + "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates", + "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius", + "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers", + "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares", + "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers", + "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing", + "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius", + "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester", + "connally", "conner", "coors", "cummings", "curtis", "daedalus", + "dionysus", "dobbs", "dolores", "edmonds"}; + + static class DictEntry { + boolean exception; + String root; + + DictEntry(String root, boolean isException) { + this.root = root; + this.exception = isException; + } + } + + private static final CharArrayMap dict_ht = initializeDictHash(); + + + private final OpenStringBuilder word = new OpenStringBuilder(); + private int j; /* index of final letter in stem (within word) */ + private int k; /* + * INDEX of final letter in word. You must add 1 to k to get + * the current length of word. When you want the length of + * word, use the method wordLength, which returns (k+1). + */ + + /* + * private void initializeStemHash() { if (maxCacheSize > 0) cache = new + * CharArrayMap(maxCacheSize,false); } + ***/ + + private char finalChar() { + return word.charAt(k); + } + + private char penultChar() { + return word.charAt(k - 1); + } + + private boolean isVowel(int index) { + return !isCons(index); + } + + private boolean isCons(int index) { + char ch; + + ch = word.charAt(index); + + if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false; + if ((ch != 'y') || (index == 0)) return true; + else return (!isCons(index - 1)); + } + + private static CharArrayMap initializeDictHash() { + DictEntry defaultEntry; + DictEntry entry; + + CharArrayMap d = new CharArrayMap<>(1000, false); + for (int i = 0; i < exceptionWords.length; i++) { + if (!d.containsKey(exceptionWords[i])) { + entry = new DictEntry(exceptionWords[i], true); + d.put(exceptionWords[i], entry); + } else { + throw new RuntimeException("Warning: Entry [" + exceptionWords[i] + + "] already in dictionary 1"); + } + } + + for (int i = 0; i < directConflations.length; i++) { + if (!d.containsKey(directConflations[i][0])) { + entry = new DictEntry(directConflations[i][1], false); + d.put(directConflations[i][0], entry); + } else { + throw new RuntimeException("Warning: Entry [" + directConflations[i][0] + + "] already in dictionary 2"); + } + } + + for (int i = 0; i < countryNationality.length; i++) { + if (!d.containsKey(countryNationality[i][0])) { + entry = new DictEntry(countryNationality[i][1], false); + d.put(countryNationality[i][0], entry); + } else { + throw new RuntimeException("Warning: Entry [" + countryNationality[i][0] + + "] already in dictionary 3"); + } + } + + defaultEntry = new DictEntry(null, false); + + String[] array; + array = KStemData1.data; + + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData2.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData3.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData4.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData5.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData6.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + array = KStemData7.data; + for (int i = 0; i < array.length; i++) { + if (!d.containsKey(array[i])) { + d.put(array[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + array[i] + + "] already in dictionary 4"); + } + } + + for (int i = 0; i < KStemData8.data.length; i++) { + if (!d.containsKey(KStemData8.data[i])) { + d.put(KStemData8.data[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + KStemData8.data[i] + + "] already in dictionary 4"); + } + } + + for (int i = 0; i < supplementDict.length; i++) { + if (!d.containsKey(supplementDict[i])) { + d.put(supplementDict[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + supplementDict[i] + + "] already in dictionary 5"); + } + } + + for (int i = 0; i < properNouns.length; i++) { + if (!d.containsKey(properNouns[i])) { + d.put(properNouns[i], defaultEntry); + } else { + throw new RuntimeException("Warning: Entry [" + properNouns[i] + + "] already in dictionary 6"); + } + } + + return d; + } + + private boolean isAlpha(char ch) { + return ch >= 'a' && ch <= 'z'; // terms must be lowercased already + } + + /* length of stem within word */ + private int stemLength() { + return j + 1; + }; + + private boolean endsIn(char[] s) { + if (s.length > k) return false; + + int r = word.length() - s.length; /* length of word before this suffix */ + j = k; + for (int r1 = r, i = 0; i < s.length; i++, r1++) { + if (s[i] != word.charAt(r1)) return false; + } + j = r - 1; /* index of the character BEFORE the posfix */ + return true; + } + + private boolean endsIn(char a, char b) { + if (2 > k) return false; + // check left to right since the endings have often already matched + if (word.charAt(k - 1) == a && word.charAt(k) == b) { + j = k - 2; + return true; + } + return false; + } + + private boolean endsIn(char a, char b, char c) { + if (3 > k) return false; + if (word.charAt(k - 2) == a && word.charAt(k - 1) == b + && word.charAt(k) == c) { + j = k - 3; + return true; + } + return false; + } + + private boolean endsIn(char a, char b, char c, char d) { + if (4 > k) return false; + if (word.charAt(k - 3) == a && word.charAt(k - 2) == b + && word.charAt(k - 1) == c && word.charAt(k) == d) { + j = k - 4; + return true; + } + return false; + } + + private DictEntry wordInDict() { + /*** + * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, + * word.size()) != matchedEntry) { + * System.out.println("Uh oh... cached entry doesn't match"); } return + * matchedEntry; } + ***/ + if (matchedEntry != null) return matchedEntry; + DictEntry e = dict_ht.get(word.getArray(), 0, word.length()); + if (e != null && !e.exception) { + matchedEntry = e; // only cache if it's not an exception. + } + // lookups.add(word.toString()); + return e; + } + + /* Convert plurals to singular form, and '-ies' to 'y' */ + private void plural() { + if (word.charAt(k) == 's') { + if (endsIn('i', 'e', 's')) { + word.setLength(j + 3); + k--; + if (lookup()) /* ensure calories -> calorie */ + return; + k++; + word.unsafeWrite('s'); + setSuffix("y"); + lookup(); + } else if (endsIn('e', 's')) { + /* try just removing the "s" */ + word.setLength(j + 2); + k--; + + /* + * note: don't check for exceptions here. So, `aides' -> `aide', but + * `aided' -> `aid'. The exception for double s is used to prevent + * crosses -> crosse. This is actually correct if crosses is a plural + * noun (a type of racket used in lacrosse), but the verb is much more + * common + */ + + /**** + * YCS: this was the one place where lookup was not followed by return. + * So restructure it. if ((j>0)&&(lookup(word.toString())) && + * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return; + *****/ + boolean tryE = j > 0 + && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's')); + if (tryE && lookup()) return; + + /* try removing the "es" */ + + word.setLength(j + 1); + k--; + if (lookup()) return; + + /* the default is to retain the "e" */ + word.unsafeWrite('e'); + k++; + + if (!tryE) lookup(); // if we didn't try the "e" ending before + return; + } else { + if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) { + /* unless the word ends in "ous" or a double "s", remove the final "s" */ + + word.setLength(k); + k--; + lookup(); + } + } + } + } + + private void setSuffix(String s) { + setSuff(s, s.length()); + } + + /* replace old suffix with s */ + private void setSuff(String s, int len) { + word.setLength(j + 1); + for (int l = 0; l < len; l++) { + word.unsafeWrite(s.charAt(l)); + } + k = j + len; + } + + /* Returns true if the word is found in the dictionary */ + // almost all uses of lookup() return immediately and are + // followed by another lookup in the dict. Store the match + // to avoid this double lookup. + DictEntry matchedEntry = null; + + private boolean lookup() { + matchedEntry = dict_ht.get(word.getArray(), 0, word.size()); + return matchedEntry != null; + } + + // Set lookups = new HashSet<>(); + + /* convert past tense (-ed) to present, and `-ied' to `y' */ + private void pastTense() { + /* + * Handle words less than 5 letters with a direct mapping This prevents + * (fled -> fl). + */ + if (word.length() <= 4) return; + + if (endsIn('i', 'e', 'd')) { + word.setLength(j + 3); + k--; + if (lookup()) /* we almost always want to convert -ied to -y, but */ + return; /* this isn't true for short words (died->die) */ + k++; /* I don't know any long words that this applies to, */ + word.unsafeWrite('d'); /* but just in case... */ + setSuffix("y"); + lookup(); + return; + } + + /* the vowelInStem() is necessary so we don't stem acronyms */ + if (endsIn('e', 'd') && vowelInStem()) { + /* see if the root ends in `e' */ + word.setLength(j + 2); + k = j + 1; + + DictEntry entry = wordInDict(); + if (entry != null) if (!entry.exception) /* + * if it's in the dictionary and + * not an exception + */ + return; + + /* try removing the "ed" */ + word.setLength(j + 1); + k = j; + if (lookup()) return; + + /* + * try removing a doubled consonant. if the root isn't found in the + * dictionary, the default is to leave it doubled. This will correctly + * capture `backfilled' -> `backfill' instead of `backfill' -> + * `backfille', and seems correct most of the time + */ + + if (doubleC(k)) { + word.setLength(k); + k--; + if (lookup()) return; + word.unsafeWrite(word.charAt(k)); + k++; + lookup(); + return; + } + + /* if we have a `un-' prefix, then leave the word alone */ + /* (this will sometimes screw up with `under-', but we */ + /* will take care of that later) */ + + if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) { + word.unsafeWrite('e'); + word.unsafeWrite('d'); + k = k + 2; + // nolookup() + return; + } + + /* + * it wasn't found by just removing the `d' or the `ed', so prefer to end + * with an `e' (e.g., `microcoded' -> `microcode'). + */ + + word.setLength(j + 1); + word.unsafeWrite('e'); + k = j + 1; + // nolookup() - we already tried the "e" ending + return; + } + } + + /* return TRUE if word ends with a double consonant */ + private boolean doubleC(int i) { + if (i < 1) return false; + + if (word.charAt(i) != word.charAt(i - 1)) return false; + return (isCons(i)); + } + + private boolean vowelInStem() { + for (int i = 0; i < stemLength(); i++) { + if (isVowel(i)) return true; + } + return false; + } + + /* handle `-ing' endings */ + private void aspect() { + /* + * handle short words (aging -> age) via a direct mapping. This prevents + * (thing -> the) in the version of this routine that ignores inflectional + * variants that are mentioned in the dictionary (when the root is also + * present) + */ + + if (word.length() <= 5) return; + + /* the vowelinstem() is necessary so we don't stem acronyms */ + if (endsIn('i', 'n', 'g') && vowelInStem()) { + + /* try adding an `e' to the stem and check against the dictionary */ + word.setCharAt(j + 1, 'e'); + word.setLength(j + 2); + k = j + 1; + + DictEntry entry = wordInDict(); + if (entry != null) { + if (!entry.exception) /* if it's in the dictionary and not an exception */ + return; + } + + /* adding on the `e' didn't work, so remove it */ + word.setLength(k); + k--; /* note that `ing' has also been removed */ + + if (lookup()) return; + + /* if I can remove a doubled consonant and get a word, then do so */ + if (doubleC(k)) { + k--; + word.setLength(k + 1); + if (lookup()) return; + word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */ + + /* the default is to leave the consonant doubled */ + /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */ + /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */ + /* Without making the algorithm significantly more complicated, this */ + /* is the best I can do */ + k++; + lookup(); + return; + } + + /* + * the word wasn't in the dictionary after removing the stem, and then + * checking with and without a final `e'. The default is to add an `e' + * unless the word ends in two consonants, so `microcoding' -> + * `microcode'. The two consonants restriction wouldn't normally be + * necessary, but is needed because we don't try to deal with prefixes and + * compounds, and most of the time it is correct (e.g., footstamping -> + * footstamp, not footstampe; however, decoupled -> decoupl). We can + * prevent almost all of the incorrect stems if we try to do some prefix + * analysis first + */ + + if ((j > 0) && isCons(j) && isCons(j - 1)) { + k = j; + word.setLength(k + 1); + // nolookup() because we already did according to the comment + return; + } + + word.setLength(j + 1); + word.unsafeWrite('e'); + k = j + 1; + // nolookup(); we already tried an 'e' ending + return; + } + } + + /* + * this routine deals with -ity endings. It accepts -ability, -ibility, and + * -ality, even without checking the dictionary because they are so + * productive. The first two are mapped to -ble, and the -ity is remove for + * the latter + */ + private void ityEndings() { + int old_k = k; + + if (endsIn('i', 't', 'y')) { + word.setLength(j + 1); /* try just removing -ity */ + k = j; + if (lookup()) return; + word.unsafeWrite('e'); /* try removing -ity and adding -e */ + k = j + 1; + if (lookup()) return; + word.setCharAt(j + 1, 'i'); + word.append("ty"); + k = old_k; + /* + * the -ability and -ibility endings are highly productive, so just accept + * them + */ + if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) { + word.setLength(j - 1); + word.append("le"); /* convert to -ble */ + k = j; + lookup(); + return; + } + + /* ditto for -ivity */ + if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) { + word.setLength(j + 1); + word.unsafeWrite('e'); /* convert to -ive */ + k = j + 1; + lookup(); + return; + } + /* ditto for -ality */ + if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) { + word.setLength(j + 1); + k = j; + lookup(); + return; + } + + /* + * if the root isn't in the dictionary, and the variant *is* there, then + * use the variant. This allows `immunity'->`immune', but prevents + * `capacity'->`capac'. If neither the variant nor the root form are in + * the dictionary, then remove the ending as a default + */ + + if (lookup()) return; + + /* the default is to remove -ity altogether */ + word.setLength(j + 1); + k = j; + // nolookup(), we already did it. + return; + } + } + + /* handle -ence and -ance */ + private void nceEndings() { + int old_k = k; + char word_char; + + if (endsIn('n', 'c', 'e')) { + word_char = word.charAt(j); + if (!((word_char == 'e') || (word_char == 'a'))) return; + word.setLength(j); + word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */ + k = j; + if (lookup()) return; + word.setLength(j); /* + * try removing -e/ance altogether + * (disappearance/disappear) + */ + k = j - 1; + if (lookup()) return; + word.unsafeWrite(word_char); /* restore the original ending */ + word.append("nce"); + k = old_k; + // nolookup() because we restored the original ending + } + return; + } + + /* handle -ness */ + private void nessEndings() { + if (endsIn('n', 'e', 's', 's')) { /* + * this is a very productive endings, so + * just accept it + */ + word.setLength(j + 1); + k = j; + if (word.charAt(j) == 'i') word.setCharAt(j, 'y'); + lookup(); + } + return; + } + + /* handle -ism */ + private void ismEndings() { + if (endsIn('i', 's', 'm')) { /* + * this is a very productive ending, so just + * accept it + */ + word.setLength(j + 1); + k = j; + lookup(); + } + return; + } + + /* this routine deals with -ment endings. */ + private void mentEndings() { + int old_k = k; + + if (endsIn('m', 'e', 'n', 't')) { + word.setLength(j + 1); + k = j; + if (lookup()) return; + word.append("ment"); + k = old_k; + // nolookup + } + return; + } + + /* this routine deals with -ize endings. */ + private void izeEndings() { + int old_k = k; + + if (endsIn('i', 'z', 'e')) { + word.setLength(j + 1); /* try removing -ize entirely */ + k = j; + if (lookup()) return; + word.unsafeWrite('i'); + + if (doubleC(j)) { /* allow for a doubled consonant */ + word.setLength(j); + k = j - 1; + if (lookup()) return; + word.unsafeWrite(word.charAt(j - 1)); + } + + word.setLength(j + 1); + word.unsafeWrite('e'); /* try removing -ize and adding -e */ + k = j + 1; + if (lookup()) return; + word.setLength(j + 1); + word.append("ize"); + k = old_k; + // nolookup() + } + return; + } + + /* handle -ency and -ancy */ + private void ncyEndings() { + if (endsIn('n', 'c', 'y')) { + if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return; + word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */ + word.setLength(j + 3); + k = j + 2; + + if (lookup()) return; + + word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */ + word.unsafeWrite('e'); + k = j + 3; + lookup(); + } + return; + } + + /* handle -able and -ible */ + private void bleEndings() { + int old_k = k; + char word_char; + + if (endsIn('b', 'l', 'e')) { + if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return; + word_char = word.charAt(j); + word.setLength(j); /* try just removing the ending */ + k = j - 1; + if (lookup()) return; + if (doubleC(k)) { /* allow for a doubled consonant */ + word.setLength(k); + k--; + if (lookup()) return; + k++; + word.unsafeWrite(word.charAt(k - 1)); + } + word.setLength(j); + word.unsafeWrite('e'); /* try removing -a/ible and adding -e */ + k = j; + if (lookup()) return; + word.setLength(j); + word.append("ate"); /* try removing -able and adding -ate */ + /* (e.g., compensable/compensate) */ + k = j + 2; + if (lookup()) return; + word.setLength(j); + word.unsafeWrite(word_char); /* restore the original values */ + word.append("ble"); + k = old_k; + // nolookup() + } + return; + } + + /* + * handle -ic endings. This is fairly straightforward, but this is also the + * only place we try *expanding* an ending, -ic -> -ical. This is to handle + * cases like `canonic' -> `canonical' + */ + private void icEndings() { + if (endsIn('i', 'c')) { + word.setLength(j + 3); + word.append("al"); /* try converting -ic to -ical */ + k = j + 4; + if (lookup()) return; + + word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */ + word.setLength(j + 2); + k = j + 1; + if (lookup()) return; + + word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */ + if (lookup()) return; + + word.setLength(j + 1); /* try removing -ic altogether */ + k = j; + if (lookup()) return; + word.append("ic"); /* restore the original ending */ + k = j + 2; + // nolookup() + } + return; + } + + private static char[] ization = "ization".toCharArray(); + private static char[] ition = "ition".toCharArray(); + private static char[] ation = "ation".toCharArray(); + private static char[] ication = "ication".toCharArray(); + + /* handle some derivational endings */ + /* + * this routine deals with -ion, -ition, -ation, -ization, and -ication. The + * -ization ending is always converted to -ize + */ + private void ionEndings() { + int old_k = k; + if (!endsIn('i', 'o', 'n')) { + return; + } + + if (endsIn(ization)) { /* + * the -ize ending is very productive, so simply + * accept it as the root + */ + word.setLength(j + 3); + word.unsafeWrite('e'); + k = j + 3; + lookup(); + return; + } + + if (endsIn(ition)) { + word.setLength(j + 1); + word.unsafeWrite('e'); + k = j + 1; + if (lookup()) /* + * remove -ition and add `e', and check against the + * dictionary + */ + return; /* (e.g., definition->define, opposition->oppose) */ + + /* restore original values */ + word.setLength(j + 1); + word.append("ition"); + k = old_k; + // nolookup() + } else if (endsIn(ation)) { + word.setLength(j + 3); + word.unsafeWrite('e'); + k = j + 3; + if (lookup()) /* remove -ion and add `e', and check against the dictionary */ + return; /* (elmination -> eliminate) */ + + word.setLength(j + 1); + word.unsafeWrite('e'); /* + * remove -ation and add `e', and check against the + * dictionary + */ + k = j + 1; + if (lookup()) return; + + word.setLength(j + 1);/* + * just remove -ation (resignation->resign) and + * check dictionary + */ + k = j; + if (lookup()) return; + + /* restore original values */ + word.setLength(j + 1); + word.append("ation"); + k = old_k; + // nolookup() + + } + + /* + * test -ication after -ation is attempted (e.g., `complication->complicate' + * rather than `complication->comply') + */ + + if (endsIn(ication)) { + word.setLength(j + 1); + word.unsafeWrite('y'); + k = j + 1; + if (lookup()) /* + * remove -ication and add `y', and check against the + * dictionary + */ + return; /* (e.g., amplification -> amplify) */ + + /* restore original values */ + word.setLength(j + 1); + word.append("ication"); + k = old_k; + // nolookup() + } + + // if (endsIn(ion)) { + if (true) { // we checked for this earlier... just need to set "j" + j = k - 3; // YCS + + word.setLength(j + 1); + word.unsafeWrite('e'); + k = j + 1; + if (lookup()) /* remove -ion and add `e', and check against the dictionary */ + return; + + word.setLength(j + 1); + k = j; + if (lookup()) /* remove -ion, and if it's found, treat that as the root */ + return; + + /* restore original values */ + word.setLength(j + 1); + word.append("ion"); + k = old_k; + // nolookup() + } + + // nolookup(); all of the other paths restored original values + return; + } + + /* + * this routine deals with -er, -or, -ier, and -eer. The -izer ending is + * always converted to -ize + */ + private void erAndOrEndings() { + int old_k = k; + + if (word.charAt(k) != 'r') return; // YCS + + char word_char; /* so we can remember if it was -er or -or */ + + if (endsIn('i', 'z', 'e', 'r')) { /* + * -ize is very productive, so accept it + * as the root + */ + word.setLength(j + 4); + k = j + 3; + lookup(); + return; + } + + if (endsIn('e', 'r') || endsIn('o', 'r')) { + word_char = word.charAt(j + 1); + if (doubleC(j)) { + word.setLength(j); + k = j - 1; + if (lookup()) return; + word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */ + } + + if (word.charAt(j) == 'i') { /* do we have a -ier ending? */ + word.setCharAt(j, 'y'); + word.setLength(j + 1); + k = j; + if (lookup()) /* yes, so check against the dictionary */ + return; + word.setCharAt(j, 'i'); /* restore the endings */ + word.unsafeWrite('e'); + } + + if (word.charAt(j) == 'e') { /* handle -eer */ + word.setLength(j); + k = j - 1; + if (lookup()) return; + word.unsafeWrite('e'); + } + + word.setLength(j + 2); /* remove the -r ending */ + k = j + 1; + if (lookup()) return; + word.setLength(j + 1); /* try removing -er/-or */ + k = j; + if (lookup()) return; + word.unsafeWrite('e'); /* try removing -or and adding -e */ + k = j + 1; + if (lookup()) return; + word.setLength(j + 1); + word.unsafeWrite(word_char); + word.unsafeWrite('r'); /* restore the word to the way it was */ + k = old_k; + // nolookup() + } + + } + + /* + * this routine deals with -ly endings. The -ally ending is always converted + * to -al Sometimes this will temporarily leave us with a non-word (e.g., + * heuristically maps to heuristical), but then the -al is removed in the next + * step. + */ + private void lyEndings() { + int old_k = k; + + if (endsIn('l', 'y')) { + + word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */ + + if (lookup()) return; + word.setCharAt(j + 2, 'y'); + + word.setLength(j + 1); /* try just removing the -ly */ + k = j; + + if (lookup()) return; + + if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /* + * always + * convert + * - + * ally + * to + * - + * al + */ + return; + word.append("ly"); + k = old_k; + + if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { // always convert 'ably' to 'able' + word.setCharAt(j + 2, 'e'); + k = j + 2; + return; + } + + if (word.charAt(j) == 'i') { /* e.g., militarily -> military */ + word.setLength(j); + word.unsafeWrite('y'); + k = j; + if (lookup()) return; + word.setLength(j); + word.append("ily"); + k = old_k; + } + + word.setLength(j + 1); /* the default is to remove -ly */ + + k = j; + // nolookup()... we already tried removing the "ly" variant + } + return; + } + + /* + * this routine deals with -al endings. Some of the endings from the previous + * routine are finished up here. + */ + private void alEndings() { + int old_k = k; + + if (word.length() < 4) return; + if (endsIn('a', 'l')) { + word.setLength(j + 1); + k = j; + if (lookup()) /* try just removing the -al */ + return; + + if (doubleC(j)) { /* allow for a doubled consonant */ + word.setLength(j); + k = j - 1; + if (lookup()) return; + word.unsafeWrite(word.charAt(j - 1)); + } + + word.setLength(j + 1); + word.unsafeWrite('e'); /* try removing the -al and adding -e */ + k = j + 1; + if (lookup()) return; + + word.setLength(j + 1); + word.append("um"); /* try converting -al to -um */ + /* (e.g., optimal - > optimum ) */ + k = j + 2; + if (lookup()) return; + + word.setLength(j + 1); + word.append("al"); /* restore the ending to the way it was */ + k = old_k; + + if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) { + word.setLength(j - 1); /* try removing -ical */ + k = j - 2; + if (lookup()) return; + + word.setLength(j - 1); + word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */ + k = j - 1; + if (lookup()) return; + + word.setLength(j - 1); + word.append("ic"); /* the default is to convert -ical to -ic */ + k = j; + // nolookup() ... converting ical to ic means removing "al" which we + // already tried + // ERROR + lookup(); + return; + } + + if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */ + word.setLength(j); /* (sometimes it gets turned into -y, but we */ + k = j - 1; /* aren't dealing with that case for now) */ + if (lookup()) return; + word.append("ial"); + k = old_k; + lookup(); + } + + } + return; + } + + /* + * this routine deals with -ive endings. It normalizes some of the -ative + * endings directly, and also maps some -ive endings to -ion. + */ + private void iveEndings() { + int old_k = k; + + if (endsIn('i', 'v', 'e')) { + word.setLength(j + 1); /* try removing -ive entirely */ + k = j; + if (lookup()) return; + + word.unsafeWrite('e'); /* try removing -ive and adding -e */ + k = j + 1; + if (lookup()) return; + word.setLength(j + 1); + word.append("ive"); + if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) { + word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */ + word.setLength(j); /* (e.g., determinative -> determine) */ + k = j - 1; + if (lookup()) return; + word.setLength(j - 1); /* try just removing -ative */ + if (lookup()) return; + + word.append("ative"); + k = old_k; + } + + /* try mapping -ive to -ion (e.g., injunctive/injunction) */ + word.setCharAt(j + 2, 'o'); + word.setCharAt(j + 3, 'n'); + if (lookup()) return; + + word.setCharAt(j + 2, 'v'); /* restore the original values */ + word.setCharAt(j + 3, 'e'); + k = old_k; + // nolookup() + } + return; + } + + public KStemmer() {} + + public String stem(String term) { + boolean changed = stem(term.toCharArray(), term.length()); + if (!changed) return term; + return asString(); + } + + /** + * Returns the result of the stem (assuming the word was changed) as a String. + */ + String asString() { + String s = getString(); + if (s != null) return s; + return word.toString(); + } + + CharSequence asCharSequence() { + return result != null ? result : word; + } + + String getString() { + return result; + } + + char[] getChars() { + return word.getArray(); + } + + int getLength() { + return word.length(); + } + + String result; + + private boolean matched() { + /*** + * if (!lookups.contains(word.toString())) { throw new + * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup); + * } + ***/ + // lookup(); + return matchedEntry != null; + } + + /** + * Stems the text in the token. Returns true if changed. + */ + boolean stem(char[] term, int len) { + + result = null; + + k = len - 1; + if ((k <= 1) || (k >= MaxWordLen - 1)) { + return false; // don't stem + } + + // first check the stemmer dictionaries, and avoid using the + // cache if it's in there. + DictEntry entry = dict_ht.get(term, 0, len); + if (entry != null) { + if (entry.root != null) { + result = entry.root; + return true; + } + return false; + } + + /*** + * caching off is normally faster if (cache == null) initializeStemHash(); + * + * // now check the cache, before we copy chars to "word" if (cache != null) + * { String val = cache.get(term, 0, len); if (val != null) { if (val != + * SAME) { result = val; return true; } return false; } } + ***/ + + word.reset(); + // allocate enough space so that an expansion is never needed + word.reserve(len + 10); + for (int i = 0; i < len; i++) { + char ch = term[i]; + if (!isAlpha(ch)) return false; // don't stem + // don't lowercase... it's a requirement that lowercase filter be + // used before this stemmer. + word.unsafeWrite(ch); + } + + matchedEntry = null; + /*** + * lookups.clear(); lookups.add(word.toString()); + ***/ + + /* + * This while loop will never be executed more than one time; it is here + * only to allow the break statement to be used to escape as soon as a word + * is recognized + */ + while (true) { + // YCS: extra lookup()s were inserted so we don't need to + // do an extra wordInDict() here. + plural(); + if (matched()) break; + pastTense(); + if (matched()) break; + aspect(); + if (matched()) break; + ityEndings(); + if (matched()) break; + nessEndings(); + if (matched()) break; + ionEndings(); + if (matched()) break; + erAndOrEndings(); + if (matched()) break; + lyEndings(); + if (matched()) break; + alEndings(); + if (matched()) break; + entry = wordInDict(); + iveEndings(); + if (matched()) break; + izeEndings(); + if (matched()) break; + mentEndings(); + if (matched()) break; + bleEndings(); + if (matched()) break; + ismEndings(); + if (matched()) break; + icEndings(); + if (matched()) break; + ncyEndings(); + if (matched()) break; + nceEndings(); + matched(); + break; + } + + /* + * try for a direct mapping (allows for cases like `Italian'->`Italy' and + * `Italians'->`Italy') + */ + entry = matchedEntry; + if (entry != null) { + result = entry.root; // may be null, which means that "word" is the stem + } + + // no entry matched means result is "word" + return true; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java new file mode 100644 index 00000000000..4f10c058424 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java @@ -0,0 +1,136 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/* + * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed + * under the terms of the Apache License, Version 2.0. + */ +package com.yahoo.language.simple.kstem; + +/** + * A StringBuilder that allows one to access the array. + */ +public class OpenStringBuilder implements Appendable, CharSequence { + + protected char[] buf; + protected int len; + + public OpenStringBuilder() { + this(32); + } + + public OpenStringBuilder(int size) { + buf = new char[size]; + } + + public void setLength(int len) { this.len = len; } + + public void set(char[] arr, int end) { + this.buf = arr; + this.len = end; + } + + public char[] getArray() { return buf; } + public int size() { return len; } + @Override + public int length() { return len; } + public int capacity() { return buf.length; } + + @Override + public Appendable append(CharSequence csq) { + return append(csq, 0, csq.length()); + } + + @Override + public Appendable append(CharSequence csq, int start, int end) { + reserve(end-start); + for (int i=start; i buf.length) resize(len + num); + } + + public void write(char b) { + if (len >= buf.length) { + resize(len +1); + } + unsafeWrite(b); + } + + public void write(int b) { write((char)b); } + + public final void write(char[] b) { + write(b,0,b.length); + } + + public void write(char b[], int off, int len) { + reserve(len); + unsafeWrite(b, off, len); + } + + public final void write(OpenStringBuilder arr) { + write(arr.buf, 0, len); + } + + public void write(String s) { + reserve(s.length()); + s.getChars(0,s.length(),buf, len); + len +=s.length(); + } + + public void flush() { + } + + public final void reset() { + len =0; + } + + public char[] toCharArray() { + char newbuf[] = new char[size()]; + System.arraycopy(buf, 0, newbuf, 0, size()); + return newbuf; + } + + @Override + public String toString() { + return new String(buf, 0, size()); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/package-info.java b/linguistics/src/main/java/com/yahoo/language/simple/package-info.java new file mode 100644 index 00000000000..722002d6bcc --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/package-info.java @@ -0,0 +1,9 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +package com.yahoo.language.simple; + +import com.yahoo.osgi.annotation.ExportPackage; + +/** + * A set of simple dependency-free linguistics processors suitable for testing. + */ diff --git a/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java new file mode 100644 index 00000000000..c99c4009c4c --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java @@ -0,0 +1,107 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * @author Rich Pito + */ +public class LanguageTestCase { + + @Test + public void requireThatSpecificLanguagesAreCjk() { + List cjk = Arrays.asList(Language.CHINESE_SIMPLIFIED, + Language.CHINESE_TRADITIONAL, + Language.JAPANESE, + Language.KOREAN, + Language.THAI); + for (Language language : cjk) { + assertTrue(language.toString(), language.isCjk()); + } + for (Language language : Language.values()) { + if (cjk.contains(language)) { + continue; + } + assertFalse(language.toString(), language.isCjk()); + } + } + + @Test + public void requireThatLanguageTagsAreRecognized() { + assertLanguage(Language.ARABIC, "ar"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-hans"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-Hans"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-foo-CN"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-CN"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-hant"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-TW"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-HK"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo-TW"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-TW"); + assertLanguage(Language.CROATIAN, "hr"); + assertLanguage(Language.DANISH, "da"); + assertLanguage(Language.DUTCH, "nl"); + assertLanguage(Language.ENGLISH, "en"); + assertLanguage(Language.ENGLISH, "en-CA"); + assertLanguage(Language.ENGLISH, "en-GB"); + assertLanguage(Language.ENGLISH, "en-US"); + assertLanguage(Language.ENGLISH, "en-Latn-i-oed-1992"); + assertLanguage(Language.FINNISH, "fi"); + assertLanguage(Language.FRENCH, "fr"); + assertLanguage(Language.FRENCH, "fr-FR"); + assertLanguage(Language.GERMAN, "de"); + assertLanguage(Language.GERMAN, "de-DE"); + assertLanguage(Language.GREEK, "el"); + assertLanguage(Language.ITALIAN, "it"); + assertLanguage(Language.ITALIAN, "it-IT"); + assertLanguage(Language.JAPANESE, "ja"); + assertLanguage(Language.KOREAN, "ko"); + assertLanguage(Language.NORWEGIAN_BOKMAL, "no"); + assertLanguage(Language.NORWEGIAN_BOKMAL, "nb"); + assertLanguage(Language.POLISH, "pl"); + assertLanguage(Language.PORTUGUESE, "pt"); + assertLanguage(Language.ROMANIAN, "ro"); + assertLanguage(Language.RUSSIAN, "ru"); + assertLanguage(Language.SPANISH, "es"); + assertLanguage(Language.SPANISH, "es-ES"); + assertLanguage(Language.SPANISH, "es-419"); + assertLanguage(Language.SWEDISH, "sv"); + assertLanguage(Language.THAI, "th"); + assertLanguage(Language.TURKISH, "tr"); + assertLanguage(Language.VIETNAMESE, "vi"); + + assertLanguage(Language.UNKNOWN, null); + assertLanguage(Language.UNKNOWN, ""); + assertLanguage(Language.UNKNOWN, "und"); + assertLanguage(Language.UNKNOWN, "z-foo"); + assertLanguage(Language.UNKNOWN, "ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe"); + assertLanguage(Language.UNKNOWN, "#$_^@#$_@%#$)%@$%^--@&&&#-%^_^%"); + } + + @Test + public void requireThatLanguageIsGuessedCorrectlyFromEncodings() { + assertSame(Language.UNKNOWN, Language.fromEncoding(null)); + assertSame(Language.UNKNOWN, Language.fromEncoding("lkij")); + assertSame(Language.UNKNOWN, Language.fromEncoding("(/)(###)")); + + assertSame(Language.CHINESE_SIMPLIFIED, Language.fromEncoding("GB2312")); + assertSame(Language.CHINESE_TRADITIONAL, Language.fromEncoding("BIG5")); + assertSame(Language.JAPANESE, Language.fromEncoding("EUC-jp")); + assertSame(Language.JAPANESE, Language.fromEncoding("ISO-2022-jp")); + assertSame(Language.JAPANESE, Language.fromEncoding("Shift-JIS")); + assertSame(Language.KOREAN, Language.fromEncoding("EUC-kr")); + } + + private static void assertLanguage(Language expected, String str) { + assertSame(expected, Language.fromLanguageTag(str)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java new file mode 100644 index 00000000000..910627584ce --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java @@ -0,0 +1,52 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import org.junit.Test; + +import java.util.Locale; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/** + * @author Simon Thoresen + */ +public class LocaleFactoryTestCase { + + @Test + public void requireThatLocaleCanBeCreatedFromLanguageTag() { + assertLocale("zh", "zh", "", ""); + assertLocale("zh-CN", "zh", "", "CN"); + assertLocale("zh-foo-CN", "zh", "", "CN"); + assertLocale("zh-Hans", "zh", "Hans", ""); + assertLocale("zh-TW", "zh", "", "TW"); + assertLocale("zh-foo-TW", "zh", "", "TW"); + assertLocale("zh-Hant", "zh", "Hant", ""); + assertLocale("ja", "ja", "", ""); + assertLocale("ko", "ko", "", ""); + assertLocale("en", "en", "", ""); + assertLocale("en-NO", "en", "", "NO"); + assertLocale("de", "de", "", ""); + assertLocale("es", "es", "", ""); + assertLocale("es-419", "es", "", "419"); + + try { + LocaleFactory.fromLanguageTag(null); + fail(); + } catch (NullPointerException e) { + + } + + assertLocale("", "", "", ""); + assertLocale("z-foo", "", "", ""); + assertLocale("ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe", "", "", ""); + } + + private static void assertLocale(String tag, String language, String variant, String country) { + Locale locale = LocaleFactory.fromLanguageTag(tag); + assertEquals(language, locale.getLanguage()); + assertEquals(country, locale.getCountry()); + assertEquals(variant, locale.getVariant()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java new file mode 100644 index 00000000000..aa8102fe9f2 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +import com.yahoo.language.Language; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; + +import static org.junit.Assert.*; + +/** + * @author Simon Thoresen Hult + */ +public class AbstractDetectorTestCase { + + private static final Detection DETECTION = new Detection(Language.ARABIC, "encoding", true); + private static final Charset UTF8 = Charset.forName("UTF-8"); + + @Test + public void requireThatDetectStringForwardsUtf8Bytes() { + Hint hint = Hint.newCountryHint("no"); + MyDetector detector = new MyDetector(); + Detection detection = detector.detect("69", hint); + assertSame(DETECTION, detection); + assertArrayEquals("69".getBytes(UTF8), detector.input); + assertEquals(0, detector.offset); + assertEquals(2, detector.length); + assertSame(hint, detector.hint); + } + + @Test + public void requireThatDetectByteBufferForwardsUtf8Bytes() { + byte[] buf = new byte[] { 6, 9 }; + Hint hint = Hint.newCountryHint("no"); + MyDetector detector = new MyDetector(); + Detection detection = detector.detect(ByteBuffer.wrap(buf), hint); + assertSame(DETECTION, detection); + assertArrayEquals(buf, detector.input); + assertEquals(0, detector.offset); + assertEquals(2, detector.length); + assertSame(hint, detector.hint); + } + + private static class MyDetector extends AbstractDetector { + + byte[] input; + int offset; + int length; + Hint hint; + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + this.input = input; + this.offset = offset; + this.length = length; + this.hint = hint; + return DETECTION; + } + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java new file mode 100644 index 00000000000..3cb82572976 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java @@ -0,0 +1,66 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public abstract class AbstractTokenizerTestCase { + + private boolean accentDrop = false; + private Language language = Language.ENGLISH; + private Linguistics linguistics; + private StemMode stemMode = StemMode.NONE; + + public void assertTokenStrings(String input, List expectedTokenStrings) { + List actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenStrings(token, actual); + } + assertEquals(expectedTokenStrings, actual); + } + + public List findTokenStrings(Token token, List out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getTokenString()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenStrings(token.getComponent(i), out); + } + } + return out; + } + + public Iterable tokenize(String input) { + return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); + } + + public AbstractTokenizerTestCase setAccentDrop(boolean accentDrop) { + this.accentDrop = accentDrop; + return this; + } + + public AbstractTokenizerTestCase setLanguage(Language language) { + this.language = language; + return this; + } + + public AbstractTokenizerTestCase setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public AbstractTokenizerTestCase setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java new file mode 100644 index 00000000000..8233ef1b8f0 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -0,0 +1,150 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.simple.SimpleLinguistics; +import org.junit.Test; + +import java.util.Iterator; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.*; + +/** + * @author bratseth + */ +public class GramSplitterTestCase { + + private static final GramSplitter gramSplitter = new SimpleLinguistics().getGramSplitter(); + + @Test + public void testNoSpaces() { + // no spaces + assertGramSplit("engulbillesang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]"); + assertGramSplit("engulbillesang", 2, "[en, ng, gu, ul, lb, bi, il, ll, le, es, sa, an, ng]"); + assertGramSplit("engulbillesang", 3, "[eng, ngu, gul, ulb, lbi, bil, ill, lle, les, esa, san, ang]"); + } + + @Test + public void testWithSpaces() { + // with spaces + assertGramSplit("en gul bille sang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]"); + assertGramSplit("en gul bille sang", 2, "[en, gu, ul, bi, il, ll, le, sa, an, ng]"); + assertGramSplit("en gul bille sang", 3, "[en, gul, bil, ill, lle, san, ang]"); + } + + @Test + public void testCornerCases() { + // corner cases + assertGramSplit("", 1, "[]"); + assertGramSplit("", 2, "[]"); + assertGramSplit("e", 1, "[e]"); + assertGramSplit("e", 2, "[e]"); + assertGramSplit("en", 1, "[e, n]"); + assertGramSplit("en", 2, "[en]"); + assertGramSplit("en", 3, "[en]"); + } + + @Test + public void testSpaceCornerCases() { + // space corner cases + assertGramSplit("e en e", 1, "[e, e, n, e]"); + assertGramSplit("e en e", 2, "[e, en, e]"); + assertGramSplit("e en e", 3, "[e, en, e]"); + assertGramSplit(" e en e ", 1, "[e, e, n, e]"); + assertGramSplit(" e en e ", 2, "[e, en, e]"); + assertGramSplit(" e en e ", 3, "[e, en, e]"); + assertGramSplit(" e en e ", 1, "[e, e, n, e]"); + assertGramSplit(" e en e ", 2, "[e, en, e]"); + assertGramSplit(" e en e ", 3, "[e, en, e]"); + assertGramSplit("a b c", 4, "[a, b, c]"); + } + + @Test + public void testWithCasing() { + assertGramSplit("This is the Black Eyed Peas", 2, + "[Th, hi, is, is, th, he, Bl, la, ac, ck, Ey, ye, ed, Pe, ea, as]"); + assertGramSplit("This is the Black Eyed Peas", 3, + "[Thi, his, is, the, Bla, lac, ack, Eye, yed, Pea, eas]"); + assertGramSplit("This is the Black Eyed Peas", 4, + "[This, is, the, Blac, lack, Eyed, Peas]"); + assertGramSplit("This is the Black Eyed Peas", 5, + "[This, is, the, Black, Eyed, Peas]"); + assertGramSplit("This is the Black Eyed Peas", 6, + "[This, is, the, Black, Eyed, Peas]"); + } + + @Test + public void testWithPunctuation() { + assertGramSplit("this is, in a sense, more than the sum of parts!", 2, + "[th, hi, is, is, in, a, se, en, ns, se, mo, or, re, th, ha, an, th, he, su, um, of, pa, ar, rt, ts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 3, + "[thi, his, is, in, a, sen, ens, nse, mor, ore, tha, han, the, sum, of, par, art, rts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 4, + "[this, is, in, a, sens, ense, more, than, the, sum, of, part, arts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 5, + "[this, is, in, a, sense, more, than, the, sum, of, parts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 6, + "[this, is, in, a, sense, more, than, the, sum, of, parts]"); + } + + @Test + public void testAccents() { + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 2, "[ca, af, f\u00e9, de, l, h\u00f4, \u00f4t, te, el]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 3, "[caf, af\u00e9, de, l, h\u00f4t, \u00f4te, tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 4, "[caf\u00e9, de, l, h\u00f4te, \u00f4tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 5, "[caf\u00e9, de, l, h\u00f4tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 6, "[caf\u00e9, de, l, h\u00f4tel]"); + } + + @Test + public void testChinese() { + String input = "\u77f3\u5ba4\u8a69\u58eb\u65bd\u6c0f\uff0c\u55dc\u7345\uff0c\u8a93\u98df\u5341\u7345\u3002" + + "\u65bd\u6c0f\u6642\u6642\u9069\u5e02\u8996\u7345\uff0c\u5341\u6642\uff0c\u9069\u5341\u7345" + + "\u9069\u5e02\u3002"; + assertGramSplit(input, 2, "[\u77f3\u5ba4, \u5ba4\u8a69, \u8a69\u58eb, \u58eb\u65bd, \u65bd\u6c0f, " + + "\u55dc\u7345, \u8a93\u98df, \u98df\u5341, \u5341\u7345, \u65bd\u6c0f, " + + "\u6c0f\u6642, \u6642\u6642, \u6642\u9069, \u9069\u5e02, \u5e02\u8996, " + + "\u8996\u7345, \u5341\u6642, \u9069\u5341, \u5341\u7345, \u7345\u9069, " + + "\u9069\u5e02]"); + assertGramSplit(input, 3, "[\u77f3\u5ba4\u8a69, \u5ba4\u8a69\u58eb, \u8a69\u58eb\u65bd, \u58eb\u65bd\u6c0f, " + + "\u55dc\u7345, \u8a93\u98df\u5341, \u98df\u5341\u7345, \u65bd\u6c0f\u6642, " + + "\u6c0f\u6642\u6642, \u6642\u6642\u9069, \u6642\u9069\u5e02, \u9069\u5e02\u8996, " + + "\u5e02\u8996\u7345, \u5341\u6642, \u9069\u5341\u7345, \u5341\u7345\u9069, " + + "\u7345\u9069\u5e02]"); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidSplitSize() { + gramSplitter.split("en", 0); + } + + @Test(expected = NullPointerException.class) + public void testInvalidSplitNull() { + gramSplitter.split(null, 1); + } + + @Test + public void testUnusualIteratorUse() { + String text = "en gul bille sang"; + Iterator grams = gramSplitter.split(text, 3); + + assertThat(grams.next().extractFrom(text), is("en")); + assertTrue(grams.hasNext()); + assertTrue(grams.hasNext()); + assertThat(grams.next().extractFrom(text), is("gul")); + assertThat(grams.next().extractFrom(text), is("bil")); + assertThat(grams.next().extractFrom(text), is("ill")); + assertThat(grams.next().extractFrom(text), is("lle")); + assertTrue(grams.hasNext()); + assertTrue(grams.hasNext()); + assertThat(grams.next().extractFrom(text), is("san")); + assertThat(grams.next().extractFrom(text), is("ang")); + assertFalse(grams.hasNext()); + assertFalse(grams.hasNext()); + } + + private void assertGramSplit(String input, int gramSize, String expected) { + assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java new file mode 100644 index 00000000000..771487d0e71 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java @@ -0,0 +1,35 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.simple.SimpleLinguistics; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author Mathias Mølster Lidal + */ +public class NormalizationTestCase { + + private final Normalizer normalizer = new SimpleLinguistics().getNormalizer(); + + @Test + public void testEmptyStringNormalization() { + assertEquals("", normalizer.normalize("")); + } + + @Test + public void testDoubleWidthAscii() { + assertNormalize("\uff41\uff42\uff43\uff44\uff45\uff46\uff47\uff48\uff49", "abcdefghi"); + } + + @Test + public void testLigature() { + assertNormalize("\uFB01nance", "finance"); + } + + private void assertNormalize(String input, String exp) { + assertEquals(exp, normalizer.normalize(input)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java new file mode 100644 index 00000000000..a70a3dc24c5 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java @@ -0,0 +1,27 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertSame; + +/** + * @author Simon Thoresen Hult + */ +public class ProcessingExceptionTestCase { + + @Test + public void requireThatMessageCanBeSet() { + assertEquals("foo", new ProcessingException("foo").getMessage()); + } + + @Test + public void requireThatMessageAndCauseCanBeSet() { + Throwable t = new Throwable(); + ProcessingException e = new ProcessingException("bar", t); + assertEquals("bar", e.getMessage()); + assertSame(t, e.getCause()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java new file mode 100644 index 00000000000..8e7e52358f9 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class SegmenterImplTestCase { + + private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer())); + + @Test + public void requireThatNonIndexableCharactersAreDelimiters() { + assertSegments("i've", Arrays.asList("i", "ve")); + assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz")); + assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4")); + } + + @Test + public void requireThatAdjacentIndexableTokenTypesAreNotSplit() { + assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d")); + } + + @Test + public void requireThatSegmentationReturnsOriginalForm() { + assertSegments("a\u030A", Arrays.asList("a\u030A")); + assertSegments("FOO BAR", Arrays.asList("FOO", "BAR")); + } + + private static void assertSegments(String input, List expectedSegments) { + assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java new file mode 100644 index 00000000000..9a592781998 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java @@ -0,0 +1,73 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import static org.junit.Assert.*; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Functional testing of StemList. + * + * @author steinar + */ +public class StemListTestCase { + + private StemList stems; + + @Before + public void setUp() throws Exception { + stems = new StemList(); + } + + @After + public void tearDown() throws Exception { + stems = null; + } + + @Test + public void testSize() { + assertEquals(0, stems.size()); + stems.add("a"); + stems.add("b"); + stems.add("a"); + assertEquals(2, stems.size()); + } + + @Test + public void testSet() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals("a", stems.set(2, "a")); + assertEquals("c", stems.get(2)); + assertEquals("c", stems.set(2, "z")); + assertEquals("z", stems.get(2)); + } + + @Test + public void testAdd() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals(4, stems.size()); + stems.add("a"); + assertEquals(4, stems.size()); + stems.add("z"); + assertEquals(5, stems.size()); + } + + @Test + public void testremove() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals("c", stems.remove(2)); + assertEquals(3, stems.size()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java new file mode 100644 index 00000000000..13cd8a82e36 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java @@ -0,0 +1,27 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class StemModeTestCase { + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfWorks() { + for (StemMode mode : StemMode.values()) { + assertEquals(mode, StemMode.valueOf(mode.getValue())); + } + } + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfUnknownIsNone() { + assertEquals(StemMode.NONE, StemMode.valueOf(-1)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java new file mode 100644 index 00000000000..d81aaaafcc8 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java @@ -0,0 +1,68 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; +import org.mockito.Mockito; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class StemmerImplTestCase { + + @Test + public void requireThatStemIsNormalizedAndLowerCased() { + assertStem("FOO", Arrays.asList("foo")); + assertStem("a\u030A", Arrays.asList("\u00E5")); + } + + @Test + public void requireThatOnlyIndexableTokensAreReturned() { + assertStem("foo. (bar)!", Arrays.asList("foo", "bar")); + } + + @Test + public void requireThatSpecialTokensAreNotDecompounded() { + SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC) + .setTokenString("c++") + .addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC) + .setTokenString("c")) + .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC) + .setTokenString("p")) + .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC) + .setTokenString("p")); + Tokenizer tokenizer = Mockito.mock(Tokenizer.class); + Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.any(), Mockito.any(), + Mockito.anyBoolean())) + .thenReturn(Arrays.asList(token)); + Stemmer stemmer = new StemmerImpl(tokenizer); + + token.setSpecialToken(false); + assertEquals(Arrays.asList(new StemList("c"), + new StemList("p"), + new StemList("p")), + stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + + token.setSpecialToken(true); + assertEquals(Arrays.asList(new StemList("c++")), + stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + } + + private static void assertStem(String input, List expectedStems) { + Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer())); + List got = new ArrayList<>(); + for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) { + got.add(word.get(0)); + } + assertEquals(expectedStems, got); + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java new file mode 100644 index 00000000000..1a92f5a750e --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * @author Simon Thoresen Hult + */ +public class TokenTypeTestCase { + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfWorks() { + for (TokenType type : TokenType.values()) { + assertEquals(type, TokenType.valueOf(type.getValue())); + } + } + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfUnknownIsUnknown() { + assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1)); + } + + @Test + public void requireThatOnlyAlphaNumericsAreIndexable() { + for (TokenType type : TokenType.values()) { + if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) { + assertTrue(type.isIndexable()); + } else { + assertFalse(type.isIndexable()); + } + } + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java new file mode 100644 index 00000000000..6506b41fc79 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java @@ -0,0 +1,233 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; + +import java.util.*; + +import static com.yahoo.language.LinguisticsCase.toLowerCase; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.*; + +/** + * Test of tokenization, with stemming and accent removal + * + * @author Mathias Mølster Lidal + */ +public class TokenizationTestCase { + + private final Tokenizer tokenizer = new SimpleTokenizer(); + + @Test + public void testTokenizer() { + assertTokenize("This is a test, 123", + Arrays.asList("this", "is", "a", "test", "123"), + Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123")); + } + + @Test + public void testUnderScoreTokenization() { + assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null); + } + + @Test + public void testPhrasesWithPunctuation() { + assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE, + false, + Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"), + null); + assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null); + assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null); + } + + @Test + public void testDoubleWidthTokenization() { + // "sony" + assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("sony"), null); + assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("sony"), null); + // "SONY" + assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("sony"), null); + assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("sony"), null); + // "on" + assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("on"), null); + assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("on"), null); + // "ON" + assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("on"), null); + assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("on"), null); + } + + @Test + public void testLargeTextTokenization() { + StringBuilder sb = new StringBuilder(); + String s = "teststring "; + for (int i = 0; i < 100000; i++) { + sb.append(s); + } + + String input = sb.toString(); + + int numTokens = 0; + List pos = new ArrayList<>(); + for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) { + numTokens++; + if ((numTokens % 100) == 0) { + pos.add(t.getOffset()); + } + } + + assertEquals("Check that all tokens have been tokenized", numTokens, 200000); + assertTrue("Increasing token pos", assertMonoIncr(pos)); + } + + @Test + public void testLargeTokenGuard() { + StringBuilder str = new StringBuilder(); + for (int i = 0; i < 128 * 256; i++) { + str.append("ab"); + } + Iterator it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator(); + assertTrue(it.hasNext()); + assertNotNull(it.next().getTokenString()); + assertFalse(it.hasNext()); + } + + @Test + public void testTokenIterator() { + Iterator it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertFalse(it.hasNext()); + try { + it.next(); + fail(); + } catch (NoSuchElementException e) { + // success + } + + it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertFalse(it.hasNext()); + + it = tokenizer.tokenize("one two three", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertFalse(it.hasNext()); + } + + @Test + public void testGetOffsetLength() { + String input = "Deka-Chef Weber r\u00e4umt Kommunikationsfehler ein"; + long[] expOffset = { 0, 4, 5, 9, 10, 15, 16, 21, 22, 42, 43 }; + int[] len = { 4, 1, 4, 1, 5, 1, 5, 1, 20, 1, 3 }; + + int idx = 0; + for (Token token : tokenizer.tokenize(input, Language.GERMAN, StemMode.SHORTEST, false)) { + assertThat("Token offset for token #" + idx, token.getOffset(), is(expOffset[idx])); + assertThat("Token len for token #" + idx, token.getOrig().length(), is(len[idx])); + idx++; + } + } + + @Test + public void testRecursiveDecompose() { + for (Token t : tokenizer.tokenize("\u00a510%", Language.ENGLISH, StemMode.SHORTEST, false)) { + recurseDecompose(t); + } + } + + @Test + public void testIndexability() { + String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652"; + for (StemMode stemMode : new StemMode[] { StemMode.NONE, + StemMode.SHORTEST }) { + for (Language l : new Language[] { Language.INDONESIAN, + Language.ENGLISH, Language.ARABIC }) { + for (boolean accentDrop : new boolean[] { true, false }) { + for (Token token : tokenizer.tokenize(input, + l, stemMode, accentDrop)) { + if (token.getTokenString().length() == 0) { + assertFalse(token.isIndexable()); + } + } + } + } + } + } + + private void recurseDecompose(Token t) { + assertTrue(t.getOffset() >= 0); + assertTrue(t.getOrig().length() >= 0); + + int numComp = t.getNumComponents(); + for (int i = 0; i < numComp; i++) { + Token comp = t.getComponent(i); + recurseDecompose(comp); + } + } + + private boolean assertMonoIncr(Iterable n) { + long trailing = -1; + for (long i : n) { + if (i < trailing) { + return false; + } + trailing = i; + } + return true; + } + + private void assertTokenize(String input, List indexed, List orig) { + assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig); + } + + /** + *

Compare the results of running an input string through the tokenizer with an "index" truth, and an optional + * "orig" truth.

+ * + * @param input The text to process, passed to tokenizer. + * @param language The language tag, passed to tokenizer. + * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming. + * @param accentDrop Passed to the tokenizer. + * @param indexed Compared to the "TokenString" result from the tokenizer. + * @param orig Compared to the "Orig" result from the tokenizer. + */ + private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop, + List indexed, List orig) { + int i = 0; + int j = 0; + for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) { + // System.err.println("got token orig '"+token.getOrig()+"'"); + // System.err.println("got token stem '"+token.getTokenString(stemMode)+"'"); + if (token.getNumComponents() > 0) { + for (int comp = 0; comp < token.getNumComponents(); comp++) { + Token t = token.getComponent(comp); + if (t.getType().isIndexable()) { + assertThat("comp index: " + i, toLowerCase(t.getTokenString()), is(indexed.get(i++))); + } + } + } else { + if (token.getType().isIndexable()) { + assertThat("exp index: " + i, toLowerCase(token.getTokenString()), is(indexed.get(i++))); + } + } + if (orig != null) { + assertThat("orig index: " + j, token.getOrig(), is(orig.get(j++))); + } + } + assertThat("indexed length", i, is(indexed.size())); + if (orig != null) { + assertThat("orig length", j, is(orig.size())); + } + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java new file mode 100644 index 00000000000..66eee3f73d4 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -0,0 +1,89 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.text.Utf8; +import org.junit.Test; + +import java.nio.charset.Charset; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen + */ +public class SimpleDetectorTestCase { + + @Test + public void requireThatLanguageCanDetected() { + assertLanguage(Language.UNKNOWN, "Hello!"); + + // "Chinese language" + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_TRADITIONAL input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u50B7\u8EAB\u9AD4\u3002"); + + // four katakana characters from this web page: http://www.japanese-online.com/language/lessons/katakana.htm + assertLanguage(Language.JAPANESE, "\u30ab\u30bf\u30ab\u30ca"); + + // four hiragana characters gotton from web page: http://www.japanese-online.com/language/lessons/hiragana.htm + assertLanguage(Language.JAPANESE, "\u3072\u3089\u304c\u306a"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + // This is a good test because this string contains not only japanese but chinese characters, so we need to look + // through it to find the japanese ones. + assertLanguage(Language.JAPANESE, + "\u79c1\u306f\u30ac\u30e9\u30b9\u3092\u98df\u3079\u3089\u308c\u307e\u3059" + + "\u3002\u305d\u308c\u306f\u79c1\u3092\u50b7\u3064\u3051\u307e\u305b\u3093" + + "\u3002"); + + // an introduction on an adobe web page. What it measn I don't know. + assertLanguage(Language.KOREAN, "\ud55c\uae00\uacfc"); + + // for the sound of "A" + assertLanguage(Language.KOREAN, "\u314f"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + } + + @Test + public void testEncodingGuess() { + // just some arbitrary data above 127 which is not valid as UTF-8 + byte[] b = new byte[] { (byte)196, (byte)197, (byte)198 }; + Detection d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + + // a string from http://www.columbia.edu/kermit/utf8.html that says + // "I can eat glass (and it doesn't hurt me)". + b = Utf8.toBytes("\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Utf8.getCharset(), d.getEncoding()); + + // arbitrary ascii + b = new byte[] { 31, 32, 33 }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("US-ASCII"), d.getEncoding()); + + // character which is not valid in UTF-8 + b = new byte[] { -1 }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + + // UTF-8 which requires more bytes than available + b = new byte[] { Utf8.toBytes("\u00E5")[0] }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + } + + private static void assertLanguage(Language language, String input) { + assertEquals(language, new SimpleDetector().detect(input, null).getLanguage()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java new file mode 100644 index 00000000000..9c9c8b8fcc5 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java @@ -0,0 +1,34 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.Normalizer; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class SimpleNormalizerTestCase { + + private static final Normalizer NORMALIZER = new SimpleNormalizer(); + + @Test + public void requireThatInputIsNfkcNormalized() { + assertNormalize("\u212B", "\u00C5"); + assertNormalize("\u2126", "\u03A9"); + assertNormalize("\u00C5", "\u00C5"); + assertNormalize("\u00F4", "\u00F4"); + assertNormalize("\u1E69", "\u1E69"); + assertNormalize("\u1E0B\u0323", "\u1E0D\u0307"); + assertNormalize("\u0071\u0307\u0323", "q\u0323\u0307"); + assertNormalize("\uFB01", "fi"); + assertNormalize("\u0032\u2075", "25"); + assertNormalize("\u1E9B\u0323", "\u1E69"); + } + + private static void assertNormalize(String input, String expectedNormalForm) { + assertEquals(expectedNormalForm, NORMALIZER.normalize(input)); + } + +} \ No newline at end of file diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java new file mode 100644 index 00000000000..b27b70b4dc9 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java @@ -0,0 +1,194 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * @author Simon Thoresen Hult + */ +public class SimpleTokenTestCase { + + @Test + public void requireThatOrigAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals("foo", token.getOrig()); + + assertEquals(token, new SimpleToken("foo")); + assertFalse(token.equals(new SimpleToken("bar"))); + } + + @Test + public void requireThatComponentAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getNumComponents()); + SimpleToken bar = new SimpleToken("bar"); + SimpleToken baz = new SimpleToken("baz"); + token.addComponent(bar); + token.addComponent(baz); + assertEquals(2, token.getNumComponents()); + assertSame(bar, token.getComponent(0)); + assertSame(baz, token.getComponent(1)); + + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.addComponent(bar); + assertFalse(token.equals(other)); + other.addComponent(baz); + assertEquals(token, other); + + other = new SimpleToken("foo"); + other.addComponent(baz); + other.addComponent(bar); + assertFalse(token.equals(other)); + } + + @Test + public void requireThatStemAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getNumStems()); + assertNull(token.getStem(0)); + token.setTokenString("bar"); + assertEquals(1, token.getNumStems()); + assertEquals("bar", token.getStem(0)); + } + + @Test + public void requireThatTokenStringAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertNull(token.getTokenString()); + token.setTokenString("bar"); + assertEquals("bar", token.getTokenString()); + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.setTokenString("bar"); + assertEquals(token, other); + } + + @Test + public void requireThatTypeAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(TokenType.UNKNOWN, token.getType()); + for (TokenType type : TokenType.values()) { + token.setType(type); + assertEquals(type, token.getType()); + } + + SimpleToken other = new SimpleToken("foo"); + for (TokenType type : TokenType.values()) { + other.setType(type); + if (type == token.getType()) { + assertEquals(token, other); + } else { + assertFalse(token.equals(other)); + } + } + } + + @Test + public void requireThatScriptAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(TokenScript.UNKNOWN, token.getScript()); + for (TokenScript script : TokenScript.values()) { + token.setScript(script); + assertEquals(script, token.getScript()); + } + + SimpleToken other = new SimpleToken("foo"); + for (TokenScript script : TokenScript.values()) { + other.setScript(script); + if (script == token.getScript()) { + assertEquals(token, other); + } else { + assertFalse(token.equals(other)); + } + } + } + + @Test + public void requireThatSpecialTokenAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertFalse(token.isSpecialToken()); + token.setSpecialToken(true); + assertTrue(token.isSpecialToken()); + token.setSpecialToken(false); + assertFalse(token.isSpecialToken()); + + SimpleToken other = new SimpleToken("foo"); + other.setSpecialToken(true); + assertFalse(token.equals(other)); + other.setSpecialToken(false); + assertEquals(token, other); + } + + @Test + public void requireThatOffsetAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getOffset()); + token.setOffset(69); + assertEquals(69, token.getOffset()); + + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.setOffset(69); + assertEquals(token, other); + } + + @Test + public void requireThatToStringIsExpressive() { + SimpleToken token = new SimpleToken("my_orig"); + token.addComponent(new SimpleToken("my_component_1")); + token.addComponent(new SimpleToken("my_component_2")); + token.setTokenString("my_token_string"); + token.setType(TokenType.ALPHABETIC); + token.setScript(TokenScript.ARABIC); + token.setOffset(1); + + String expected = "token : SimpleToken {\n" + + " components : {\n" + + " [0] : SimpleToken {\n" + + " components : {\n" + + " }\n" + + " offset : 0\n" + + " orig : 'my_component_1'\n" + + " script : UNKNOWN\n" + + " special : false\n" + + " token string : null\n" + + " type : UNKNOWN\n" + + " }\n" + + " [1] : SimpleToken {\n" + + " components : {\n" + + " }\n" + + " offset : 0\n" + + " orig : 'my_component_2'\n" + + " script : UNKNOWN\n" + + " special : false\n" + + " token string : null\n" + + " type : UNKNOWN\n" + + " }\n" + + " }\n" + + " offset : 1\n" + + " orig : 'my_orig'\n" + + " script : ARABIC\n" + + " special : false\n" + + " token string : 'my_token_string'\n" + + " type : ALPHABETIC\n" + + "}"; + assertEquals(expected, token.toString()); + } + + @Test + public void requireThatHashCodeIsImplemented() { + assertEquals(new SimpleToken("foo").hashCode(), new SimpleToken("foo").hashCode()); + } + + @Test + public void requireThatEqualsIsImplemented() { + assertFalse(new SimpleToken("foo").equals(new Object())); + assertEquals(new SimpleToken("foo"), new SimpleToken("foo")); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java new file mode 100644 index 00000000000..2d258be7af0 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenType; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * Check simple token types. + * + * @author Steinar Knutsen + */ +public class SimpleTokenTypeTestCase { + + @Test + public final void test() { + assertEquals(TokenType.ALPHABETIC, tokenType('a')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u01c0')); + assertEquals(TokenType.SYMBOL, tokenType('\u20dd')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u0912')); + assertEquals(TokenType.NUMERIC, tokenType('1')); + assertEquals(TokenType.PUNCTUATION, tokenType('.')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3b')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3c')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u203f')); + assertEquals(TokenType.SYMBOL, tokenType('\u2044')); + assertEquals(TokenType.SYMBOL, tokenType('$')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u2132')); + assertEquals(TokenType.ALPHABETIC, tokenType('\uD800', '\uDFC8')); + } + + private static TokenType tokenType(char c) { + return SimpleTokenType.valueOf(c); + } + + private static TokenType tokenType(char high, char low) { + return SimpleTokenType.valueOf(Character.toCodePoint(high, low)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java new file mode 100644 index 00000000000..8760da56415 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -0,0 +1,36 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.AbstractTokenizerTestCase; +import com.yahoo.language.process.StemMode; +import org.junit.Test; + +/** + * @author Steinar Knutsen + * @author bratseth + */ +public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { + + @Test + public void testTokenizingNoStemming() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.NONE); + tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " + + "anyone 1s offended by ancien7 gods.Running)", + "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(", + "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ", + "if", " ", "anyone", " ", "1s", " ", "offended", " ", "by", " ", "ancien7", + " ", "gods", ".", "running", ")"); + } + + @Test + public void testTokenizingStemming() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); + tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " + + "anyone 1s offended by ancien7 gods.Running)", + "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(", + "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ", + "if", " ", "anyone", " ", "1s", " ", "offend", " ", "by", " ", "ancien7", + " ", "gods", ".", "running", ")"); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java new file mode 100644 index 00000000000..ea4b85e4bd1 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java @@ -0,0 +1,40 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.process.Transformer; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class SimpleTransformerTestCase { + + private final static Transformer TRANSFORMER = new SimpleTransformer(); + + @Test + public void requireThatNonAccentsRemain() { + assertTransform("foo", "foo"); + } + + @Test + public void requireThatTransformerRemovesAccents() { + assertTransform("\u212B", "A"); + assertTransform("\u2126", "\u03A9"); + assertTransform("\u00C5", "A"); + assertTransform("\u00F4", "o"); + assertTransform("\u1E69", "s"); + assertTransform("\u1E0B\u0323", "d"); + assertTransform("\u0071\u0307\u0323", "q"); + assertTransform("\uFB01", "\uFB01"); + assertTransform("2\u2075", "2\u2075"); + assertTransform("\u1E9B\u0323", "\u017F"); + } + + private static void assertTransform(String input, String expectedTransform) { + assertEquals(expectedTransform, TRANSFORMER.accentDrop(input, Language.ENGLISH)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java new file mode 100644 index 00000000000..bb59788b26e --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class TokenizerTester { + + private boolean accentDrop = false; + private Language language = Language.ENGLISH; + private Linguistics linguistics = new SimpleLinguistics(); + private StemMode stemMode = StemMode.NONE; + + public void assertTokens(String input, String ... expectedTokenStrings) { + List actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenStrings(token, actual); + } + assertEquals(Arrays.asList(expectedTokenStrings), actual); + } + + public List findTokenStrings(Token token, List out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getTokenString()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenStrings(token.getComponent(i), out); + } + } + return out; + } + + public Iterable tokenize(String input) { + return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); + } + + public TokenizerTester setAccentDrop(boolean accentDrop) { + this.accentDrop = accentDrop; + return this; + } + + public TokenizerTester setLanguage(Language language) { + this.language = language; + return this; + } + + public TokenizerTester setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public TokenizerTester setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + +} -- cgit v1.2.3