summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics
Publish
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/.gitignore5
-rw-r--r--linguistics/OWNERS2
-rw-r--r--linguistics/etc/icaneatglass.txt151
-rw-r--r--linguistics/pom.xml86
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Language.java615
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java101
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java31
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LocaleFactory.java57
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java25
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/Detection.java47
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java14
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/Detector.java44
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/Hint.java38
-rw-r--r--linguistics/src/main/java/com/yahoo/language/detect/package-info.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/package-info.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java55
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java222
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Normalizer.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java18
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java45
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemList.java61
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemMode.java45
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Stemmer.java26
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java46
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Token.java56
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/TokenScript.java77
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/TokenType.java51
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java38
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Transformer.java23
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/package-info.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java179
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java61
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java16
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java188
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java68
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java76
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java25
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java661
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java184
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java375
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java716
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java715
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java614
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java1426
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java136
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/package-info.java9
-rw-r--r--linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java107
-rw-r--r--linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java52
-rw-r--r--linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java61
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java66
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java150
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java35
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java27
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java43
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java73
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java27
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java68
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java38
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java233
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java89
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java34
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java194
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java43
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java36
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java40
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java69
72 files changed, 12557 insertions, 0 deletions
diff --git a/linguistics/.gitignore b/linguistics/.gitignore
new file mode 100644
index 00000000000..8b990078588
--- /dev/null
+++ b/linguistics/.gitignore
@@ -0,0 +1,5 @@
+target
+*.iml
+*.ipr
+*.iws
+/pom.xml.build
diff --git a/linguistics/OWNERS b/linguistics/OWNERS
new file mode 100644
index 00000000000..cd50f7a263a
--- /dev/null
+++ b/linguistics/OWNERS
@@ -0,0 +1,2 @@
+bratseth
+arnej27959
diff --git a/linguistics/etc/icaneatglass.txt b/linguistics/etc/icaneatglass.txt
new file mode 100644
index 00000000000..6de0dd8ba38
--- /dev/null
+++ b/linguistics/etc/icaneatglass.txt
@@ -0,0 +1,151 @@
+Afrikaans: Ek kan glas eet, maar dit doen my nie skade nie.
+Albanian: Un\u00EB mund t\u00EB ha qelq dhe nuk m\u00EB gjen gj\u00EB.
+Allemannisch: I kaun Gloos essen, es tuat ma ned weh.
+Anglo-Saxon (Latin): Ic m\u00E6g gl\u00E6s eotan ond hit ne hearmia\u00F0 me.
+Anglo-Saxon (Runes): \u16C1\u16B3\u16EB\u16D7\u16A8\u16B7\u16EB\u16B7\u16DA\u16A8\u16CB\u16EB\u16D6\u16A9\u16CF\u16AA\u16BE\u16EB\u16A9\u16BE\u16DE\u16EB\u16BB\u16C1\u16CF\u16EB\u16BE\u16D6\u16EB\u16BB\u16D6\u16AA\u16B1\u16D7\u16C1\u16AA\u16A7\u16EB\u16D7\u16D6\u16EC
+Arabic: \u0623\u0646\u0627 \u0642\u0627\u062F\u0631 \u0639\u0644\u0649 \u0623\u0643\u0644 \u0627\u0644\u0632\u062C\u0627\u062C \u0648 \u0647\u0630\u0627 \u0644\u0627 \u064A\u0624\u0644\u0645\u0646\u064A.
+Aragon\u00E9s: Puedo minchar beire, no me'n fa mal .
+Armenian: \u053F\u0580\u0576\u0561\u0574 \u0561\u057A\u0561\u056F\u056B \u0578\u0582\u057F\u0565\u056C \u0587 \u056B\u0576\u056E\u056B \u0561\u0576\u0570\u0561\u0576\u0563\u056B\u057D\u057F \u0579\u0568\u0576\u0565\u0580\u0589
+Bangla / Bengali: \u0986\u09AE\u09BF \u0995\u09BE\u0981\u099A \u0996\u09C7\u09A4\u09C7 \u09AA\u09BE\u09B0\u09BF, \u09A4\u09BE\u09A4\u09C7 \u0986\u09AE\u09BE\u09B0 \u0995\u09CB\u09A8\u09CB \u0995\u09CD\u09B7\u09A4\u09BF \u09B9\u09AF\u09BC \u09A8\u09BE\u0964
+Basque: Kristala jan dezaket, ez dit minik ematen.
+Bayrisch / Bavarian: I koh Glos esa, und es duard ma ned wei.
+Belarusian (Cyrillic): \u042F \u043C\u0430\u0433\u0443 \u0435\u0441\u0446\u0456 \u0448\u043A\u043B\u043E, \u044F\u043D\u043E \u043C\u043D\u0435 \u043D\u0435 \u0448\u043A\u043E\u0434\u0437\u0456\u0446\u044C.
+Belarusian (Lacinka): Ja mahu je\u015Bci \u0161k\u0142o, jano mne ne \u0161kodzi\u0107.
+Bislama: Mi save kakae glas, hemi no save katem mi.
+Brazilian Portuguese : Posso comer vidro, n\u00E3o me machuca.
+Bulgarian: \u041C\u043E\u0433\u0430 \u0434\u0430 \u044F\u043C \u0441\u0442\u044A\u043A\u043B\u043E, \u0442\u043E \u043D\u0435 \u043C\u0438 \u0432\u0440\u0435\u0434\u0438.
+Burmese: \u1000\u1039\u101A\u1039\u101D\u1014\u1039\u200C\u1010\u1031\u102C\u1039\u200C\u104A\u1000\u1039\u101A\u1039\u101D\u1014\u1039\u200C\u1019 \u1019\u1039\u101A\u1000\u1039\u200C\u1005\u102C\u1038\u1014\u102F\u102D\u1004\u1039\u200C\u101E\u100A\u1039\u200C\u104B \u104E\u1000\u1039\u101B\u1031\u102C\u1004\u1039\u200C\u1037 \u1011\u102D\u1001\u102F\u102D\u1000\u1039\u200C\u1019\u1039\u101F\u102F \u1019\u101B\u1039\u101F\u102D\u1015\u102C\u104B
+Caboverdiano/Kabuverdianu (Cape Verde): M' pod\u00EA cum\u00EA vidru, ca ta magu\u00E2-m'.
+Catalan / Catal\u00E0: Puc menjar vidre, que no em fa mal.
+Chamorro: Si\u00F1a yo' chumocho krestat, ti ha na'lalamen yo'.
+Chinese (Traditional): \u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u50B7\u8EAB\u9AD4\u3002
+Chinese: \u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002
+Chinook Jargon: Naika m\u0259km\u0259k kaksh\u0259t labutay, pi weyk ukuk munk-sik nay.
+Classical Greek: \u1F55\u03B1\u03BB\u03BF\u03BD \u03D5\u03B1\u03B3\u03B5\u1FD6\u03BD \u03B4\u03CD\u03BD\u03B1\u03BC\u03B1\u03B9\u00B7 \u03C4\u03BF\u1FE6\u03C4\u03BF \u03BF\u1F54 \u03BC\u03B5 \u03B2\u03BB\u03AC\u03C0\u03C4\u03B5\u03B9.
+Cornish: M\u00FD a yl dybry gw\u00E9der hag \u00E9f ny wra ow ankenya.
+Croatian: Ja mogu jesti staklo i ne boli me.
+Czech: Mohu j\u00EDst sklo, neubl\u00ED\u017E\u00ED mi.
+Dansk / Danish: Jeg kan spise glas, det g\u00F8r ikke ondt p\u00E5 mig.
+Deutsch (Voralberg): I ka glas eassa, ohne dass mar weh tuat.
+Deutsch / German: Ich kann Glas essen, ohne mir zu schaden.
+English (Braille): \u280A\u2800\u2809\u2801\u281D\u2800\u2811\u2801\u281E\u2800\u281B\u2807\u2801\u280E\u280E\u2800\u2801\u281D\u2819\u2800\u280A\u281E\u2800\u2819\u2815\u2811\u280E\u281D\u281E\u2800\u2813\u2825\u2817\u281E\u2800\u280D\u2811
+English (IPA): [a\u026A k\u00E6n i\u02D0t gl\u0251\u02D0s \u00E6nd \u026At d\u0250z n\u0252t h\u025C\u02D0t mi\u02D0] (Received Pronunciation)
+English: I can eat glass and it doesn't hurt me.
+Erzian: \u041C\u043E\u043D \u044F\u0440\u0441\u0430\u043D \u0441\u0443\u043B\u0438\u043A\u0430\u0434\u043E, \u0434\u044B \u0437\u044B\u044F\u043D \u044D\u0439\u0441\u0442\u044D\u043D\u0437\u044D \u0430 \u0443\u043B\u0438.
+Esperanto: Mi povas man\u011Di vitron, \u011Di ne dama\u011Das min.
+Estonian: Ma v\u00F5in klaasi s\u00FC\u00FCa, see ei tee mulle midagi.
+European Portuguese: Posso comer vidro, n\u00E3o me faz mal.
+F\u00F8royskt / Faroese: Eg kann eta glas, ska\u00F0aleysur.
+Farsi / Persian: .\u0645\u0646 \u0645\u06CC \u062A\u0648\u0627\u0646\u0645 \u0628\u062F\u0648\u0646\u0650 \u0627\u062D\u0633\u0627\u0633 \u062F\u0631\u062F \u0634\u064A\u0634\u0647 \u0628\u062E\u0648\u0631\u0645
+Fijian: Au rawa ni kana iloilo, ia au sega ni vakacacani kina.
+French: Je peux manger du verre, \u00E7a ne me fait pas mal.
+Frysk / Frisian: Ik kin gl\u00EAs ite, it docht me net sear.
+Galician: Eu podo xantar cristais e non cortarme.
+Georgian: \u10DB\u10D8\u10DC\u10D0\u10E1 \u10D5\u10ED\u10D0\u10DB \u10D3\u10D0 \u10D0\u10E0\u10D0 \u10DB\u10E2\u10D9\u10D8\u10D5\u10D0.
+Gothic : \uD800\uDF3C\uD800\uDF30\uD800\uDF32 \uD800\uDF32\uD800\uDF3B\uD800\uDF34\uD800\uDF43 \uD800\uDF39\u0308\uD800\uDF44\uD800\uDF30\uD800\uDF3D, \uD800\uDF3D\uD800\uDF39 \uD800\uDF3C\uD800\uDF39\uD800\uDF43 \uD800\uDF45\uD800\uDF3F \uD800\uDF3D\uD800\uDF33\uD800\uDF30\uD800\uDF3D \uD800\uDF31\uD800\uDF42\uD800\uDF39\uD800\uDF32\uD800\uDF32\uD800\uDF39\uD800\uDF38.
+Greek (monotonic): \u039C\u03C0\u03BF\u03C1\u03CE \u03BD\u03B1 \u03C6\u03AC\u03C9 \u03C3\u03C0\u03B1\u03C3\u03BC\u03AD\u03BD\u03B1 \u03B3\u03C5\u03B1\u03BB\u03B9\u03AC \u03C7\u03C9\u03C1\u03AF\u03C2 \u03BD\u03B1 \u03C0\u03AC\u03B8\u03C9 \u03C4\u03AF\u03C0\u03BF\u03C4\u03B1.
+Greek (polytonic): \u039C\u03C0\u03BF\u03C1\u1FF6 \u03BD\u1F70 \u03C6\u03AC\u03C9 \u03C3\u03C0\u03B1\u03C3\u03BC\u03AD\u03BD\u03B1 \u03B3\u03C5\u03B1\u03BB\u03B9\u1F70 \u03C7\u03C9\u03C1\u1F76\u03C2 \u03BD\u1F70 \u03C0\u03AC\u03B8\u03C9 \u03C4\u03AF\u03C0\u03BF\u03C4\u03B1.
+Hausa (Ajami) : \u0625\u0650\u0646\u0627 \u0625\u0650\u0649\u064E \u062A\u064E\u0648\u0646\u064E\u0631 \u063A\u0650\u0644\u064E\u0627\u0634\u0650 \u0643\u064F\u0645\u064E \u0625\u0650\u0646 \u063A\u064E\u0645\u064E\u0627 \u0644\u064E\u0627\u0641\u0650\u0649\u064E\u0627
+Hausa (Latin): In\u0101 iya taunar gil\u0101shi kuma in gam\u0101 l\u0101fiy\u0101.
+Hawaiian: Hiki ia\u02BBu ke \u02BBai i ke aniani; \u02BBa\u02BBole n\u014D l\u0101 au e \u02BBeha.
+Hebrew: \u05D0\u05E0\u05D9 \u05D9\u05DB\u05D5\u05DC \u05DC\u05D0\u05DB\u05D5\u05DC \u05D6\u05DB\u05D5\u05DB\u05D9\u05EA \u05D5\u05D6\u05D4 \u05DC\u05D0 \u05DE\u05D6\u05D9\u05E7 \u05DC\u05D9.
+Hindi: \u092E\u0948\u0902 \u0915\u093E\u0901\u091A \u0916\u093E \u0938\u0915\u0924\u093E \u0939\u0942\u0901 \u0914\u0930 \u092E\u0941\u091D\u0947 \u0909\u0938\u0938\u0947 \u0915\u094B\u0908 \u091A\u094B\u091F \u0928\u0939\u0940\u0902 \u092A\u0939\u0941\u0902\u091A\u0924\u0940.
+Hungarian: Meg tudom enni az \u00FCveget, nem lesz t\u0151le bajom.
+Inuktitut (10): \u140A\u14D5\u148D\u1585 \u14C2\u1546\u152D\u154C\u1593\u1483\u146F \u14F1\u154B\u1671\u1466\u1450\u14D0\u14C7\u1585\u1450\u1593
+Icelandic: \u00C9g get eti\u00F0 gler \u00E1n \u00FEess a\u00F0 mei\u00F0a mig.
+Irish: Is f\u00E9idir liom gloinne a ithe. N\u00ED dh\u00E9anann s\u00ED dochar ar bith dom.
+Italian: Posso mangiare il vetro e non mi fa male.
+Jamaican: Mi kian niam glas han i neba hot mi.
+Japanese: \u79C1\u306F\u30AC\u30E9\u30B9\u3092\u98DF\u3079\u3089\u308C\u307E\u3059\u3002\u305D\u308C\u306F\u79C1\u3092\u50B7\u3064\u3051\u307E\u305B\u3093\u3002
+Javanese: Aku isa mangan beling tanpa lara.
+Kannada: \u0CA8\u0CA8\u0C97\u0CC6 \u0CB9\u0CBE\u0CA8\u0CBF \u0C86\u0C97\u0CA6\u0CC6, \u0CA8\u0CBE\u0CA8\u0CC1 \u0C97\u0C9C\u0CA8\u0CCD\u0CA8\u0CC1 \u0CA4\u0CBF\u0CA8\u0CAC\u0CB9\u0CC1\u0CA6\u0CC1
+Khmer: \u1781\u17D2\u1789\u17BB\u17C6\u17A2\u17B6\u1785\u1789\u17BB\u17C6\u1780\u1789\u17D2\u1785\u1780\u17CB\u1794\u17B6\u1793 \u178A\u17C4\u1799\u1782\u17D2\u1798\u17B6\u1793\u1794\u1789\u17D2\u17A0\u17B6\u179A
+Kirchr\u00F6adsj/B\u00F4chesserplat: Iech ken glaas \u00E8\u00E8se, mer 't deet miech jing pieng.
+Kiswahili (Swahili): Naweza kula bilauri na sikunyui.
+Korean: \uB098\uB294 \uC720\uB9AC\uB97C \uBA39\uC744 \uC218 \uC788\uC5B4\uC694. \uADF8\uB798\uB3C4 \uC544\uD504\uC9C0 \uC54A\uC544\uC694
+Krey\u00F2l Ayisyen (Hait\u00EF): Mwen kap manje v\u00E8, li pa blese'm.
+L\u00EBtzebuergescht / Luxemburgish: Ech kan Glas iessen, daat deet mir n\u00EBt wei.
+Lalland Scots / Doric: Ah can eat gless, it disnae hurt us.
+Langenfelder Platt: Isch kann Jlaas kimmeln, uuhne datt mich datt weh d\u00E4\u00E4d.
+Lao: \u0E82\u0EAD\u0EC9\u0E8D\u0E81\u0EB4\u0E99\u0EC1\u0E81\u0EC9\u0EA7\u0EC4\u0E94\u0EC9\u0EC2\u0E94\u0E8D\u0E97\u0EB5\u0EC8\u0EA1\u0EB1\u0E99\u0E9A\u0ECD\u0EC8\u0EC4\u0E94\u0EC9\u0EC0\u0EAE\u0EB1\u0E94\u0EC3\u0EAB\u0EC9\u0E82\u0EAD\u0EC9\u0E8D\u0EC0\u0E88\u0EB1\u0E9A.
+Latin: Vitrum edere possum; mihi non nocet.
+Latvian: Es varu \u0113st stiklu, tas man nekait\u0113.
+Lausitzer Mundart ("Lusatian"): Ich koann Gloos assn und doas dudd merr ni wii.
+Lingala: Nakok\u00ED kol\u00EDya bit\u00E9ni bya milungi, ekos\u00E1la ng\u00E1\u00ED mab\u00E9 t\u025B\u0301.
+Lithuanian: A\u0161 galiu valgyti stikl\u0105 ir jis man\u0119s ne\u017Eeid\u017Eia
+Lojban: mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi
+Macedonian: \u041C\u043E\u0436\u0430\u043C \u0434\u0430 \u0458\u0430\u0434\u0430\u043C \u0441\u0442\u0430\u043A\u043B\u043E, \u0430 \u043D\u0435 \u043C\u0435 \u0448\u0442\u0435\u0442\u0430.
+Malay: Saya boleh makan kaca dan ia tidak mencederakan saya.
+Maltese: Nista' niekol il-\u0127\u0121ie\u0121 u ma jag\u0127milli xejn.
+Manx Gaelic: Foddym gee glonney agh cha jean eh gortaghey mee.
+Marathi: \u092E\u0940 \u0915\u093E\u091A \u0916\u093E\u090A \u0936\u0915\u0924\u094B, \u092E\u0932\u093E \u0924\u0947 \u0926\u0941\u0916\u0924 \u0928\u093E\u0939\u0940.
+Marquesan: E ko\u02BBana e kai i te karahi, mea \u02BB\u0101, \u02BBa\u02BBe hauhau.
+Middle English: Ich canne glas eten and hit hirti\u00FE me nou\u021Dt.
+Milanese: S\u00F4n b\u00F4n de magn\u00E0 el v\u00E9der, el me fa minga mal.
+Mongolian (Classic) : \u182A\u1822 \u1830\u1822\u182F\u1822 \u1822\u1833\u1821\u1836\u1826 \u1834\u1822\u1833\u1820\u1828\u1820 \u1802 \u1828\u1820\u1833\u1824\u1837 \u182C\u1823\u1824\u1837\u1820\u1833\u1820\u1822 \u182A\u1822\u1830\u1822
+Mongolian (Cyrillic): \u0411\u0438 \u0448\u0438\u043B \u0438\u0434\u044D\u0439 \u0447\u0430\u0434\u043D\u0430, \u043D\u0430\u0434\u0430\u0434 \u0445\u043E\u0440\u0442\u043E\u0439 \u0431\u0438\u0448
+N\u00F3rdicg: Lj\u0153r ye caudran cr\u00E9ne\u00FE \u00FD jor c\u1E83ran.
+Napoletano: M' pozz magna' o'vetr, e nun m' fa mal.
+Navajo: Ts\u00E9s\u01EB\u02BC yish\u0105\u0301\u0105go b\u00ED\u00EDn\u00EDshghah d\u00F3\u00F3 doo shi\u0142 neezgai da.
+Nederlands / Dutch: Ik kan glas eten, het doet m\u0133 geen kwaad.
+Nepali: \uFEFF\u092E \u0915\u093E\u0901\u091A \u0916\u093E\u0928 \u0938\u0915\u094D\u091B\u0942 \u0930 \u092E\u0932\u093E\u0908 \u0915\u0947\u0939\u093F \u0928\u0940 \u0939\u0941\u0928\u094D\u200D\u0928\u094D \u0964
+Norsk / Norwegian (Bokm\u00E5l): Jeg kan spise glass uten \u00E5 skade meg.
+Norsk / Norwegian (Nynorsk): Eg kan eta glas utan \u00E5 skada meg.
+Northern Karelian: Mie voin syvv\u00E4 lasie ta minla ei ole kipie.
+Odenw\u00E4lderisch: Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud.
+Old French: Je puis mangier del voirre. Ne me nuit.
+Old Irish (Latin): Con\u00B7iccim ithi nglano. N\u00EDm\u00B7g\u00E9na.
+Old Irish (Ogham): \u169B\u169B\u1689\u1691\u1685\u1694\u1689\u1689\u1694\u168B\u1680\u1694\u1688\u1694\u1680\u168D\u1682\u1690\u1685\u1691\u1680\u1685\u1694\u168B\u168C\u1693\u1685\u1690\u169C
+Old Norse (Latin): Ek get eti\u00F0 gler \u00E1n \u00FEess a\u00F0 ver\u00F0a s\u00E1r.
+Old Norse (Runes): \u16D6\u16B4 \u16B7\u16D6\u16CF \u16D6\u16CF\u16C1 \u16A7 \u16B7\u16DA\u16D6\u16B1 \u16D8\u16BE \u16A6\u16D6\u16CB\u16CB \u16A8\u16A7 \u16A1\u16D6 \u16B1\u16A7\u16A8 \u16CB\u16A8\u16B1
+Papiamentu: Ami por kome glas anto e no ta hasimi da\u00F1o.
+Pashto: \u0632\u0647 \u0634\u064A\u0634\u0647 \u062E\u0648\u0693\u0644\u06D0 \u0634\u0645\u060C \u0647\u063A\u0647 \u0645\u0627 \u0646\u0647 \u062E\u0648\u0696\u0648\u064A
+Pf\u00E4lzisch: Isch konn Glass fresse ohne dasses mer ebbes ausmache dud.
+Picard: Ch'peux mingi du verre, cha m'fo\u00E9 mie n'ma.
+Polska / Polish: Mog\u0119 je\u015B\u0107 szk\u0142o i mi nie szkodzi.
+Proven\u00E7al / Occitan: P\u00F2di manjar de veire, me nafrari\u00E1 pas.
+Qu\u00E9b\u00E9cois: J'peux manger d'la vitre, \u00E7a m'fa pas mal.
+Roman: Me posso magna' er vetro, e nun me fa male.
+Romanian: Pot s\u0103 m\u0103n\u00E2nc sticl\u0103 \u0219i ea nu m\u0103 r\u0103ne\u0219te.
+Romansch (Grischun): Jau sai mangiar vaider, senza che quai fa donn a mai.
+Ruhrdeutsch: Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut.
+Russian: \u042F \u043C\u043E\u0433\u0443 \u0435\u0441\u0442\u044C \u0441\u0442\u0435\u043A\u043B\u043E, \u043E\u043D\u043E \u043C\u043D\u0435 \u043D\u0435 \u0432\u0440\u0435\u0434\u0438\u0442.
+S\u00E4chsisch / Saxon: 'sch kann Glos essn, ohne dass'sch mer wehtue.
+S\u00F8nderjysk: \u00C6 ka \u00E6e glass uhen at det go m\u00E6 naue.
+Sami (Northern): S\u00E1ht\u00E1n borrat l\u00E1sa, dat ii leat b\u00E1v\u010D\u010Das.
+Sanskrit (standard transcription): k\u0101ca\u1E43 \u015Baknomyattum; nopahinasti m\u0101m.
+Sanskrit: \uFEFF\u0915\u093E\u091A\u0902 \u0936\u0915\u094D\u0928\u094B\u092E\u094D\u092F\u0924\u094D\u0924\u0941\u092E\u094D \u0964 \u0928\u094B\u092A\u0939\u093F\u0928\u0938\u094D\u0924\u093F \u092E\u093E\u092E\u094D \u0965
+Schw\u00E4bisch / Swabian: I k\u00E5 Glas fr\u00E4ssa, ond des macht mr nix!
+Schwyzerd\u00FCtsch (Luzern): Ech cha Gl\u00E2s \u00E4sse, das schadt mer ned.
+Schwyzerd\u00FCtsch (Z\u00FCrich): Ich chan Glaas \u00E4sse, das schadt mir n\u00F6d.
+Scottish Gaelic: S urrainn dhomh gloinne ithe; cha ghoirtich i mi.
+Serbian (Cyrillic): \u0408\u0430 \u043C\u043E\u0433\u0443 \u0434\u0430 \u0458\u0435\u0434\u0435\u043C \u0441\u0442\u0430\u043A\u043B\u043E.
+Serbian (Latin): Ja mogu da jedem staklo.
+Sicilian: Puotsu mangiari u vitru, nun mi fa mali.
+Sinhalese: \u0DB8\u0DA7 \u0DC0\u0DD3\u0DAF\u0DD4\u0DBB\u0DD4 \u0D9A\u0DD1\u0DB8\u0DA7 \u0DC4\u0DD0\u0D9A\u0DD2\u0DBA\u0DD2. \u0D91\u0DBA\u0DD2\u0DB1\u0DCA \u0DB8\u0DA7 \u0D9A\u0DD2\u0DC3\u0DD2 \u0DC4\u0DCF\u0DB1\u0DD2\u0DBA\u0D9A\u0DCA \u0DC3\u0DD2\u0DAF\u0DD4 \u0DB1\u0DDC\u0DC0\u0DDA.
+Slovak: M\u00F4\u017Eem jes\u0165 sklo. Nezran\u00ED ma.
+Slovenian: Lahko jem steklo, ne da bi mi \u0161kodovalo.
+Southern Karelian: Min\u00E4 voin syvv\u00E4 st'oklua dai minule ei ole kibie.
+Spanish: Puedo comer vidrio, no me hace da\u00F1o.
+Suomi / Finnish: Voin sy\u00F6d\u00E4 lasia, se ei vahingoita minua.
+Svenska / Swedish: Jag kan \u00E4ta glas utan att skada mig.
+Tagalog: Kaya kong kumain nang bubog at hindi ako masaktan.
+Taiwanese: G\u00F3a \u0113-t\u00E0ng chia\u030Dh po-l\u00EA, m\u0101 b\u0113 tio\u030Dh-siong.
+Tamil: \u0BA8\u0BBE\u0BA9\u0BCD \u0B95\u0BA3\u0BCD\u0BA3\u0BBE\u0B9F\u0BBF \u0B9A\u0BBE\u0BAA\u0BCD\u0BAA\u0BBF\u0B9F\u0BC1\u0BB5\u0BC7\u0BA9\u0BCD, \u0B85\u0BA4\u0BA9\u0BBE\u0BB2\u0BCD \u0B8E\u0BA9\u0B95\u0BCD\u0B95\u0BC1 \u0B92\u0BB0\u0BC1 \u0B95\u0BC7\u0B9F\u0BC1\u0BAE\u0BCD \u0BB5\u0BB0\u0BBE\u0BA4\u0BC1.
+Telugu: \u0C28\u0C47\u0C28\u0C41 \u0C17\u0C3E\u0C1C\u0C41 \u0C24\u0C3F\u0C28\u0C17\u0C32\u0C28\u0C41 \u0C2E\u0C30\u0C3F\u0C2F\u0C41 \u0C05\u0C32\u0C3E \u0C1A\u0C47\u0C38\u0C3F\u0C28\u0C3E \u0C28\u0C3E\u0C15\u0C41 \u0C0F\u0C2E\u0C3F \u0C07\u0C2C\u0C4D\u0C2C\u0C02\u0C26\u0C3F \u0C32\u0C47\u0C26\u0C41
+Thai: \u0E09\u0E31\u0E19\u0E01\u0E34\u0E19\u0E01\u0E23\u0E30\u0E08\u0E01\u0E44\u0E14\u0E49 \u0E41\u0E15\u0E48\u0E21\u0E31\u0E19\u0E44\u0E21\u0E48\u0E17\u0E33\u0E43\u0E2B\u0E49\u0E09\u0E31\u0E19\u0E40\u0E08\u0E47\u0E1A
+Tibetan: \u0F64\u0F7A\u0F63\u0F0B\u0F66\u0F92\u0F7C\u0F0B\u0F5F\u0F0B\u0F53\u0F66\u0F0B\u0F44\u0F0B\u0F53\u0F0B\u0F42\u0F72\u0F0B\u0F58\u0F0B\u0F62\u0F7A\u0F51\u0F0D
+Turkish (Ottoman): \u062C\u0627\u0645 \u064A\u064A\u0647 \u0628\u0644\u0648\u0631\u0645 \u0628\u06AD\u0627 \u0636\u0631\u0631\u0649 \u0637\u0648\u0642\u0648\u0646\u0645\u0632
+Turkish: Cam yiyebilirim, bana zarar\u0131 dokunmaz.
+Twi: Metumi awe tumpan, \u025Cny\u025C me hwee.
+Ukrainian: \u042F \u043C\u043E\u0436\u0443 \u0457\u0441\u0442\u0438 \u0441\u043A\u043B\u043E, \u0456 \u0432\u043E\u043D\u043E \u043C\u0435\u043D\u0456 \u043D\u0435 \u0437\u0430\u0448\u043A\u043E\u0434\u0438\u0442\u044C.
+Ulster Gaelic: Ithim-sa gloine agus n\u00ED miste damh \u00E9.
+Urdu: \u0645\u06CC\u06BA \u06A9\u0627\u0646\u0686 \u06A9\u06BE\u0627 \u0633\u06A9\u062A\u0627 \u06C1\u0648\u06BA \u0627\u0648\u0631 \u0645\u062C\u06BE\u06D2 \u062A\u06A9\u0644\u06CC\u0641 \u0646\u06C1\u06CC\u06BA \u06C1\u0648\u062A\u06CC \u06D4
+Venetian: Mi posso magnare el vetro, no'l me fa mae.
+Vietnamese (n\u00F4m) : \u4E9B \uD84C\uDF8F \u4E16 \u54B9 \u6C34 \u6676 \uD859\uDCE1 \u7A7A \uD84C\uDF8F \u5BB3 \u54A6
+Vietnamese (qu\u1ED1c ng\u1EEF): T\u00F4i c\u00F3 th\u1EC3 \u0103n th\u1EE7y tinh m\u00E0 kh\u00F4ng h\u1EA1i g\u00EC.
+Walloon: Dji pou magn\u00EE do v\u00EAre, \u00E7oula m' freut n\u00E9n m\u00E5.
+Welsh: Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi.
+Yiddish: \u05D0\u05D9\u05DA \u05E7\u05E2\u05DF \u05E2\u05E1\u05DF \u05D2\u05DC\u05D0\u05B8\u05D6 \u05D0\u05D5\u05DF \u05E2\u05E1 \u05D8\u05D5\u05D8 \u05DE\u05D9\u05E8 \u05E0\u05D9\u05E9\u05D8 \u05F0\u05F2.
+Yoruba: Mo l\u00E8 je\u0329 d\u00EDg\u00ED, k\u00F2 n\u00ED pa m\u00ED l\u00E1ra.
+Zeneise (Genovese): P\u00F2sso mangi\u00E2 o veddro e o no me f\u00E0 m\u00E2.
diff --git a/linguistics/pom.xml b/linguistics/pom.xml
new file mode 100644
index 00000000000..baeb2457c76
--- /dev/null
+++ b/linguistics/pom.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0"?>
+<!-- Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>parent</artifactId>
+ <version>6-SNAPSHOT</version>
+ <relativePath>../parent/pom.xml</relativePath>
+ </parent>
+ <artifactId>linguistics</artifactId>
+ <packaging>container-plugin</packaging>
+ <version>6-SNAPSHOT</version>
+ <dependencies>
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-library</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>component</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>config-bundle</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>annotations</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>configdefinitions</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>vespajlib</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.inject</groupId>
+ <artifactId>guice</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <compilerArgs>
+ <arg>-Xlint:rawtypes</arg>
+ <arg>-Xlint:unchecked</arg>
+ <arg>-Xlint:deprecation</arg>
+ <arg>-Werror</arg>
+ </compilerArgs>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/linguistics/src/main/java/com/yahoo/language/Language.java b/linguistics/src/main/java/com/yahoo/language/Language.java
new file mode 100644
index 00000000000..0fade0d7299
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/Language.java
@@ -0,0 +1,615 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import com.yahoo.text.Lowercase;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * @author rpito
+ */
+public enum Language {
+
+ /** Language tag "un". */
+ UNKNOWN("un"),
+
+ /** Language tag "ab". */
+ ABKHAZIAN("ab"),
+
+ /** Language tag "aa". */
+ AFAR("aa"),
+
+ /** Language tag "af". */
+ AFRIKAANS("af"),
+
+ /** Language tag "sq". */
+ ALBANIAN("sq"),
+
+ /** Language tag "am". */
+ AMHARIC("am"),
+
+ /** Language tag "ar". */
+ ARABIC("ar"),
+
+ /** Language tag "hy". */
+ ARMENIAN("hy"),
+
+ /** Language tag "as". */
+ ASSAMESE("as"),
+
+ /** Language tag "ay". */
+ AYMARA("ay"),
+
+ /** Language tag "az". */
+ AZERBAIJANI("az"),
+
+ /** Language tag "ba". */
+ BASHKIR("ba"),
+
+ /** Language tag "eu". */
+ BASQUE("eu"),
+
+ /** Language tag "bn". */
+ BENGALI("bn"),
+
+ /** Language tag "dz". */
+ BHUTANI("dz"),
+
+ /** Language tag "bh". */
+ BIHARI("bh"),
+
+ /** Language tag "bi". */
+ BISLAMA("bi"),
+
+ /** Language tag "br". */
+ BRETON("br"),
+
+ /** Language tag "bug". */
+ BUGINESE("bug"),
+
+ /** Language tag "bg". */
+ BULGARIAN("bg"),
+
+ /** Language tag "my". */
+ BURMESE("my"),
+
+ /** Language tag "be". */
+ BYELORUSSIAN("be"),
+
+ /** Language tag "km". */
+ CAMBODIAN("km"),
+
+ /** Language tag "ca". */
+ CATALAN("ca"),
+
+ /** Language tag "chr". */
+ CHEROKEE("chr"),
+
+ /**
+ * Language tag "zh-hans".
+ *
+ * @see #fromLocale(Locale)
+ */
+ CHINESE_SIMPLIFIED("zh-hans"),
+
+ /**
+ * Language tag "zh-hant".
+ *
+ * @see #fromLocale(Locale)
+ */
+ CHINESE_TRADITIONAL("zh-hant"),
+
+ /** Language tag "cop". */
+ COPTIC("cop"),
+
+ /** Language tag "co". */
+ CORSICAN("co"),
+
+ /** Language tag "hr". */
+ CROATIAN("hr"),
+
+ /** Language tag "cs". */
+ CZECH("cs"),
+
+ /** Language tag "da". */
+ DANISH("da"),
+
+ /** Language tag "div". */
+ DIVEHI("div"),
+
+ /** Language tag "nl". */
+ DUTCH("nl"),
+
+ /** Language tag "en". */
+ ENGLISH("en"),
+
+ /** Language tag "eo". */
+ ESPERANTO("eo"),
+
+ /** Language tag "et". */
+ ESTONIAN("et"),
+
+ /** Language tag "fo". */
+ FAROESE("fo"),
+
+ /** Language tag "fj". */
+ FIJI("fj"),
+
+ /** Language tag "fi". */
+ FINNISH("fi"),
+
+ /** Language tag "fr". */
+ FRENCH("fr"),
+
+ /** Language tag "fy". */
+ FRISIAN("fy"),
+
+ /** Language tag "gl". */
+ GALICIAN("gl"),
+
+ /** Language tag "ka". */
+ GEORGIAN("ka"),
+
+ /** Language tag "de". */
+ GERMAN("de"),
+
+ /** Language tag "got". */
+ GOTHIC("got"),
+
+ /** Language tag "el". */
+ GREEK("el"),
+
+ /** Language tag "kl". */
+ GREENLANDIC("kl"),
+
+ /** Language tag "gn". */
+ GUARANI("gn"),
+
+ /** Language tag "gu". */
+ GUJARATI("gu"),
+
+ /** Language tag "ha". */
+ HAUSA("ha"),
+
+ /**
+ * Language tag "he".
+ *
+ * @see #fromLocale(Locale)
+ */
+ HEBREW("he"),
+
+ /** Language tag "hi". */
+ HINDI("hi"),
+
+ /** Language tag "hu". */
+ HUNGARIAN("hu"),
+
+ /** Language tag "is". */
+ ICELANDIC("is"),
+
+ /**
+ * Language tag "id".
+ *
+ * @see #fromLocale(Locale)
+ */
+ INDONESIAN("id"),
+
+ /** Language tag "ia". */
+ INTERLINGUA("ia"),
+
+ /** Language tag "ie". */
+ INTERLINGUE("ie"),
+
+ /** Language tag "iu". */
+ INUKTITUT("iu"),
+
+ /** Language tag "ik". */
+ INUPIAK("ik"),
+
+ /** Language tag "ga". */
+ IRISH("ga"),
+
+ /** Language tag "it". */
+ ITALIAN("it"),
+
+ /** Language tag "ja". */
+ JAPANESE("ja"),
+
+ /** Language tag "jw". */
+ JAVANESE("jw"),
+
+ /** Language tag "kn". */
+ KANNADA("kn"),
+
+ /** Language tag "ks". */
+ KASHMIRI("ks"),
+
+ /** Language tag "kk". */
+ KAZAKH("kk"),
+
+ /** Language tag "rw". */
+ KINYARWANDA("rw"),
+
+ /** Language tag "ky". */
+ KIRGHIZ("ky"),
+
+ /** Language tag "rn". */
+ KIRUNDI("rn"),
+
+ /** Language tag "ko". */
+ KOREAN("ko"),
+
+ /** Language tag "ku". */
+ KURDISH("ku"),
+
+ /** Language tag "lo". */
+ LAOTHIAN("lo"),
+
+ /** Language tag "la". */
+ LATIN("la"),
+
+ /** Language tag "lv". */
+ LATVIAN("lv"),
+
+ /** Language tag "ln". */
+ LINGALA("ln"),
+
+ /** Language tag "lt". */
+ LITHUANIAN("lt"),
+
+ /** Language tag "mk". */
+ MACEDONIAN("mk"),
+
+ /** Language tag "mg". */
+ MALAGASY("mg"),
+
+ /** Language tag "ms". */
+ MALAY("ms"),
+
+ /** Language tag "ml". */
+ MALAYALAM("ml"),
+
+ /** Language tag "mt". */
+ MALTESE("mt"),
+
+ /** Language tag "mni". */
+ MANIPURI("mni"),
+
+ /** Language tag "mi". */
+ MAORI("mi"),
+
+ /** Language tag "mr". */
+ MARATHI("mr"),
+
+ /** Language tag "mo". */
+ MOLDAVIAN("mo"),
+
+ /** Language tag "mn". */
+ MONGOLIAN("mn"),
+
+ /** Language tag "mun". */
+ MUNDA("mun"),
+
+ /** Language tag "na". */
+ NAURU("na"),
+
+ /** Language tag "ne". */
+ NEPALI("ne"),
+
+ /**
+ * Language tag "nb".
+ *
+ * @see #fromLocale(Locale)
+ */
+ NORWEGIAN_BOKMAL("nb"),
+
+ /** Language tag "nn". */
+ NORWEGIAN_NYNORSK("nn"),
+
+ /** Language tag "oc". */
+ OCCITAN("oc"),
+
+ /** Language tag "or". */
+ ORIYA("or"),
+
+ /** Language tag "om". */
+ OROMO("om"),
+
+ /** Language tag "ps". */
+ PASHTO("ps"),
+
+ /** Language tag "fa". */
+ PERSIAN("fa"),
+
+ /** Language tag "pl". */
+ POLISH("pl"),
+
+ /** Language tag "pt". */
+ PORTUGUESE("pt"),
+
+ /** Language tag "pa". */
+ PUNJABI("pa"),
+
+ /** Language tag "qu". */
+ QUECHUA("qu"),
+
+ /** Language tag "rm". */
+ RHAETO_ROMANCE("rm"),
+
+ /** Language tag "ro". */
+ ROMANIAN("ro"),
+
+ /** Language tag "ru". */
+ RUSSIAN("ru"),
+
+ /** Language tag "sm". */
+ SAMOAN("sm"),
+
+ /** Language tag "sg". */
+ SANGHO("sg"),
+
+ /** Language tag "sa". */
+ SANSKRIT("sa"),
+
+ /** Language tag "gd". */
+ SCOTS_GAELIC("gd"),
+
+ /** Language tag "sr". */
+ SERBIAN("sr"),
+
+ /** Language tag "s". */
+ SERBO_CROATIAN("sh"),
+
+ /** Language tag "st". */
+ SESOTHO("st"),
+
+ /** Language tag "tn". */
+ SETSWANA("tn"),
+
+ /** Language tag "sn". */
+ SHONA("sn"),
+
+ /** Language tag "ii". */
+ SICHUAN_YI("ii"),
+
+ /** Language tag "sd". */
+ SINDHI("sd"),
+
+ /** Language tag "si". */
+ SINHALESE("si"),
+
+ /** Language tag "ss". */
+ SISWATI("ss"),
+
+ /** Language tag "sk". */
+ SLOVAK("sk"),
+
+ /** Language tag "sl". */
+ SLOVENIAN("sl"),
+
+ /** Language tag "so". */
+ SOMALI("so"),
+
+ /** Language tag "es". */
+ SPANISH("es"),
+
+ /** Language tag "su". */
+ SUNDANESE("su"),
+
+ /** Language tag "sw". */
+ SWAHILI("sw"),
+
+ /** Language tag "sv". */
+ SWEDISH("sv"),
+
+ /** Language tag "syr". */
+ SYRIAC("syr"),
+
+ /** Language tag "fil". */
+ TAGALOG("fil"),
+
+ /** Language tag "tg". */
+ TAJIK("tg"),
+
+ /** Language tag "ta". */
+ TAMIL("ta"),
+
+ /** Language tag "tt". */
+ TATAR("tt"),
+
+ /** Language tag "te". */
+ TELUGU("te"),
+
+ /** Language tag "th". */
+ THAI("th"),
+
+ /** Language tag "bo". */
+ TIBETAN("bo"),
+
+ /** Language tag "ti". */
+ TIGRINYA("ti"),
+
+ /** Language tag "to". */
+ TONGA("to"),
+
+ /** Language tag "ts". */
+ TSONGA("ts"),
+
+ /** Language tag "tr". */
+ TURKISH("tr"),
+
+ /** Language tag "tk". */
+ TURKMEN("tk"),
+
+ /** Language tag "tw". */
+ TWI("tw"),
+
+ /** Language tag "uga". */
+ UGARITIC("uga"),
+
+ /** Language tag "ug". */
+ UIGHUR("ug"),
+
+ /** Language tag "uk". */
+ UKRAINIAN("uk"),
+
+ /** Language tag "ur". */
+ URDU("ur"),
+
+ /** Language tag "uz". */
+ UZBEK("uz"),
+
+ /** Language tag "vi". */
+ VIETNAMESE("vi"),
+
+ /** Language tag "vo". */
+ VOLAPUK("vo"),
+
+ /** Language tag "cy". */
+ WELSH("cy"),
+
+ /** Language tag "wo". */
+ WOLOF("wo"),
+
+ /** Language tag "xh". */
+ XHOSA("xh"),
+
+ /**
+ * Language tag "yi".
+ *
+ * @see #fromLocale(Locale)
+ */
+ YIDDISH("yi"),
+
+ /** Language tag "yo". */
+ YORUBA("yo"),
+
+ /** Language tag "za". */
+ ZHUANG("za"),
+
+ /** Language tag "zu". */
+ ZULU("zu");
+
+ private static final Map<String, Language> index = new HashMap<>();
+ private final String code;
+
+ static {
+ for (Language language : values()) {
+ index.put(language.code, language);
+ }
+ }
+
+ private Language(String code) {
+ this.code = code;
+ }
+
+ public String languageCode() {
+ return code;
+ }
+
+ /**
+ * Returns whether this is a "cjk" language. CJK is here not a linguistic term, it is basically whether the language
+ * has loose word order and a non-rigid use of space.
+ *
+ * @return True if this is a CJK language.
+ */
+ public boolean isCjk() {
+ switch (this) {
+ case CHINESE_SIMPLIFIED:
+ case CHINESE_TRADITIONAL:
+ case JAPANESE:
+ case KOREAN:
+ case THAI:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * <p>Convenience method for calling <tt>fromLocale(LocaleFactory.fromLanguageTag(languageTag))</tt>.</p>
+ *
+ * @param languageTag The language tag for which the <tt>Language</tt> to return.
+ * @return The corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known.
+ */
+ public static Language fromLanguageTag(String languageTag) {
+ if (languageTag == null) {
+ return UNKNOWN;
+ }
+ return fromLocale(LocaleFactory.fromLanguageTag(languageTag));
+ }
+
+ /**
+ * <p>Returns the <tt>Language</tt> whose {@link #languageCode()} is equal to <tt>locale.getLanguage()</tt>, with
+ * the following additions:</p>
+ * <ul>
+ * <li>Language code "in" translates to {@link #INDONESIAN}</li>
+ * <li>Language code "iw" translates to {@link #HEBREW}</li>
+ * <li>Language code "ji" translates to {@link #YIDDISH}</li>
+ * <li>Language code "no" translates to {@link #NORWEGIAN_BOKMAL}</li>
+ * <li>Language code "zh" translates to {@link #CHINESE_TRADITIONAL}, unless country code is "cn" or variant code
+ * is "hans", in which case it translates to {@link #CHINESE_SIMPLIFIED}.</li>
+ * </ul>
+ *
+ * @param locale The locale for which the <tt>Language</tt> to return.
+ * @return The corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known.
+ */
+ public static Language fromLocale(Locale locale) {
+ String str = locale.getLanguage();
+ if (str.equals("in")) {
+ return INDONESIAN; // Locale converts 'id' to 'in'
+ }
+ if (str.equals("iw")) {
+ return HEBREW; // Locale converts 'he' to 'iw'
+ }
+ if (str.equals("ji")) {
+ return YIDDISH; // Locale converts 'yi' to 'ji'
+ }
+ if (str.equals("no")) {
+ return NORWEGIAN_BOKMAL; // alias for 'nb'
+ }
+ if (str.equals("zh")) {
+ if (locale.getCountry().equalsIgnoreCase("cn") ||
+ locale.getVariant().equalsIgnoreCase("hans")) {
+ return CHINESE_SIMPLIFIED;
+ }
+ return CHINESE_TRADITIONAL;
+ }
+ Language ret = index.get(str);
+ return ret != null ? ret : UNKNOWN;
+ }
+
+ /**
+ * Returns the language from an encoding, or {@link #UNKNOWN} if it cannot be determined.
+ *
+ * @param encoding The name of the encoding to derive the <tt>Language</tt> from.
+ * @return the language given by the encoding, or {@link #UNKNOWN} if not determined.
+ */
+ public static Language fromEncoding(String encoding) {
+ if (encoding == null) {
+ return UNKNOWN;
+ }
+ return fromLowerCasedEncoding(Lowercase.toLowerCase(encoding));
+ }
+
+ private static Language fromLowerCasedEncoding(String encoding) {
+ if (encoding.equals("gb2312")) {
+ return CHINESE_SIMPLIFIED;
+ }
+ if (encoding.equals("big5")) {
+ return CHINESE_TRADITIONAL;
+ }
+ if (encoding.equals("euc-jp") ||
+ encoding.equals("iso-2022-jp") ||
+ encoding.equals("shift-jis")) {
+ return JAPANESE;
+ }
+ if (encoding.equals("euc-kr")) {
+ return KOREAN;
+ }
+ return UNKNOWN;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
new file mode 100644
index 00000000000..7a6e224b221
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -0,0 +1,101 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import com.yahoo.collections.Tuple2;
+import com.yahoo.component.Version;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.process.CharacterClasses;
+import com.yahoo.language.process.GramSplitter;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.Segmenter;
+import com.yahoo.language.process.Stemmer;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+import com.yahoo.language.simple.SimpleLinguistics;
+
+/**
+ * <p>Factory of linguistic processors. For technical reasons this provides more flexibility to provide separate
+ * components for different operations than is needed in many cases; in particular the tokenizer should typically
+ * stem, transform and normalize using the same operations as provided directly by this. A set of adaptors are
+ * provided that makes this easy to achieve. Refer to the {com.yahoo.language.simple.SimpleLinguistics} implementation
+ * to set this up.</p>
+ *
+ * <p>Thread safety: Instances of this factory type must be thread safe but the processors
+ * returned by the factory methods do not. Clients should request separate processor instances
+ * for each thread.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Moelster Lidal</a>
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ * @author bratseth
+ */
+public interface Linguistics {
+
+ enum Component {
+ STEMMER,
+ TOKENIZER,
+ NORMALIZER,
+ TRANSFORMER,
+ SEGMENTER,
+ DETECTOR,
+ GRAM_SPLITTER,
+ CHARACTER_CLASSES
+ }
+
+ /** The same as new com.yahoo.language.simple.SimpleLinguistics(). Prefer using that directly. */
+ Linguistics SIMPLE = new SimpleLinguistics();
+
+ /**
+ * Returns a thread-unsafe stemmer or lemmatizer.
+ * This is used at query time to do stemming of search terms to indexes which contains text tokenized
+ * with stemming turned on
+ */
+ Stemmer getStemmer();
+
+ /**
+ * Returns a thread-unsafe tokenizer.
+ * This is used at indexing time to produce a optionally stemmed and
+ * transformed (accent normalized) stream of indexable tokens.
+ */
+ Tokenizer getTokenizer();
+
+ /** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */
+ Normalizer getNormalizer();
+
+ /**
+ * Returns a thread-unsafe transformer.
+ * This is used at query time to do stemming of search terms to indexes which contains text tokenized
+ * with accent normalization turned on
+ */
+ Transformer getTransformer();
+
+ /**
+ * Returns a thread-unsafe segmenter.
+ * This is used at query time to find the individual semantic components of search terms to indexes
+ * tokenized with segmentation.
+ */
+ Segmenter getSegmenter();
+
+ /**
+ * Returns a thread-unsafe detector.
+ * The language of the text is a parameter to other linguistic operations.
+ * This is used to determine the language of a query or document field when not specified explicitly.
+ */
+ Detector getDetector();
+
+ /**
+ * Returns a thread-unsafe gram splitter.
+ * This is used to split query or document text into fixed-length grams which allows matching without needing
+ * or using segmented tokens.
+ */
+ GramSplitter getGramSplitter();
+
+ /** Returns a thread-unsafe character classes instance. */
+ CharacterClasses getCharacterClasses();
+
+ /**
+ * Returns the name and version of a processor component returned by
+ * this instance.
+ */
+ Tuple2<String, Version> getVersion(Linguistics.Component component);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
new file mode 100644
index 00000000000..a34ec9386b0
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -0,0 +1,31 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import com.yahoo.text.Lowercase;
+
+import java.util.Locale;
+
+/**
+ * This class provides a case normalization operation to be used e.g. when
+ * document search should be case insensitive.
+ *
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class LinguisticsCase {
+
+ /**
+ * <p>The lower casing method to use in Vespa when doing language independent processing of natural language data.
+ * It is placed in a single place to ensure symmetry between e.g. query processing and indexing.</p>
+ * <p>Return a lowercased version of the given string. Since this is language independent, this is more of a case
+ * normalization operation than lowercasing.</p>
+ *
+ * @param in The string to lowercase.
+ * @return A string containing only lowercase character.
+ */
+ public static String toLowerCase(String in) {
+ // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
+ // Also, at the time of writing, English is the default language for queries
+ return Lowercase.toLowerCase(in);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
new file mode 100644
index 00000000000..2610550dfd2
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/LocaleFactory.java
@@ -0,0 +1,57 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import java.util.Locale;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public final class LocaleFactory {
+
+ private static final Locale UNKNOWN = new Locale("", "", "");
+
+ private LocaleFactory() {
+ // hide
+ }
+
+ /**
+ * <p>Implements a simple parser for RFC5646 language tags. The language tag is parsed into a Locale.</p>
+ *
+ * @param tag The language tag to parse.
+ * @return The corrseponding Locale.
+ */
+ @SuppressWarnings("ConstantConditions")
+ public static Locale fromLanguageTag(String tag) {
+ // TODO: Should be replaced by return Locale.forLanguageTag(tag); ?
+
+ tag.getClass(); // throws NullPointerException
+ tag = tag.trim();
+ if (tag.isEmpty()) {
+ return UNKNOWN;
+ }
+ String language = "";
+ String region = "";
+ String script = "";
+ String[] parts = tag.split("-");
+ for (int partIdx = 0; partIdx < parts.length; ++partIdx) {
+ String part = parts[partIdx];
+ int partLen = part.length();
+ if (partIdx == 0) {
+ if (partLen == 2 || partLen == 3) {
+ language = part;
+ }
+ } else if (partIdx == 1 || partIdx == 2) {
+ if (partLen == 2 || partLen == 3) {
+ region = part;
+ } else if (partLen == 4) {
+ script = part;
+ }
+ }
+ }
+ if (language.isEmpty()) {
+ return UNKNOWN;
+ }
+ return new Locale(language, region, script);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java b/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java
new file mode 100644
index 00000000000..f80f876d248
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/AbstractDetector.java
@@ -0,0 +1,25 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+import com.yahoo.text.Utf8;
+
+import java.nio.ByteBuffer;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public abstract class AbstractDetector implements Detector {
+
+ @Override
+ public final Detection detect(String input, Hint hint) {
+ byte[] buf = Utf8.toBytes(input);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public final Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Detection.java b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java
new file mode 100644
index 00000000000..e70d70425d4
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/Detection.java
@@ -0,0 +1,47 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+import com.yahoo.language.Language;
+
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+
+/**
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public class Detection {
+
+ private final Language language;
+ private final String encodingName;
+ private final boolean local;
+
+ public Detection(Language language, String encodingName, boolean local) {
+ this.language = language;
+ this.encodingName = encodingName;
+ this.local = local;
+ }
+
+ public Language getLanguage() {
+ return language;
+ }
+
+ public Charset getEncoding() {
+ if (encodingName == null) {
+ return null;
+ }
+ try {
+ return Charset.forName(encodingName);
+ } catch (UnsupportedCharsetException e) {
+ // ignore
+ }
+ return null;
+ }
+
+ public String getEncodingName() {
+ return encodingName;
+ }
+
+ public boolean isLocal() {
+ return local;
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java
new file mode 100644
index 00000000000..c97895387fe
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/DetectionException.java
@@ -0,0 +1,14 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+/**
+ * Exception that is thrown when detection fails.
+ *
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public final class DetectionException extends RuntimeException {
+
+ public DetectionException(String str) {
+ super(str);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Detector.java b/linguistics/src/main/java/com/yahoo/language/detect/Detector.java
new file mode 100644
index 00000000000..4962d761a5a
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/Detector.java
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Abstract superclass of all Detectors used for language and encoding detection.
+ *
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public interface Detector {
+
+ /**
+ * Detects language and encoding of the supplied byte array, possibly using a language/encoding hint.
+ *
+ * @param input the buffer that is to be inspected
+ * @param offset the offset to detect from
+ * @param length the size to detect from
+ * @param hint a hint to the detector, or null for no hint
+ * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null)
+ * @throws DetectionException if detection fails
+ */
+ public abstract Detection detect(byte[] input, int offset, int length, Hint hint);
+
+ /**
+ * Detects language and encoding of the supplied ByteBuffer, possibly using a language/encoding hint.
+ *
+ * @param input the buffer that is to be inspected, from its current position to its limit
+ * @param hint a hint to the detector, or null for no hint
+ * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null)
+ * @throws DetectionException if detection fails
+ */
+ public abstract Detection detect(ByteBuffer input, Hint hint);
+
+ /**
+ * Detects language of the supplied String, possibly using a language hint.
+ *
+ * @param input the string that is to be inspected
+ * @param hint a hint to the detector, or null for no hint
+ * @return an array of possible language/encoding pairs, sorted by decreasing confidence (possibly empty, but never null)
+ * @throws DetectionException if detection fails
+ */
+ public abstract Detection detect(String input, Hint hint);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/Hint.java b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java
new file mode 100644
index 00000000000..c3fad8bc260
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/Hint.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+/**
+ * <p>A hint that can be given to a {@link Detector}.</p>
+ *
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public class Hint {
+
+ private final String market;
+ private final String country;
+
+ private Hint(String market, String country) {
+ this.market = market;
+ this.country = country;
+ }
+
+ public String getMarket() {
+ return market;
+ }
+
+ public String getCountry() {
+ return country;
+ }
+
+ public static Hint newMarketHint(String market) {
+ return new Hint(market, null);
+ }
+
+ public static Hint newCountryHint(String country) {
+ return new Hint(null, country);
+ }
+
+ public static Hint newInstance(String market, String country) {
+ return new Hint(market, country);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/detect/package-info.java b/linguistics/src/main/java/com/yahoo/language/detect/package-info.java
new file mode 100644
index 00000000000..3ab6309e9e2
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/detect/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.detect;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/linguistics/src/main/java/com/yahoo/language/package-info.java b/linguistics/src/main/java/com/yahoo/language/package-info.java
new file mode 100644
index 00000000000..2f5638d6b70
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
new file mode 100644
index 00000000000..0e1327aabcf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Determines the class of a given character. Use this rather than java.lang.Character.
+ *
+ * @author bratseth
+ */
+public class CharacterClasses {
+
+ /**
+ * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters
+ * which are useful to view as letters even though not defined as such in unicode.
+ */
+ public boolean isLetter(int c) {
+ if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
+ // if (c == '_') return true;
+
+ // Ticket 3864695, some CJK punctuation YST defined as word characters
+ if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
+ c == '\u300c' || c == '\u300d' || c == '\u300e' ||
+ c == '\u300f' || c == '\u3010' || c == '\u3011') {
+ return true;
+ }
+ int type = java.lang.Character.getType(c);
+ return type == java.lang.Character.NON_SPACING_MARK ||
+ type == java.lang.Character.COMBINING_SPACING_MARK ||
+ type == java.lang.Character.ENCLOSING_MARK;
+ }
+
+ /**
+ * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
+ */
+ public boolean isDigit(int c) {
+ return Character.isDigit(c);
+ }
+
+ /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */
+ public boolean isLatinDigit(int c) {
+ return Character.isDigit(c) && isLatin(c);
+ }
+
+ /** Returns true if this is a latin character */
+ public boolean isLatin(int c) {
+ return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN);
+ }
+
+ /**
+ * Convenience, returns isLetter(c) || isDigit(c)
+ */
+ public boolean isLetterOrDigit(int c) {
+ return isLetter(c) || isDigit(c);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
new file mode 100644
index 00000000000..0672582d732
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -0,0 +1,222 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.*;
+
+/**
+ * A class which splits consecutive word character sequences into overlapping character n-grams.
+ * For example "en gul bille sang" split into 2-grams becomes
+ * "en gu ul bi il ll le sa an ng", and split into 3-grams becomes "en gul bil ill lle san ang".
+ * <p>
+ * This class is multithread safe.
+ *
+ * @author bratseth
+ */
+public class GramSplitter {
+
+ private final CharacterClasses characterClasses;
+
+ public GramSplitter(CharacterClasses characterClasses) {
+ this.characterClasses = characterClasses;
+ }
+
+ /**
+ * Splits the input into grams of size n and returns an iterator over grams represented as [start index,length]
+ * pairs into the input string.
+ * <p>
+ * The iterator is implemented as a sliding view over the input string rather than being backed by a
+ * list, which makes this space efficient for large strings.
+ *
+ * @param input the input string to be split, cannot be null
+ * @param n the gram size, a positive integer
+ * @return a read only iterator over the resulting grams
+ * @throws NullPointerException if input==null
+ * @throws IllegalArgumentException if n is less than 1
+ */
+ public GramSplitterIterator split(String input, int n) {
+ if (input == null) {
+ throw new NullPointerException("input cannot be null");
+ }
+ if (n < 1) {
+ throw new IllegalArgumentException("n (gram size) cannot be smaller than 1, was " + n);
+ }
+ return new GramSplitterIterator(input, n, characterClasses);
+ }
+
+ public static class GramSplitterIterator implements Iterator<Gram> {
+
+ private final CharacterClasses characterClasses;
+
+ /**
+ * Text to split
+ */
+ private final String input;
+
+ /**
+ * Gram size
+ */
+ private final int n;
+
+ /**
+ * Current index
+ */
+ private int i = 0;
+
+ /**
+ * Whether the last thing that happened was being on a separator (including the start of the string)
+ */
+ private boolean isFirstAfterSeparator = true;
+
+ /**
+ * The next gram or null if not determined yet
+ */
+ private Gram nextGram = null;
+
+ public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
+ this.input = input;
+ this.n = n;
+ this.characterClasses = characterClasses;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (nextGram != null) {
+ return true;
+ }
+ nextGram = findNext();
+ return nextGram != null;
+ }
+
+ @Override
+ public Gram next() {
+ Gram currentGram = nextGram;
+ if (currentGram == null) {
+ currentGram = findNext();
+ }
+ if (currentGram == null) {
+ throw new NoSuchElementException("No next gram at position " + i);
+ }
+ nextGram = null;
+ return currentGram;
+ }
+
+ private Gram findNext() {
+ // Skip to next word character
+ while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
+ i++;
+ isFirstAfterSeparator = true;
+ }
+ if (i >= input.length()) {
+ return null;
+ }
+
+ String gram = input.substring(i, Math.min(i + n, input.length()));
+ int nonWordChar = indexOfNonWordChar(gram);
+ if (nonWordChar == 0) {
+ throw new RuntimeException("Programming error");
+ }
+ if (nonWordChar > 0) {
+ gram = gram.substring(0, nonWordChar);
+ }
+
+ if (gram.length() == n) { // normal case: got a full length gram
+ i++;
+ isFirstAfterSeparator = false;
+ return new Gram(i - 1, gram.length());
+ } else { // gram is too short due either to a non-word separator or end of string
+ if (isFirstAfterSeparator) { // make a gram anyway
+ i++;
+ isFirstAfterSeparator = false;
+ return new Gram(i - 1, gram.length());
+ } else { // skip to next
+ i += gram.length() + 1;
+ isFirstAfterSeparator = true;
+ return findNext();
+ }
+ }
+ }
+
+ private int indexOfNonWordChar(String s) {
+ for (int i = 0; i < s.length(); i++) {
+ if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("This iterator is read only");
+ }
+
+ /**
+ * Convenience list which splits the remaining items in this iterator into a list of gram strings
+ *
+ * @return an immutable list of extracted grams
+ */
+ public List<String> toExtractedList() {
+ List<String> gramList = new ArrayList<>();
+ while (hasNext()) {
+ gramList.add(next().extractFrom(input));
+ }
+ return Collections.unmodifiableList(gramList);
+ }
+ }
+
+ /**
+ * An immutable start index and length pair
+ */
+ public static final class Gram {
+
+ private int start, length;
+
+ public Gram(int start, int length) {
+ this.start = start;
+ this.length = length;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Returns this gram as a string from the input string
+ */
+ public String extractFrom(String input) {
+ return input.substring(start, start + length);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof Gram)) {
+ return false;
+ }
+
+ Gram gram = (Gram)o;
+
+ if (length != gram.length) {
+ return false;
+ }
+ if (start != gram.start) {
+ return false;
+ }
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = start;
+ result = 31 * result + length;
+ return result;
+ }
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
new file mode 100644
index 00000000000..f4e1ccc9feb
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Normalizer.java
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>This interface provides NFKC normalization of Strings through the underlying linguistics library.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M\u00F8lster Lidal</a>
+ */
+public interface Normalizer {
+
+ /**
+ * <p>NFKC normalizes a String.</p>
+ *
+ * @param input String to normalize.
+ * @return The normalized String.
+ * @throws ProcessingException If underlying library throws an Exception.
+ */
+ public String normalize(String input);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
new file mode 100644
index 00000000000..ce8b455707c
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/ProcessingException.java
@@ -0,0 +1,18 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * <p>Exception class indicating that a fatal error occured during linguistic processing.</p>
+ *
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class ProcessingException extends RuntimeException {
+
+ public ProcessingException(String message) {
+ super(message);
+ }
+
+ public ProcessingException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
new file mode 100644
index 00000000000..73764e06ef6
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
+ * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
+ * processing).</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Segmenter {
+
+ /**
+ * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
+ * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
+ * contains word-characters, any punctuation and spacing tokens will be removed.
+ *
+ * @param input the text to segment.
+ * @param language language of input text.
+ * @return the list of segments.
+ * @throws ProcessingException if an exception is encountered during processing
+ */
+ List<String> segment(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
new file mode 100644
index 00000000000..146d65cb7e2
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SegmenterImpl implements Segmenter {
+
+ private final Tokenizer tokenizer;
+
+ public SegmenterImpl(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public List<String> segment(String input, Language language) {
+ List<String> segments = new ArrayList<>();
+ for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) {
+ findSegments(token, segments);
+ }
+ if (segments.isEmpty()) {
+ segments.add(input); // no segments, return original string
+ }
+ return segments;
+ }
+
+ private void findSegments(Token token, List<String> out) {
+ int len;
+ if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+ if (token.isIndexable()) {
+ out.add(token.getOrig());
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ findSegments(token.getComponent(i), out);
+ }
+ }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemList.java b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
new file mode 100644
index 00000000000..d355af87f08
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemList.java
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import java.util.AbstractList;
+import java.util.ArrayList;
+
+/**
+ * A list of strings which does not allow for duplicate elements.
+ *
+ * @author steinar
+ */
+public class StemList extends AbstractList<String> {
+ private final ArrayList<String> stems;
+
+ public StemList() {
+ this(new String[0]);
+ }
+
+ public StemList(String... stems) {
+ super();
+ this.stems = new ArrayList<>(Math.max(stems.length, 3));
+ for (String word : stems) {
+ add(word);
+ }
+ }
+
+ @Override
+ public String get(int i) {
+ return stems.get(i);
+ }
+
+ @Override
+ public int size() {
+ return stems.size();
+ }
+
+ @Override
+ public String set(int i, String element) {
+ int existing = stems.indexOf(element);
+ if (existing >= 0 && existing != i) {
+ // the element already exists
+ return element;
+ } else {
+ return stems.set(i, element);
+ }
+ }
+
+ @Override
+ public void add(int i, String element) {
+ int existing = stems.indexOf(element);
+ if (existing < 0) {
+ stems.add(i, element);
+ }
+ }
+
+ @Override
+ public String remove(int i) {
+ return stems.remove(i);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
new file mode 100644
index 00000000000..269b08dcdf7
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enum of the stemming modes which can be requested.
+ * Stemming implementation may support a smaller number of modes by mapping a mode to a more
+ * inclusive alternative.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum StemMode {
+
+ NONE(0),
+ DEFAULT(1),
+ ALL(2),
+ SHORTEST(4),
+ BEST(5);
+
+ private final int value;
+
+ StemMode(int value) {
+ this.value = value;
+ }
+
+ /**
+ * Returns the stem mode as an int
+ *
+ * @deprecated do not use
+ */
+ @Deprecated
+ public int getValue() {
+ return value;
+ }
+
+ @Deprecated
+ public static StemMode valueOf(int value) {
+ for (StemMode mode : values()) {
+ if (mode.value == value) {
+ return mode;
+ }
+ }
+ return NONE;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
new file mode 100644
index 00000000000..739fd1d9e96
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing stemming of single words.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Stemmer {
+
+ /**
+ * Stem input according to specified stemming mode.
+ *
+ * @param input the string to stem.
+ * @param mode the stemming mode
+ * @param language the language to use for stemming
+ * @return list of possible stems. Empty if none.
+ * @throws ProcessingException thrown if there is an exception stemming this input
+ */
+ List<StemList> stem(String input, StemMode mode, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
new file mode 100644
index 00000000000..0d175a2bf3e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemmerImpl implements Stemmer {
+
+ private final Tokenizer tokenizer;
+
+ public StemmerImpl(Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public List<StemList> stem(String input, StemMode stemMode, Language language) {
+ List<StemList> stems = new ArrayList<>();
+ for (Token token : tokenizer.tokenize(input, language, stemMode, false)) {
+ findStems(token, stems);
+ }
+ return stems;
+ }
+
+ private void findStems(Token token, List<StemList> out) {
+ int len;
+ if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
+ if (token.isIndexable()) {
+ StemList word = new StemList();
+ word.add(token.getTokenString()); // takes care of getStem(0)
+ for (int i = 1; i < token.getNumStems(); i++) {
+ word.add(token.getStem(i));
+ }
+ out.add(word);
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ findStems(token.getComponent(i), out);
+ }
+ }
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java
new file mode 100644
index 00000000000..f1dc6639e11
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * Interface providing access to a single token produced by the tokenizer.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Token {
+
+ /** Returns the type of this token - word, space or punctuation etc. */
+ TokenType getType();
+
+ /** Returns the original form of this token */
+ String getOrig();
+
+ /** Returns the number of stem forms available for this token. */
+ int getNumStems();
+
+ /** Returns the stem at position i */
+ String getStem(int i);
+
+ /**
+ * Returns the number of components, if this token is a compound word
+ * (e.g. german "kommunikationsfehler". Otherwise, return 0
+ *
+ * @return number of components, or 0 if none
+ */
+ int getNumComponents();
+
+ /** Returns a component token of this */
+ Token getComponent(int i);
+
+ /** Returns the offset position of this token */
+ long getOffset();
+
+ /** Returns the script of this token */
+ TokenScript getScript();
+
+ /**
+ * Returns token string in a form suitable for indexing: The
+ * most lowercased variant of the most processed token form available.
+ * If called on a compound token this returns a lowercased form of the
+ * entire word.
+ *
+ * @return token string value
+ */
+ String getTokenString();
+
+ /** Returns whether this is an instance of a declared special token (e.g. c++) */
+ boolean isSpecialToken();
+
+ /** Whether this token should be indexed */
+ boolean isIndexable();
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
new file mode 100644
index 00000000000..ba0ad89b454
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenScript.java
@@ -0,0 +1,77 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * List of token scripts (e.g. latin, japanese, chinese, etc.) which may warrant different
+ * linguistics treatment.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenScript {
+
+ COMMON,
+ LATIN,
+ GREEK,
+ CYRILLIC,
+ ARMENIAN,
+ HEBREW,
+ ARABIC,
+ SYRIAC,
+ THAANA,
+ DEVANAGARI,
+ BENGALI,
+ GURMUKHI,
+ GUJARATI,
+ ORIYA,
+ TAMIL,
+ TELUGU,
+ KANNADA,
+ MALAYALAM,
+ SINHALA,
+ THAI,
+ LAO,
+ TIBETAN,
+ MYANMAR,
+ GEORGIAN,
+ HANGUL,
+ ETHIOPIC,
+ CHEROKEE,
+ CANADIAN,
+ OGHAM,
+ RUNIC,
+ KHMER,
+ MONGOLIAN,
+ HIRAGANA,
+ KATAKANA,
+ CHINESE,
+ HAN,
+ YI,
+ OLDITALIC,
+ GOTHIC,
+ DESERET,
+ INHERITED,
+ TAGALOG,
+ HANUNOO,
+ BUHID,
+ TAGBANWA,
+ LIMBU,
+ TAILE,
+ LINEARB,
+ UGARITIC,
+ SHAVIAN,
+ OSMANYA,
+ CYPRIOT,
+ BRAILLE,
+ ASCII,
+ BUGINESE,
+ COPTIC,
+ GLAGOLITIC,
+ KHAROSHTHI,
+ OLDPERSIAN,
+ SYLOTINAGRI,
+ TAILUE,
+ TIFINAGH,
+ VIETNAMESE,
+ UNKNOWN;
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
new file mode 100644
index 00000000000..7d880440f1e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+/**
+ * An enumeration of token types.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public enum TokenType {
+
+ UNKNOWN(0),
+ SPACE(1),
+ PUNCTUATION(2),
+ SYMBOL(3),
+ ALPHABETIC(4),
+ NUMERIC(5),
+ MARKER(255);
+
+ private final int value;
+
+ TokenType(int value) {
+ this.value = value;
+ }
+
+ /** Returns an int code for this type */
+ public int getValue() { return value; }
+
+ /**
+ * Marker for whether this type of token can be indexed for search.
+ * Note that a Token can be excluded from an index, even though the token type marks
+ * it as indexable.
+ *
+ * @see com.yahoo.language.process.Token#isIndexable()
+ * @return whether this type of token can be indexed
+ */
+ public boolean isIndexable() {
+ switch (this) {
+ case ALPHABETIC: case NUMERIC: return true;
+ default: return false;
+ }
+ }
+
+ /** Translates this from the int code representation returned from {@link #getValue} */
+ public static TokenType valueOf(int value) {
+ for (TokenType type : values()) {
+ if (value == type.value) return type;
+ }
+ return UNKNOWN;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
new file mode 100644
index 00000000000..d7d1e210de4
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Language-sensitive tokenization of a text string.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Tokenizer {
+
+ /**
+ * Returns the tokens produced from an input string under the rules of the given Language and additional options
+ *
+ * @param input the string to tokenize. May be arbitrarily large.
+ * @param language the language of the input string.
+ * @param stemMode the stem mode applied on the returned tokens
+ * @param removeAccents if true accents and similar are removed from the returned tokens
+ * @return the tokens of the input String.
+ * @throws ProcessingException If the underlying library throws an Exception.
+ */
+ Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
+
+ /**
+ * Return a replacement for an input token string.
+ * This accepts strings returned by Token.getTokenString
+ * and returns a replacement which will be used as the index token.
+ * The input token string is returned if there is no replacement.
+ * <p>
+ * This default implementation always returns the input token string.
+ *
+ * @param tokenString the token string of the term to lookup a replacement for
+ * @return the replacement, if any, or the argument token string if not
+ */
+ default String getReplacementTerm(String tokenString) { return tokenString; }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Transformer.java b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
new file mode 100644
index 00000000000..4d288aafaca
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Transformer.java
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+/**
+ * Interface for providers of text transformations such as accent removal.
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Transformer {
+
+ /**
+ * Remove accents from input text.
+ *
+ * @param input text to transform.
+ * @param language language of input text.
+ * @return text with accents removed, or input-text if the feature is unavailable
+ * @throws ProcessingException thrown if there is an exception stemming this input
+ */
+ String accentDrop(String input, Language language);
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/package-info.java b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
new file mode 100644
index 00000000000..de8d82fcf36
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.process;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
new file mode 100644
index 00000000000..eca35772296
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -0,0 +1,179 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.text.Utf8;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese,
+ * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character
+ * set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available,
+ * then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a
+ * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string.
+ * Unfortunately, its not 100% fool-proof. From what I've been able to determine, Korean characters do not overlap with
+ * Japanese or Chinese characters, so their presence is a good indication of Korean. If a string contains phonetic
+ * japanese, this is a good indication of Japanese. However, Japanese and Chinese characters occupy many of the same
+ * character blocks, so if there are no definitive signs of Japanese then it is assumed that the String is Chinese.</p>
+
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public class SimpleDetector implements Detector {
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
+ }
+
+ @Override
+ public Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
+ }
+
+ public static Language guessLanguage(byte[] buf, int offset, int length) {
+ return guessLanguage(Utf8.toString(buf, offset, length));
+ }
+
+ public static Language guessLanguage(String input) {
+ if (input == null || input.length() == 0) {
+ return Language.UNKNOWN;
+ }
+
+ // used to record the current theory of language guess, in case of ambiguous characters, such as Chinese
+ Language soFar = Language.UNKNOWN;
+ for (int i = 0; i < input.length(); i++) {
+ char c = input.charAt(i);
+ Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
+
+ // Check some special cases for Korean. Korean doesn't
+ // overlap with Japanese or Chinese, so this is a good test.
+ if ((c >= 0x3200 && c < 0x3220) || // parenthesized hangul
+ (c >= 0x3260 && c < 0x3280) || // circled hangul
+ (c >= 0xFFA0 && c < 0xFFE0) || // halfwidth hangul
+ (c == 0x302E || c == 0x302F) || // hangul tone mark
+
+ // standard Hangul character blocks
+ block == Character.UnicodeBlock.HANGUL_SYLLABLES ||
+ block == Character.UnicodeBlock.HANGUL_JAMO ||
+ block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) {
+ return Language.KOREAN;
+ }
+ // katakana phonetic extensions.
+ if (0x31f0 <= c && c <= 0x31ff) {
+ // See http://www.unicode.org/charts/PDF/U31F0.pdf
+ // This is a special case because This range of character
+ // codes is classified as unasigned in
+ // Character.UnicodeBlock. But clearly it is assigned as
+ // per above.
+ return Language.JAPANESE;
+ }
+ if (0x31f0 <= c && c <= 0x31ff || // these are standard character blocks for japanese characters.
+ block == Character.UnicodeBlock.HIRAGANA ||
+ block == Character.UnicodeBlock.KATAKANA ||
+ block == Character.UnicodeBlock.KANBUN) {
+ // See http://www.unicode.org/charts/PDF/U31F0.pdf
+ // This is a special case because This range of character
+ // codes is classified as unasigned in
+ // Character.UnicodeBlock. But clearly it is assigned as
+ // per above.
+ return Language.JAPANESE;
+ }
+ if (block == Character.UnicodeBlock.CJK_COMPATIBILITY ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT ||
+ block == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT ||
+ block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) {
+ // seeing one of these chars, we assume that the text is Chinese, until more concrete evidence is found
+ soFar = Language.CHINESE_TRADITIONAL;
+ }
+ if (block == Character.UnicodeBlock.BOPOMOFO ||
+ block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) {
+ return Language.CHINESE_TRADITIONAL;
+ }
+ if (block == Character.UnicodeBlock.THAI) {
+ return Language.THAI;
+ }
+ }
+ // got to the end, so return the current best guess
+ return soFar;
+ }
+
+ private boolean isTrailingOctet(byte i) {
+ return ((i >>> 6) & 3) == 2;
+ }
+
+ // If UTF-8, how many trailing octets are expected?
+ private int isLeadingFor(byte c) {
+ int i = c & 0xff;
+ if ((i & (1 << 7)) == 0) {
+ return 0;
+ } else if ((i >>> 5) == ((1 << 3) - 2)) {
+ return 1;
+ } else if ((i >>> 4) == ((1 << 4) - 2)) {
+ return 2;
+ } else if ((i >>> 3) == ((1 << 5) - 2)) {
+ return 3;
+ } else if ((i >>> 2) == ((1 << 6) - 2)) {
+ return 4;
+ } else if ((i >>> 1) == ((1 << 7) - 2)) {
+ return 5;
+ } else {
+ return -1;
+ }
+ }
+
+ private String guessEncoding(byte[] input) {
+ boolean isUtf8 = true;
+ boolean hasHighs = false;
+ scan:
+ for (int i = 0; i < input.length; i++) {
+ final int l = isLeadingFor(input[i]);
+ if (l < 0 || i + l >= input.length) {
+ hasHighs = true;
+ isUtf8 = false;
+ break;
+ }
+ switch (l) {
+ case 0:
+ break;
+ case 5:
+ isUtf8 = isTrailingOctet(input[++i]);
+ case 4:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 3:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 2:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 1:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ hasHighs = true;
+ if (!isUtf8) {
+ break scan;
+ }
+ break;
+ }
+ }
+ if (hasHighs && isUtf8) {
+ return Utf8.getCharset().name();
+ } else if (!hasHighs) {
+ return "US-ASCII";
+ } else {
+ return "ISO-8859-1";
+ }
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
new file mode 100644
index 00000000000..857964d5d35
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.collections.Tuple2;
+import com.yahoo.component.Version;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.process.CharacterClasses;
+import com.yahoo.language.process.GramSplitter;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.Segmenter;
+import com.yahoo.language.process.SegmenterImpl;
+import com.yahoo.language.process.Stemmer;
+import com.yahoo.language.process.StemmerImpl;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+
+/**
+ * Factory of pure Java linguistic processor implementations.
+ *
+ * @author bratseth
+ */
+public class SimpleLinguistics implements Linguistics {
+
+ // Threadsafe instances
+ private final static Normalizer normalizer = new SimpleNormalizer();
+ private final static Transformer transformer = new SimpleTransformer();
+ private final static Detector detector = new SimpleDetector();
+ private final static CharacterClasses characterClasses = new CharacterClasses();
+ private final static GramSplitter gramSplitter = new GramSplitter(characterClasses);
+
+ @Override
+ public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); }
+
+ @Override
+ public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer); }
+
+ @Override
+ public Normalizer getNormalizer() { return normalizer; }
+
+ @Override
+ public Transformer getTransformer() { return transformer; }
+
+ @Override
+ public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); }
+
+ @Override
+ public Detector getDetector() { return detector; }
+
+ @Override
+ public GramSplitter getGramSplitter() { return gramSplitter; }
+
+ @Override
+ public CharacterClasses getCharacterClasses() { return characterClasses; }
+
+ @Override
+ public Tuple2<String, Version> getVersion(Component component) {
+ return new Tuple2<>("yahoo", new Version(1, 0));
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java
new file mode 100644
index 00000000000..bfc6f813452
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleNormalizer.java
@@ -0,0 +1,16 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.Normalizer;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class SimpleNormalizer implements Normalizer {
+
+ @Override
+ public String normalize(String input) {
+ return java.text.Normalizer.normalize(input, java.text.Normalizer.Form.NFKC);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
new file mode 100644
index 00000000000..1cf707bf5be
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -0,0 +1,188 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenScript;
+import com.yahoo.language.process.TokenType;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public class SimpleToken implements Token {
+
+ private final List<Token> components = new ArrayList<>();
+ private final String orig;
+ private TokenType type = TokenType.UNKNOWN;
+ private TokenScript script = TokenScript.UNKNOWN;
+ private String tokenString = null;
+ private boolean specialToken = false;
+ private long offset = 0;
+
+ public SimpleToken(String orig) {
+ this.orig = orig;
+ }
+
+ @Override
+ public String getOrig() {
+ return orig;
+ }
+
+ @Override
+ public int getNumStems() {
+ return tokenString != null ? 1 : 0;
+ }
+
+ @Override
+ public String getStem(int i) {
+ return tokenString;
+ }
+
+ @Override
+ public int getNumComponents() {
+ return components.size();
+ }
+
+ @Override
+ public Token getComponent(int i) {
+ return components.get(i);
+ }
+
+ public SimpleToken addComponent(Token token) {
+ components.add(token);
+ return this;
+ }
+
+ @Override
+ public String getTokenString() {
+ return tokenString;
+ }
+
+ public SimpleToken setTokenString(String str) {
+ tokenString = str;
+ return this;
+ }
+
+ @Override
+ public TokenType getType() {
+ return type;
+ }
+
+ public SimpleToken setType(TokenType type) {
+ this.type = type;
+ return this;
+ }
+
+ @Override
+ public TokenScript getScript() {
+ return script;
+ }
+
+ public SimpleToken setScript(TokenScript script) {
+ this.script = script;
+ return this;
+ }
+
+ @Override
+ public boolean isSpecialToken() {
+ return specialToken;
+ }
+
+ public SimpleToken setSpecialToken(boolean specialToken) {
+ this.specialToken = specialToken;
+ return this;
+ }
+
+ @Override
+ public long getOffset() {
+ return offset;
+ }
+
+ public SimpleToken setOffset(long offset) {
+ this.offset = offset;
+ return this;
+ }
+
+ @Override
+ public int hashCode() {
+ return orig.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof Token)) {
+ return false;
+ }
+ Token rhs = (Token)obj;
+ if (!getType().equals(rhs.getType())) {
+ return false;
+ }
+ if (!equalsOpt(getOrig(), rhs.getOrig())) {
+ return false;
+ }
+ if (getOffset() != rhs.getOffset()) {
+ return false;
+ }
+ if (!equalsOpt(getScript(), rhs.getScript())) {
+ return false;
+ }
+ if (!equalsOpt(getTokenString(), rhs.getTokenString())) {
+ return false;
+ }
+ if (isSpecialToken() != rhs.isSpecialToken()) {
+ return false;
+ }
+ if (getNumComponents() != rhs.getNumComponents()) {
+ return false;
+ }
+ for (int i = 0, len = getNumComponents(); i < len; ++i) {
+ if (!equalsOpt(getComponent(i), rhs.getComponent(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean equalsOpt(Object lhs, Object rhs) {
+ if (lhs == null || rhs == null) {
+ return lhs == rhs;
+ }
+ return lhs.equals(rhs);
+ }
+
+ @Override
+ public String toString() {
+ return "token : " + getClass().getSimpleName() + " {\n" + toString(this, " ") + "}";
+ }
+
+ private static String toString(Token token, String indent) {
+ StringBuilder builder = new StringBuilder();
+ builder.append(indent).append("components : {\n");
+ for (int i = 0, len = token.getNumComponents(); i < len; ++i) {
+ Token comp = token.getComponent(i);
+ builder.append(indent).append(" [").append(i).append("] : ").append(comp.getClass().getSimpleName());
+ builder.append(" {\n").append(toString(comp, indent + " "));
+ builder.append(indent).append(" }\n");
+ }
+ builder.append(indent).append("}\n");
+ builder.append(indent).append("offset : ").append(token.getOffset()).append("\n");
+ builder.append(indent).append("orig : ").append(quoteString(token.getOrig())).append("\n");
+ builder.append(indent).append("script : ").append(token.getScript()).append("\n");
+ builder.append(indent).append("special : ").append(token.isSpecialToken()).append("\n");
+ builder.append(indent).append("token string : ").append(quoteString(token.getTokenString())).append("\n");
+ builder.append(indent).append("type : ").append(token.getType()).append("\n");
+ return builder.toString();
+ }
+
+ private static String quoteString(String str) {
+ return str != null ? "'" + str + "'" : null;
+ }
+
+ @Override
+ public boolean isIndexable() {
+ return getType().isIndexable() && (getOrig().length() > 0);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
new file mode 100644
index 00000000000..9d1a6a5dbb8
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
@@ -0,0 +1,68 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.TokenType;
+
+/**
+ * @author arnej27959
+ */
+class SimpleTokenType {
+
+ public static TokenType valueOf(int codePoint) {
+ switch (Character.getType(codePoint)) {
+ case Character.NON_SPACING_MARK:
+ // "combining grave accent"
+ // and "DEVANAGARI VOWEL SIGN SHORT E" etc
+ // (letter-like)
+ case Character.COMBINING_SPACING_MARK:
+ // "DEVANAGARI VOWEL SIGN SHORT O"
+ // and similar (letter-like)
+ case Character.LETTER_NUMBER:
+ // "SMALL ROMAN NUMERAL SIX" etc (letter-like)
+ case Character.UPPERCASE_LETTER:
+ case Character.LOWERCASE_LETTER:
+ case Character.TITLECASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ case Character.OTHER_LETTER:
+ return TokenType.ALPHABETIC;
+
+ case Character.ENCLOSING_MARK:
+ // "enclosing circle" etc is symbol-like
+ case Character.MATH_SYMBOL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.OTHER_SYMBOL:
+ return TokenType.SYMBOL;
+
+ case Character.OTHER_NUMBER:
+ // "SUPERSCRIPT TWO",
+ // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE"
+ // and more numbers that should mostly normalize
+ // to digits
+ case Character.DECIMAL_DIGIT_NUMBER:
+ return TokenType.NUMERIC;
+
+ case Character.SPACE_SEPARATOR:
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ return TokenType.SPACE;
+
+ case Character.DASH_PUNCTUATION:
+ case Character.START_PUNCTUATION:
+ case Character.END_PUNCTUATION:
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.OTHER_PUNCTUATION:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ return TokenType.PUNCTUATION;
+
+ case Character.CONTROL:
+ case Character.FORMAT:
+ case Character.SURROGATE:
+ case Character.PRIVATE_USE:
+ case Character.UNASSIGNED:
+ return TokenType.UNKNOWN;
+ }
+ throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint)));
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
new file mode 100644
index 00000000000..48a12c54e86
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -0,0 +1,76 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.LinguisticsCase;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.kstem.KStemmer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations
+ * and stems using the kstem algorithm.</p>
+ *
+ * <p>This is not multithread safe.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ * @author bratseth
+ */
+public class SimpleTokenizer implements Tokenizer {
+
+ private final static int SPACE_CODE = 32;
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final KStemmer stemmer = new KStemmer();
+
+ public SimpleTokenizer() {
+ this(new SimpleNormalizer(), new SimpleTransformer());
+ }
+
+ public SimpleTokenizer(Normalizer normalizer) {
+ this(normalizer, new SimpleTransformer());
+ }
+
+ public SimpleTokenizer(Normalizer normalizer, Transformer transformer) {
+ this.normalizer = normalizer;
+ this.transformer = transformer;
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ if (input.isEmpty()) return Collections.emptyList();
+
+ List<Token> tokens = new ArrayList<>();
+ int nextCode = input.codePointAt(0);
+ TokenType prevType = SimpleTokenType.valueOf(nextCode);
+ for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
+ nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
+ TokenType nextType = SimpleTokenType.valueOf(nextCode);
+ if (!prevType.isIndexable() || !nextType.isIndexable()) {
+ String original = input.substring(prev, next);
+ String token = processToken(original, language, stemMode, removeAccents);
+ tokens.add(new SimpleToken(original).setOffset(prev)
+ .setType(prevType)
+ .setTokenString(token));
+ prev = next;
+ prevType = nextType;
+ }
+ next += Character.charCount(nextCode);
+ }
+ return tokens;
+ }
+
+ private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
+ token = normalizer.normalize(token);
+ token = LinguisticsCase.toLowerCase(token);
+ if (removeAccents)
+ token = transformer.accentDrop(token, language);
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token);
+ return token;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java
new file mode 100644
index 00000000000..409ef44986e
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java
@@ -0,0 +1,25 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.process.Transformer;
+
+import java.text.Normalizer;
+import java.util.regex.Pattern;
+
+/**
+ * Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then
+ * strips off the diacritics using a regex.
+ *
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class SimpleTransformer implements Transformer {
+
+ private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
+
+ @Override
+ public String accentDrop(String input, Language language) {
+ return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("");
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java
new file mode 100644
index 00000000000..355acf41525
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArrayMap.java
@@ -0,0 +1,661 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0.
+ */
+package com.yahoo.language.simple.kstem;
+
+
+import java.util.Arrays;
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A simple class that stores key Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * map, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to retrieve items
+ * by char[] keys without the necessity of converting
+ * to a String first.
+ */
+public class CharArrayMap<V> extends AbstractMap<Object,V> {
+
+ private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
+
+ private final static int INIT_SIZE = 8;
+ private final CharacterUtils charUtils;
+ private boolean ignoreCase;
+ private int count;
+ char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+ V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
+
+ /**
+ * Create map with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ @SuppressWarnings("unchecked")
+ public CharArrayMap(int startSize, boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while(startSize + (startSize>>2) > size)
+ size <<= 1;
+ keys = new char[size][];
+ values = (V[]) new Object[size];
+ this.charUtils = CharacterUtils.getInstance();
+ }
+
+ /**
+ * Creates a map from the mappings in another map.
+ *
+ * @param c
+ * a map whose mappings to be copied
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArrayMap(Map<?,? extends V> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ putAll(c);
+ }
+
+ /** Create set from the supplied map (used internally for readonly maps...) */
+ private CharArrayMap(CharArrayMap<V> toCopy){
+ this.keys = toCopy.keys;
+ this.values = toCopy.values;
+ this.ignoreCase = toCopy.ignoreCase;
+ this.count = toCopy.count;
+ this.charUtils = toCopy.charUtils;
+ }
+
+ /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
+ @Override
+ public void clear() {
+ count = 0;
+ Arrays.fill(keys, null);
+ Arrays.fill(values, null);
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the {@link #keySet()} */
+ public boolean containsKey(char[] text, int off, int len) {
+ return keys[getSlot(text, off, len)] != null;
+ }
+
+ /** true if the <code>CharSequence</code> is in the {@link #keySet()} */
+ public boolean containsKey(CharSequence cs) {
+ return keys[getSlot(cs)] != null;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return containsKey(text, 0, text.length);
+ }
+ return containsKey(o.toString());
+ }
+
+ /** returns the value of the mapping of <code>len</code> chars of <code>text</code>
+ * starting at <code>off</code> */
+ public V get(char[] text, int off, int len) {
+ return values[getSlot(text, off, len)];
+ }
+
+ /** returns the value of the mapping of the chars inside this {@code CharSequence} */
+ public V get(CharSequence cs) {
+ return values[getSlot(cs)];
+ }
+
+ @Override
+ public V get(Object o) {
+ if (o instanceof char[]) {
+ final char[] text = (char[])o;
+ return get(text, 0, text.length);
+ }
+ return get(o.toString());
+ }
+
+ private int getSlot(char[] text, int off, int len) {
+ int code = getHashCode(text, off, len);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, off, len, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, off, len, text2));
+ }
+ return pos;
+ }
+
+ /** Returns true if the String is in the set */
+ private int getSlot(CharSequence text) {
+ int code = getHashCode(text);
+ int pos = code & (keys.length-1);
+ char[] text2 = keys[pos];
+ if (text2 != null && !equals(text, text2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ text2 = keys[pos];
+ } while (text2 != null && !equals(text, text2));
+ }
+ return pos;
+ }
+
+ /** Add the given mapping. */
+ public V put(CharSequence text, V value) {
+ return put(text.toString(), value); // could be more efficient
+ }
+
+ @Override
+ public V put(Object o, V value) {
+ if (o instanceof char[]) {
+ return put((char[])o, value);
+ }
+ return put(o.toString(), value);
+ }
+
+ /** Add the given mapping. */
+ public V put(String text, V value) {
+ return put(text.toCharArray(), value);
+ }
+
+ /** Add the given mapping.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public V put(char[] text, V value) {
+ if (ignoreCase) {
+ charUtils.toLowerCase(text, 0, text.length);
+ }
+ int slot = getSlot(text, 0, text.length);
+ if (keys[slot] != null) {
+ final V oldValue = values[slot];
+ values[slot] = value;
+ return oldValue;
+ }
+ keys[slot] = text;
+ values[slot] = value;
+ count++;
+
+ if (count + (count>>2) > keys.length) {
+ rehash();
+ }
+
+ return null;
+ }
+
+ @SuppressWarnings("unchecked")
+ private void rehash() {
+ assert keys.length == values.length;
+ final int newSize = 2*keys.length;
+ final char[][] oldkeys = keys;
+ final V[] oldvalues = values;
+ keys = new char[newSize][];
+ values = (V[]) new Object[newSize];
+
+ for(int i=0; i<oldkeys.length; i++) {
+ char[] text = oldkeys[i];
+ if (text != null) {
+ // todo: could be faster... no need to compare strings on collision
+ final int slot = getSlot(text,0,text.length);
+ keys[slot] = text;
+ values[slot] = oldvalues[i];
+ }
+ }
+ }
+
+ private boolean equals(char[] text1, int off, int len, char[] text2) {
+ if (len != text2.length)
+ return false;
+ final int limit = off+len;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
+ if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1[off+i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean equals(CharSequence text1, char[] text2) {
+ int len = text1.length();
+ if (len != text2.length)
+ return false;
+ if (ignoreCase) {
+ for(int i=0;i<len;) {
+ final int codePointAt = charUtils.codePointAt(text1, i);
+ if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+ return false;
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1.charAt(i) != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private int getHashCode(char[] text, int offset, int len) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ final int stop = offset + len;
+ if (ignoreCase) {
+ for (int i=offset; i<stop;) {
+ final int codePointAt = charUtils.codePointAt(text, i, stop);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=offset; i<stop; i++) {
+ code = code*31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ private int getHashCode(CharSequence text) {
+ if (text == null)
+ throw new NullPointerException();
+ int code = 0;
+ int len = text.length();
+ if (ignoreCase) {
+ for (int i=0; i<len;) {
+ int codePointAt = charUtils.codePointAt(text, i);
+ code = code*31 + Character.toLowerCase(codePointAt);
+ i += Character.charCount(codePointAt);
+ }
+ } else {
+ for (int i=0; i<len; i++) {
+ code = code*31 + text.charAt(i);
+ }
+ }
+ return code;
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("{");
+ for (Map.Entry<Object,V> entry : entrySet()) {
+ if (sb.length()>1) sb.append(", ");
+ sb.append(entry);
+ }
+ return sb.append('}').toString();
+ }
+
+ private EntrySet entrySet = null;
+ private CharArraySet keySet = null;
+
+ EntrySet createEntrySet() {
+ return new EntrySet(true);
+ }
+
+ @Override
+ public final EntrySet entrySet() {
+ if (entrySet == null) {
+ entrySet = createEntrySet();
+ }
+ return entrySet;
+ }
+
+ // helper for CharArraySet to not produce endless recursion
+ final Set<Object> originalKeySet() {
+ return super.keySet();
+ }
+
+ /** Returns an {@link CharArraySet} view on the map's keys.
+ * The set will use the same {@code matchVersion} as this map. */
+ @Override @SuppressWarnings({"unchecked","rawtypes"})
+ public final CharArraySet keySet() {
+ if (keySet == null) {
+ // prevent adding of entries
+ keySet = new CharArraySet((CharArrayMap) this) {
+ @Override
+ public boolean add(Object o) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(CharSequence text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(String text) {
+ throw new UnsupportedOperationException();
+ }
+ @Override
+ public boolean add(char[] text) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ return keySet;
+ }
+
+ /** public iterator class so efficient methods are exposed to users */
+ public class EntryIterator implements Iterator<Map.Entry<Object,V>> {
+ private int pos=-1;
+ private int lastPos;
+ private final boolean allowModify;
+
+ private EntryIterator(boolean allowModify) {
+ this.allowModify = allowModify;
+ goNext();
+ }
+
+ private void goNext() {
+ lastPos = pos;
+ pos++;
+ while (pos < keys.length && keys[pos] == null) pos++;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return pos < keys.length;
+ }
+
+ /** gets the next key... do not modify the returned char[] */
+ public char[] nextKey() {
+ goNext();
+ return keys[lastPos];
+ }
+
+ /** gets the next key as a newly created String object */
+ public String nextKeyString() {
+ return new String(nextKey());
+ }
+
+ /** returns the value associated with the last key returned */
+ public V currentValue() {
+ return values[lastPos];
+ }
+
+ /** sets the value associated with the last key returned */
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ V old = values[lastPos];
+ values[lastPos] = value;
+ return old;
+ }
+
+ /** use nextCharArray() + currentValue() for better efficiency. */
+ @Override
+ public Map.Entry<Object,V> next() {
+ goNext();
+ return new MapEntry(lastPos, allowModify);
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private final class MapEntry implements Map.Entry<Object,V> {
+ private final int pos;
+ private final boolean allowModify;
+
+ private MapEntry(int pos, boolean allowModify) {
+ this.pos = pos;
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public Object getKey() {
+ // we must clone here, as putAll to another CharArrayMap
+ // with other case sensitivity flag would corrupt the keys
+ return keys[pos].clone();
+ }
+
+ @Override
+ public V getValue() {
+ return values[pos];
+ }
+
+ @Override
+ public V setValue(V value) {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ final V old = values[pos];
+ values[pos] = value;
+ return old;
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append(keys[pos]).append('=')
+ .append((values[pos] == CharArrayMap.this) ? "(this Map)" : values[pos])
+ .toString();
+ }
+ }
+
+ /** public EntrySet class so efficient methods are exposed to users */
+ public final class EntrySet extends AbstractSet<Map.Entry<Object,V>> {
+ private final boolean allowModify;
+
+ private EntrySet(boolean allowModify) {
+ this.allowModify = allowModify;
+ }
+
+ @Override
+ public EntryIterator iterator() {
+ return new EntryIterator(allowModify);
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public boolean contains(Object o) {
+ if (!(o instanceof Map.Entry))
+ return false;
+ final Map.Entry<Object,V> e = (Map.Entry<Object,V>)o;
+ final Object key = e.getKey();
+ final Object val = e.getValue();
+ final Object v = get(key);
+ return v == null ? val == null : v.equals(val);
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public void clear() {
+ if (!allowModify)
+ throw new UnsupportedOperationException();
+ CharArrayMap.this.clear();
+ }
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArrayMap}. This allows to provide
+ * unmodifiable views of internal map for "read-only" use.
+ *
+ * @param map
+ * a map for which the unmodifiable map is returned.
+ * @return an new unmodifiable {@link CharArrayMap}.
+ * @throws NullPointerException
+ * if the given map is <code>null</code>.
+ */
+ public static <V> CharArrayMap<V> unmodifiableMap(CharArrayMap<V> map) {
+ if (map == null)
+ throw new NullPointerException("Given map is null");
+ if (map == emptyMap() || map.isEmpty())
+ return emptyMap();
+ if (map instanceof UnmodifiableCharArrayMap)
+ return map;
+ return new UnmodifiableCharArrayMap<>(map);
+ }
+
+ /**
+ * Returns a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property will be preserved.
+ *
+ * @param map
+ * a map to copy
+ * @return a copy of the given map as a {@link CharArrayMap}. If the given map
+ * is a {@link CharArrayMap} the ignoreCase property as well as the
+ * matchVersion will be of the given map will be preserved.
+ */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> copy(final Map<?,? extends V> map) {
+ if(map == EMPTY_MAP)
+ return emptyMap();
+ if(map instanceof CharArrayMap) {
+ CharArrayMap<V> m = (CharArrayMap<V>) map;
+ // use fast path instead of iterating all values
+ // this is even on very small sets ~10 times faster than iterating
+ final char[][] keys = new char[m.keys.length][];
+ System.arraycopy(m.keys, 0, keys, 0, keys.length);
+ final V[] values = (V[]) new Object[m.values.length];
+ System.arraycopy(m.values, 0, values, 0, values.length);
+ m = new CharArrayMap<>(m);
+ m.keys = keys;
+ m.values = values;
+ return m;
+ }
+ // In jdk-9b54 or later, a plain diamond causes compile error with "-source 1.7":
+ return new CharArrayMap<V>(map, false);
+ }
+
+ /** Returns an empty, unmodifiable map. */
+ @SuppressWarnings("unchecked")
+ public static <V> CharArrayMap<V> emptyMap() {
+ return (CharArrayMap<V>) EMPTY_MAP;
+ }
+
+ // package private CharArraySet instanceof check in CharArraySet
+ static class UnmodifiableCharArrayMap<V> extends CharArrayMap<V> {
+
+ UnmodifiableCharArrayMap(CharArrayMap<V> map) {
+ super(map);
+ }
+
+ @Override
+ public void clear() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(Object o, V val){
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(char[] text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(CharSequence text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V put(String text, V val) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public V remove(Object key) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ EntrySet createEntrySet() {
+ return new EntrySet(false);
+ }
+ }
+
+ /**
+ * Empty array map optimized for speed.
+ * Contains checks will always return <code>false</code> or throw
+ * NPE if necessary.
+ */
+ private static final class EmptyCharArrayMap<V> extends UnmodifiableCharArrayMap<V> {
+ EmptyCharArrayMap() {
+ super(new CharArrayMap<V>(0, false));
+ }
+
+ @Override
+ public boolean containsKey(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public boolean containsKey(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return false;
+ }
+
+ @Override
+ public V get(char[] text, int off, int len) {
+ if(text == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(CharSequence cs) {
+ if(cs == null)
+ throw new NullPointerException();
+ return null;
+ }
+
+ @Override
+ public V get(Object o) {
+ if(o == null)
+ throw new NullPointerException();
+ return null;
+ }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java
new file mode 100644
index 00000000000..df7dc32070b
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharArraySet.java
@@ -0,0 +1,184 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0.
+ */
+package com.yahoo.language.simple.kstem;
+
+
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A simple class that stores Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to test if a char[]
+ * is in the set without the necessity of converting it
+ * to a String first.
+ *
+ * <P>
+ * <em>Please note:</em> This class implements {@link java.util.Set Set} but
+ * does not behave like it should in all cases. The generic type is
+ * {@code Set<Object>}, because you can add any object to it,
+ * that has a string representation. The add methods will use
+ * {@link Object#toString} and store the result using a {@code char[]}
+ * buffer. The same behavior have the {@code contains()} methods.
+ * The {@link #iterator()} returns an {@code Iterator<char[]>}.
+ */
+public class CharArraySet extends AbstractSet<Object> {
+
+ public static final CharArraySet EMPTY_SET = new CharArraySet(CharArrayMap.<Object>emptyMap());
+ private static final Object PLACEHOLDER = new Object();
+
+ private final CharArrayMap<Object> map;
+
+ /**
+ * Create set with enough capacity to hold startSize terms
+ *
+ * @param startSize
+ * the initial capacity
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(int startSize, boolean ignoreCase) {
+ this(new CharArrayMap<>(startSize, ignoreCase));
+ }
+
+ /**
+ * Creates a set from a Collection of objects.
+ *
+ * @param c
+ * a collection whose elements to be placed into the set
+ * @param ignoreCase
+ * <code>false</code> if and only if the set should be case sensitive
+ * otherwise <code>true</code>.
+ */
+ public CharArraySet(Collection<?> c, boolean ignoreCase) {
+ this(c.size(), ignoreCase);
+ addAll(c);
+ }
+
+ /** Create set from the specified map (internal only), used also by {@link CharArrayMap#keySet()} */
+ CharArraySet(final CharArrayMap<Object> map){
+ this.map = map;
+ }
+
+ /** Clears all entries in this set. This method is supported for reusing, but not {@link Set#remove}. */
+ @Override
+ public void clear() {
+ map.clear();
+ }
+
+ /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
+ * are in the set */
+ public boolean contains(char[] text, int off, int len) {
+ return map.containsKey(text, off, len);
+ }
+
+ /** true if the <code>CharSequence</code> is in the set */
+ public boolean contains(CharSequence cs) {
+ return map.containsKey(cs);
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return map.containsKey(o);
+ }
+
+ @Override
+ public boolean add(Object o) {
+ return map.put(o, PLACEHOLDER) == null;
+ }
+
+ /** Add this CharSequence into the set */
+ public boolean add(CharSequence text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this String into the set */
+ public boolean add(String text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ /** Add this char[] directly to the set.
+ * If ignoreCase is true for this Set, the text array will be directly modified.
+ * The user should never modify this text array after calling this method.
+ */
+ public boolean add(char[] text) {
+ return map.put(text, PLACEHOLDER) == null;
+ }
+
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ /**
+ * Returns an unmodifiable {@link CharArraySet}. This allows to provide
+ * unmodifiable views of internal sets for "read-only" use.
+ *
+ * @param set
+ * a set for which the unmodifiable set is returned.
+ * @return an new unmodifiable {@link CharArraySet}.
+ * @throws NullPointerException
+ * if the given set is <code>null</code>.
+ */
+ public static CharArraySet unmodifiableSet(CharArraySet set) {
+ if (set == null)
+ throw new NullPointerException("Given set is null");
+ if (set == EMPTY_SET)
+ return EMPTY_SET;
+ if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
+ return set;
+ return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
+ }
+
+ /**
+ * Returns a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property will be preserved.
+ *
+ * @param set
+ * a set to copy
+ * @return a copy of the given set as a {@link CharArraySet}. If the given set
+ * is a {@link CharArraySet} the ignoreCase property as well as the
+ * matchVersion will be of the given set will be preserved.
+ */
+ public static CharArraySet copy(final Set<?> set) {
+ if(set == EMPTY_SET)
+ return EMPTY_SET;
+ if(set instanceof CharArraySet) {
+ final CharArraySet source = (CharArraySet) set;
+ return new CharArraySet(CharArrayMap.copy(source.map));
+ }
+ return new CharArraySet(set, false);
+ }
+
+ /**
+ * Returns an {@link Iterator} for {@code char[]} instances in this set.
+ */
+ @Override @SuppressWarnings("unchecked")
+ public Iterator<Object> iterator() {
+ // use the AbstractSet#keySet()'s iterator (to not produce endless recursion)
+ return map.originalKeySet().iterator();
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("[");
+ for (Object item : this) {
+ if (sb.length()>1) sb.append(", ");
+ if (item instanceof char[]) {
+ sb.append((char[]) item);
+ } else {
+ sb.append(item);
+ }
+ }
+ return sb.append(']').toString();
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
new file mode 100644
index 00000000000..91bd6286b28
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
@@ -0,0 +1,375 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0.
+ */
+package com.yahoo.language.simple.kstem;
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * {@link CharacterUtils} provides a unified interface to Character-related
+ * operations to implement backwards compatible character operations.
+ */
+public abstract class CharacterUtils {
+
+ private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
+ private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
+
+ /**
+ * Returns a {@link CharacterUtils} implementation.
+ */
+ public static CharacterUtils getInstance() {
+ return JAVA_5;
+ }
+
+ /**
+ * explicitly returns a version matching java 4 semantics
+ * @deprecated Only for n-gram backwards compat
+ */
+ @Deprecated
+ public static CharacterUtils getJava4Instance() {
+ return JAVA_4;
+ }
+
+ /**
+ * Returns the code point at the given index of the {@link CharSequence}.
+ *
+ * @param seq
+ * a character sequence
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the sequence is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the character sequence.
+ */
+ public abstract int codePointAt(final CharSequence seq, final int offset);
+
+ /**
+ * Returns the code point at the given index of the char array where only elements
+ * with index less than the limit are used.
+ *
+ * @param chars
+ * a character array
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ * @param limit the index afer the last element that should be used to calculate
+ * codepoint.
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the array is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the char array.
+ */
+ public abstract int codePointAt(final char[] chars, final int offset, final int limit);
+
+ /** Return the number of characters in <code>seq</code>. */
+ public abstract int codePointCount(CharSequence seq);
+
+ /**
+ * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+ * of the given bufferSize.
+ *
+ * @param bufferSize
+ * the internal char buffer size, must be <code>&gt;= 2</code>
+ * @return a new {@link CharacterBuffer} instance.
+ */
+ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+ if (bufferSize < 2) {
+ throw new IllegalArgumentException("buffersize must be >= 2");
+ }
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+
+
+ /**
+ * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to lowercase
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /**
+ * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to UPPERCASE
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toUpperCase(
+ codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
+ /** Converts a sequence of Java characters to a sequence of unicode code points.
+ * @return the number of code points written to the destination buffer */
+ public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int codePointCount = 0;
+ for (int i = 0; i < srcLen; ) {
+ final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int charCount = Character.charCount(cp);
+ dest[destOff + codePointCount++] = cp;
+ i += charCount;
+ }
+ return codePointCount;
+ }
+
+ /** Converts a sequence of unicode code points to a sequence of Java characters.
+ * @return the number of chars written to the destination buffer */
+ public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ if (srcLen < 0) {
+ throw new IllegalArgumentException("srcLen must be >= 0");
+ }
+ int written = 0;
+ for (int i = 0; i < srcLen; ++i) {
+ written += Character.toChars(src[srcOff + i], dest, destOff + written);
+ }
+ return written;
+ }
+
+ /**
+ * Fills the {@link CharacterBuffer} with characters read from the given
+ * reader {@link Reader}. This method tries to read <code>numChars</code>
+ * characters into the {@link CharacterBuffer}, each call to fill will start
+ * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+ * In case code points can span across 2 java characters, this method may
+ * only fill <code>numChars - 1</code> characters in order not to split in
+ * the middle of a surrogate pair, even if there are remaining characters in
+ * the {@link Reader}.
+ * <p>
+ * This method guarantees
+ * that the given {@link CharacterBuffer} will never contain a high surrogate
+ * character as the last element in the buffer unless it is the last available
+ * character in the reader. In other words, high and low surrogate pairs will
+ * always be preserved across buffer boarders.
+ * </p>
+ * <p>
+ * A return value of <code>false</code> means that this method call exhausted
+ * the reader, but there may be some bytes which have been read, which can be
+ * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
+ * </p>
+ *
+ * @param buffer
+ * the buffer to fill.
+ * @param reader
+ * the reader to read characters from.
+ * @param numChars
+ * the number of chars to read
+ * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+ * @throws IOException
+ * if the reader throws an {@link IOException}.
+ */
+ public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+
+ /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+ public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ return fill(buffer, reader, buffer.buffer.length);
+ }
+
+ /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+ * code points from <code>index</code>. */
+ public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+ static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+ int read = 0;
+ while (read < len) {
+ final int r = reader.read(dest, offset + read, len - read);
+ if (r == -1) {
+ break;
+ }
+ read += r;
+ }
+ return read;
+ }
+
+ private static final class Java5CharacterUtils extends CharacterUtils {
+ Java5CharacterUtils() {
+ }
+
+ @Override
+ public int codePointAt(final CharSequence seq, final int offset) {
+ return Character.codePointAt(seq, offset);
+ }
+
+ @Override
+ public int codePointAt(final char[] chars, final int offset, final int limit) {
+ return Character.codePointAt(chars, offset, limit);
+ }
+
+ @Override
+ public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
+
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return Character.codePointCount(seq, 0, seq.length());
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ return Character.offsetByCodePoints(buf, start, count, index, offset);
+ }
+ }
+
+ private static final class Java4CharacterUtils extends CharacterUtils {
+ Java4CharacterUtils() {
+ }
+
+ @Override
+ public int codePointAt(final CharSequence seq, final int offset) {
+ return seq.charAt(offset);
+ }
+
+ @Override
+ public int codePointAt(final char[] chars, final int offset, final int limit) {
+ if(offset >= limit)
+ throw new IndexOutOfBoundsException("offset must be less than limit");
+ return chars[offset];
+ }
+
+ @Override
+ public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
+ throws IOException {
+ assert buffer.buffer.length >= 1;
+ if (numChars < 1 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
+ }
+ buffer.offset = 0;
+ final int read = readFully(reader, buffer.buffer, 0, numChars);
+ buffer.length = read;
+ buffer.lastTrailingHighSurrogate = 0;
+ return read == numChars;
+ }
+
+ @Override
+ public int codePointCount(CharSequence seq) {
+ return seq.length();
+ }
+
+ @Override
+ public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+ final int result = index + offset;
+ if (result < 0 || result > count) {
+ throw new IndexOutOfBoundsException();
+ }
+ return result;
+ }
+
+ }
+
+ /**
+ * A simple IO buffer to use with
+ * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+ */
+ public static final class CharacterBuffer {
+
+ private final char[] buffer;
+ private int offset;
+ private int length;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ char lastTrailingHighSurrogate;
+
+ CharacterBuffer(char[] buffer, int offset, int length) {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /**
+ * Returns the internal buffer
+ *
+ * @return the buffer
+ */
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Returns the data offset in the internal buffer.
+ *
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Return the length of the data in the internal buffer starting at
+ * {@link #getOffset()}
+ *
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Resets the CharacterBuffer. All internals are reset to its default
+ * values.
+ */
+ public void reset() {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = 0;
+ }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java
new file mode 100644
index 00000000000..abdde0d619b
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData1.java
@@ -0,0 +1,716 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData1 {
+ private KStemData1() {
+ }
+// KStemData1 ... KStemData8 are created from "head_word_list.txt"
+ static String[] data = {
+"aback","abacus","abandon","abandoned","abase",
+"abash","abate","abattoir","abbess","abbey",
+"abbot","abbreviate","abbreviation","abc","abdicate",
+"abdomen","abduct","abed","aberrant","aberration",
+"abet","abeyance","abhor","abhorrent","abide",
+"abiding","abilities","ability","abject","abjure",
+"ablative","ablaut","ablaze","able","ablution",
+"ablutions","ably","abnegation","abnormal","abo",
+"aboard","abode","abolish","abolition","abominable",
+"abominate","abomination","aboriginal","aborigine","abort",
+"abortion","abortionist","abortive","abound","about",
+"above","aboveboard","abracadabra","abrade","abrasion",
+"abrasive","abreast","abridge","abridgement","abridgment",
+"abroad","abrogate","abrupt","abscess","abscond",
+"absence","absent","absentee","absenteeism","absently",
+"absinth","absinthe","absolute","absolutely","absolution",
+"absolutism","absolve","absorb","absorbent","absorbing",
+"absorption","abstain","abstemious","abstention","abstinence",
+"abstract","abstracted","abstraction","abstruse","absurd",
+"abundance","abundant","abuse","abusive","abut",
+"abutment","abysmal","abyss","acacia","academic",
+"academician","academy","accede","accelerate","acceleration",
+"accelerator","accent","accentuate","accept","acceptable",
+"acceptance","access","accessible","accession","accessory",
+"accidence","accident","accidental","acclaim","acclamation",
+"acclimatize","acclivity","accolade","accommodate","accommodating",
+"accommodation","accommodations","accompaniment","accompanist","accompany",
+"accomplice","accomplish","accomplished","accomplishment","accord",
+"accordance","according","accordingly","accordion","accost",
+"account","accountable","accountancy","accountant","accoutrements",
+"accredit","accretion","accrue","accumulate","accumulation",
+"accumulative","accumulator","accuracy","accurate","accursed",
+"accusation","accusative","accuse","accused","accustom",
+"accustomed","ace","acerbity","acetate","acetic",
+"acetylene","ache","achieve","achievement","achoo",
+"acid","acidify","acidity","acidulated","acidulous",
+"acknowledge","acknowledgement","acknowledgment","acme","acne",
+"acolyte","aconite","acorn","acoustic","acoustics",
+"acquaint","acquaintance","acquaintanceship","acquiesce","acquiescent",
+"acquire","acquisition","acquisitive","acquit","acquittal",
+"acre","acreage","acrid","acrimony","acrobat",
+"acrobatic","acrobatics","acronym","across","acrostic",
+"act","acting","actinism","action","actionable",
+"activate","active","activist","activity","actor",
+"actress","acts","actual","actuality","actually",
+"actuary","actuate","acuity","acumen","acupuncture",
+"acute","adage","adagio","adam","adamant",
+"adapt","adaptable","adaptation","adapter","adaptor",
+"adc","add","addendum","adder","addict",
+"addiction","addictive","addition","additional","additive",
+"addle","address","addressee","adduce","adenoidal",
+"adenoids","adept","adequate","adhere","adherence",
+"adherent","adhesion","adhesive","adieu","adipose",
+"adj","adjacent","adjective","adjoin","adjourn",
+"adjudge","adjudicate","adjunct","adjure","adjust",
+"adjutant","adman","admass","administer","administration",
+"administrative","administrator","admirable","admiral","admiralty",
+"admiration","admire","admirer","admissible","admission",
+"admit","admittance","admitted","admittedly","admixture",
+"admonish","admonition","admonitory","ado","adobe",
+"adolescent","adopt","adoption","adoptive","adorable",
+"adoration","adore","adorn","adornment","adrenalin",
+"adrift","adroit","adulate","adulation","adult",
+"adulterate","adulterer","adultery","adumbrate","adv",
+"advance","advanced","advancement","advances","advantage",
+"advantageous","advent","adventist","adventitious","adventure",
+"adventurer","adventuress","adventurous","adverb","adverbial",
+"adversary","adverse","adversity","advert","advertise",
+"advertisement","advertising","advice","advisable","advise",
+"advisedly","adviser","advisor","advisory","advocacy",
+"advocate","adz","adze","aegis","aeon",
+"aerate","aerial","aerie","aerobatic","aerobatics",
+"aerodrome","aerodynamic","aerodynamics","aeronautics","aeroplane",
+"aerosol","aerospace","aertex","aery","aesthete",
+"aesthetic","aesthetics","aether","aethereal","aetiology",
+"afar","affable","affair","affect","affectation",
+"affected","affecting","affection","affectionate","affiance",
+"affidavit","affiliate","affiliation","affinity","affirm",
+"affirmative","affix","afflict","affliction","affluent",
+"afford","afforest","affray","affricate","affront",
+"aficionado","afield","afire","aflame","afloat",
+"afoot","aforesaid","aforethought","afraid","afresh",
+"afrikaans","afrikaner","afro","aft","after",
+"afterbirth","aftercare","aftereffect","afterglow","afterlife",
+"aftermath","afternoon","afternoons","afters","aftershave",
+"aftertaste","afterthought","afterwards","again","against",
+"agape","agate","age","ageing","ageless",
+"agency","agenda","agent","agglomerate","agglutination",
+"agglutinative","aggrandisement","aggrandizement","aggravate","aggravation",
+"aggregate","aggregation","aggression","aggressive","aggressor",
+"aggrieved","aggro","aghast","agile","agitate",
+"agitation","agitator","aglow","agnostic","ago",
+"agog","agonise","agonised","agonising","agonize",
+"agonized","agonizing","agony","agoraphobia","agoraphobic",
+"agrarian","agree","agreeable","agreeably","agreement",
+"agriculture","agronomy","aground","ague","aha",
+"ahead","ahem","ahoy","aid","ail",
+"aileron","ailment","aim","aimless","air",
+"airbase","airbed","airbladder","airborne","airbrake",
+"airbrick","airbus","aircraft","aircraftman","aircrew",
+"aircushion","airdrop","airedale","airfield","airflow",
+"airforce","airgun","airhole","airhostess","airily",
+"airing","airlane","airless","airletter","airlift",
+"airline","airliner","airlock","airmail","airman",
+"airplane","airpocket","airport","airs","airshaft",
+"airship","airsick","airspace","airspeed","airstrip",
+"airtight","airway","airwoman","airworthy","airy",
+"aisle","aitch","ajar","akimbo","akin",
+"alabaster","alack","alacrity","alarm","alarmist",
+"alas","albatross","albeit","albino","album",
+"albumen","alchemist","alchemy","alcohol","alcoholic",
+"alcoholism","alcove","alder","alderman","ale",
+"alehouse","alert","alfalfa","alfresco","algae",
+"algebra","algorithm","alias","alibi","alien",
+"alienate","alienation","alienist","alight","align",
+"alignment","alike","alimentary","alimony","aline",
+"alinement","alive","alkali","alkaline","all",
+"allah","allay","allegation","allege","allegedly",
+"allegiance","allegorical","allegory","allegretto","allegro",
+"alleluia","allergic","allergy","alleviate","alley",
+"alleyway","alliance","allied","alligator","alliteration",
+"alliterative","allocate","allocation","allopathy","allot",
+"allotment","allow","allowable","allowance","alloy",
+"allspice","allude","allure","allurement","allusion",
+"alluvial","alluvium","ally","almanac","almanack",
+"almighty","almond","almoner","almost","alms",
+"aloe","aloft","alone","along","alongside",
+"aloof","alopecia","aloud","alpaca","alpenhorn",
+"alpenstock","alpha","alphabet","alphabetical","alpine",
+"already","alright","alsatian","also","altar",
+"altarpiece","alter","alteration","altercation","alternate",
+"alternative","alternator","although","altimeter","altitude",
+"alto","altogether","altruism","altruist","alum",
+"aluminium","alumna","alumnus","alveolar","always",
+"alyssum","amalgam","amalgamate","amanuensis","amass",
+"amateur","amateurish","amatory","amaze","amazing",
+"amazon","ambassador","ambassadorial","amber","ambergris",
+"ambidextrous","ambience","ambient","ambiguous","ambit",
+"ambition","ambitious","ambivalent","amble","ambrosia",
+"ambulance","ambush","ame","ameba","ameliorate",
+"amen","amenable","amend","amendment","amends",
+"amenity","americanise","americanism","americanize","amethyst",
+"amiable","amicable","amid","amidships","amir",
+"amiss","amity","ammeter","ammo","ammonia",
+"ammonite","ammunition","amnesia","amnesty","amoeba",
+"amoebic","amok","among","amoral","amorous",
+"amorphous","amortise","amortize","amount","amour",
+"amp","amperage","ampersand","amphetamine","amphibian",
+"amphibious","amphitheater","amphitheatre","amphora","ample",
+"amplifier","amplify","amplitude","ampoule","amputate",
+"amputee","amuck","amulet","amuse","amusement",
+"anachronism","anaconda","anaemia","anaemic","anaesthesia",
+"anaesthetic","anaesthetist","anagram","anal","analgesia",
+"analgesic","analog","analogize","analogous","analogue",
+"analogy","analyse","analysis","analyst","analytic",
+"anapaest","anarchic","anarchism","anarchist","anarchy",
+"anathema","anathematize","anatomical","anatomist","anatomy",
+"ancestor","ancestral","ancestry","anchor","anchorage",
+"anchorite","anchovy","ancient","ancients","ancillary",
+"and","andante","andiron","androgynous","anecdotal",
+"anecdote","anemia","anemometer","anemone","anesthesia",
+"anesthetise","anesthetize","anew","angel","angelica",
+"angelus","anger","angle","anglican","anglicise",
+"anglicism","anglicize","angling","anglophile","anglophilia",
+"anglophobe","anglophobia","angora","angostura","angry",
+"angst","anguish","anguished","angular","aniline",
+"animadversion","animadvert","animal","animalcule","animalism",
+"animate","animation","animism","animosity","animus",
+"anis","anise","aniseed","ankle","anklet",
+"annals","anneal","annex","annexation","annexe",
+"annihilate","anniversary","annotate","annotation","announce",
+"announcement","announcer","annoy","annoyance","annual",
+"annuity","annul","annular","annunciation","anode",
+"anodyne","anoint","anomalous","anomaly","anon",
+"anonymity","anonymous","anopheles","anorak","anorexia",
+"another","answer","answerable","ant","antacid",
+"antagonism","antagonist","antagonize","antarctic","ante",
+"anteater","antecedence","antecedent","antecedents","antechamber",
+"antedate","antediluvian","antelope","antenatal","antenna",
+"antepenultimate","anterior","anteroom","anthem","anther",
+"anthill","anthology","anthracite","anthrax","anthropocentric",
+"anthropoid","anthropologist","anthropology","anthropomorphic","anthropomorphism",
+"anthropophagous","anthropophagy","antiaircraft","antibiotic","antibody",
+"antic","anticipate","anticipation","anticipatory","anticlerical",
+"anticlimax","anticlockwise","antics","anticyclone","antidote",
+"antifreeze","antigen","antihero","antihistamine","antiknock",
+"antilogarithm","antimacassar","antimatter","antimony","antipathetic",
+"antipathy","antipersonnel","antipodal","antipodes","antiquarian",
+"antiquary","antiquated","antique","antiquity","antirrhinum",
+"antiseptic","antisocial","antithesis","antithetic","antitoxin",
+"antler","antonym","anus","anvil","anxiety",
+"anxious","any","anybody","anyhow","anyplace",
+"anyroad","anything","anyway","anywhere","aorta",
+"apace","apanage","apart","apartheid","apartment",
+"apartments","apathetic","apathy","ape","aperient",
+"aperitif","aperture","apex","aphasia","aphasic",
+"aphid","aphorism","aphoristic","aphrodisiac","apiarist",
+"apiary","apices","apiculture","apiece","apish",
+"aplomb","apocalypse","apocalyptic","apocrypha","apocryphal",
+"apogee","apologetic","apologetics","apologia","apologise",
+"apologist","apologize","apology","apophthegm","apoplectic",
+"apoplexy","apostasy","apostate","apostatise","apostatize",
+"apostle","apostolic","apostrophe","apostrophize","apothecary",
+"apothegm","apotheosis","appal","appall","appalling",
+"appanage","apparatus","apparel","apparent","apparently",
+"apparition","appeal","appealing","appear","appearance",
+"appearances","appease","appeasement","appellant","appellate",
+"appellation","append","appendage","appendectomy","appendicitis",
+"appendix","appertain","appetite","appetizer","appetizing",
+"applaud","applause","apple","applejack","appliance",
+"applicable","applicant","application","applied","apply",
+"appoint","appointment","appointments","apportion","apposite",
+"apposition","appraisal","appraise","appreciable","appreciate",
+"appreciation","appreciative","apprehend","apprehension","apprehensive",
+"apprentice","apprenticeship","apprise","appro","approach",
+"approachable","approbation","approbatory","appropriate","appropriation",
+"approval","approve","approx","approximate","approximation",
+"appurtenance","apricot","april","apron","apropos",
+"apse","apt","aptitude","aqualung","aquamarine",
+"aquaplane","aquarium","aquatic","aquatint","aqueduct",
+"aqueous","aquiline","arab","arabesque","arabic",
+"arable","arachnid","arak","arbiter","arbitrary",
+"arbitrate","arbitration","arbitrator","arbor","arboreal",
+"arboretum","arbour","arc","arcade","arcadia",
+"arcane","arch","archaeology","archaic","archaism",
+"archangel","archbishop","archbishopric","archdeacon","archdeaconry",
+"archdiocese","archduke","archeology","archer","archery",
+"archetype","archimandrite","archipelago","architect","architecture",
+"archive","archway","arctic","ardent","ardor",
+"ardour","arduous","are","area","areca",
+"arena","argent","argon","argot","arguable",
+"argue","argument","argumentative","aria","arid",
+"aries","aright","arise","aristocracy","aristocrat",
+"aristocratic","arithmetic","arithmetician","ark","arm",
+"armada","armadillo","armament","armature","armband",
+"armchair","armed","armful","armhole","armistice",
+"armlet","armor","armorer","armorial","armory",
+"armour","armoured","armourer","armoury","armpit",
+"arms","army","aroma","aromatic","arose",
+"around","arouse","arpeggio","arquebus","arrack",
+"arraign","arrange","arrangement","arrant","arras",
+"array","arrears","arrest","arrival","arrive",
+"arrogance","arrogant","arrogate","arrow","arrowhead",
+"arrowroot","arse","arsenal","arsenic","arson",
+"art","artefact","arterial","arteriosclerosis","artery",
+"artful","arthritis","artichoke","article","articles",
+"articulate","articulated","articulateness","articulation","artifact",
+"artifice","artificer","artificial","artillery","artisan",
+"artist","artiste","artistic","artistry","artless",
+"arts","arty","arum","asbestos","ascend",
+"ascendancy","ascendant","ascendency","ascendent","ascension",
+"ascent","ascertain","ascetic","ascribe","ascription",
+"asepsis","aseptic","asexual","ash","ashamed",
+"ashbin","ashcan","ashen","ashes","ashore",
+"ashtray","ashy","aside","asinine","ask",
+"askance","askew","aslant","asleep","asp",
+"asparagus","aspect","aspectual","aspen","asperity",
+"aspersion","asphalt","asphodel","asphyxia","asphyxiate",
+"aspic","aspidistra","aspirant","aspirate","aspiration",
+"aspire","aspirin","ass","assagai","assail",
+"assailant","assassin","assassinate","assault","assay",
+"assegai","assemblage","assemble","assembly","assemblyman",
+"assent","assert","assertion","assertive","assess",
+"assessment","assessor","asset","asseverate","assiduity",
+"assiduous","assign","assignation","assignment","assimilate",
+"assimilation","assist","assistance","assistant","assize",
+"assizes","associate","association","assonance","assort",
+"assorted","assortment","asst","assuage","assume",
+"assumption","assurance","assure","assured","aster",
+"asterisk","astern","asteroid","asthma","astigmatic",
+"astigmatism","astir","astonish","astonishment","astound",
+"astrakhan","astral","astray","astride","astringent",
+"astrolabe","astrologer","astrology","astronaut","astronautics",
+"astronomer","astronomical","astronomy","astrophysics","astute",
+"asunder","asylum","asymmetric","atavism","atchoo",
+"ate","atelier","atheism","atheist","athlete",
+"athletic","athletics","athwart","atishoo","atlas",
+"atmosphere","atmospheric","atmospherics","atoll","atom",
+"atomic","atomise","atomize","atonal","atonality",
+"atone","atop","atrocious","atrocity","atrophy",
+"attach","attachment","attack","attain","attainder",
+"attainment","attar","attempt","attend","attendance",
+"attendant","attention","attentive","attenuate","attest",
+"attestation","attested","attic","attire","attitude",
+"attitudinise","attitudinize","attorney","attract","attraction",
+"attractive","attributable","attribute","attribution","attributive",
+"attrition","attune","atypical","aubergine","aubrietia",
+"auburn","auction","auctioneer","audacious","audacity",
+"audible","audience","audio","audiometer","audit",
+"audition","auditor","auditorium","auditory","auger",
+"aught","augment","augmentation","augur","augury",
+"august","auk","aunt","aura","aural",
+"aureole","auricle","auricular","auriferous","aurora",
+"auscultation","auspices","auspicious","aussie","austere",
+"austerity","australasian","autarchy","autarky","authentic",
+"authenticate","authenticity","author","authoress","authorisation",
+"authorise","authoritarian","authoritative","authority","authorization",
+"authorize","authorship","autism","autistic","auto",
+"autobahn","autobiographical","autobiography","autocracy","autocrat",
+"autoeroticism","autograph","automat","automate","automatic",
+"automation","automatism","automaton","automobile","autonomous",
+"autonomy","autopsy","autostrada","autosuggestion","autumn",
+"autumnal","auxiliary","avail","available","avalanche",
+"avarice","avaricious","avatar","avaunt","avenge",
+"avenue","aver","average","averse","aversion",
+"aversive","avert","aviary","aviation","aviator",
+"avid","avocado","avocation","avocet","avoid",
+"avoidance","avoirdupois","avow","avowal","avowed",
+"avuncular","await","awake","awaken","awakening",
+"award","aware","awash","away","awe",
+"awesome","awestruck","awful","awfully","awhile",
+"awkward","awl","awning","awoke","awoken",
+"awry","axe","axiom","axiomatic","axis",
+"axle","axolotl","ayah","aye","azalea",
+"azimuth","azure","baa","babble","babbler",
+"babe","babel","baboo","baboon","babu",
+"baby","babyhood","babyish","baccalaureate","baccara",
+"baccarat","bacchanal","baccy","bachelor","bacillus",
+"back","backache","backbench","backbite","backbone",
+"backbreaking","backchat","backcloth","backcomb","backdate",
+"backdrop","backer","backfire","backgammon","background",
+"backhand","backhanded","backhander","backing","backlash",
+"backlog","backmost","backpedal","backside","backslide",
+"backspace","backstage","backstairs","backstay","backstroke",
+"backtrack","backup","backward","backwards","backwash",
+"backwater","backwoods","backwoodsman","backyard","bacon",
+"bacteria","bacteriology","bactrian","bad","bade",
+"badge","badger","badinage","badly","badminton",
+"baffle","baffling","bag","bagatelle","bagful",
+"baggage","baggy","bagpipes","bags","bah",
+"bail","bailey","bailiff","bairn","bait",
+"baize","bake","bakelite","baker","bakery",
+"baksheesh","balaclava","balalaika","balance","balanced",
+"balcony","bald","balderdash","balding","baldly",
+"baldric","bale","baleful","balk","ball",
+"ballad","ballade","ballast","ballcock","ballerina",
+"ballet","ballistic","ballistics","ballocks","balloon",
+"ballooning","balloonist","ballot","ballpoint","ballroom",
+"balls","bally","ballyhoo","balm","balmy",
+"baloney","balsa","balsam","balustrade","bamboo",
+"bamboozle","ban","banal","banana","band",
+"bandage","bandana","bandanna","bandbox","bandeau",
+"bandit","banditry","bandmaster","bandoleer","bandolier",
+"bandsman","bandstand","bandwagon","bandy","bane",
+"baneful","bang","banger","bangle","banian",
+"banish","banister","banjo","bank","bankbook",
+"banker","banking","bankrupt","bankruptcy","banner",
+"bannock","banns","banquet","banshee","bantam",
+"bantamweight","banter","banyan","baobab","baptise",
+"baptism","baptist","baptize","bar","barb",
+"barbarian","barbaric","barbarise","barbarism","barbarize",
+"barbarous","barbecue","barbed","barbel","barber",
+"barbican","barbiturate","barcarole","barcarolle","bard",
+"bare","bareback","barebacked","barefaced","barefoot",
+"bareheaded","barelegged","barely","bargain","barge",
+"bargee","baritone","barium","bark","barker",
+"barley","barleycorn","barmaid","barman","barmy",
+"barn","barnacle","barnstorm","barnyard","barograph",
+"barometer","baron","baroness","baronet","baronetcy",
+"baronial","barony","baroque","barque","barrack",
+"barracks","barracuda","barrage","barred","barrel",
+"barren","barricade","barricades","barrier","barring",
+"barrister","barrow","bartender","barter","basalt",
+"base","baseball","baseboard","baseless","baseline",
+"basement","bases","bash","bashful","basic",
+"basically","basics","basil","basilica","basilisk",
+"basin","basis","bask","basket","basketball",
+"basketful","basketry","basketwork","bass","basset",
+"bassinet","bassoon","bast","bastard","bastardise",
+"bastardize","bastardy","baste","bastinado","bastion",
+"bat","batch","bated","bath","bathing",
+"bathos","bathrobe","bathroom","baths","bathtub",
+"bathysphere","batik","batiste","batman","baton",
+"bats","batsman","battalion","batten","batter",
+"battery","battle","battleax","battleaxe","battlefield",
+"battlements","battleship","batty","bauble","baulk",
+"bauxite","bawd","bawdy","bawl","bay",
+"bayonet","bayou","bazaar","bazooka","bbc",
+"beach","beachcomber","beachhead","beachwear","beacon",
+"bead","beading","beadle","beady","beagle",
+"beagling","beak","beaker","beam","bean",
+"beanpole","beanstalk","bear","bearable","beard",
+"bearded","bearer","bearing","bearings","bearish",
+"bearskin","beast","beastly","beat","beaten",
+"beater","beatific","beatification","beatify","beating",
+"beatitude","beatitudes","beatnik","beau","beaujolais",
+"beaut","beauteous","beautician","beautiful","beautify",
+"beauty","beaver","bebop","becalmed","because",
+"beck","beckon","become","becoming","bed",
+"bedaub","bedbug","bedclothes","bedding","bedeck",
+"bedevil","bedewed","bedfellow","bedimmed","bedlam",
+"bedouin","bedpan","bedpost","bedraggled","bedridden",
+"bedrock","bedroom","bedside","bedsore","bedspread",
+"bedstead","bedtime","bee","beech","beef",
+"beefcake","beefeater","beefsteak","beefy","beehive",
+"beeline","been","beer","beery","beeswax",
+"beet","beetle","beetling","beetroot","beeves",
+"befall","befit","befitting","before","beforehand",
+"befriend","befuddle","beg","beget","beggar",
+"beggarly","beggary","begin","beginner","beginning",
+"begone","begonia","begorra","begot","begotten",
+"begrudge","beguile","begum","begun","behalf",
+"behave","behavior","behaviorism","behaviour","behaviourism",
+"behead","behemoth","behest","behind","behindhand",
+"behold","beholden","behove","beige","being",
+"belabor","belabour","belated","belay","belch",
+"beleaguer","belfry","belie","belief","believable",
+"believe","believer","belittle","bell","belladonna",
+"bellboy","belle","bellflower","bellicose","belligerency",
+"belligerent","bellow","bellows","belly","bellyache",
+"bellyful","belong","belongings","beloved","below",
+"belt","belted","belting","beltway","bemoan",
+"bemused","ben","bench","bencher","bend",
+"bended","bends","beneath","benedictine","benediction",
+"benedictus","benefaction","benefactor","benefice","beneficent",
+"beneficial","beneficiary","benefit","benevolence","benevolent",
+"benighted","benign","benignity","bent","benumbed",
+"benzedrine","benzene","benzine","bequeath","bequest",
+"berate","bereave","bereaved","bereavement","bereft",
+"beret","beriberi","berk","berry","berserk",
+"berth","beryl","beseech","beseem","beset",
+"besetting","beside","besides","besiege","besmear",
+"besmirch","besom","besotted","besought","bespattered",
+"bespeak","bespoke","best","bestial","bestiality",
+"bestiary","bestir","bestow","bestrew","bestride",
+"bet","beta","betake","betel","bethel",
+"bethink","betide","betimes","betoken","betray",
+"betrayal","betroth","betrothal","betrothed","better",
+"betterment","betters","bettor","between","betwixt",
+"bevel","beverage","bevy","bewail","beware",
+"bewilder","bewitch","bey","beyond","bezique",
+"bhang","bias","bib","bible","biblical",
+"bibliographer","bibliography","bibliophile","bibulous","bicarb",
+"bicarbonate","bicentenary","bicentennial","biceps","bicker",
+"bicycle","bid","biddable","bidding","bide",
+"bidet","biennial","bier","biff","bifocals",
+"bifurcate","big","bigamist","bigamous","bigamy",
+"bighead","bight","bigot","bigoted","bigotry",
+"bigwig","bijou","bike","bikini","bilabial",
+"bilateral","bilberry","bile","bilge","bilingual",
+"bilious","bilk","bill","billboard","billet",
+"billfold","billhook","billiard","billiards","billion",
+"billow","billposter","billy","biltong","bimetallic",
+"bimetallism","bimonthly","bin","binary","bind",
+"binder","bindery","binding","bindweed","binge",
+"bingo","binnacle","binocular","binoculars","binomial",
+"biochemistry","biodegradable","biographer","biographical","biography",
+"biological","biology","biomedical","bionic","biosphere",
+"biotechnology","bipartisan","bipartite","biped","biplane",
+"birch","bird","birdie","birdlime","birdseed",
+"biretta","biro","birth","birthday","birthmark",
+"birthplace","birthrate","birthright","biscuit","bisect",
+"bisexual","bishop","bishopric","bismuth","bison",
+"bisque","bistro","bit","bitch","bitchy",
+"bite","biting","bitter","bittern","bitters",
+"bittersweet","bitty","bitumen","bituminous","bivalve",
+"bivouac","biweekly","bizarre","blab","blabber",
+"blabbermouth","black","blackamoor","blackball","blackberry",
+"blackbird","blackboard","blackcurrant","blacken","blackguard",
+"blackhead","blacking","blackjack","blackleg","blacklist",
+"blackly","blackmail","blackout","blackshirt","blacksmith",
+"blackthorn","bladder","blade","blaeberry","blah",
+"blame","blameless","blameworthy","blanch","blancmange",
+"bland","blandishments","blank","blanket","blare",
+"blarney","blaspheme","blasphemous","blasphemy","blast",
+"blasted","blatant","blather","blaze","blazer",
+"blazes","blazing","blazon","blazonry","bleach",
+"bleachers","bleak","bleary","bleat","bleed",
+"bleeder","bleeding","bleep","blemish","blench",
+"blend","blender","bless","blessed","blessing",
+"blether","blew","blight","blighter","blimey",
+"blimp","blind","blinder","blinders","blindfold",
+"blink","blinkered","blinkers","blinking","blip",
+"bliss","blister","blistering","blithe","blithering",
+"blitz","blizzard","bloated","bloater","blob",
+"bloc","block","blockade","blockage","blockbuster",
+"blockhead","blockhouse","bloke","blond","blood",
+"bloodbath","bloodcurdling","bloodhound","bloodless","bloodletting",
+"bloodshed","bloodshot","bloodstain","bloodstock","bloodstream",
+"bloodsucker","bloodthirsty","bloody","bloom","bloomer",
+"bloomers","blooming","blossom","blot","blotch",
+"blotter","blotto","blouse","blow","blower",
+"blowfly","blowgun","blowhard","blowhole","blowlamp",
+"blown","blowout","blowpipe","blowsy","blowy",
+"blowzy","blubber","bludgeon","blue","bluebag",
+"bluebeard","bluebell","blueberry","bluebird","bluebottle",
+"bluecoat","bluefish","bluejacket","blueprint","blues",
+"bluestocking","bluff","blunder","blunderbuss","blunt",
+"bluntly","blur","blurb","blurt","blush",
+"bluster","blustery","boa","boar","board",
+"boarder","boarding","boardinghouse","boardroom","boards",
+"boardwalk","boast","boaster","boastful","boat",
+"boater","boathouse","boatman","boatswain","bob",
+"bobbin","bobby","bobcat","bobolink","bobsleigh",
+"bobtail","bobtailed","bock","bod","bode",
+"bodice","bodily","boding","bodkin","body",
+"bodyguard","bodywork","boer","boffin","bog",
+"bogey","boggle","boggy","bogie","bogus",
+"bohemian","boil","boiler","boisterous","bold",
+"boldface","boldfaced","bole","bolero","boll",
+"bollard","bollocks","boloney","bolshevik","bolshevism",
+"bolshy","bolster","bolt","bolthole","bomb",
+"bombard","bombardier","bombardment","bombast","bomber",
+"bombproof","bombshell","bombsight","bombsite","bonanza",
+"bonbon","bond","bondage","bonded","bondholder",
+"bonds","bone","boned","bonehead","boner",
+"bonesetter","boneshaker","bonfire","bongo","bonhomie",
+"bonito","bonkers","bonnet","bonny","bonsai",
+"bonus","bony","bonzer","boo","boob",
+"boobs","booby","boodle","boohoo","book",
+"bookable","bookbindery","bookbinding","bookcase","bookend",
+"booking","bookish","bookkeeping","booklet","bookmaker",
+"bookmark","bookmobile","bookplate","books","bookseller",
+"bookshop","bookstall","bookwork","bookworm","boom",
+"boomerang","boon","boor","boost","booster",
+"boot","bootblack","booted","bootee","booth",
+"bootlace","bootleg","bootless","boots","bootstraps",
+"booty","booze","boozer","boozy","bop",
+"bopper","boracic","borage","borax","bordeaux",
+"bordello","border","borderer","borderland","borderline",
+"bore","borealis","borehole","borer","born",
+"borne","boron","borough","borrow","borrowing",
+"borscht","borshcht","borstal","borzoi","bosh",
+"bosom","bosomy","boss","bossy","bosun",
+"botanical","botanise","botanist","botanize","botany",
+"botch","both","bother","botheration","bothersome",
+"bottle","bottleful","bottleneck","bottom","bottomless",
+"botulism","boudoir","bouffant","bougainvillaea","bougainvillea",
+"bough","bought","bouillabaisse","bouillon","boulder",
+"boulevard","bounce","bouncer","bouncing","bouncy",
+"bound","boundary","bounden","bounder","boundless",
+"bounds","bounteous","bountiful","bounty","bouquet",
+"bourbon","bourgeois","bourgeoisie","bourn","bourne",
+"bourse","bout","boutique","bouzouki","bovine",
+"bovril","bovver","bow","bowdlerise","bowdlerize",
+"bowed","bowel","bowels","bower","bowerbird",
+"bowing","bowl","bowler","bowlful","bowline",
+"bowling","bowls","bowman","bowser","bowshot",
+"bowsprit","bowwow","box","boxer","boxful",
+"boxing","boxwood","boy","boycott","boyfriend",
+"boyhood","boyish","boys","bra","brace",
+"bracelet","bracelets","braces","bracing","bracken",
+"bracket","brackish","bract","bradawl","brae",
+"brag","braggadocio","braggart","brahman","braid",
+"braille","brain","brainchild","brainless","brainpan",
+"brains","brainstorm","brainwash","brainwashing","brainwave",
+"brainy","braise","brake","bramble","bran",
+"branch","brand","brandish","brandy","brash",
+"brass","brasserie","brassiere","brassy","brat",
+"bravado","brave","bravo","bravura","brawl",
+"brawn","brawny","bray","brazen","brazier",
+"bre","breach","bread","breadbasket","breadboard",
+"breadcrumb","breaded","breadfruit","breadline","breadth",
+"breadthways","breadwinner","break","breakage","breakaway",
+"breakdown","breaker","breakfast","breakneck","breakout",
+"breakthrough","breakup","breakwater","bream","breast",
+"breastbone","breastplate","breaststroke","breastwork","breath",
+"breathalyse","breathalyser","breathe","breather","breathing",
+"breathless","breathtaking","breathy","breech","breeches",
+"breed","breeder","breeding","breeze","breezeblock",
+"breezy","brethren","breve","brevet","breviary",
+"brevity","brew","brewer","brewery","briar",
+"bribe","bribery","brick","brickbat","brickfield",
+"bricklayer","brickwork","bridal","bride","bridegroom",
+"bridesmaid","bridge","bridgehead","bridgework","bridle",
+"brie","brief","briefcase","briefing","briefs",
+"brier","brig","brigade","brigadier","brigand",
+"brigandage","brigantine","bright","brighten","brill",
+"brilliancy","brilliant","brilliantine","brim","brimful",
+"brimfull","brimstone","brindled","brine","bring",
+"brink","brinkmanship","brioche","briquet","briquette",
+"brisk","brisket","bristle","bristly","bristols",
+"brit","britches","britisher","briton","brittle",
+"broach","broad","broadcast","broadcasting","broadcloth",
+"broaden","broadloom","broadminded","broadsheet","broadside",
+"broadsword","broadways","brocade","broccoli","brochure",
+"brogue","broil","broiler","broke","broken",
+"broker","brolly","bromide","bromine","bronchial",
+"bronchitis","bronco","brontosaurus","bronze","brooch",
+"brood","broody","brook","broom","broomstick",
+"broth","brothel","brother","brotherhood","brougham",
+"brought","brouhaha","brow","browbeat","brown",
+"brownie","brownstone","browse","brucellosis","bruin",
+"bruise","bruiser","bruising","bruit","brunch",
+"brunet","brunette","brunt","brush","brushwood",
+"brushwork","brusque","brutal","brutalise","brutality",
+"brutalize","brute","brutish","bubble","bubbly",
+"buccaneer","buck","buckboard","bucked","bucket",
+"buckle","buckler","buckram","buckshee","buckshot",
+"buckskin","bucktooth","buckwheat","bucolic","bud",
+"buddhism","budding","buddy","budge","budgerigar",
+"budget","budgetary","buff","buffalo","buffer",
+"buffet","buffoon","buffoonery","bug","bugaboo",
+"bugbear","bugger","buggered","buggery","buggy",
+"bughouse","bugle","bugrake","buhl","build",
+"builder","building","buildup","bulb","bulbous",
+"bulbul","bulge","bulk","bulkhead","bulky",
+"bull","bulldog","bulldoze","bulldozer","bullet",
+"bulletin","bulletproof","bullfight","bullfighting","bullfinch",
+"bullfrog","bullheaded","bullion","bullnecked","bullock",
+"bullring","bullshit","bully","bullyboy","bulrush",
+"bulwark","bum","bumble","bumblebee","bumboat",
+"bumf","bummer","bump","bumper","bumph",
+"bumpkin","bumptious","bumpy","bun","bunch",
+"bundle","bung","bungalow","bunghole","bungle",
+"bunion","bunk","bunker","bunkered","bunkhouse",
+"bunkum","bunny","bunting","buoy","buoyancy",
+"bur","burberry","burble","burden","burdensome",
+"burdock","bureau","bureaucracy","bureaucrat","bureaucratic",
+"burg","burgeon","burgess","burgh","burgher",
+"burglar","burglary","burgle","burgomaster","burgundy",
+"burial","burlap","burlesque","burly","burn",
+"burner","burning","burnish","burnous","burnouse",
+"burnt","burp","burr","burro","burrow",
+"bursar","bursary","burst","burthen","burton",
+"bury","bus","busby","bush","bushbaby",
+"bushed","bushel","bushwhack","bushy","business",
+"businesslike","businessman","busk","busker","busman",
+"bust","bustard","buster","bustle","busy",
+"busybody","but","butane","butch","butcher",
+"butchery","butler","butt","butter","buttercup",
+"butterfingers","butterfly","buttermilk","butterscotch","buttery",
+"buttock","buttocks","button","buttonhole","buttonhook",
+"buttons","buttress","buxom","buy","buyer",
+"buzz","buzzard","buzzer","bye","byelaw",
+"bygone","bygones","bylaw","bypass","byplay",
+"byre","bystander","byway","byways","byword",
+"byzantine","cab","cabal","cabaret","cabbage",
+"cabbie","cabby","cabdriver","caber","cabin",
+"cabinet","cable","cablegram","caboodle","caboose",
+"cabriolet","cacao","cache","cachet","cachou",
+"cackle","cacophony","cactus","cad","cadaver",
+"cadaverous","caddie","caddy","cadence","cadenza",
+"cadet","cadge","cadi","cadmium","cadre",
+"caerphilly","caesura","cafeteria","caffeine","caftan",
+"cage","cagey","cahoots","caiman","caique",
+"cairn","caisson","cajole","cake","calabash",
+"calaboose","calamitous","calamity","calcify","calcination",
+"calcine","calcium","calculable","calculate","calculating",
+"calculation","calculator","calculus","caldron","calendar",
+"calender","calends","calf","calfskin","caliber",
+"calibrate","calibration","calibre","calico","caliper",
+"calipers","caliph","caliphate","calisthenic","calisthenics",
+"calk","call","calla","callboy","caller",
+"calligraphy","calling","calliper","callipers","callisthenic",
+"callisthenics","callous","callow","callus","calm",
+"calomel","calorie","calorific","calumniate","calumny",
+"calvary","calve","calves","calvinism","calypso",
+"calyx","cam","camaraderie","camber","cambric",
+"came","camel","camelhair","camellia","camembert",
+"cameo","camera","cameraman","camisole","camomile",
+"camouflage","camp","campaign","campanile","campanology",
+"campanula","camper","campfire","campground","camphor",
+"camphorated","campion","campsite","campus","camshaft",
+"can","canal","canalise","canalize","canard",
+"canary","canasta","cancan","cancel","cancellation",
+"cancer","cancerous","candela","candelabrum","candid",
+"candidate","candidature","candidly","candied","candle",
+"candlelight","candlemas","candlepower","candlestick","candlewick",
+"candor","candour","candy","candyfloss","candytuft",
+"cane","canine","canis","canister","canker",
+"canna","cannabis","canned","cannelloni","cannery",
+"cannibal","cannibalise","cannibalism","cannibalize","cannon",
+"cannonade","cannonball","cannot","canny","canoe",
+"canon","canonical","canonicals","canonise","canonize",
+"canoodle","canopy","canst","cant","cantab",
+"cantabrigian","cantaloup","cantaloupe","cantankerous","cantata",
+"canteen","canter","canticle","cantilever","canto",
+"canton","cantonment","cantor","canvas","canvass",
+"canyon","cap","capabilities","capability","capable",
+"capacious","capacity","caparison","cape","caper",
+"capillarity","capillary","capital","capitalisation","capitalise",
+"capitalism","capitalist","capitalization","capitalize","capitals",
+"capitation","capitol","capitulate","capitulation","capitulations",
+"capon","capriccio","caprice","capricious","capricorn",
+"capsicum","capsize","capstan","capsule","captain",
+"caption","captious","captivate","captive","captivity",
+"captor","capture","car","carafe","caramel",
+"carapace","carat","caravan","caravanning","caravanserai",
+"caraway","carbide","carbine","carbohydrate","carbolic",
+"carbon","carbonated","carbonation","carboniferous","carbonise",
+"carbonize","carborundum","carboy","carbuncle","carburetor",
+"carburettor","carcase","carcass","carcinogen","card",
+"cardamom","cardboard","cardiac","cardigan","cardinal",
+"cardpunch","cards","cardsharp","care","careen",
+"career","careerist","carefree","careful","careless",
+"caress","caret","caretaker","careworn","cargo",
+"caribou","caricature","caries","carillon","carious",
+"carmelite","carmine","carnage","carnal","carnation",
+"carnelian","carnival","carnivore","carnivorous","carob",
+"carol","carotid","carousal","carouse","carousel",
+"carp","carpal","carpenter","carpentry","carpet",
+"carpetbag","carpetbagger","carpeting","carport","carpus",
+"carriage","carriageway","carrier","carrion","carrot",
+"carroty","carrousel","carry","carryall","carrycot",
+"carryout","carsick","cart","cartage","cartel",
+"carter","carthorse","cartilage","cartilaginous","cartographer",
+"cartography","carton","cartoon","cartridge","cartwheel",
+"carve","carver","carving","caryatid","cascade",
+"cascara","case","casebook","casein","casework",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java
new file mode 100644
index 00000000000..bd724f048be
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData2.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData2 {
+ private KStemData2() {
+ }
+ static String[] data = {
+"cash","cashew","cashier","cashmere","casing",
+"casino","cask","casket","casque","cassava",
+"casserole","cassette","cassock","cassowary","cast",
+"castanets","castaway","castellated","caster","castigate",
+"casting","castle","castor","castrate","casual",
+"casualty","casuist","casuistry","cat","cataclysm",
+"catacomb","catafalque","catalepsy","catalog","catalogue",
+"catalpa","catalysis","catalyst","catamaran","catapult",
+"cataract","catarrh","catastrophe","catatonic","catcall",
+"catch","catcher","catching","catchpenny","catchphrase",
+"catchword","catchy","catechise","catechism","catechize",
+"categorical","categorise","categorize","category","cater",
+"caterer","caterpillar","caterwaul","catfish","catgut",
+"catharsis","cathartic","cathedral","catheter","cathode",
+"catholic","catholicism","catholicity","catkin","catnap",
+"catnip","catsup","cattle","catty","catwalk",
+"caucus","caudal","caught","caul","cauldron",
+"cauliflower","caulk","causal","causality","causation",
+"causative","cause","causeless","causeway","caustic",
+"cauterise","cauterize","caution","cautionary","cautious",
+"cavalcade","cavalier","cavalry","cavalryman","cave",
+"caveat","caveman","cavern","cavernous","caviar",
+"caviare","cavil","cavity","cavort","cavy",
+"caw","cay","cayman","cease","ceaseless",
+"cedar","cede","cedilla","ceiling","celandine",
+"celebrant","celebrate","celebrated","celebration","celebrity",
+"celerity","celery","celestial","celibacy","celibate",
+"cell","cellar","cellarage","cellist","cello",
+"cellophane","cellular","celluloid","cellulose","celsius",
+"celtic","cement","cemetery","cenotaph","censor",
+"censorious","censorship","censure","census","cent",
+"centaur","centavo","centenarian","centenary","centennial",
+"center","centerboard","centerpiece","centigrade","centigram",
+"centigramme","centime","centimeter","centimetre","centipede",
+"central","centralise","centralism","centralize","centre",
+"centreboard","centrepiece","centrifugal","centrifuge","centripetal",
+"centrist","centurion","century","cephalic","ceramic",
+"ceramics","cereal","cerebellum","cerebral","cerebration",
+"cerebrum","ceremonial","ceremonious","ceremony","cerise",
+"cert","certain","certainly","certainty","certifiable",
+"certificate","certificated","certify","certitude","cerulean",
+"cervical","cervix","cessation","cession","cesspit",
+"cetacean","chablis","chaconne","chafe","chaff",
+"chaffinch","chagrin","chain","chair","chairman",
+"chairmanship","chairperson","chairwoman","chaise","chalet",
+"chalice","chalk","chalky","challenge","challenging",
+"chamber","chamberlain","chambermaid","chambers","chameleon",
+"chamiomile","chamois","chamomile","champ","champagne",
+"champaign","champion","championship","chance","chancel",
+"chancellery","chancellor","chancery","chancy","chandelier",
+"chandler","change","changeable","changeless","changeling",
+"changeover","channel","chant","chanterelle","chanticleer",
+"chantry","chanty","chaos","chaotic","chap",
+"chapel","chapelgoer","chaperon","chaperone","chapfallen",
+"chaplain","chaplaincy","chaplet","chaps","chapter",
+"char","charabanc","character","characterise","characteristic",
+"characterization","characterize","characterless","charade","charades",
+"charcoal","chard","charge","chargeable","charged",
+"charger","chariot","charioteer","charisma","charismatic",
+"charitable","charity","charlady","charlatan","charleston",
+"charlock","charlotte","charm","charmer","charming",
+"chart","charter","chartreuse","charwoman","chary",
+"charybdis","chase","chaser","chasm","chassis",
+"chaste","chasten","chastise","chastisement","chastity",
+"chasuble","chat","chatelaine","chattel","chatter",
+"chatterbox","chatty","chauffeur","chauvinism","chauvinist",
+"cheap","cheapen","cheapskate","cheat","check",
+"checkbook","checked","checker","checkerboard","checkers",
+"checklist","checkmate","checkoff","checkout","checkpoint",
+"checkrail","checkrein","checkroom","checkup","cheddar",
+"cheek","cheekbone","cheeky","cheep","cheer",
+"cheerful","cheering","cheerio","cheerleader","cheerless",
+"cheers","cheery","cheese","cheesecake","cheesecloth",
+"cheeseparing","cheetah","chef","chem","chemical",
+"chemise","chemist","chemistry","chemotherapy","chenille",
+"cheque","chequebook","chequer","cherish","cheroot",
+"cherry","cherub","chervil","chess","chessboard",
+"chessman","chest","chesterfield","chestnut","chesty",
+"chevalier","chevron","chevvy","chevy","chew",
+"chi","chianti","chiaroscuro","chic","chicanery",
+"chicano","chichi","chick","chicken","chickenfeed",
+"chickenhearted","chickpea","chickweed","chicle","chicory",
+"chide","chief","chiefly","chieftain","chieftainship",
+"chiffon","chiffonier","chiffonnier","chigger","chignon",
+"chihuahua","chilblain","child","childbearing","childbirth",
+"childhood","childish","childlike","chile","chill",
+"chiller","chilli","chilly","chimaera","chime",
+"chimera","chimerical","chimney","chimneybreast","chimneypiece",
+"chimneypot","chimneystack","chimneysweep","chimpanzee","chin",
+"china","chinatown","chinaware","chinchilla","chine",
+"chink","chinless","chinook","chinstrap","chintz",
+"chinwag","chip","chipboard","chipmunk","chippendale",
+"chipping","chippy","chiromancy","chiropody","chiropractic",
+"chirp","chirpy","chisel","chiseler","chiseller",
+"chit","chitchat","chivalrous","chivalry","chive",
+"chivvy","chivy","chloride","chlorinate","chlorine",
+"chloroform","chlorophyll","chock","chocolate","choice",
+"choir","choirboy","choirmaster","choke","choker",
+"chokey","choky","choler","cholera","choleric",
+"cholesterol","chomp","choose","choosey","choosy",
+"chop","chopfallen","chophouse","chopper","choppers",
+"choppy","chopstick","choral","chorale","chord",
+"chore","choreographer","choreography","chorine","chorister",
+"chortle","chorus","chose","chosen","chow",
+"chowder","christ","christen","christendom","christening",
+"christian","christianity","christlike","christmastime","chromatic",
+"chrome","chromium","chromosome","chronic","chronicle",
+"chronograph","chronological","chronology","chronometer","chrysalis",
+"chrysanthemum","chub","chubby","chuck","chuckle",
+"chug","chukker","chum","chummy","chump",
+"chunk","chunky","church","churchgoer","churching",
+"churchwarden","churchyard","churl","churlish","churn",
+"chute","chutney","cia","cicada","cicatrice",
+"cicerone","cid","cider","cif","cigar",
+"cigaret","cigarette","cinch","cincture","cinder",
+"cinderella","cinders","cine","cinema","cinematograph",
+"cinematography","cinnamon","cinquefoil","cipher","circa",
+"circadian","circle","circlet","circuit","circuitous",
+"circular","circularise","circularize","circulate","circulation",
+"circumcise","circumcision","circumference","circumflex","circumlocution",
+"circumnavigate","circumscribe","circumscription","circumspect","circumstance",
+"circumstances","circumstantial","circumvent","circus","cirque",
+"cirrhosis","cirrus","cissy","cistern","citadel",
+"citation","cite","citizen","citizenry","citizenship",
+"citron","citrous","citrus","city","civet",
+"civic","civics","civies","civil","civilian",
+"civilisation","civilise","civility","civilization","civilize",
+"civilly","civvies","clack","clad","claim",
+"claimant","clairvoyance","clairvoyant","clam","clambake",
+"clamber","clammy","clamor","clamorous","clamour",
+"clamp","clampdown","clamshell","clan","clandestine",
+"clang","clanger","clangor","clangour","clank",
+"clannish","clansman","clap","clapboard","clapper",
+"clapperboard","clappers","claptrap","claque","claret",
+"clarification","clarify","clarinet","clarinetist","clarinettist",
+"clarion","clarity","clarts","clash","clasp",
+"class","classic","classical","classicism","classicist",
+"classics","classification","classified","classify","classless",
+"classmate","classroom","classy","clatter","clause",
+"claustrophobia","claustrophobic","clavichord","clavicle","claw",
+"clay","claymore","clean","cleaner","cleanliness",
+"cleanly","cleanse","cleanser","cleanup","clear",
+"clearance","clearing","clearinghouse","clearly","clearout",
+"clearway","cleat","cleavage","cleave","cleaver",
+"clef","cleft","clematis","clemency","clement",
+"clench","clerestory","clergy","clergyman","clerical",
+"clerihew","clerk","clever","clew","click",
+"client","clientele","cliff","cliffhanger","climacteric",
+"climactic","climate","climatic","climatology","climax",
+"climb","climber","clime","clinch","clincher",
+"cline","cling","clinging","clingy","clinic",
+"clinical","clink","clinker","clip","clipboard",
+"clipper","clippers","clippie","clipping","clique",
+"cliquey","cliquish","clitoris","cloaca","cloak",
+"cloakroom","clobber","cloche","clock","clockwise",
+"clockwork","clod","cloddish","clodhopper","clog",
+"cloggy","cloister","clone","clop","close",
+"closed","closedown","closefisted","closet","closure",
+"clot","cloth","clothe","clothes","clothesbasket",
+"clotheshorse","clothesline","clothier","clothing","cloture",
+"cloud","cloudbank","cloudburst","cloudless","cloudy",
+"clout","clove","cloven","clover","cloverleaf",
+"clown","clownish","cloy","club","clubbable",
+"clubfoot","clubhouse","cluck","clue","clueless",
+"clump","clumsy","clung","cluster","clutch",
+"clutches","clutter","coach","coachbuilder","coachman",
+"coachwork","coadjutor","coagulant","coagulate","coal",
+"coalbunker","coalesce","coalface","coalfield","coalhole",
+"coalhouse","coalition","coalmine","coalscuttle","coarse",
+"coarsen","coast","coastal","coaster","coastguard",
+"coastguardsman","coastline","coastwise","coat","coating",
+"coax","cob","cobalt","cobber","cobble",
+"cobbler","cobblers","cobblestone","cobra","cobweb",
+"cocaine","coccyx","cochineal","cochlea","cock",
+"cockade","cockatoo","cockchafer","cockcrow","cockerel",
+"cockeyed","cockfight","cockhorse","cockle","cockleshell",
+"cockney","cockpit","cockroach","cockscomb","cocksure",
+"cocktail","cocky","coco","cocoa","coconut",
+"cocoon","cod","coda","coddle","code",
+"codeine","codex","codger","codicil","codify",
+"codling","codpiece","codswallop","coed","coeducation",
+"coefficient","coelacanth","coequal","coerce","coercion",
+"coercive","coeternal","coeval","coexist","coexistence",
+"coffee","coffeepot","coffer","cofferdam","coffers",
+"coffin","cog","cogency","cogent","cogitate",
+"cogitation","cognac","cognate","cognition","cognitive",
+"cognizance","cognizant","cognomen","cognoscenti","cogwheel",
+"cohabit","cohere","coherence","coherent","cohesion",
+"cohesive","cohort","coif","coiffeur","coiffure",
+"coil","coin","coinage","coincide","coincidence",
+"coincident","coincidental","coir","coitus","coke",
+"col","cola","colander","cold","coleslaw",
+"coley","colic","colicky","colitis","collaborate",
+"collaboration","collaborationist","collage","collapse","collapsible",
+"collar","collarbone","collate","collateral","collation",
+"colleague","collect","collected","collection","collective",
+"collectivise","collectivism","collectivize","collector","colleen",
+"college","collegiate","collide","collie","collier",
+"colliery","collision","collocate","collocation","colloquial",
+"colloquialism","colloquy","collude","collusion","collywobbles",
+"cologne","colon","colonel","colonial","colonialism",
+"colonialist","colonies","colonise","colonist","colonize",
+"colonnade","colony","color","coloration","coloratura",
+"colored","colorfast","colorful","coloring","colorless",
+"colors","colossal","colossally","colossus","colostrum",
+"colour","coloured","colourfast","colourful","colouring",
+"colourless","colours","colt","colter","coltish",
+"columbine","column","columnist","coma","comatose",
+"comb","combat","combatant","combative","comber",
+"combination","combinations","combinatorial","combine","combo",
+"combustible","combustion","come","comeback","comecon",
+"comedian","comedienne","comedown","comedy","comely",
+"comer","comestible","comet","comfit","comfort",
+"comfortable","comforter","comfrey","comfy","comic",
+"comical","comics","cominform","coming","comintern",
+"comity","comma","command","commandant","commandeer",
+"commander","commanding","commandment","commando","commemorate",
+"commemoration","commemorative","commence","commencement","commend",
+"commendable","commendation","commendatory","commensurable","commensurate",
+"comment","commentary","commentate","commentator","commerce",
+"commercial","commercialise","commercialism","commercialize","commie",
+"commiserate","commiseration","commissar","commissariat","commissary",
+"commission","commissionaire","commissioner","commit","commitment",
+"committal","committed","committee","committeeman","commode",
+"commodious","commodity","commodore","common","commonage",
+"commonalty","commoner","commonly","commonplace","commons",
+"commonweal","commonwealth","commotion","communal","commune",
+"communicable","communicant","communicate","communication","communications",
+"communicative","communion","communism","communist","community",
+"commutable","commutation","commutative","commutator","commute",
+"commuter","compact","compacted","companion","companionable",
+"companionship","companionway","company","comparable","comparative",
+"comparatively","compare","comparison","compartment","compartmentalise",
+"compartmentalize","compass","compassion","compassionate","compatibility",
+"compatible","compatriot","compeer","compel","compendious",
+"compendium","compensate","compensation","compensatory","compere",
+"compete","competence","competent","competition","competitive",
+"competitor","compilation","compile","complacency","complacent",
+"complain","complainant","complaint","complaisance","complaisant",
+"complement","complementary","complete","completely","completion",
+"complex","complexion","complexity","compliance","compliant",
+"complicate","complicated","complication","complicity","compliment",
+"complimentary","compliments","complin","compline","comply",
+"compo","component","comport","comportment","compose",
+"composer","composite","composition","compositor","compost",
+"composure","compote","compound","comprehend","comprehensible",
+"comprehension","comprehensive","compress","compressible","compression",
+"compressor","comprise","compromise","comptometer","comptroller",
+"compulsion","compulsive","compulsory","compunction","computation",
+"compute","computer","computerize","comrade","comradeship",
+"coms","con","concatenate","concatenation","concave",
+"concavity","conceal","concealment","concede","conceit",
+"conceited","conceivable","conceive","concentrate","concentrated",
+"concentration","concentric","concept","conception","conceptual",
+"conceptualise","conceptualize","concern","concerned","concernedly",
+"concerning","concert","concerted","concertgoer","concertina",
+"concertmaster","concerto","concession","concessionaire","concessive",
+"conch","conchology","concierge","conciliate","conciliation",
+"conciliatory","concise","concision","conclave","conclude",
+"conclusion","conclusive","concoct","concoction","concomitance",
+"concomitant","concord","concordance","concordant","concordat",
+"concourse","concrete","concubinage","concubine","concupiscence",
+"concur","concurrence","concurrent","concuss","concussion",
+"condemn","condemnation","condensation","condense","condenser",
+"condescend","condescension","condign","condiment","condition",
+"conditional","conditions","condole","condolence","condom",
+"condominium","condone","condor","conduce","conducive",
+"conduct","conduction","conductive","conductivity","conductor",
+"conduit","cone","coney","confabulate","confabulation",
+"confection","confectioner","confectionery","confederacy","confederate",
+"confederation","confer","conference","confess","confessed",
+"confession","confessional","confessor","confetti","confidant",
+"confide","confidence","confident","confidential","confiding",
+"configuration","confine","confinement","confines","confirm",
+"confirmation","confirmed","confiscate","confiscatory","conflagration",
+"conflate","conflict","confluence","conform","conformable",
+"conformation","conformist","conformity","confound","confounded",
+"confraternity","confront","confrontation","confucian","confucianism",
+"confuse","confusion","confute","conga","congeal",
+"congenial","congenital","congest","congestion","conglomerate",
+"conglomeration","congrats","congratulate","congratulations","congratulatory",
+"congregate","congregation","congregational","congregationalism","congress",
+"congressional","congressman","congruent","congruity","congruous",
+"conic","conical","conifer","coniferous","conj",
+"conjectural","conjecture","conjoin","conjoint","conjugal",
+"conjugate","conjugation","conjunction","conjunctiva","conjunctive",
+"conjunctivitis","conjuncture","conjure","conjurer","conjuror",
+"conk","conker","conkers","connect","connected",
+"connection","connective","connexion","connivance","connive",
+"connoisseur","connotation","connotative","connote","connubial",
+"conquer","conquest","conquistador","consanguineous","consanguinity",
+"conscience","conscientious","conscious","consciousness","conscript",
+"conscription","consecrate","consecration","consecutive","consensus",
+"consent","consequence","consequent","consequential","consequently",
+"conservancy","conservation","conservationist","conservatism","conservative",
+"conservatoire","conservatory","conserve","consider","considerable",
+"considerably","considerate","consideration","considered","considering",
+"consign","consignee","consigner","consignment","consignor",
+"consist","consistency","consistent","consistory","consolation",
+"consolatory","console","consolidate","consols","consonance",
+"consonant","consort","consortium","conspectus","conspicuous",
+"conspiracy","conspirator","conspiratorial","conspire","constable",
+"constabulary","constancy","constant","constellation","consternation",
+"constipate","constipation","constituency","constituent","constitute",
+"constitution","constitutional","constitutionalism","constitutionally","constitutive",
+"constrain","constrained","constraint","constrict","constriction",
+"constrictor","construct","construction","constructive","constructor",
+"construe","consubstantiation","consul","consular","consulate",
+"consult","consultancy","consultant","consultation","consultative",
+"consulting","consume","consumer","consummate","consummation",
+"consumption","consumptive","contact","contagion","contagious",
+"contain","contained","container","containerise","containerize",
+"containment","contaminate","contamination","contemplate","contemplation",
+"contemplative","contemporaneous","contemporary","contempt","contemptible",
+"contemptuous","contend","contender","content","contented",
+"contention","contentious","contentment","contents","contest",
+"contestant","context","contextual","contiguity","contiguous",
+"continence","continent","continental","contingency","contingent",
+"continual","continuance","continuation","continue","continuity",
+"continuo","continuous","continuum","contort","contortion",
+"contortionist","contour","contraband","contrabass","contraception",
+"contraceptive","contract","contractile","contraction","contractor",
+"contractual","contradict","contradiction","contradictory","contradistinction",
+"contrail","contraindication","contralto","contraption","contrapuntal",
+"contrariety","contrariwise","contrary","contrast","contravene",
+"contravention","contretemps","contribute","contribution","contributor",
+"contributory","contrite","contrition","contrivance","contrive",
+"contrived","control","controller","controversial","controversy",
+"controvert","contumacious","contumacy","contumelious","contumely",
+"contuse","contusion","conundrum","conurbation","convalesce",
+"convalescence","convalescent","convection","convector","convene",
+"convener","convenience","convenient","convenor","convent",
+"conventicle","convention","conventional","conventionality","converge",
+"conversant","conversation","conversational","conversationalist","conversazione",
+"converse","conversion","convert","converter","convertible",
+"convex","convexity","convey","conveyance","conveyancer",
+"conveyancing","conveyer","conveyor","convict","conviction",
+"convince","convinced","convincing","convivial","convocation",
+"convoke","convoluted","convolution","convolvulus","convoy",
+"convulse","convulsion","convulsive","cony","coo",
+"cook","cooker","cookery","cookhouse","cookie",
+"cooking","cookout","cool","coolant","cooler",
+"coolie","coon","coop","cooper","cooperate",
+"cooperation","cooperative","coordinate","coordinates","coordination",
+"coot","cop","cope","copeck","copier",
+"copilot","coping","copingstone","copious","copper",
+"copperhead","copperplate","coppersmith","coppice","copra",
+"coptic","copula","copulate","copulative","copy",
+"copybook","copyboy","copycat","copydesk","copyhold",
+"copyist","copyright","copywriter","coquetry","coquette",
+"cor","coracle","coral","corbel","cord",
+"cordage","cordial","cordiality","cordially","cordillera",
+"cordite","cordon","cords","corduroy","core",
+"corelate","coreligionist","corer","corespondent","corgi",
+"coriander","corinthian","cork","corkage","corked",
+"corker","corkscrew","corm","cormorant","corn",
+"corncob","corncrake","cornea","cornelian","corner",
+"cornerstone","cornet","cornfield","cornflakes","cornflower",
+"cornice","cornish","cornucopia","corny","corolla",
+"corollary","corona","coronary","coronation","coroner",
+"coronet","corpora","corporal","corporate","corporation",
+"corporeal","corps","corpse","corpulence","corpulent",
+"corpus","corpuscle","corral","correct","correction",
+"correctitude","corrective","correlate","correlation","correlative",
+"correspond","correspondence","correspondent","corresponding","corridor",
+"corrie","corrigendum","corroborate","corroboration","corroborative",
+"corroboree","corrode","corrosion","corrosive","corrugate",
+"corrugation","corrupt","corruption","corsage","corsair",
+"corse","corselet","corset","cortex","cortisone",
+"corundum","coruscate","corvette","cos","cosh",
+"cosignatory","cosine","cosmetic","cosmetician","cosmic",
+"cosmogony","cosmology","cosmonaut","cosmopolitan","cosmos",
+"cosset","cost","costermonger","costive","costly",
+"costs","costume","costumier","cosy","cot",
+"cotangent","cote","coterie","coterminous","cotillion",
+"cottage","cottager","cottar","cotter","cotton",
+"cottonseed","cottontail","cotyledon","couch","couchant",
+"couchette","cougar","cough","could","couldst",
+"coulter","council","councillor","counsel","counsellor",
+"counselor","count","countable","countdown","countenance",
+"counter","counteract","counterattack","counterattraction","counterbalance",
+"counterblast","counterclaim","counterclockwise","counterespionage","counterfeit",
+"counterfoil","counterintelligence","counterirritant","countermand","countermarch",
+"countermeasure","counteroffensive","counterpane","counterpart","counterpoint",
+"counterpoise","countersign","countersink","countertenor","countervail",
+"countess","countinghouse","countless","countrified","country",
+"countryman","countryside","county","coup","couple",
+"couplet","coupling","coupon","courage","courageous",
+"courgette","courier","course","courser","coursing",
+"court","courteous","courtesan","courtesy","courthouse",
+"courtier","courting","courtly","courtroom","courtship",
+"courtyard","couscous","cousin","couture","cove",
+"coven","covenant","coventry","cover","coverage",
+"covering","coverlet","covert","covet","covetous",
+"covey","cow","coward","cowardice","cowardly",
+"cowbell","cowboy","cowcatcher","cower","cowgirl",
+"cowhand","cowheel","cowherd","cowhide","cowl",
+"cowlick","cowling","cowman","cowpat","cowpox",
+"cowrie","cowry","cowshed","cowslip","cox",
+"coxcomb","coy","coyote","coypu","cozen",
+"cozy","cpa","crab","crabbed","crabby",
+"crabgrass","crabwise","crack","crackbrained","crackdown",
+"cracked","cracker","crackers","crackle","crackleware",
+"crackling","crackpot","cracksman","crackup","cradle",
+"craft","craftsman","crafty","crag","craggy",
+"crake","cram","crammer","cramp","cramped",
+"crampon","cramps","cranberry","crane","cranial",
+"cranium","crank","crankshaft","cranky","cranny",
+"crap","crape","crappy","craps","crash",
+"crashing","crass","crate","crater","cravat",
+"crave","craven","craving","crawl","crawler",
+"crawlers","crayfish","crayon","craze","crazy",
+"creak","creaky","cream","creamer","creamery",
+"creamy","crease","create","creation","creative",
+"creativity","creator","creature","credence","credentials",
+"credibility","credible","credit","creditable","creditor",
+"credo","credulous","creed","creek","creel",
+"creep","creeper","creepers","creeps","creepy",
+"cremate","crematorium","crenelated","crenellated","creole",
+"creosote","crept","crepuscular","crescendo","crescent",
+"cress","crest","crested","crestfallen","cretaceous",
+"cretin","cretonne","crevasse","crevice","crew",
+"crewman","crib","cribbage","crick","cricket",
+"cricketer","crier","cries","crikey","crime",
+"criminal","criminology","crimp","crimplene","crimson",
+"cringe","crinkle","crinkly","crinoid","crinoline",
+"cripes","cripple","crisis","crisp","crispy",
+"crisscross","criterion","critic","critical","criticise",
+"criticism","criticize","critique","critter","croak",
+"crochet","crock","crockery","crocodile","crocus",
+"croft","crofter","croissant","cromlech","crone",
+"crony","crook","crooked","croon","crooner",
+"crop","cropper","croquet","croquette","crore",
+"crosier","cross","crossbar","crossbeam","crossbenches",
+"crossbones","crossbow","crossbred","crossbreed","crosscheck",
+"crosscurrent","crosscut","crossfire","crossing","crossover",
+"crosspatch","crosspiece","crossply","crossroad","crossroads",
+"crosstree","crosswalk","crosswind","crosswise","crossword",
+"crotch","crotchet","crotchety","crouch","croup",
+"croupier","crouton","crow","crowbar","crowd",
+"crowded","crowfoot","crown","crozier","crucial",
+"crucible","crucifix","crucifixion","cruciform","crucify",
+"crude","crudity","cruel","cruelty","cruet",
+"cruise","cruiser","crumb","crumble","crumbly",
+"crummy","crumpet","crumple","crunch","crupper",
+"crusade","cruse","crush","crust","crustacean",
+"crusty","crutch","crux","cry","crybaby",
+"crying","crypt","cryptic","cryptogram","cryptography",
+"crystal","crystalline","crystallise","crystallize","cub",
+"cubbyhole","cube","cubic","cubical","cubicle",
+"cubism","cubit","cubs","cuckold","cuckoldry",
+"cuckoo","cucumber","cud","cuddle","cuddlesome",
+"cuddly","cudgel","cue","cuff","cuffs",
+"cuirass","cuisine","culinary","cull","cullender",
+"culminate","culmination","culotte","culottes","culpable",
+"culprit","cult","cultivable","cultivate","cultivated",
+"cultivation","cultivator","cultural","culture","cultured",
+"culvert","cumber","cumbersome","cumin","cummerbund",
+"cumulative","cumulonimbus","cumulus","cuneiform","cunnilingus",
+"cunning","cunt","cup","cupbearer","cupboard",
+"cupid","cupidity","cupola","cuppa","cupping",
+"cupric","cur","curable","curacy","curate",
+"curative","curator","curb","curd","curdle",
+"cure","curettage","curfew","curia","curio",
+"curiosity","curious","curl","curler","curlew",
+"curlicue","curling","curly","curlycue","curmudgeon",
+"currant","currency","current","curriculum","currish",
+"curry","curse","cursed","cursive","cursory",
+"curt","curtail","curtain","curtains","curtsey",
+"curtsy","curvaceous","curvacious","curvature","curve",
+"cushion","cushy","cusp","cuspidor","cuss",
+"cussed","custard","custodial","custodian","custody",
+"custom","customary","customer","customs","cut",
+"cutaway","cutback","cuticle","cutlass","cutler",
+"cutlery","cutlet","cutoff","cutout","cutpurse",
+"cutter","cutthroat","cutting","cuttlefish","cutworm",
+"cwm","cwt","cyanide","cybernetics","cyclamate",
+"cyclamen","cycle","cyclic","cyclist","cyclone",
+"cyclopaedia","cyclopedia","cyclostyle","cyclotron","cyder",
+"cygnet","cylinder","cymbal","cynic","cynical",
+"cynicism","cynosure","cypher","cypress","cyrillic",
+"cyst","cystitis","cytology","czar","czarina",
+"czech","dab","dabble","dabchick","dabs",
+"dace","dachshund","dactyl","dad","daddy",
+"dado","daemon","daffodil","daft","dagger",
+"dago","daguerreotype","dahlia","daily","dainty",
+"daiquiri","dairy","dairying","dairymaid","dairyman",
+"dais","daisy","dale","dalliance","dally",
+"dalmation","dam","damage","damages","damascene",
+"damask","damn","damnable","damnation","damnedest",
+"damning","damocles","damp","dampen","damper",
+"dampish","damsel","damson","dance","dandelion",
+"dander","dandified","dandle","dandruff","dandy",
+"danger","dangerous","dangle","dank","dapper",
+"dappled","dare","daredevil","daresay","daring",
+"dark","darken","darkey","darkroom","darky",
+"darling","darn","darning","dart","dartboard",
+"dartmoor","darts","dash","dashboard","dashed",
+"dashing","data","date","dated","dateless",
+"dateline","dates","dative","daub","daughter",
+"daunt","dauntless","dauphin","davit","dawdle",
+"dawn","day","dayboy","daybreak","daydream",
+"daylight","dayroom","days","daytime","daze",
+"dazzle","ddt","deacon","dead","deaden",
+"deadline","deadlock","deadly","deadpan","deadweight",
+"deaf","deafen","deal","dealer","dealing",
+"dealings","dean","deanery","dear","dearest",
+"dearie","dearly","dearth","deary","death",
+"deathbed","deathblow","deathless","deathlike","deathly",
+"deathwatch","deb","debar","debark","debase",
+"debatable","debate","debater","debauch","debauchee",
+"debauchery","debenture","debilitate","debility","debit",
+"debonair","debone","debouch","debrief","debris",
+"debt","debtor","debug","debunk","debut",
+"debutante","decade","decadence","decadent","decalogue",
+"decamp","decant","decanter","decapitate","decathlon",
+"decay","decease","deceased","deceit","deceitful",
+"deceive","decelerate","december","decencies","decency",
+"decent","decentralise","decentralize","deception","deceptive",
+"decibel","decide","decided","decidedly","deciduous",
+"decimal","decimalise","decimalize","decimate","decipher",
+"decision","decisive","deck","deckchair","deckhand",
+"declaim","declamation","declaration","declare","declared",
+"declassify","declension","declination","decline","declivity",
+"declutch","decoction","decode","decolonise","decolonize",
+"decompose","decompress","decongestant","decontaminate","decontrol",
+"decorate","decoration","decorative","decorator","decorous",
+"decorum","decoy","decrease","decree","decrepit",
+"decrepitude","decry","dedicate","dedicated","dedication",
+"deduce","deduct","deduction","deductive","deed",
+"deem","deep","deepen","deer","deerstalker",
+"def","deface","defame","default","defeat",
+"defeatism","defecate","defect","defection","defective",
+"defence","defend","defendant","defense","defensible",
+"defensive","defer","deference","defiance","defiant",
+"deficiency","deficient","deficit","defile","define",
+"definite","definitely","definition","definitive","deflate",
+"deflation","deflationary","deflect","deflection","deflower",
+"defoliant","defoliate","deforest","deform","deformation",
+"deformity","defraud","defray","defrock","defrost",
+"deft","defunct","defuse","defy","degauss",
+"degeneracy","degenerate","degeneration","degenerative","degrade",
+"degree","dehorn","dehumanise","dehumanize","dehydrate",
+"deice","deification","deify","deign","deism",
+"deity","dejected","dejection","dekko","delay",
+"delectable","delectation","delegacy","delegate","delegation",
+"delete","deleterious","deletion","delft","deliberate",
+"deliberation","deliberative","delicacy","delicate","delicatessen",
+"delicious","delight","delightful","delimit","delineate",
+"delinquency","delinquent","deliquescent","delirious","delirium",
+"deliver","deliverance","delivery","deliveryman","dell",
+"delouse","delphic","delphinium","delta","delude",
+"deluge","delusion","delusive","delve","demagnetise",
+"demagnetize","demagogic","demagogue","demagoguery","demand",
+"demanding","demarcate","demarcation","demean","demeanor",
+"demeanour","demented","demerit","demesne","demigod",
+"demijohn","demilitarise","demilitarize","demise","demist",
+"demister","demo","demob","demobilise","demobilize",
+"democracy","democrat","democratic","democratise","democratize",
+"demography","demolish","demolition","demon","demonetise",
+"demonetize","demoniacal","demonic","demonstrable","demonstrate",
+"demonstration","demonstrative","demonstrator","demoralise","demoralize",
+"demote","demotic","demur","demure","demystify",
+"den","denationalise","denationalize","denial","denier",
+"denigrate","denim","denims","denizen","denominate",
+"denomination","denominational","denominator","denotation","denote",
+"denouement","denounce","dense","density","dent",
+"dental","dentifrice","dentist","dentistry","denture",
+"dentures","denude","denunciation","deny","deodorant",
+"deodorise","deodorize","depart","departed","department",
+"departure","depend","dependable","dependant","dependence",
+"dependency","dependent","depict","depilatory","deplete",
+"deplorable","deplore","deploy","deponent","depopulate",
+"deport","deportee","deportment","depose","deposit",
+"deposition","depositor","depository","depot","deprave",
+"depravity","deprecate","deprecatory","depreciate","depreciatory",
+"depredation","depress","depressed","depression","deprivation",
+"deprive","deprived","depth","depths","deputation",
+"depute","deputise","deputize","deputy","derail",
+"derange","derby","derelict","dereliction","deride",
+"derision","derisive","derisory","derivative","derive",
+"dermatitis","dermatology","derogate","derogatory","derrick",
+"derv","dervish","des","desalinise","desalinize",
+"descale","descant","descend","descendant","descended",
+"descent","describe","description","descriptive","descry",
+"desecrate","desegregate","desensitise","desensitize","desert",
+"deserter","desertion","deserts","deserve","deservedly",
+"deserving","desiccant","desiccate","desideratum","design",
+"designate","designation","designedly","designer","designing",
+"designs","desirable","desire","desirous","desist",
+"desk","deskwork","desolate","despair","despairing",
+"despatch","despatches","desperado","desperate","desperation",
+"despicable","despise","despite","despoil","despondent",
+"despot","despotic","despotism","dessert","dessertspoon",
+"dessertspoonful","destination","destined","destiny","destitute",
+"destroy","destroyer","destruction","destructive","desuetude",
+"desultory","detach","detached","detachedly","detachment",
+"detail","detailed","detain","detainee","detect",
+"detection","detective","detector","detention","deter",
+"detergent","deteriorate","determinant","determination","determine",
+"determined","determiner","determinism","deterrent","detest",
+"dethrone","detonate","detonation","detonator","detour",
+"detract","detractor","detrain","detriment","detritus",
+"deuce","deuced","deuteronomy","devaluation","devalue",
+"devastate","devastating","develop","developer","development",
+"developmental","deviance","deviant","deviate","deviation",
+"deviationist","device","devil","devilish","devilishly",
+"devilment","devious","devise","devitalise","devitalize",
+"devoid","devolution","devolve","devote","devoted",
+"devotee","devotion","devotional","devotions","devour",
+"devout","devoutly","dew","dewdrop","dewlap",
+"dewpond","dewy","dexterity","dexterous","dextrose",
+"dhoti","dhow","diabetes","diabetic","diabolic",
+"diabolical","diacritic","diacritical","diadem","diaeresis",
+"diagnose","diagnosis","diagnostic","diagonal","diagram",
+"dial","dialect","dialectic","dialectician","dialog",
+"dialogue","diameter","diametrically","diamond","diaper",
+"diaphanous","diaphragm","diarist","diarrhea","diarrhoea",
+"diary","diaspora","diatom","diatribe","dibble",
+"dice","dicey","dichotomy","dick","dicker",
+"dickie","dicky","dickybird","dictaphone","dictate",
+"dictation","dictator","dictatorial","dictatorship","diction",
+"dictionary","dictum","did","didactic","diddle",
+"didst","die","diehard","dieresis","diet",
+"dietary","dietetic","dietetics","dietician","dietitian",
+"differ","difference","different","differential","differentiate",
+"difficult","difficulty","diffident","diffract","diffuse",
+"diffusion","dig","digest","digestion","digestive",
+"digger","digging","diggings","digit","digital",
+"dignified","dignify","dignitary","dignity","digraph",
+"digress","digression","digs","dike","dilapidated",
+"dilapidation","dilapidations","dilate","dilatory","dildo",
+"dilemma","dilettante","diligence","diligent","dill",
+"dillydally","dilute","dilution","dim","dimension",
+"dimensions","diminish","diminuendo","diminution","diminutive",
+"dimity","dimple","dimwit","din","dinar",
+"dine","diner","dingdong","dinghy","dingle",
+"dingo","dingy","dink","dinkum","dinky",
+"dinner","dinosaur","dint","diocese","dioxide",
+"dip","diphtheria","diphthong","diploma","diplomacy",
+"diplomat","diplomatic","diplomatically","diplomatist","dipper",
+"dipsomania","dipsomaniac","dipstick","dipswitch","diptych",
+"dire","direct","direction","directional","directions",
+"directive","directly","director","directorate","directorship",
+"directory","direful","dirge","dirigible","dirk",
+"dirndl","dirt","dirty","disability","disable",
+"disabled","disabuse","disadvantage","disadvantageous","disaffected",
+"disaffection","disaffiliate","disafforest","disagree","disagreeable",
+"disagreement","disallow","disappear","disappearance","disappoint",
+"disappointed","disappointing","disappointment","disapprobation","disapproval",
+"disapprove","disarm","disarmament","disarrange","disarray",
+"disassociate","disaster","disastrous","disavow","disband",
+"disbar","disbelief","disbelieve","disburden","disburse",
+"disbursement","disc","discard","discern","discerning",
+"discernment","discharge","disciple","discipleship","disciplinarian",
+"disciplinary","discipline","disclaim","disclaimer","disclose",
+"disclosure","disco","discolor","discoloration","discolour",
+"discolouration","discomfit","discomfiture","discomfort","discommode",
+"discompose","disconcert","disconnect","disconnected","disconnection",
+"disconsolate","discontent","discontented","discontinue","discontinuity",
+"discontinuous","discord","discordance","discordant","discotheque",
+"discount","discountenance","discourage","discouragement","discourse",
+"discourteous","discourtesy","discover","discovery","discredit",
+"discreditable","discreet","discrepancy","discrete","discretion",
+"discretionary","discriminate","discriminating","discrimination","discriminatory",
+"discursive","discus","discuss","discussion","disdain",
+"disdainful","disease","disembark","disembarrass","disembodied",
+"disembowel","disembroil","disenchant","disencumber","disendow",
+"disengage","disengaged","disentangle","disequilibrium","disestablish",
+"disfavor","disfavour","disfigure","disforest","disfranchise",
+"disfrock","disgorge","disgrace","disgraceful","disgruntled",
+"disguise","disgust","dish","dishabille","disharmony",
+"dishcloth","dishearten","dishes","dishevelled","dishful",
+"dishonest","dishonesty","dishonor","dishonorable","dishonour",
+"dishonourable","dishwasher","dishwater","dishy","disillusion",
+"disillusioned","disillusionment","disincentive","disinclination","disinclined",
+"disinfect","disinfectant","disinfest","disingenuous","disinherit",
+"disintegrate","disinter","disinterested","disjoint","disjointed",
+"disjunctive","disk","dislike","dislocate","dislocation",
+"dislodge","disloyal","dismal","dismantle","dismast",
+"dismay","dismember","dismiss","dismissal","dismount",
+"disobedient","disobey","disoblige","disorder","disorderly",
+"disorganise","disorganize","disorientate","disown","disparage",
+"disparate","disparity","dispassionate","dispatch","dispatches",
+"dispel","dispensable","dispensary","dispensation","dispense",
+"dispenser","dispersal","disperse","dispersion","dispirit",
+"displace","displacement","display","displease","displeasure",
+"disport","disposable","disposal","dispose","disposed",
+"disposition","dispossess","dispossessed","disproof","disproportion",
+"disproportionate","disprove","disputable","disputant","disputation",
+"disputatious","dispute","disqualification","disqualify","disquiet",
+"disquietude","disquisition","disregard","disrelish","disremember",
+"disrepair","disreputable","disrepute","disrespect","disrobe",
+"disrupt","dissatisfaction","dissatisfy","dissect","dissection",
+"dissemble","disseminate","dissension","dissent","dissenter",
+"dissenting","dissertation","disservice","dissever","dissident",
+"dissimilar","dissimilarity","dissimulate","dissipate","dissipated",
+"dissipation","dissociate","dissoluble","dissolute","dissolution",
+"dissolve","dissonance","dissonant","dissuade","distaff",
+"distal","distance","distant","distantly","distaste",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java
new file mode 100644
index 00000000000..4c72ef87526
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData3.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData3 {
+ private KStemData3() {
+ }
+ static String[] data = {
+"distasteful","distemper","distempered","distend","distension",
+"distil","distill","distillation","distiller","distillery",
+"distinct","distinction","distinctive","distinguish","distinguishable",
+"distinguished","distort","distortion","distract","distracted",
+"distraction","distrain","distraint","distrait","distraught",
+"distress","distressing","distribute","distribution","distributive",
+"distributor","district","distrust","distrustful","disturb",
+"disturbance","disturbed","disunion","disunite","disunity",
+"disuse","disused","disyllabic","disyllable","ditch",
+"dither","dithers","ditto","ditty","diuretic",
+"diurnal","divagate","divan","dive","diver",
+"diverge","divergence","divers","diverse","diversify",
+"diversion","diversionary","diversity","divert","divertimento",
+"divertissement","divest","divide","dividend","dividers",
+"divination","divine","diviner","divingboard","divinity",
+"divisible","division","divisive","divisor","divorce",
+"divot","divulge","divvy","dixie","dixieland",
+"dizzy","djinn","dna","do","dobbin",
+"doc","docile","dock","docker","docket",
+"dockyard","doctor","doctoral","doctorate","doctrinaire",
+"doctrinal","doctrine","document","documentary","documentation",
+"dodder","doddering","doddle","dodge","dodgems",
+"dodger","dodgy","dodo","doe","doer",
+"doeskin","doff","dog","dogcart","dogcatcher",
+"dogfight","dogfish","dogged","doggerel","doggie",
+"doggo","doggone","doggy","doghouse","dogie",
+"dogleg","dogma","dogmatic","dogmatics","dogmatism",
+"dogs","dogsbody","dogtooth","dogtrot","dogwood",
+"doh","doily","doings","doldrums","dole",
+"doleful","doll","dollar","dollop","dolly",
+"dolmen","dolor","dolorous","dolour","dolphin",
+"dolt","domain","dome","domed","domestic",
+"domesticate","domesticity","domicile","domiciliary","dominance",
+"dominant","dominate","domination","domineer","dominican",
+"dominion","domino","dominoes","don","donate",
+"donation","donjon","donkey","donkeywork","donnish",
+"donor","doodle","doodlebug","doom","doomsday",
+"door","doorbell","doorframe","doorkeeper","doorknob",
+"doorknocker","doorman","doormat","doornail","doorplate",
+"doorscraper","doorstep","doorstopper","doorway","dope",
+"dopey","dopy","doric","dormant","dormer",
+"dormitory","dormouse","dorsal","dory","dosage",
+"dose","doss","dosser","dosshouse","dossier",
+"dost","dot","dotage","dote","doth",
+"doting","dottle","dotty","double","doubles",
+"doublet","doublethink","doubloon","doubly","doubt",
+"doubtful","doubtless","douche","dough","doughnut",
+"doughty","doughy","dour","douse","dove",
+"dovecote","dovetail","dowager","dowdy","dowel",
+"dower","down","downbeat","downcast","downdraft",
+"downdraught","downer","downfall","downgrade","downhearted",
+"downhill","downpour","downright","downstage","downstairs",
+"downstream","downtown","downtrodden","downward","downwards",
+"downwind","downy","dowry","dowse","doxology",
+"doyen","doyley","doze","dozen","dozy",
+"dpt","drab","drabs","drachm","drachma",
+"draconian","draft","draftee","draftsman","drafty",
+"drag","draggled","draggy","dragnet","dragoman",
+"dragon","dragonfly","dragoon","drain","drainage",
+"drainpipe","drake","dram","drama","dramatic",
+"dramatics","dramatise","dramatist","dramatize","drank",
+"drape","draper","drapery","drastic","drat",
+"draught","draughtboard","draughts","draughtsman","draughty",
+"draw","drawback","drawbridge","drawer","drawers",
+"drawing","drawl","drawn","drawstring","dray",
+"dread","dreadful","dreadfully","dreadnaught","dreadnought",
+"dream","dreamboat","dreamer","dreamland","dreamless",
+"dreamlike","dreamy","drear","dreary","dredge",
+"dredger","dregs","drench","dress","dressage",
+"dresser","dressing","dressmaker","dressy","drew",
+"dribble","driblet","dribs","drier","drift",
+"driftage","drifter","driftnet","driftwood","drill",
+"drily","drink","drinkable","drinker","drip",
+"dripping","drive","drivel","driver","driveway",
+"driving","drizzle","drogue","droll","drollery",
+"dromedary","drone","drool","droop","drop",
+"dropkick","droplet","dropout","dropper","droppings",
+"drops","dropsy","dross","drought","drove",
+"drover","drown","drowse","drowsy","drub",
+"drudge","drudgery","drug","drugget","druggist",
+"drugstore","druid","drum","drumbeat","drumfire",
+"drumhead","drummer","drumstick","drunk","drunkard",
+"drunken","drupe","dry","dryad","dryer",
+"dual","dub","dubbin","dubiety","dubious",
+"ducal","ducat","duchess","duchy","duck",
+"duckboards","duckling","ducks","duckweed","ducky",
+"duct","ductile","dud","dude","dudgeon",
+"duds","due","duel","duenna","dues",
+"duet","duff","duffel","duffer","duffle",
+"dug","dugout","duke","dukedom","dukes",
+"dulcet","dulcimer","dull","dullard","duly",
+"dumb","dumbbell","dumbfound","dumbwaiter","dumfound",
+"dummy","dump","dumper","dumpling","dumps",
+"dumpy","dun","dunce","dunderhead","dung",
+"dungaree","dungarees","dungeon","dunghill","dunk",
+"duo","duodecimal","duodenum","duologue","dupe",
+"duplex","duplicate","duplicator","duplicity","durable",
+"duration","durbar","duress","durex","during",
+"durst","dusk","dusky","dust","dustbin",
+"dustbowl","dustcart","dustcoat","duster","dustman",
+"dustpan","dustsheet","dustup","dusty","dutch",
+"dutiable","dutiful","duty","duvet","dwarf",
+"dwell","dwelling","dwindle","dyarchy","dye",
+"dyestuff","dyeworks","dyke","dynamic","dynamics",
+"dynamism","dynamite","dynamo","dynasty","dysentery",
+"dyslexia","dyspepsia","dyspeptic","each","eager",
+"eagle","eaglet","ear","earache","eardrum",
+"eared","earful","earl","earliest","earlobe",
+"early","earmark","earmuff","earn","earnest",
+"earnings","earphone","earpiece","earplug","earring",
+"earshot","earth","earthbound","earthen","earthenware",
+"earthling","earthly","earthnut","earthquake","earthshaking",
+"earthwork","earthworm","earthy","earwax","earwig",
+"ease","easel","easily","east","eastbound",
+"easter","easterly","eastern","easterner","easternmost",
+"easy","easygoing","eat","eatable","eatables",
+"eater","eats","eaves","eavesdrop","ebb",
+"ebony","ebullience","ebullient","eccentric","eccentricity",
+"ecclesiastic","ecclesiastical","ecg","echelon","echo",
+"eclectic","eclipse","ecliptic","eclogue","ecological",
+"ecologically","ecology","economic","economical","economically",
+"economics","economise","economist","economize","economy",
+"ecosystem","ecstasy","ecstatic","ect","ectoplasm",
+"ecumenical","ecumenicalism","eczema","edam","eddy",
+"edelweiss","eden","edge","edgeways","edging",
+"edgy","edible","edibles","edict","edification",
+"edifice","edify","edit","edition","editor",
+"editorial","editorialise","editorialize","educate","educated",
+"education","educational","educationist","educator","educe",
+"eec","eeg","eel","eerie","efface",
+"effect","effective","effectively","effectiveness","effectives",
+"effects","effectual","effectually","effectuate","effeminacy",
+"effeminate","effendi","effervesce","effete","efficacious",
+"efficacy","efficiency","efficient","effigy","efflorescence",
+"effluent","efflux","effort","effortless","effrontery",
+"effulgence","effulgent","effusion","effusive","eft",
+"egalitarian","egg","eggcup","egghead","eggnog",
+"eggplant","eggshell","egis","eglantine","ego",
+"egocentric","egoism","egoist","egotism","egotist",
+"egregious","egress","egret","eiderdown","eight",
+"eighteen","eightsome","eighty","eisteddfod","either",
+"ejaculate","ejaculation","eject","ejector","eke",
+"ekg","elaborate","elaboration","eland","elapse",
+"elastic","elasticity","elastoplast","elate","elated",
+"elation","elbow","elbowroom","elder","elderberry",
+"elderflower","elderly","eldest","elect","election",
+"electioneer","electioneering","elective","elector","electoral",
+"electorate","electric","electrical","electrician","electricity",
+"electrify","electrocardiogram","electrocardiograph","electrocute","electrode",
+"electroencephalogram","electroencephalograph","electrolysis","electrolyte","electron",
+"electronic","electronics","electroplate","eleemosynary","elegant",
+"elegiac","elegy","element","elemental","elementary",
+"elements","elephant","elephantiasis","elephantine","elevate",
+"elevated","elevation","elevator","eleven","elevenses",
+"elf","elfin","elfish","elicit","elide",
+"eligible","eliminate","elite","elitism","elixir",
+"elizabethan","elk","elkhound","ellipse","ellipsis",
+"elliptic","elm","elocution","elocutionary","elocutionist",
+"elongate","elongation","elope","eloquence","eloquent",
+"else","elsewhere","elucidate","elucidatory","elude",
+"elusive","elver","elves","elvish","elysian",
+"elysium","emaciate","emanate","emancipate","emancipation",
+"emasculate","embalm","embankment","embargo","embark",
+"embarkation","embarrass","embarrassment","embassy","embattled",
+"embed","embellish","ember","embezzle","embitter",
+"emblazon","emblem","emblematic","embodiment","embody",
+"embolden","embolism","embonpoint","embosomed","emboss",
+"embowered","embrace","embrasure","embrocation","embroider",
+"embroidery","embroil","embryo","embryonic","emend",
+"emendation","emerald","emerge","emergence","emergency",
+"emergent","emeritus","emery","emetic","emigrant",
+"emigrate","eminence","eminent","eminently","emir",
+"emirate","emissary","emission","emit","emmentaler",
+"emmenthaler","emollient","emolument","emote","emotion",
+"emotional","emotionalism","emotionally","emotive","empanel",
+"empathy","emperor","emphasis","emphasise","emphasize",
+"emphatic","emphatically","emphysema","empire","empirical",
+"empiricism","emplacement","emplane","employ","employable",
+"employee","employer","employment","emporium","empower",
+"empress","emptily","empty","empurpled","empyreal",
+"empyrean","emu","emulate","emulation","emulsify",
+"emulsion","enable","enabling","enact","enactment",
+"enamel","enamelware","enamored","enamoured","encamp",
+"encampment","encapsulate","encase","encaustic","encephalitis",
+"enchain","enchant","enchanter","enchanting","enchantment",
+"encipher","encircle","enclave","enclose","enclosure",
+"encode","encomium","encompass","encore","encounter",
+"encourage","encouragement","encroach","encroachment","encrust",
+"encumber","encumbrance","encyclical","encyclopaedia","encyclopaedic",
+"encyclopedia","encyclopedic","end","endanger","endear",
+"endearing","endearment","endeavor","endeavour","endemic",
+"ending","endive","endless","endocrine","endorse",
+"endow","endowment","endpaper","endurance","endure",
+"enduring","endways","enema","enemy","energetic",
+"energize","energy","enervate","enfeeble","enfilade",
+"enfold","enforce","enfranchise","engage","engaged",
+"engagement","engaging","engender","engine","engineer",
+"engineering","english","englishman","engraft","engrave",
+"engraving","engross","engrossing","engulf","enhance",
+"enigma","enigmatic","enjoin","enjoy","enjoyable",
+"enjoyment","enkindle","enlarge","enlargement","enlighten",
+"enlightened","enlightenment","enlist","enliven","enmesh",
+"enmity","ennoble","ennui","enormity","enormous",
+"enormously","enough","enplane","enquire","enquiring",
+"enquiry","enrage","enrapture","enrich","enrol",
+"enroll","enrollment","enrolment","ensanguined","ensconce",
+"ensemble","enshrine","enshroud","ensign","enslave",
+"ensnare","ensue","ensure","entail","entangle",
+"entanglement","entente","enter","enteritis","enterprise",
+"enterprising","entertain","entertainer","entertaining","entertainment",
+"enthral","enthrall","enthrone","enthroned","enthuse",
+"enthusiasm","enthusiast","entice","enticement","entire",
+"entirety","entitle","entity","entomb","entomology",
+"entourage","entrails","entrain","entrance","entrant",
+"entrap","entreat","entreaty","entrench","entrenched",
+"entrenchment","entrepreneur","entresol","entropy","entrust",
+"entry","entwine","enumerate","enunciate","enunciation",
+"envelop","envenom","enviable","envious","environed",
+"environment","environmental","environmentalist","environs","envisage",
+"envoi","envoy","envy","enzyme","eon",
+"epaulet","epaulette","ephemeral","epic","epicenter",
+"epicentre","epicure","epicurean","epidemic","epidermis",
+"epidiascope","epiglottis","epigram","epigrammatic","epilepsy",
+"epileptic","epilogue","epiphany","episcopacy","episcopal",
+"episcopalian","episode","episodic","epistle","epistolary",
+"epitaph","epithet","epitome","epitomise","epitomize",
+"epoch","eponymous","equability","equable","equal",
+"equalise","equalitarian","equality","equalize","equally",
+"equanimity","equate","equation","equator","equatorial",
+"equerry","equestrian","equidistant","equilateral","equilibrium",
+"equine","equinoctial","equinox","equip","equipage",
+"equipment","equipoise","equitable","equitation","equities",
+"equity","equivalence","equivalent","equivocal","equivocate",
+"equivocation","era","eradicate","eradicator","erase",
+"eraser","erasure","ere","erect","erectile",
+"erection","eremite","erg","ergo","ergonomics",
+"ermine","erode","erogenous","erosion","erotic",
+"erotica","eroticism","err","errand","errant",
+"erratic","erratum","erroneous","error","ersatz",
+"erse","eructation","erudite","erupt","eruption",
+"erysipelas","escalate","escalator","escalope","escapade",
+"escape","escapee","escapement","escapism","escapology",
+"escarpment","eschatology","eschew","escort","escritoire",
+"escutcheon","eskimo","esophagus","esoteric","esp",
+"espalier","especial","especially","esperanto","espionage",
+"esplanade","espousal","espouse","espresso","espy",
+"essay","essence","essential","essentially","establish",
+"establishment","estaminet","estate","esteem","esthete",
+"esthetic","esthetics","estimable","estimate","estimation",
+"estimator","estrange","estrangement","estrogen","estuary",
+"etch","etching","eternal","eternity","ether",
+"ethereal","ethic","ethical","ethically","ethics",
+"ethnic","ethnically","ethnographer","ethnography","ethnologist",
+"ethnology","ethos","ethyl","etiolate","etiology",
+"etiquette","etymologist","etymology","eucalyptus","eucharist",
+"euclidean","euclidian","eugenic","eugenics","eulogise",
+"eulogist","eulogistic","eulogize","eulogy","eunuch",
+"euphemism","euphemistic","euphonious","euphonium","euphony",
+"euphoria","euphuism","eurasian","eureka","eurhythmic",
+"eurhythmics","eurocrat","eurodollar","eurythmic","eurythmics",
+"euthanasia","evacuate","evacuee","evade","evaluate",
+"evanescent","evangelic","evangelical","evangelise","evangelist",
+"evangelize","evaporate","evasion","evasive","eve",
+"even","evening","evenings","evens","evensong",
+"event","eventful","eventide","eventual","eventuality",
+"eventually","eventuate","ever","evergreen","everlasting",
+"everlastingly","evermore","every","everybody","everyday",
+"everything","everywhere","evict","evidence","evident",
+"evidently","evil","evildoer","evince","eviscerate",
+"evocative","evoke","evolution","evolutionary","evolve",
+"ewe","ewer","exacerbate","exact","exacting",
+"exaction","exactly","exaggerate","exaggeration","exalt",
+"exaltation","exalted","exam","examination","examine",
+"example","exasperate","exasperation","excavate","excavation",
+"excavator","exceed","exceedingly","excel","excellence",
+"excellency","excellent","excelsior","except","excepted",
+"excepting","exception","exceptionable","exceptional","excerpt",
+"excess","excesses","excessive","exchange","exchequer",
+"excise","excision","excitable","excite","excited",
+"excitement","exciting","exclaim","exclamation","exclamatory",
+"exclude","excluding","exclusion","exclusive","exclusively",
+"excogitate","excommunicate","excommunication","excoriate","excrement",
+"excrescence","excreta","excrete","excretion","excruciating",
+"exculpate","excursion","excursionist","excusable","excuse",
+"execrable","execrate","executant","execute","execution",
+"executioner","executive","executor","exegesis","exemplary",
+"exemplification","exemplify","exempt","exemption","exercise",
+"exercises","exert","exertion","exeunt","exhalation",
+"exhale","exhaust","exhaustion","exhaustive","exhibit",
+"exhibition","exhibitionism","exhibitor","exhilarate","exhilarating",
+"exhort","exhortation","exhume","exigency","exigent",
+"exiguous","exile","exist","existence","existent",
+"existential","existentialism","existing","exit","exodus",
+"exogamy","exonerate","exorbitant","exorcise","exorcism",
+"exorcist","exorcize","exotic","expand","expanse",
+"expansion","expansive","expatiate","expatriate","expect",
+"expectancy","expectant","expectation","expectations","expectorate",
+"expediency","expedient","expedite","expedition","expeditionary",
+"expeditious","expel","expend","expendable","expenditure",
+"expense","expenses","expensive","experience","experienced",
+"experiment","experimental","experimentation","expert","expertise",
+"expiate","expiration","expire","explain","explanation",
+"explanatory","expletive","explicable","explicate","explicit",
+"explode","exploded","exploit","exploration","exploratory",
+"explore","explosion","explosive","expo","exponent",
+"exponential","export","exportation","exporter","expose",
+"exposition","expostulate","exposure","expound","express",
+"expression","expressionism","expressionless","expressive","expressly",
+"expressway","expropriate","expulsion","expunge","expurgate",
+"exquisite","extant","extemporaneous","extempore","extemporise",
+"extemporize","extend","extension","extensive","extent",
+"extenuate","extenuation","exterior","exteriorise","exteriorize",
+"exterminate","external","externalise","externalize","externally",
+"externals","exterritorial","extinct","extinction","extinguish",
+"extinguisher","extirpate","extol","extort","extortion",
+"extortionate","extortions","extra","extract","extraction",
+"extracurricular","extraditable","extradite","extrajudicial","extramarital",
+"extramural","extraneous","extraordinarily","extraordinary","extrapolate",
+"extraterrestrial","extraterritorial","extravagance","extravagant","extravaganza",
+"extravert","extreme","extremely","extremism","extremities",
+"extremity","extricate","extrinsic","extrovert","extrude",
+"exuberance","exuberant","exude","exult","exultant",
+"exultation","eye","eyeball","eyebrow","eyecup",
+"eyeful","eyeglass","eyeglasses","eyelash","eyelet",
+"eyelid","eyeliner","eyepiece","eyes","eyeshot",
+"eyesight","eyesore","eyestrain","eyetooth","eyewash",
+"eyewitness","eyot","eyrie","eyry","fabian",
+"fable","fabled","fabric","fabricate","fabrication",
+"fabulous","fabulously","face","facecloth","faceless",
+"facet","facetious","facial","facile","facilitate",
+"facilities","facility","facing","facings","facsimile",
+"fact","faction","factious","factitious","factor",
+"factorial","factorise","factorize","factory","factotum",
+"factual","faculty","fad","fade","faeces",
+"faerie","faery","fag","fagged","faggot",
+"fagot","fahrenheit","faience","fail","failing",
+"failure","fain","faint","fair","fairground",
+"fairly","fairway","fairy","fairyland","faith",
+"faithful","faithfully","faithless","fake","fakir",
+"falcon","falconer","falconry","fall","fallacious",
+"fallacy","fallen","fallible","fallout","fallow",
+"falls","false","falsehood","falsetto","falsies",
+"falsify","falsity","falter","fame","famed",
+"familial","familiar","familiarise","familiarity","familiarize",
+"familiarly","family","famine","famish","famished",
+"famous","famously","fan","fanatic","fanaticism",
+"fancier","fancies","fanciful","fancy","fancywork",
+"fandango","fanfare","fang","fanlight","fanny",
+"fantasia","fantastic","fantasy","far","faraway",
+"farce","fare","farewell","farfetched","farinaceous",
+"farm","farmer","farmhand","farmhouse","farming",
+"farmyard","farrago","farrier","farrow","farsighted",
+"fart","farther","farthest","farthing","fascia",
+"fascinate","fascinating","fascination","fascism","fascist",
+"fashion","fashionable","fast","fasten","fastener",
+"fastening","fastidious","fastness","fat","fatal",
+"fatalism","fatalist","fatality","fatally","fate",
+"fated","fateful","fates","fathead","father",
+"fatherhood","fatherly","fathom","fathomless","fatigue",
+"fatigues","fatless","fatted","fatten","fatty",
+"fatuity","fatuous","faucet","fault","faultfinding",
+"faultless","faulty","faun","fauna","favor",
+"favorable","favored","favorite","favoritism","favour",
+"favourable","favoured","favourite","favouritism","favours",
+"fawn","fay","faze","fbi","fealty",
+"fear","fearful","fearless","fearsome","feasible",
+"feast","feat","feather","featherbed","featherbrained",
+"featherweight","feathery","feature","featureless","features",
+"febrile","february","feces","feckless","fecund",
+"fed","federal","federalism","federalist","federate",
+"federation","fee","feeble","feebleminded","feed",
+"feedback","feedbag","feeder","feel","feeler",
+"feeling","feelings","feet","feign","feint",
+"feldspar","felicitate","felicitous","felicity","feline",
+"fell","fellah","fellatio","fellow","fellowship",
+"felon","felony","felspar","felt","felucca",
+"fem","female","feminine","femininity","feminism",
+"feminist","femur","fen","fence","fencer",
+"fencing","fend","fender","fennel","feoff",
+"feral","ferment","fermentation","fern","ferocious",
+"ferocity","ferret","ferroconcrete","ferrous","ferrule",
+"ferry","ferryboat","ferryman","fertile","fertilise",
+"fertility","fertilize","fertilizer","ferule","fervent",
+"fervid","fervor","fervour","festal","fester",
+"festival","festive","festivity","festoon","fetal",
+"fetch","fetching","fete","fetid","fetish",
+"fetishism","fetishist","fetlock","fetter","fettle",
+"fetus","feud","feudal","feudalism","feudatory",
+"fever","fevered","feverish","feverishly","few",
+"fey","fez","fiasco","fiat","fib",
+"fiber","fiberboard","fiberglass","fibre","fibreboard",
+"fibreglass","fibrositis","fibrous","fibula","fichu",
+"fickle","fiction","fictional","fictionalisation","fictionalization",
+"fictitious","fiddle","fiddler","fiddlesticks","fiddling",
+"fidelity","fidget","fidgets","fidgety","fie",
+"fief","field","fielder","fieldwork","fiend",
+"fiendish","fiendishly","fierce","fiery","fiesta",
+"fife","fifteen","fifth","fifty","fig",
+"fight","fighter","figment","figurative","figure",
+"figured","figurehead","figures","figurine","filament",
+"filbert","filch","file","filet","filial",
+"filibuster","filigree","filings","fill","filler",
+"fillet","filling","fillip","filly","film",
+"filmable","filmstrip","filmy","filter","filth",
+"filthy","fin","finable","final","finale",
+"finalise","finalist","finality","finalize","finally",
+"finance","finances","financial","financially","financier",
+"finch","find","finder","finding","fine",
+"fineable","finely","finery","finesse","finger",
+"fingerboard","fingering","fingernail","fingerplate","fingerpost",
+"fingerprint","fingerstall","fingertip","finicky","finis",
+"finish","finished","finite","fink","fiord",
+"fir","fire","firearm","fireball","firebomb",
+"firebox","firebrand","firebreak","firebrick","firebug",
+"fireclay","firecracker","firedamp","firedog","firefly",
+"fireguard","firelight","firelighter","fireman","fireplace",
+"firepower","fireproof","fireside","firestorm","firetrap",
+"firewalking","firewatcher","firewater","firewood","firework",
+"fireworks","firkin","firm","firmament","first",
+"firstborn","firstfruits","firsthand","firstly","firth",
+"firtree","fiscal","fish","fishcake","fisherman",
+"fishery","fishing","fishmonger","fishplate","fishwife",
+"fishy","fissile","fission","fissionable","fissure",
+"fist","fisticuffs","fistula","fit","fitful",
+"fitment","fitness","fitted","fitter","fitting",
+"five","fiver","fives","fix","fixation",
+"fixative","fixed","fixedly","fixity","fixture",
+"fizz","fizzle","fizzy","fjord","flabbergast",
+"flabby","flaccid","flag","flagellant","flagellate",
+"flageolet","flagon","flagpole","flagrancy","flagrant",
+"flagship","flagstaff","flagstone","flail","flair",
+"flak","flake","flaky","flambeau","flamboyant",
+"flame","flamenco","flaming","flamingo","flammable",
+"flan","flange","flank","flannel","flannelette",
+"flannels","flap","flapjack","flapper","flare",
+"flared","flares","flash","flashback","flashbulb",
+"flashcube","flasher","flashgun","flashlight","flashy",
+"flask","flat","flatcar","flatfish","flatfoot",
+"flatiron","flatlet","flatly","flatten","flatter",
+"flattery","flattop","flatulence","flaunt","flautist",
+"flavor","flavoring","flavour","flavouring","flaw",
+"flawless","flax","flaxen","flay","flea",
+"fleabag","fleabite","fleapit","fleck","fledged",
+"fledgling","flee","fleece","fleecy","fleet",
+"fleeting","flesh","fleshings","fleshly","fleshpot",
+"fleshy","flew","flex","flexible","flibbertigibbet",
+"flick","flicker","flicks","flier","flies",
+"flight","flightless","flighty","flimsy","flinch",
+"fling","flint","flintlock","flinty","flip",
+"flippancy","flippant","flipper","flipping","flirt",
+"flirtation","flirtatious","flit","flitch","flivver",
+"float","floatation","floating","flock","floe",
+"flog","flogging","flood","floodgate","floodlight",
+"floor","floorboard","flooring","floorwalker","floosy",
+"floozy","flop","floppy","flora","floral",
+"floriculture","florid","florin","florist","floss",
+"flotation","flotilla","flounce","flounder","flour",
+"flourish","flourmill","floury","flout","flow",
+"flower","flowerbed","flowered","flowering","flowerless",
+"flowerpot","flowery","flowing","flown","flu",
+"fluctuate","flue","fluency","fluent","fluff",
+"fluffy","fluid","fluidity","fluke","flukey",
+"fluky","flume","flummery","flummox","flung",
+"flunk","flunkey","flunky","fluorescent","fluoridate",
+"fluoride","fluorine","flurry","flush","flushed",
+"fluster","flute","fluting","flutist","flutter",
+"fluvial","flux","fly","flyaway","flyblown",
+"flyby","flycatcher","flyer","flying","flyleaf",
+"flyover","flypaper","flypast","flysheet","flyswatter",
+"flytrap","flyweight","flywheel","flywhisk","foal",
+"foam","fob","focal","focus","fodder",
+"foe","foeman","foetal","foetus","fog",
+"fogbank","fogbound","fogey","foggy","foghorn",
+"fogy","foible","foil","foist","fold",
+"foldaway","folder","foliage","folio","folk",
+"folklore","folklorist","folks","folksy","folktale",
+"folkway","follicle","follow","follower","following",
+"folly","foment","fomentation","fond","fondant",
+"fondle","fondly","fondu","fondue","font",
+"food","foodstuff","fool","foolery","foolhardy",
+"foolish","foolproof","foolscap","foot","footage",
+"football","footbath","footboard","footbridge","footer",
+"footfall","foothill","foothold","footing","footle",
+"footlights","footling","footloose","footman","footnote",
+"footpad","footpath","footplate","footprint","footrace",
+"footsie","footslog","footsore","footstep","footstool",
+"footsure","footwear","footwork","fop","foppish",
+"for","forage","foray","forbear","forbearance",
+"forbearing","forbid","forbidden","forbidding","force",
+"forced","forceful","forcemeat","forceps","forces",
+"forcible","forcibly","ford","fore","forearm",
+"forebode","foreboding","forecast","forecastle","foreclose",
+"foreclosure","forecourt","foredoomed","forefather","forefinger",
+"forefoot","forefront","forego","foregoing","foreground",
+"forehand","forehead","foreign","foreigner","foreknowledge",
+"foreland","foreleg","forelock","foreman","foremost",
+"forename","forenoon","forensic","foreordain","forepart",
+"foreplay","forerunner","foresail","foresee","foreseeable",
+"foreshadow","foreshore","foreshorten","foresight","foreskin",
+"forest","forestall","forester","forestry","foreswear",
+"foretaste","foretell","forethought","forever","forewarn",
+"forewent","forewoman","foreword","forfeit","forfeiture",
+"forgather","forgave","forge","forger","forgery",
+"forget","forgetful","forging","forgivable","forgive",
+"forgiveable","forgiveness","forgiving","forgo","fork",
+"forked","forkful","forklift","forlorn","form",
+"formal","formaldehyde","formalin","formalise","formalism",
+"formality","formalize","format","formation","formative",
+"formbook","former","formerly","formica","formidable",
+"formless","formula","formulaic","formulate","formulation",
+"fornicate","fornication","forrader","forsake","forsooth",
+"forswear","forsythia","fort","forte","forth",
+"forthcoming","forthright","forthwith","fortieth","fortification",
+"fortify","fortissimo","fortitude","fortnight","fortnightly",
+"fortress","fortuitous","fortunate","fortunately","fortune",
+"forty","forum","forward","forwarding","forwardly",
+"forwardness","forwent","foss","fosse","fossil",
+"fossilise","fossilize","foster","fought","foul",
+"found","foundation","foundations","founder","foundling",
+"foundry","fount","fountain","fountainhead","four",
+"foureyes","fourpenny","fours","foursquare","fourteen",
+"fourth","fowl","fox","foxglove","foxhole",
+"foxhound","foxhunt","foxtrot","foxy","foyer",
+"fracas","fraction","fractional","fractionally","fractious",
+"fracture","fragile","fragment","fragmentary","fragmentation",
+"fragrance","fragrant","frail","frailty","frame",
+"frames","framework","franc","franchise","franciscan",
+"frank","frankfurter","frankincense","franklin","frankly",
+"frantic","fraternal","fraternise","fraternity","fraternize",
+"fratricide","frau","fraud","fraudulence","fraudulent",
+"fraught","fraulein","fray","frazzle","freak",
+"freakish","freckle","free","freebee","freebie",
+"freeboard","freebooter","freeborn","freedman","freedom",
+"freehand","freehanded","freehold","freeholder","freelance",
+"freeload","freely","freeman","freemason","freemasonry",
+"freepost","freesia","freestanding","freestone","freestyle",
+"freethinker","freeway","freewheel","freewheeling","freewill",
+"freeze","freezer","freezing","freight","freighter",
+"freightliner","frenchman","frenetic","frenzied","frenzy",
+"frequency","frequent","fresco","fresh","freshen",
+"fresher","freshet","freshly","freshwater","fret",
+"fretful","fretsaw","fretwork","freudian","friable",
+"friar","friary","fricassee","fricative","friction",
+"friday","fridge","friend","friendless","friendly",
+"friends","friendship","frier","frieze","frig",
+"frigate","frigging","fright","frighten","frightened",
+"frightful","frightfully","frigid","frigidity","frill",
+"frilled","frills","frilly","fringe","frippery",
+"frisbee","frisian","frisk","frisky","frisson",
+"fritter","frivolity","frivolous","frizz","frizzle",
+"frizzy","fro","frock","frog","frogged",
+"frogman","frogmarch","frogspawn","frolic","frolicsome",
+"from","frond","front","frontage","frontal",
+"frontbench","frontier","frontiersman","frontispiece","frost",
+"frostbite","frostbitten","frostbound","frosting","frosty",
+"froth","frothy","frown","frowst","frowsty",
+"frowsy","frowzy","froze","frozen","frs",
+"fructification","fructify","frugal","frugality","fruit",
+"fruitcake","fruiterer","fruitful","fruition","fruitless",
+"fruits","fruity","frump","frustrate","frustration",
+"fry","fryer","fuchsia","fuck","fucker",
+"fucking","fuddle","fudge","fuehrer","fuel",
+"fug","fugitive","fugue","fuhrer","fulcrum",
+"fulfil","fulfill","fulfillment","fulfilment","full",
+"fullback","fuller","fully","fulmar","fulminate",
+"fulmination","fulness","fulsome","fumble","fume",
+"fumes","fumigate","fun","function","functional",
+"functionalism","functionalist","functionary","fund","fundamental",
+"fundamentalism","fundamentally","funds","funeral","funerary",
+"funereal","funfair","fungicide","fungoid","fungous",
+"fungus","funicular","funk","funky","funnel",
+"funnies","funnily","funny","fur","furbelow",
+"furbish","furious","furiously","furl","furlong",
+"furlough","furnace","furnish","furnishings","furniture",
+"furore","furrier","furrow","furry","further",
+"furtherance","furthermore","furthermost","furthest","furtive",
+"fury","furze","fuse","fused","fuselage",
+"fusilier","fusillade","fusion","fuss","fusspot",
+"fussy","fustian","fusty","futile","futility",
+"future","futureless","futures","futurism","futuristic",
+"futurity","fuzz","fuzzy","gab","gabardine",
+"gabble","gaberdine","gable","gabled","gad",
+"gadabout","gadfly","gadget","gadgetry","gaelic",
+"gaff","gaffe","gaffer","gag","gaga",
+"gaggle","gaiety","gaily","gain","gainful",
+"gainfully","gainsay","gait","gaiter","gal",
+"gala","galactic","galantine","galaxy","gale",
+"gall","gallant","gallantry","galleon","gallery",
+"galley","gallic","gallicism","gallivant","gallon",
+"gallop","galloping","gallows","gallstone","galore",
+"galosh","galumph","galvanic","galvanise","galvanism",
+"galvanize","gambit","gamble","gamboge","gambol",
+"game","gamecock","gamekeeper","games","gamesmanship",
+"gamey","gamma","gammon","gammy","gamp",
+"gamut","gamy","gander","gang","ganger",
+"gangling","ganglion","gangplank","gangrene","gangster",
+"gangway","gannet","gantry","gaol","gaolbird",
+"gaoler","gap","gape","gapes","garage",
+"garb","garbage","garble","garden","gardenia",
+"gardening","gargantuan","gargle","gargoyle","garish",
+"garland","garlic","garment","garner","garnet",
+"garnish","garret","garrison","garrote","garrotte",
+"garrulity","garrulous","garter","gas","gasbag",
+"gaseous","gash","gasholder","gasify","gasket",
+"gaslight","gasman","gasolene","gasoline","gasp",
+"gassy","gastric","gastritis","gastroenteritis","gastronomy",
+"gasworks","gat","gate","gatecrash","gatehouse",
+"gatekeeper","gatepost","gateway","gather","gathering",
+"gauche","gaucherie","gaucho","gaudy","gauge",
+"gaunt","gauntlet","gauze","gave","gavel",
+"gavotte","gawk","gawky","gawp","gay",
+"gayness","gaze","gazebo","gazelle","gazette",
+"gazetteer","gazump","gce","gear","gearbox",
+"gecko","gee","geese","geezer","geisha",
+"gel","gelatine","gelatinous","geld","gelding",
+"gelignite","gem","gemini","gen","gendarme",
+"gender","gene","genealogist","genealogy","genera",
+"general","generalisation","generalise","generalissimo","generality",
+"generalization","generalize","generally","generate","generation",
+"generative","generator","generic","generous","genesis",
+"genetic","geneticist","genetics","genial","geniality",
+"genie","genital","genitals","genitive","genius",
+"genocide","genre","gent","genteel","gentian",
+"gentile","gentility","gentle","gentlefolk","gentleman",
+"gentlemanly","gentlewoman","gently","gentry","gents",
+"genuflect","genuine","genus","geocentric","geographer",
+"geography","geologist","geology","geometric","geometry",
+"geophysics","geopolitics","georgette","geranium","geriatric",
+"geriatrician","geriatrics","germ","germane","germanic",
+"germicide","germinal","germinate","gerontology","gerrymander",
+"gerund","gestalt","gestapo","gestation","gesticulate",
+"gesture","get","getaway","getup","geum",
+"gewgaw","geyser","gharry","ghastly","ghat",
+"ghaut","ghee","gherkin","ghetto","ghi",
+"ghost","ghostly","ghoul","ghoulish","ghq",
+"ghyll","giant","giantess","gibber","gibberish",
+"gibbet","gibbon","gibbous","gibe","giblets",
+"giddy","gift","gifted","gig","gigantic",
+"giggle","gigolo","gild","gilded","gilding",
+"gill","gillie","gilly","gilt","gimcrack",
+"gimlet","gimmick","gimmicky","gin","ginger",
+"gingerbread","gingerly","gingham","gingivitis","gingko",
+"ginkgo","ginseng","gipsy","giraffe","gird",
+"girder","girdle","girl","girlfriend","girlhood",
+"girlie","girlish","girly","giro","girt",
+"girth","gist","give","giveaway","given",
+"gizzard","glacial","glacier","glad","gladden",
+"glade","gladiator","gladiolus","gladly","glamor",
+"glamorise","glamorize","glamorous","glamour","glamourous",
+"glance","glancing","gland","glandular","glare",
+"glaring","glass","glassblower","glasscutter","glasses",
+"glasshouse","glassware","glassworks","glassy","glaucoma",
+"glaucous","glaze","glazier","glazing","glc",
+"gleam","glean","gleaner","gleanings","glebe",
+"glee","gleeful","glen","glengarry","glib",
+"glide","glider","gliding","glimmer","glimmerings",
+"glimpse","glint","glissade","glissando","glisten",
+"glister","glitter","glittering","gloaming","gloat",
+"global","globe","globefish","globetrotter","globular",
+"globule","glockenspiel","gloom","gloomy","gloria",
+"glorification","glorify","glorious","glory","gloss",
+"glossary","glossy","glottal","glottis","glove",
+"glow","glower","glowing","glucose","glue",
+"gluey","glum","glut","gluten","glutinous",
+"glutton","gluttonous","gluttony","glycerin","glycerine",
+"gnarled","gnash","gnat","gnaw","gnawing",
+"gneiss","gnocchi","gnome","gnp","gnu",
+"goad","goal","goalkeeper","goalmouth","goalpost",
+"goat","goatee","goatherd","goatskin","gob",
+"gobbet","gobble","gobbledegook","gobbledygook","gobbler",
+"goblet","goblin","god","godchild","goddam",
+"goddamn","goddie","godforsaken","godhead","godless",
+"godlike","godly","godown","godparent","gods",
+"godsend","godspeed","goer","goggle","goggles",
+"goings","goiter","goitre","gold","goldbeater",
+"golden","goldfield","goldfinch","goldfish","goldmine",
+"goldsmith","golf","goliath","golliwog","golly",
+"gollywog","gonad","gondola","gondolier","gone",
+"goner","gong","gonna","gonorrhea","gonorrhoea",
+"goo","good","goodbye","goodish","goodly",
+"goodness","goodnight","goods","goodwill","goody",
+"gooey","goof","goofy","googly","goon",
+"goose","gooseberry","gooseflesh","goosestep","gopher",
+"gore","gorge","gorgeous","gorgon","gorgonzola",
+"gorilla","gormandise","gormandize","gormless","gorse",
+"gory","gosh","gosling","gospel","gossamer",
+"gossip","gossipy","got","gothic","gotta",
+"gotten","gouache","gouda","gouge","goulash",
+"gourd","gourmand","gourmet","gout","gouty",
+"govern","governance","governess","governing","government",
+"governor","gown","gpo","grab","grace",
+"graceful","graceless","graces","gracious","gradation",
+"grade","gradient","gradual","graduate","graduation",
+"graffiti","graft","grafter","grail","grain",
+"gram","grammar","grammarian","grammatical","gramme",
+"gramophone","grampus","gran","granary","grand",
+"grandad","grandchild","granddad","granddaughter","grandee",
+"grandeur","grandfather","grandiloquent","grandiose","grandma",
+"grandmother","grandpa","grandparent","grandson","grandstand",
+"grange","granite","grannie","granny","grant",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java
new file mode 100644
index 00000000000..4ffb6d00cc5
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData4.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData4 {
+ private KStemData4() {
+ }
+ static String[] data = {
+"granular","granulate","granule","grape","grapefruit",
+"grapeshot","grapevine","graph","graphic","graphical",
+"graphically","graphite","graphology","grapnel","grapple",
+"grasp","grasping","grass","grasshopper","grassland",
+"grassy","grate","grateful","grater","gratification",
+"gratify","gratifying","grating","gratis","gratitude",
+"gratuitous","gratuity","grave","gravel","gravelly",
+"gravestone","graveyard","gravitate","gravitation","gravity",
+"gravure","gravy","gray","graybeard","grayish",
+"graze","grease","greasepaint","greaseproof","greaser",
+"greasy","great","greatcoat","greater","greatly",
+"grebe","grecian","greed","greedy","green",
+"greenback","greenery","greenfly","greengage","greengrocer",
+"greenhorn","greenhouse","greenish","greenroom","greens",
+"greenwood","greet","greeting","gregarious","gremlin",
+"grenade","grenadier","grenadine","grew","grey",
+"greybeard","greyhound","greyish","grid","griddle",
+"gridiron","grief","grievance","grieve","grievous",
+"griffin","grill","grim","grimace","grime",
+"grimy","grin","grind","grinder","grindstone",
+"gringo","grip","gripe","gripes","gripping",
+"grisly","grist","gristle","grit","grits",
+"grizzle","grizzled","groan","groat","groats",
+"grocer","groceries","grocery","grog","groggy",
+"groin","groom","groove","groover","groovy",
+"grope","gropingly","gross","grotesque","grotto",
+"grotty","grouch","ground","grounding","groundless",
+"groundnut","grounds","groundsel","groundsheet","groundsman",
+"groundwork","group","groupie","grouping","grouse",
+"grove","grovel","grow","grower","growl",
+"grown","growth","groyne","grub","grubby",
+"grudge","grudging","gruel","grueling","gruelling",
+"gruesome","gruff","grumble","grumbling","grumpy",
+"grundyism","grunt","gryphon","guano","guarantee",
+"guarantor","guaranty","guard","guarded","guardhouse",
+"guardian","guardianship","guardrail","guardroom","guardsman",
+"guava","gubernatorial","gudgeon","guerilla","guerrilla",
+"guess","guesswork","guest","guesthouse","guestroom",
+"guffaw","guidance","guide","guidelines","guild",
+"guilder","guildhall","guile","guileless","guillemot",
+"guillotine","guilt","guilty","guinea","guipure",
+"guise","guitar","gulch","gulden","gulf",
+"gull","gullet","gulley","gullible","gully",
+"gulp","gum","gumbo","gumboil","gumboot",
+"gumdrop","gummy","gumption","gun","gunboat",
+"gundog","gunfire","gunge","gunman","gunmetal",
+"gunnel","gunner","gunnery","gunnysack","gunpoint",
+"gunpowder","gunrunner","gunshot","gunshy","gunsmith",
+"gunwale","guppy","gurgle","guru","gush",
+"gusher","gushing","gushy","gusset","gust",
+"gustatory","gusto","gusty","gut","gutless",
+"guts","gutsy","gutter","guttersnipe","guttural",
+"guv","guvnor","guy","guzzle","gym",
+"gymkhana","gymnasium","gymnast","gymnastic","gymnastics",
+"gymslip","gynaecology","gynecology","gyp","gypsum",
+"gypsy","gyrate","gyration","gyroscope","gyves",
+"haberdasher","haberdashery","habiliment","habit","habitable",
+"habitat","habitation","habitual","habituate","hacienda",
+"hack","hackles","hackney","hackneyed","hacksaw",
+"hackwork","had","haddock","hadji","haft",
+"hag","haggard","haggis","haggle","hagiography",
+"haiku","hail","hailstone","hailstorm","hair",
+"hairbrush","haircut","hairdo","hairdresser","hairgrip",
+"hairless","hairline","hairnet","hairpiece","hairpin",
+"hairspring","hairy","hajji","hake","halberd",
+"halcyon","hale","half","halfback","halfpence",
+"halfpenny","halfpennyworth","halftone","halfway","halibut",
+"halitosis","hall","halleluja","halliard","hallmark",
+"hallo","hallow","hallstand","hallucinate","hallucination",
+"hallucinatory","hallucinogenic","hallway","halma","halo",
+"halt","halter","halterneck","halting","halve",
+"halves","halyard","ham","hamadryad","hamburger",
+"hamlet","hammer","hammock","hamper","hamster",
+"hamstring","hand","handbag","handball","handbarrow",
+"handbill","handbook","handbrake","handcart","handclap",
+"handcuff","handcuffs","handful","handgun","handhold",
+"handicap","handicraft","handiwork","handkerchief","handle",
+"handlebars","handler","handloom","handmade","handmaiden",
+"handout","handpick","handrail","handshake","handsome",
+"handstand","handwork","handwriting","handwritten","handy",
+"handyman","hang","hangar","hangdog","hanger",
+"hanging","hangings","hangman","hangnail","hangout",
+"hangover","hangup","hank","hanker","hankering",
+"hankie","hanky","hansard","hansom","hap",
+"haphazard","hapless","haply","happen","happening",
+"happily","happiness","happy","harangue","harass",
+"harassment","harbinger","harbor","harbour","hard",
+"hardback","hardboard","hardbound","harden","hardheaded",
+"hardihood","hardiness","hardly","hardness","hardship",
+"hardtop","hardware","hardwearing","hardwood","hardy",
+"hare","harebell","harebrained","harelip","harem",
+"haricot","hark","harlequin","harlequinade","harlot",
+"harm","harmless","harmonic","harmonica","harmonise",
+"harmonium","harmonize","harmony","harness","harp",
+"harpoon","harpsichord","harpy","harquebus","harridan",
+"harrier","harrow","harrowing","harry","harsh",
+"hart","hartal","hartebeest","harvest","harvester",
+"has","hash","hashish","hasp","hassle",
+"hassock","hast","haste","hasten","hasty",
+"hat","hatband","hatch","hatchback","hatchery",
+"hatchet","hatching","hatchway","hate","hateful",
+"hath","hatless","hatpin","hatred","hatter",
+"hauberk","haughty","haul","haulage","haulier",
+"haulm","haunch","haunt","haunting","hautbois",
+"hautboy","hauteur","havana","have","haven",
+"haver","haversack","haves","havoc","haw",
+"hawk","hawker","hawser","hawthorn","hay",
+"haycock","hayfork","haymaker","haystack","haywire",
+"hazard","hazardous","haze","hazel","hazy",
+"head","headache","headband","headboard","headcheese",
+"headdress","header","headfirst","headgear","headhunter",
+"heading","headland","headless","headlight","headline",
+"headlong","headman","headmaster","headphone","headpiece",
+"headquarters","headrest","headroom","headset","headship",
+"headshrinker","headstall","headstone","headstrong","headway",
+"headwind","headword","heady","heal","health",
+"healthful","healthy","heap","hear","hearer",
+"hearing","hearken","hearsay","hearse","heart",
+"heartache","heartbeat","heartbreak","heartbreaking","heartbroken",
+"heartburn","hearten","heartening","heartfelt","hearth",
+"hearthrug","heartily","heartless","heartrending","heartsease",
+"heartsick","heartstrings","heartthrob","heartwarming","heartwood",
+"hearty","heat","heated","heater","heath",
+"heathen","heather","heating","heatstroke","heave",
+"heaven","heavenly","heavenwards","heavy","heavyhearted",
+"heavyweight","hebdomadal","hebraic","hebrew","hecatomb",
+"heck","heckle","hectare","hectic","hector",
+"hedge","hedgehog","hedgehop","hedgerow","hedonism",
+"heed","heel","heelball","hefty","hegemony",
+"hegira","heifer","height","heighten","heinous",
+"heir","heiress","heirloom","hejira","held",
+"helicopter","heliograph","heliotrope","heliport","helium",
+"hell","hellcat","hellene","hellenic","hellenistic",
+"hellish","hellishly","hello","helm","helmet",
+"helmeted","helmsman","helot","help","helpful",
+"helping","helpless","helpmate","helve","hem",
+"hemisphere","hemline","hemlock","hemoglobin","hemophilia",
+"hemophiliac","hemorrhage","hemorrhoid","hemp","hempen",
+"hemstitch","hen","henbane","hence","henceforth",
+"henchman","henna","hennaed","henpecked","hepatitis",
+"heptagon","her","herald","heraldic","heraldry",
+"herb","herbaceous","herbage","herbal","herbalist",
+"herbivorous","herculean","herd","herdsman","here",
+"hereabouts","hereafter","hereby","hereditament","hereditary",
+"heredity","herein","hereinafter","hereof","heresy",
+"heretic","hereto","heretofore","hereunder","hereupon",
+"herewith","heritable","heritage","hermaphrodite","hermetic",
+"hermit","hermitage","hernia","hero","heroic",
+"heroics","heroin","heroism","heron","heronry",
+"herpes","herr","herring","herringbone","hers",
+"herself","hertz","hesitancy","hesitant","hesitate",
+"hesitation","hesperus","hessian","heterodox","heterodoxy",
+"heterogeneous","heterosexual","heuristic","heuristics","hew",
+"hewer","hex","hexagon","hexagram","hexameter",
+"hey","heyday","hiatus","hibernate","hibiscus",
+"hiccough","hiccup","hick","hickory","hide",
+"hideaway","hidebound","hideous","hiding","hie",
+"hierarchy","hieroglyph","hieroglyphics","high","highball",
+"highborn","highboy","highbrow","higher","highfalutin",
+"highland","highlander","highlands","highlight","highly",
+"highness","highpitched","highroad","highway","highwayman",
+"hijack","hike","hilarious","hilarity","hill",
+"hillbilly","hillock","hillside","hilly","hilt",
+"him","himself","hind","hinder","hindmost",
+"hindquarters","hindrance","hindsight","hindu","hinduism",
+"hinge","hint","hinterland","hip","hipbath",
+"hippie","hippodrome","hippopotamus","hippy","hipster",
+"hire","hireling","hirsute","his","hiss",
+"hist","histamine","histology","historian","historic",
+"historical","history","histrionic","histrionics","hit",
+"hitch","hitchhike","hither","hitherto","hive",
+"hives","hms","hoard","hoarding","hoarfrost",
+"hoarse","hoary","hoax","hob","hobble",
+"hobbledehoy","hobby","hobbyhorse","hobgoblin","hobnail",
+"hobnob","hobo","hock","hockey","hod",
+"hodgepodge","hoe","hog","hoggish","hogmanay",
+"hogshead","hogwash","hoist","hold","holdall",
+"holder","holding","holdover","holdup","hole",
+"holiday","holidaymaker","holiness","holler","hollow",
+"holly","hollyhock","hollywood","holocaust","holograph",
+"holstein","holster","holy","homage","homburg",
+"home","homecoming","homegrown","homeland","homelike",
+"homely","homemade","homeopath","homeopathy","homeric",
+"homesick","homespun","homestead","hometown","homeward",
+"homewards","homework","homey","homicidal","homicide",
+"homiletic","homiletics","homily","homing","hominy",
+"homoeopath","homoeopathy","homogeneous","homogenise","homogenize",
+"homograph","homonym","homophone","homosexual","homy",
+"hone","honest","honestly","honesty","honey",
+"honeybee","honeycomb","honeycombed","honeydew","honeyed",
+"honeymoon","honeysuckle","honk","honkie","honky",
+"honor","honorable","honorarium","honorary","honorific",
+"honors","honour","honourable","honours","hooch",
+"hood","hooded","hoodlum","hoodoo","hoodwink",
+"hooey","hoof","hook","hookah","hooked",
+"hooker","hookey","hookup","hookworm","hooky",
+"hooligan","hoop","hooray","hoot","hooter",
+"hoover","hooves","hop","hope","hopeful",
+"hopefully","hopeless","hopper","hopscotch","horde",
+"horizon","horizontal","hormone","horn","hornbeam",
+"hornbill","horned","hornet","hornpipe","horny",
+"horology","horoscope","horrendous","horrible","horrid",
+"horrific","horrify","horror","horrors","horse",
+"horseback","horsebox","horseflesh","horsefly","horsehair",
+"horselaugh","horseman","horsemanship","horsemeat","horseplay",
+"horsepower","horseracing","horseradish","horseshit","horseshoe",
+"horsewhip","horsewoman","horsy","hortative","horticulture",
+"hosanna","hose","hosier","hosiery","hospice",
+"hospitable","hospital","hospitalise","hospitality","hospitalize",
+"host","hostage","hostel","hosteler","hosteller",
+"hostelry","hostess","hostile","hostilities","hostility",
+"hostler","hot","hotbed","hotchpotch","hotel",
+"hotelier","hotfoot","hothead","hothouse","hotly",
+"hotplate","hotpot","hottentot","hound","hour",
+"hourglass","houri","hourly","house","houseboat",
+"housebound","houseboy","housebreaker","housebroken","housecoat",
+"housecraft","housedog","housefather","housefly","houseful",
+"household","householder","housekeeper","housekeeping","housemaid",
+"houseman","housemaster","housemother","houseroom","housetops",
+"housewarming","housewife","housewifery","housework","housing",
+"hove","hovel","hover","hovercraft","how",
+"howdah","howdy","however","howitzer","howl",
+"howler","howling","howsoever","hoyden","hrh",
+"hub","hubbub","hubby","hubcap","hubris",
+"huckaback","huckleberry","huckster","huddle","hue",
+"huff","huffish","huffy","hug","huge",
+"hugely","huguenot","huh","hula","hulk",
+"hulking","hull","hullabaloo","hullo","hum",
+"human","humane","humanise","humanism","humanitarian",
+"humanitarianism","humanities","humanity","humanize","humankind",
+"humanly","humble","humbug","humdinger","humdrum",
+"humerus","humid","humidify","humidity","humidor",
+"humiliate","humility","hummingbird","hummock","humor",
+"humorist","humorous","humour","hump","humpback",
+"humph","humus","hun","hunch","hunchback",
+"hundred","hundredweight","hung","hunger","hungry",
+"hunk","hunkers","hunt","hunter","hunting",
+"huntress","huntsman","hurdle","hurl","hurling",
+"hurray","hurricane","hurried","hurry","hurt",
+"hurtful","hurtle","husband","husbandman","husbandry",
+"hush","husk","husky","hussar","hussy",
+"hustings","hustle","hustler","hut","hutch",
+"hutment","huzza","huzzah","hyacinth","hyaena",
+"hybrid","hybridise","hybridize","hydra","hydrangea",
+"hydrant","hydrate","hydraulic","hydraulics","hydrocarbon",
+"hydroelectric","hydrofoil","hydrogen","hydrophobia","hydroplane",
+"hydroponics","hydrotherapy","hyena","hygiene","hygienic",
+"hymen","hymeneal","hymn","hymnal","hyperbola",
+"hyperbole","hyperbolic","hypercritical","hypermarket","hypersensitive",
+"hyphen","hyphenate","hypnosis","hypnotise","hypnotism",
+"hypnotist","hypnotize","hypo","hypochondria","hypochondriac",
+"hypocrisy","hypocrite","hypodermic","hypotenuse","hypothermia",
+"hypothesis","hypothetical","hysterectomy","hysteria","hysterical",
+"hysterics","iamb","iberian","ibex","ibidem",
+"ibis","icbm","ice","iceberg","icebound",
+"icebox","icebreaker","icefall","icehouse","iceman",
+"icicle","icing","icon","iconoclast","icy",
+"idea","ideal","idealise","idealism","idealist",
+"idealize","ideally","idem","identical","identification",
+"identify","identikit","identity","ideogram","ideology",
+"ides","idiocy","idiom","idiomatic","idiosyncrasy",
+"idiot","idle","idol","idolater","idolatrous",
+"idolatry","idolise","idolize","idyl","idyll",
+"igloo","igneous","ignite","ignition","ignoble",
+"ignominious","ignominy","ignoramus","ignorance","ignorant",
+"ignore","iguana","ikon","ilex","ilk",
+"ill","illegal","illegality","illegible","illegitimate",
+"illiberal","illicit","illimitable","illiterate","illness",
+"illogical","illuminate","illuminating","illumination","illuminations",
+"illusion","illusionist","illusory","illustrate","illustration",
+"illustrative","illustrator","illustrious","image","imagery",
+"imaginable","imaginary","imagination","imaginative","imagine",
+"imam","imbalance","imbecile","imbecility","imbed",
+"imbibe","imbroglio","imbue","imitate","imitation",
+"imitative","imitator","immaculate","immanence","immanent",
+"immaterial","immature","immeasurable","immediacy","immediate",
+"immediately","immemorial","immense","immensely","immensity",
+"immerse","immersion","immigrant","immigrate","imminence",
+"imminent","immobile","immobilise","immobilize","immoderate",
+"immodest","immolate","immoral","immorality","immortal",
+"immortalise","immortality","immortalize","immovable","immune",
+"immunise","immunize","immure","immutable","imp",
+"impact","impacted","impair","impala","impale",
+"impalpable","impanel","impart","impartial","impassable",
+"impasse","impassioned","impassive","impatience","impatient",
+"impeach","impeccable","impecunious","impedance","impede",
+"impediment","impedimenta","impel","impending","impenetrable",
+"impenitent","imperative","imperceptible","imperfect","imperial",
+"imperialism","imperialist","imperialistic","imperil","imperious",
+"imperishable","impermanent","impermeable","impersonal","impersonate",
+"impertinent","imperturbable","impervious","impetigo","impetuous",
+"impetus","impiety","impinge","impious","impish",
+"implacable","implant","implement","implicate","implication",
+"implicit","implore","implosion","imply","impolite",
+"impolitic","imponderable","import","importance","important",
+"importation","importunate","importune","impose","imposing",
+"imposition","impossible","impostor","imposture","impotent",
+"impound","impoverish","impracticable","impractical","imprecation",
+"impregnable","impregnate","impresario","impress","impression",
+"impressionable","impressionism","impressionist","impressionistic","impressive",
+"imprimatur","imprint","imprison","improbability","improbable",
+"impromptu","improper","impropriety","improve","improvement",
+"improvident","improvise","imprudent","impudent","impugn",
+"impulse","impulsion","impulsive","impunity","impure",
+"impurity","imputation","impute","inability","inaccessible",
+"inaccurate","inaction","inactive","inadequacy","inadequate",
+"inadmissible","inadvertent","inalienable","inamorata","inane",
+"inanimate","inanition","inanity","inapplicable","inappropriate",
+"inapt","inaptitude","inarticulate","inartistic","inattention",
+"inattentive","inaudible","inaugural","inaugurate","inauspicious",
+"inboard","inborn","inbound","inbred","inbreeding",
+"inc","incalculable","incandescent","incantation","incapable",
+"incapacitate","incapacity","incarcerate","incarnate","incarnation",
+"incautious","incendiarism","incendiary","incense","incentive",
+"inception","incertitude","incessant","incest","incestuous",
+"inch","inchoate","incidence","incident","incidental",
+"incidentally","incidentals","incinerate","incinerator","incipience",
+"incipient","incise","incision","incisive","incisor",
+"incite","incivility","inclement","inclination","incline",
+"inclined","inclose","inclosure","include","included",
+"including","inclusion","inclusive","incognito","incoherent",
+"incombustible","income","incoming","incommensurable","incommensurate",
+"incommode","incommodious","incommunicable","incommunicado","incommunicative",
+"incomparable","incompatible","incompetence","incompetent","incomplete",
+"incomprehensible","incomprehensibly","incomprehension","inconceivable","inconclusive",
+"incongruity","incongruous","inconsequent","inconsequential","inconsiderable",
+"inconsiderate","inconsistent","inconsolable","inconspicuous","inconstant",
+"incontestable","incontinent","incontrovertible","inconvenience","inconvenient",
+"incorporate","incorporated","incorporeal","incorrect","incorrigible",
+"incorruptible","increase","increasingly","incredible","incredulity",
+"incredulous","increment","incriminate","incrust","incrustation",
+"incubate","incubation","incubator","incubus","inculcate",
+"inculpate","incumbency","incumbent","incur","incurable",
+"incurious","incursion","incurved","indebted","indecent",
+"indecipherable","indecision","indecisive","indecorous","indecorum",
+"indeed","indefatigable","indefensible","indefinable","indefinite",
+"indefinitely","indelible","indelicate","indemnification","indemnify",
+"indemnity","indent","indentation","indenture","independence",
+"independent","indescribable","indestructible","indeterminable","indeterminate",
+"index","indian","indicate","indication","indicative",
+"indicator","indices","indict","indictable","indifferent",
+"indigenous","indigent","indigestible","indigestion","indignant",
+"indignation","indignity","indigo","indirect","indiscernible",
+"indiscipline","indiscreet","indiscretion","indiscriminate","indispensable",
+"indisposed","indisposition","indisputable","indissoluble","indistinct",
+"indistinguishable","individual","individualise","individualism","individuality",
+"individualize","individually","indivisible","indocile","indoctrinate",
+"indolent","indomitable","indoor","indoors","indorse",
+"indrawn","indubitable","induce","inducement","induct",
+"induction","inductive","indue","indulge","indulgence",
+"indulgent","industrial","industrialise","industrialism","industrialist",
+"industrialize","industrious","industry","inebriate","inedible",
+"ineducable","ineffable","ineffaceable","ineffective","ineffectual",
+"inefficient","inelastic","inelegant","ineligible","ineluctable",
+"inept","ineptitude","inequality","inequitable","inequity",
+"ineradicable","inert","inertia","inescapable","inessential",
+"inestimable","inevitable","inexact","inexactitude","inexcusable",
+"inexhaustible","inexorable","inexpediency","inexpedient","inexpensive",
+"inexperience","inexperienced","inexpert","inexpiable","inexplicable",
+"inexplicably","inexpressible","inextinguishable","inextricable","infallible",
+"infallibly","infamous","infamy","infancy","infant",
+"infanticide","infantile","infantry","infantryman","infatuated",
+"infatuation","infect","infection","infectious","infelicitous",
+"infer","inference","inferential","inferior","infernal",
+"inferno","infertile","infest","infidel","infidelity",
+"infield","infighting","infiltrate","infiltration","infinite",
+"infinitesimal","infinitive","infinitude","infinity","infirm",
+"infirmary","infirmity","inflame","inflamed","inflammable",
+"inflammation","inflammatory","inflatable","inflate","inflated",
+"inflation","inflationary","inflect","inflection","inflexible",
+"inflexion","inflict","infliction","inflow","influence",
+"influential","influenza","influx","info","inform",
+"informal","informant","information","informative","informed",
+"informer","infra","infraction","infrared","infrastructure",
+"infrequent","infringe","infuriate","infuse","infusion",
+"ingathering","ingenious","ingenuity","ingenuous","ingest",
+"inglenook","inglorious","ingoing","ingot","ingraft",
+"ingrained","ingratiate","ingratiating","ingratitude","ingredient",
+"ingress","ingrown","inhabit","inhabitant","inhale",
+"inhaler","inharmonious","inhere","inherent","inherently",
+"inherit","inheritance","inhibit","inhibited","inhibition",
+"inhospitable","inhuman","inhumane","inhumanity","inimical",
+"inimitable","iniquitous","iniquity","initial","initially",
+"initiate","initiation","initiative","inject","injection",
+"injudicious","injunction","injure","injurious","injury",
+"injustice","ink","inkbottle","inkling","inkpad",
+"inkstand","inkwell","inky","inlaid","inland",
+"inlay","inlet","inmate","inmost","inn",
+"innards","innate","inner","inning","innings",
+"innkeeper","innocent","innocuous","innovate","innovation",
+"innuendo","innumerable","inoculate","inoffensive","inoperable",
+"inoperative","inopportune","inordinate","inorganic","input",
+"inquest","inquietude","inquire","inquiring","inquiry",
+"inquisition","inquisitive","inquisitor","inquisitorial","inroad",
+"inrush","insalubrious","insane","insanitary","insanity",
+"insatiable","insatiate","inscribe","inscription","inscrutable",
+"insect","insecticide","insectivore","insectivorous","insecure",
+"inseminate","insemination","insensate","insensibility","insensible",
+"insensitive","inseparable","insert","insertion","inset",
+"inshore","inside","insider","insidious","insight",
+"insignia","insignificant","insincere","insinuate","insinuation",
+"insipid","insist","insistence","insistency","insistent",
+"insole","insolent","insoluble","insolvable","insolvent",
+"insomnia","insomniac","insouciance","inspect","inspection",
+"inspector","inspectorate","inspectorship","inspiration","inspire",
+"inspired","instability","install","installation","installment",
+"instalment","instance","instant","instantaneous","instantly",
+"instead","instep","instigate","instigation","instil",
+"instill","instinct","instinctive","institute","institution",
+"instruct","instruction","instructive","instructor","instructress",
+"instrument","instrumental","instrumentalist","instrumentality","instrumentation",
+"insubordinate","insubstantial","insufferable","insufficiency","insufficient",
+"insular","insularity","insulate","insulation","insulator",
+"insulin","insult","insuperable","insupportable","insurance",
+"insure","insured","insurer","insurgent","insurmountable",
+"insurrection","intact","intaglio","intake","intangible",
+"integer","integral","integrate","integrated","integrity",
+"integument","intellect","intellectual","intelligence","intelligent",
+"intelligentsia","intelligible","intemperate","intend","intended",
+"intense","intensifier","intensify","intensity","intensive",
+"intent","intention","intentional","intentions","inter",
+"interact","interaction","interbreed","intercalary","intercalate",
+"intercede","intercept","interceptor","intercession","interchange",
+"interchangeable","intercity","intercollegiate","intercom","intercommunicate",
+"intercommunion","intercontinental","intercourse","interdenominational","interdependent",
+"interdict","interest","interested","interesting","interests",
+"interface","interfere","interference","interim","interior",
+"interject","interjection","interlace","interlard","interleave",
+"interline","interlinear","interlink","interlock","interlocutor",
+"interloper","interlude","intermarriage","intermarry","intermediary",
+"intermediate","interment","intermezzo","interminable","intermingle",
+"intermission","intermittent","intern","internal","internalise",
+"internalize","international","internationale","internationalise","internationalism",
+"internationalize","interne","internecine","internee","internment",
+"interpellate","interpenetrate","interpersonal","interplanetary","interplay",
+"interpol","interpolate","interpolation","interpose","interposition",
+"interpret","interpretation","interpretative","interpreter","interracial",
+"interregnum","interrelate","interrelation","interrogate","interrogative",
+"interrogatory","interrupt","intersect","intersection","intersperse",
+"interstate","interstellar","interstice","intertribal","intertwine",
+"interurban","interval","intervene","intervention","interview",
+"interweave","intestate","intestinal","intestine","intimacy",
+"intimate","intimidate","intimidation","into","intolerable",
+"intolerant","intonation","intone","intoxicant","intoxicate",
+"intractable","intramural","intransigent","intransitive","intravenous",
+"intrench","intrepid","intricacy","intricate","intrigue",
+"intrinsic","intro","introduce","introduction","introductory",
+"introit","introspection","introspective","introvert","introverted",
+"intrude","intruder","intrusion","intrusive","intrust",
+"intuit","intuition","intuitive","intumescence","inundate",
+"inundation","inure","invade","invalid","invalidate",
+"invalidism","invaluable","invariable","invasion","invective",
+"inveigh","inveigle","invent","invention","inventive",
+"inventor","inventory","inverse","inversion","invert",
+"invertebrate","invest","investigate","investiture","investment",
+"inveterate","invidious","invigilate","invigorate","invincible",
+"inviolable","inviolate","invisible","invitation","invite",
+"inviting","invocation","invoice","invoke","involuntary",
+"involve","involved","invulnerable","inward","inwardness",
+"inwards","inwrought","iodin","iodine","iodise",
+"iodize","ion","ionic","ionise","ionize",
+"ionosphere","iota","iou","ipa","ira",
+"irascible","irate","ire","iridescent","iridium",
+"irishman","irk","irksome","iron","ironclad",
+"ironic","ironically","ironing","ironmonger","ironmongery",
+"ironmould","irons","ironstone","ironware","ironwork",
+"ironworks","irony","irradiate","irrational","irreconcilable",
+"irrecoverable","irredeemable","irreducible","irrefutable","irregular",
+"irregularity","irrelevance","irrelevant","irreligious","irremediable",
+"irremovable","irreparable","irreplaceable","irrepressible","irreproachable",
+"irresistible","irresolute","irresponsible","irretrievable","irreverent",
+"irreversible","irrevocable","irrigate","irritable","irritant",
+"irritate","irritation","irruption","isinglass","islam",
+"island","islander","isle","islet","ism",
+"isobar","isolate","isolated","isolation","isolationism",
+"isotherm","isotope","israelite","issue","isthmus",
+"ita","italic","italicise","italicize","italics",
+"itch","itchy","item","itemise","itemize",
+"iterate","itinerant","itinerary","itn","its",
+"itself","itv","iud","ivied","ivory",
+"ivy","jab","jabber","jack","jackal",
+"jackanapes","jackaroo","jackass","jackboot","jackdaw",
+"jackeroo","jacket","jackpot","jackrabbit","jacobean",
+"jacobite","jade","jaded","jaffa","jag",
+"jagged","jaguar","jail","jailbird","jailbreak",
+"jailer","jailor","jalopy","jam","jamb",
+"jamboree","jammy","jangle","janissary","janitor",
+"january","japan","jape","japonica","jar",
+"jargon","jasmine","jasper","jaundice","jaundiced",
+"jaunt","jaunty","javelin","jaw","jawbone",
+"jawbreaker","jaws","jay","jaywalk","jazz",
+"jazzy","jealous","jealousy","jeans","jeep",
+"jeer","jehovah","jejune","jell","jellied",
+"jello","jelly","jellyfish","jemmy","jenny",
+"jeopardise","jeopardize","jeopardy","jerboa","jeremiad",
+"jerk","jerkin","jerky","jeroboam","jerry",
+"jersey","jest","jester","jesting","jesuit",
+"jesuitical","jet","jetsam","jettison","jetty",
+"jew","jewel","jeweled","jeweler","jewelled",
+"jeweller","jewellery","jewelry","jewess","jewish",
+"jezebel","jib","jibe","jiffy","jig",
+"jigger","jiggered","jiggle","jigsaw","jihad",
+"jilt","jiminy","jimjams","jimmy","jingle",
+"jingo","jingoism","jinks","jinn","jinrikisha",
+"jinx","jitney","jitterbug","jitters","jiujitsu",
+"jive","jnr","job","jobber","jobbery",
+"jobbing","jobless","jockey","jockstrap","jocose",
+"jocular","jocund","jodhpurs","jog","joggle",
+"john","johnny","join","joiner","joinery",
+"joint","joist","joke","joker","jollification",
+"jollity","jolly","jolt","jolty","jonah",
+"jonquil","josh","jostle","jot","jotter",
+"jotting","joule","journal","journalese","journalism",
+"journalist","journey","journeyman","joust","jove",
+"jovial","jowl","joy","joyful","joyless",
+"joyous","joyride","joystick","jubilant","jubilation",
+"jubilee","judaic","judaism","judder","judge",
+"judgement","judgment","judicature","judicial","judiciary",
+"judicious","judo","jug","juggernaut","juggle",
+"juice","juicy","jujitsu","juju","jujube",
+"jukebox","julep","july","jumble","jumbo",
+"jump","jumper","jumps","jumpy","junction",
+"juncture","june","jungle","junior","juniper",
+"junk","junket","junketing","junkie","junky",
+"junoesque","junta","jupiter","juridical","jurisdiction",
+"jurisprudence","jurist","juror","jury","juryman",
+"just","justice","justifiable","justification","justified",
+"justify","jut","jute","juvenile","juxtapose",
+"juxtaposition","kaffir","kafir","kaftan","kail",
+"kaiser","kale","kaleidoscope","kaleidoscopic","kalends",
+"kampong","kangaroo","kaolin","kapok","kappa",
+"kaput","karat","karate","karma","katydid",
+"kayak","kazoo","kebab","kebob","kedgeree",
+"keel","keelhaul","keen","keep","keeper",
+"keeping","keeps","keepsake","keg","kelp",
+"kelvin","ken","kennel","kennels","kepi",
+"kept","kerb","kerchief","kerfuffle","kernel",
+"kerosene","kerosine","kersey","kestrel","ketch",
+"ketchup","kettle","kettledrum","key","keyboard",
+"keyhole","keyless","keynote","keypunch","keystone",
+"khaki","khalif","khalifate","khan","kibbutz",
+"kibosh","kick","kickback","kicker","kickoff",
+"kicks","kid","kiddie","kiddy","kidnap",
+"kidney","kike","kill","killer","killing",
+"killjoy","kiln","kilo","kilogram","kilogramme",
+"kilohertz","kiloliter","kilolitre","kilometer","kilometre",
+"kilowatt","kilt","kimono","kin","kind",
+"kindergarten","kindle","kindling","kindly","kindness",
+"kindred","kine","kinetic","kinetics","kinfolk",
+"king","kingcup","kingdom","kingfisher","kingly",
+"kingmaker","kingpin","kings","kingship","kink",
+"kinky","kinsfolk","kinship","kinsman","kiosk",
+"kip","kipper","kirk","kirsch","kirtle",
+"kismet","kiss","kisser","kit","kitchen",
+"kitchenette","kite","kitsch","kitten","kittenish",
+"kittiwake","kitty","kiwi","klaxon","kleenex",
+"kleptomania","kleptomaniac","knack","knacker","knackered",
+"knapsack","knave","knavery","knead","knee",
+"kneecap","kneel","knell","knew","knickerbockers",
+"knickers","knife","knight","knighthood","knightly",
+"knit","knitter","knitting","knitwear","knives",
+"knob","knobbly","knobkerrie","knock","knockabout",
+"knockdown","knocker","knockers","knockout","knoll",
+"knot","knothole","knotty","knout","know",
+"knowing","knowingly","knowledge","knowledgeable","known",
+"knuckle","koala","kohl","kohlrabi","kookaburra",
+"kopeck","kopek","kopje","koppie","koran",
+"kosher","kowtow","kraal","kremlin","kris",
+"krona","krone","kudos","kukri","kumis",
+"kumquat","kuomintang","kurus","kvass","kwashiorkor",
+"kwela","laager","lab","label","labial",
+"labor","laboratory","laborer","laborious","labour",
+"labourer","labourite","labrador","laburnum","labyrinth",
+"lace","lacerate","laceration","lachrymal","lachrymose",
+"lack","lackadaisical","lackey","lacking","lackluster",
+"lacklustre","laconic","lacquer","lacrosse","lactation",
+"lactic","lactose","lacuna","lacy","lad",
+"ladder","laddie","laddy","laden","ladies",
+"lading","ladle","lady","ladybird","ladylike",
+"ladyship","lag","lager","laggard","lagging",
+"lagoon","laid","lain","lair","laird",
+"laity","lake","lam","lama","lamaism",
+"lamasery","lamb","lambaste","lambent","lambkin",
+"lamblike","lambskin","lame","lament","lamentable",
+"lamentation","laminate","lamming","lamp","lampoon",
+"lamppost","lamprey","lampshade","lance","lancer",
+"lancers","lancet","land","landau","landed",
+"landfall","landing","landlady","landlocked","landlord",
+"landlubber","landmark","landmine","lands","landscape",
+"landslide","landslip","landward","landwards","lane",
+"language","languid","languish","languor","lank",
+"lanky","lanolin","lantern","lanternslide","lanyard",
+"lap","lapdog","lapel","lapidary","lapse",
+"lapsed","lapwing","larboard","larceny","larch",
+"lard","larder","large","largely","largess",
+"largesse","largo","lariat","lark","larkspur",
+"larrup","larva","laryngeal","laryngitis","laryngoscope",
+"larynx","lasagna","lascivious","laser","lash",
+"lashing","lashings","lass","lasso","last",
+"lasting","lastly","lat","latch","latchkey",
+"late","latecomer","lately","latent","lateral",
+"latest","latex","lath","lathe","lather",
+"latin","latinise","latinize","latitude","latitudes",
+"latitudinal","latitudinarian","latrine","latter","latterly",
+"lattice","laud","laudable","laudanum","laudatory",
+"laugh","laughable","laughingstock","laughter","launch",
+"launder","launderette","laundress","laundry","laureate",
+"laurel","laurels","lava","lavatory","lave",
+"lavender","lavish","law","lawful","lawless",
+"lawn","lawsuit","lawyer","lax","laxative",
+"laxity","lay","layabout","layer","layette",
+"layman","layout","laze","lazy","lbw",
+"lcm","lea","leach","lead","leaden",
+"leader","leadership","leading","leads","leaf",
+"leafage","leafed","leaflet","leafy","league",
+"leak","leakage","leaky","lean","leaning",
+"leap","leapfrog","learn","learned","learner",
+"learning","lease","leasehold","leash","least",
+"leastways","leather","leatherette","leathery","leave",
+"leaved","leaven","leavening","leaves","leavings",
+"lech","lecher","lecherous","lechery","lectern",
+"lecture","lecturer","lectureship","led","ledge",
+"ledger","lee","leech","leek","leer",
+"leery","lees","leeward","leeway","left",
+"leftist","leftovers","leftward","leftwards","leg",
+"legacy","legal","legalise","legality","legalize",
+"legate","legatee","legation","legato","legend",
+"legendary","leger","legerdemain","legged","leggings",
+"leggy","legible","legion","legionary","legislate",
+"legislation","legislative","legislator","legislature","legit",
+"legitimate","legitimatise","legitimatize","legroom","legume",
+"leguminous","lei","leisure","leisured","leisurely",
+"leitmotif","leitmotive","lemming","lemon","lemonade",
+"lemur","lend","length","lengthen","lengthways",
+"lengthy","lenience","lenient","lenity","lens",
+"lent","lentil","lento","leo","leonine",
+"leopard","leotard","leper","leprechaun","leprosy",
+"lesbian","lesion","less","lessee","lessen",
+"lesser","lesson","lessor","lest","let",
+"letdown","lethal","lethargy","letraset","letter",
+"letterbox","lettered","letterhead","lettering","letterpress",
+"letters","letting","lettuce","letup","leucocyte",
+"leucotomy","leukaemia","leukemia","leukocyte","levee",
+"level","leveler","leveller","lever","leverage",
+"leveret","leviathan","levitate","levity","levodopa",
+"levy","lewd","lexical","lexicographer","lexicography",
+"lexicon","lexis","liability","liable","liaise",
+"liaison","liana","liar","lib","libation",
+"libel","libellous","libelous","liberal","liberalise",
+"liberalism","liberality","liberalize","liberally","liberate",
+"liberated","liberation","libertarian","liberties","libertine",
+"liberty","libidinous","libido","libra","librarian",
+"library","librettist","libretto","lice","licence",
+"licenced","license","licensed","licensee","licentiate",
+"licentious","lichen","licit","lick","licking",
+"licorice","lid","lido","lie","lieder",
+"lief","liege","lien","lieu","lieutenant",
+"life","lifeblood","lifeboat","lifeguard","lifeless",
+"lifelike","lifeline","lifelong","lifer","lifetime",
+"lift","liftboy","ligament","ligature","light",
+"lighten","lighter","lighterage","lighthouse","lighting",
+"lightly","lightness","lightning","lights","lightship",
+"lightweight","ligneous","lignite","likable","like",
+"likeable","likelihood","likely","liken","likeness",
+"likes","likewise","liking","lilac","lilliputian",
+"lilo","lilt","lily","limb","limber",
+"limbo","lime","limeade","limejuice","limekiln",
+"limelight","limerick","limestone","limey","limit",
+"limitation","limited","limiting","limitless","limn",
+"limousine","limp","limpet","limpid","limy",
+"linchpin","linctus","linden","line","lineage",
+"lineal","lineament","linear","lineman","linen",
+"lineout","liner","linertrain","lines","lineshooter",
+"linesman","lineup","ling","linger","lingerie",
+"lingering","lingo","lingual","linguist","linguistic",
+"linguistics","liniment","lining","link","linkage",
+"linkman","links","linkup","linnet","linocut",
+"linoleum","linotype","linseed","lint","lintel",
+"lion","lionize","lip","lipid","lipstick",
+"liquefaction","liquefy","liquescent","liqueur","liquid",
+"liquidate","liquidation","liquidator","liquidity","liquidize",
+"liquidizer","liquor","liquorice","lira","lisle",
+"lisp","lissom","lissome","list","listen",
+"listenable","listener","listless","lists","lit",
+"litany","litchi","liter","literacy","literal",
+"literally","literary","literate","literati","literature",
+"lithe","lithium","lithograph","lithographic","lithography",
+"litigant","litigate","litigation","litigious","litmus",
+"litotes","litre","litter","litterateur","litterbin",
+"litterlout","little","littoral","liturgical","liturgy",
+"livable","live","liveable","livelihood","livelong",
+"lively","liven","liver","liveried","liverish",
+"livery","liveryman","lives","livestock","livid",
+"living","lizard","llama","load","loaded",
+"loadstar","loadstone","loaf","loafsugar","loam",
+"loan","loanword","loath","loathe","loathing",
+"loathsome","loaves","lob","lobby","lobed",
+"lobotomy","lobster","lobsterpot","local","locale",
+"localise","localism","locality","localize","locally",
+"locate","located","location","loch","loci",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java
new file mode 100644
index 00000000000..c917c7ace29
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData5.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData5 {
+ private KStemData5() {
+ }
+ static String[] data = {
+"lock","locker","locket","lockjaw","locknut",
+"lockout","locks","locksmith","lockstitch","lockup",
+"loco","locomotion","locomotive","locum","locus",
+"locust","locution","lode","lodestar","lodestone",
+"lodge","lodgement","lodger","lodging","lodgings",
+"lodgment","loess","loft","lofted","lofty",
+"log","loganberry","logarithm","logarithmic","logbook",
+"logger","loggerheads","loggia","logic","logical",
+"logically","logician","logistic","logistics","logjam",
+"logrolling","loin","loincloth","loins","loiter",
+"loll","lollipop","lollop","lolly","lone",
+"lonely","loner","lonesome","long","longboat",
+"longbow","longevity","longhaired","longhand","longheaded",
+"longhop","longing","longish","longitude","longitudinal",
+"longship","longshoreman","longsighted","longstanding","longstop",
+"longsuffering","longueur","longways","longwearing","longwinded",
+"longwise","loo","loofa","loofah","look",
+"looker","lookout","looks","loom","loon",
+"loony","loop","loophole","loose","loosebox",
+"loosen","loot","lop","lope","loppings",
+"loquacious","loquat","lord","lordly","lords",
+"lordship","lore","lorgnette","lorn","lorry",
+"lose","loser","loss","lost","lot",
+"loth","lotion","lottery","lotto","lotus",
+"loud","loudhailer","loudmouth","loudspeaker","lough",
+"lounge","lounger","lour","louse","lousy",
+"lout","louver","louvre","lovable","love",
+"loveable","lovebird","lovechild","loveless","lovelorn",
+"lovely","lovemaking","lover","lovers","lovesick",
+"lovey","loving","low","lowborn","lowbred",
+"lowbrow","lowdown","lower","lowermost","lowland",
+"lowlander","lowly","loyal","loyalist","loyalty",
+"lozenge","lsd","ltd","lubber","lubricant",
+"lubricate","lubricator","lubricious","lucerne","lucid",
+"luck","luckless","lucky","lucrative","lucre",
+"ludicrous","ludo","luff","lug","luggage",
+"lugger","lughole","lugsail","lugubrious","lugworm",
+"lukewarm","lull","lullaby","lumbago","lumbar",
+"lumber","lumberjack","lumberman","lumberyard","luminary",
+"luminous","lumme","lummox","lummy","lump",
+"lumpish","lumpy","lunacy","lunar","lunate",
+"lunatic","lunch","lunchtime","lung","lunge",
+"lungfish","lungpower","lupin","lurch","lure",
+"lurgy","lurid","lurk","luscious","lush",
+"lust","luster","lustful","lustre","lustrous",
+"lusty","lutanist","lute","lutenist","luv",
+"luxuriant","luxuriate","luxurious","luxury","lychee",
+"lychgate","lye","lymph","lymphatic","lynch",
+"lynx","lyre","lyrebird","lyric","lyrical",
+"lyricism","lyricist","lyrics","mac","macabre",
+"macadam","macadamise","macadamize","macaroni","macaroon",
+"macaw","mace","macerate","mach","machete",
+"machiavellian","machination","machine","machinegun","machinery",
+"machinist","mackerel","mackintosh","macrobiotic","macrocosm",
+"mad","madam","madame","madcap","madden",
+"maddening","madder","made","madeira","mademoiselle",
+"madhouse","madly","madman","madness","madonna",
+"madrigal","maelstrom","maenad","maestro","mafia",
+"mag","magazine","magenta","maggot","maggoty",
+"magi","magic","magical","magician","magisterial",
+"magistracy","magistrate","magma","magnanimity","magnanimous",
+"magnate","magnesia","magnesium","magnet","magnetic",
+"magnetise","magnetism","magnetize","magneto","magnificat",
+"magnification","magnificent","magnifier","magnify","magniloquent",
+"magnitude","magnolia","magnum","magpie","magus",
+"maharaja","maharajah","maharanee","maharani","mahatma",
+"mahlstick","mahogany","mahout","maid","maiden",
+"maidenhair","maidenhead","maidenhood","maidenly","maidservant",
+"mail","mailbag","mailbox","maim","main",
+"mainland","mainline","mainly","mainmast","mains",
+"mainsail","mainspring","mainstay","mainstream","maintain",
+"maintenance","maisonette","maisonnette","maize","majestic",
+"majesty","majolica","major","majordomo","majorette",
+"majority","make","maker","makeshift","making",
+"makings","malachite","maladjusted","maladministration","maladroit",
+"malady","malaise","malapropism","malapropos","malaria",
+"malarial","malay","malcontent","malcontented","male",
+"malediction","malefactor","maleficent","malevolent","malfeasance",
+"malformation","malformed","malfunction","malice","malicious",
+"malign","malignancy","malignant","malignity","malinger",
+"mall","mallard","malleable","mallet","mallow",
+"malmsey","malnutrition","malodorous","malpractice","malt",
+"malthusian","maltreat","maltster","mama","mamba",
+"mambo","mamma","mammal","mammary","mammon",
+"mammoth","mammy","man","manacle","manage",
+"manageable","management","manager","manageress","managerial",
+"manatee","mandarin","mandate","mandatory","mandible",
+"mandolin","mandrake","mandrill","maneuver","maneuverable",
+"manful","manganese","mange","manger","mangle",
+"mango","mangosteen","mangrove","mangy","manhandle",
+"manhole","manhood","manhour","mania","maniac",
+"maniacal","manic","manicure","manicurist","manifest",
+"manifestation","manifesto","manifold","manikin","manila",
+"manilla","manipulate","manipulation","mankind","manly",
+"manna","manned","mannequin","manner","mannered",
+"mannerism","mannerly","manners","mannikin","mannish",
+"manoeuverable","manoeuvre","manometer","manor","manorial",
+"manpower","mansard","manse","manservant","mansion",
+"mansions","manslaughter","mantelpiece","mantelshelf","mantilla",
+"mantis","mantle","mantrap","manual","manufacture",
+"manufacturer","manumit","manure","manuscript","manx",
+"many","maoism","maori","map","maple",
+"mapping","maquis","mar","marabou","marabout",
+"maraschino","marathon","maraud","marble","marbled",
+"marbles","marc","marcasite","march","marchioness",
+"margarine","margin","marginal","marguerite","marigold",
+"marihuana","marijuana","marimba","marina","marinade",
+"marinate","marine","mariner","marionette","marital",
+"maritime","marjoram","mark","markdown","marked",
+"marker","market","marketeer","marketer","marketing",
+"marketplace","marking","marksman","marksmanship","markup",
+"marl","marlinespike","marmalade","marmoreal","marmoset",
+"marmot","marocain","maroon","marquee","marquess",
+"marquetry","marquis","marriage","marriageable","married",
+"marrow","marrowbone","marrowfat","marry","mars",
+"marsala","marseillaise","marsh","marshal","marshmallow",
+"marshy","marsupial","mart","marten","martial",
+"martian","martin","martinet","martini","martinmas",
+"martyr","martyrdom","marvel","marvellous","marvelous",
+"marxism","marzipan","mascara","mascot","masculine",
+"masculinity","maser","mash","mashie","mask",
+"masked","masochism","mason","masonic","masonry",
+"masque","masquerade","mass","massacre","massage",
+"masses","masseur","massif","massive","massy",
+"mast","mastectomy","master","masterful","masterly",
+"mastermind","masterpiece","mastership","masterstroke","mastery",
+"masthead","mastic","masticate","mastiff","mastitis",
+"mastodon","mastoid","mastoiditis","masturbate","mat",
+"matador","match","matchbox","matching","matchless",
+"matchlock","matchmaker","matchstick","matchwood","mate",
+"material","materialise","materialism","materialist","materialize",
+"maternal","maternity","matey","mathematician","mathematics",
+"matins","matriarch","matriarchy","matricide","matriculate",
+"matrimony","matrix","matron","matronly","matt",
+"matter","matting","mattins","mattock","mattress",
+"maturation","mature","maturity","maudlin","maul",
+"maulstick","maunder","mausoleum","mauve","maverick",
+"maw","mawkish","maxi","maxim","maximal",
+"maximise","maximize","maximum","may","maybe",
+"maybeetle","mayday","mayfly","mayhem","mayonnaise",
+"mayor","mayoralty","mayoress","maypole","mayst",
+"maze","mazed","mazurka","mccarthyism","mead",
+"meadow","meadowsweet","meager","meagre","meal",
+"mealie","mealtime","mealy","mealybug","mean",
+"meander","meanderings","meaning","meaningful","meaningless",
+"means","meant","meantime","meanwhile","measles",
+"measly","measurable","measure","measured","measureless",
+"measurement","meat","meatball","meaty","mecca",
+"mechanic","mechanical","mechanics","mechanise","mechanism",
+"mechanistic","mechanize","medal","medalist","medallion",
+"medallist","meddle","meddlesome","media","mediaeval",
+"medial","median","mediate","medic","medical",
+"medicament","medicare","medicate","medication","medicinal",
+"medicine","medico","medieval","mediocre","mediocrity",
+"meditate","meditation","meditative","mediterranean","medium",
+"medlar","medley","meed","meek","meerschaum",
+"meet","meeting","meetinghouse","megadeath","megahertz",
+"megalith","megalithic","megalomania","megalomaniac","megaphone",
+"megaton","megrim","meiosis","melancholia","melancholic",
+"melancholy","meld","melee","meliorate","meliorism",
+"mellifluous","mellow","melodic","melodious","melodrama",
+"melodramatic","melody","melon","melt","melting",
+"member","membership","membrane","membranous","memento",
+"memo","memoir","memoirs","memorabilia","memorable",
+"memorandum","memorial","memorise","memorize","memory",
+"memsahib","men","menace","menagerie","mend",
+"mendacious","mendacity","mendelian","mendicant","mending",
+"menfolk","menial","meningitis","meniscus","menopause",
+"menses","menstrual","menstruate","mensurable","mensuration",
+"mental","mentality","menthol","mentholated","mention",
+"mentor","menu","meow","mephistopheles","mercantile",
+"mercenary","mercer","mercerise","mercerize","merchandise",
+"merchant","merchantman","merciful","merciless","mercurial",
+"mercury","mercy","mere","merely","meretricious",
+"merge","merger","meridian","meridional","meringue",
+"merino","merit","meritocracy","meritorious","mermaid",
+"merman","merriment","merry","merrymaking","mesa",
+"mescalin","mescaline","mesdames","mesdemoiselles","meseems",
+"mesh","mesmeric","mesmerise","mesmerism","mesmerist",
+"mesmerize","mess","message","messenger","messiah",
+"messianic","messieurs","messmate","messrs","messuage",
+"messy","mestizo","met","metabolic","metabolise",
+"metabolism","metabolize","metacarpal","metal","metalanguage",
+"metallic","metallurgist","metallurgy","metalwork","metamorphose",
+"metamorphosis","metaphor","metaphorical","metaphysics","metatarsal",
+"mete","metempsychosis","meteor","meteoric","meteorite",
+"meteoroid","meteorologist","meteorology","meter","methane",
+"methinks","method","methodical","methodism","methodology",
+"meths","methuselah","meticulous","metre","metric",
+"metrical","metrication","metricise","metricize","metro",
+"metronome","metropolis","metropolitan","mettle","mettlesome",
+"mew","mews","mezzanine","mezzo","mezzotint",
+"miaow","miasma","mica","mice","michaelmas",
+"mick","mickey","microbe","microbiologist","microbiology",
+"microcosm","microelectronics","microfiche","microfilm","micromesh",
+"micrometer","micron","microorganism","microphone","microscope",
+"microscopic","microsecond","microwave","mid","midair",
+"midcourse","midday","midden","middle","middlebrow",
+"middleman","middleweight","middling","midge","midget",
+"midi","midland","midlands","midmost","midnight",
+"midpoint","midriff","midshipman","midships","midst",
+"midsummer","midway","midweek","midwest","midwicket",
+"midwife","midwifery","mien","miffed","might",
+"mightily","mighty","mignonette","migraine","migrant",
+"migrate","migration","migratory","mikado","mike",
+"milady","mild","mildew","mildly","mile",
+"mileage","mileometer","miler","milestone","milieu",
+"militancy","militant","militarise","militarism","militarize",
+"military","militate","militia","militiaman","milk",
+"milker","milkmaid","milkman","milksop","milkweed",
+"milky","mill","millboard","milldam","millenarian",
+"millenium","millepede","miller","millet","millibar",
+"milligram","milligramme","milliliter","millilitre","millimeter",
+"millimetre","milliner","millinery","million","millionaire",
+"millipede","millpond","millrace","millstone","millwheel",
+"millwright","milometer","milord","milt","mime",
+"mimeograph","mimetic","mimic","mimicry","mimosa",
+"min","minaret","minatory","mince","mincemeat",
+"mincer","mincingly","mind","minded","mindful",
+"mindless","mine","minefield","minelayer","miner",
+"mineral","mineralogist","mineralogy","minestrone","minesweeper",
+"mingle","mingy","mini","miniature","miniaturist",
+"minibus","minim","minimal","minimise","minimize",
+"minimum","mining","minion","minister","ministerial",
+"ministrant","ministration","ministry","miniver","mink",
+"minnow","minor","minority","minotaur","minster",
+"minstrel","minstrelsy","mint","minuet","minus",
+"minuscule","minute","minutely","minuteman","minutes",
+"minutia","minx","miracle","miraculous","mirage",
+"mire","mirror","mirth","miry","misadventure",
+"misadvise","misalliance","misanthrope","misanthropy","misapplication",
+"misapply","misapprehend","misapprehension","misappropriate","misbegotten",
+"misbehave","misbehaved","misbehavior","misbehaviour","miscalculate",
+"miscall","miscarry","miscast","miscegenation","miscellaneous",
+"miscellany","mischance","mischief","mischievous","misconceive",
+"misconception","misconduct","misconstruction","misconstrue","miscount",
+"miscreant","miscue","misdate","misdeal","misdeed",
+"misdemeanor","misdemeanour","misdirect","misdoing","miser",
+"miserable","miserably","miserly","misery","misfire",
+"misfit","misfortune","misgiving","misgovern","misguide",
+"misguided","mishandle","mishap","mishear","mishit",
+"mishmash","misinform","misinterpret","misjudge","misjudgement",
+"misjudgment","mislay","mislead","mismanage","mismatch",
+"misname","misnomer","misogynist","misogyny","misplace",
+"misprint","mispronounce","mispronunciation","misquote","misread",
+"misreport","misrepresent","misrule","miss","missal",
+"misshapen","missile","missing","mission","missionary",
+"missis","missive","misspell","misspend","misstate",
+"misstatement","missus","missy","mist","mistake",
+"mistaken","mister","mistime","mistletoe","mistral",
+"mistranslate","mistress","mistrial","mistrust","mistrustful",
+"mists","misty","misunderstand","misunderstanding","misuse",
+"mite","miter","mitigate","mitosis","mitre",
+"mitt","mitten","mix","mixed","mixer",
+"mixture","mizen","mizzen","mizzenmast","mizzle",
+"mnemonic","mnemonics","moa","moan","moat",
+"moated","mob","mobile","mobilisation","mobilise",
+"mobility","mobilization","mobilize","mobster","moccasin",
+"mocha","mock","mockers","mockery","mockingbird",
+"modal","mode","model","moderate","moderately",
+"moderation","moderations","moderato","moderator","modern",
+"modernise","modernism","modernistic","modernity","modernize",
+"modest","modesty","modicum","modification","modifier",
+"modify","modish","mods","modular","modulate",
+"modulation","module","moggy","mogul","moh",
+"mohair","mohammedan","mohammedanism","moiety","moist",
+"moisten","moisture","moisturise","moisturize","moke",
+"molar","molasses","mold","molder","molding",
+"moldy","mole","molecular","molecule","molehill",
+"moleskin","molest","moll","mollify","mollusc",
+"mollusk","mollycoddle","molt","molten","molto",
+"molybdenum","mom","moment","momentarily","momentary",
+"momentous","moments","momentum","momma","mommy",
+"monarch","monarchic","monarchism","monarchist","monarchy",
+"monastery","monastic","monasticism","monaural","monday",
+"monetary","money","moneybags","moneybox","moneychanger",
+"moneyed","moneylender","moneymaker","moneys","monger",
+"mongol","mongolism","mongoose","mongrel","monies",
+"monitor","monk","monkey","mono","monochrome",
+"monocle","monogamous","monogamy","monogram","monograph",
+"monolith","monolithic","monolog","monologue","monomania",
+"monomaniac","mononucleosis","monophonic","monophthong","monoplane",
+"monopolise","monopolist","monopolize","monopoly","monorail",
+"monosyllabic","monosyllable","monotheism","monotone","monotonous",
+"monotony","monotype","monoxide","monsieur","monsignor",
+"monsoon","monster","monstrance","monstrosity","monstrous",
+"montage","month","monthly","monument","monumental",
+"monumentally","moo","mooch","moocow","mood",
+"moody","moon","moonbeam","mooncalf","moonlight",
+"moonlit","moonshine","moonstone","moonstruck","moony",
+"moor","moorhen","moorings","moorish","moorland",
+"moose","moot","mop","mope","moped",
+"moppet","moquette","moraine","moral","morale",
+"moralise","moralist","moralistic","morality","moralize",
+"morally","morals","morass","moratorium","morbid",
+"morbidity","mordant","more","morello","moreover",
+"mores","moresque","morganatic","morgue","moribund",
+"mormon","mormonism","morn","morning","mornings",
+"morocco","moron","moronic","morose","morpheme",
+"morphemics","morpheus","morphine","morphology","morrow",
+"morsel","mortal","mortality","mortally","mortar",
+"mortarboard","mortgage","mortgagee","mortgagor","mortice",
+"mortician","mortification","mortify","mortise","mortuary",
+"mosaic","moselle","mosey","moslem","mosque",
+"mosquito","moss","mossy","most","mostly",
+"mote","motel","motet","moth","mothball",
+"mothballs","mother","motherhood","motherly","mothproof",
+"motif","motion","motionless","motions","motivate",
+"motivation","motive","motley","motocross","motor",
+"motorbike","motorboat","motorcade","motorcar","motorcycle",
+"motorcyclist","motoring","motorise","motorist","motorize",
+"motorman","motorway","mottled","motto","mould",
+"moulder","moulding","mouldy","moult","mound",
+"mount","mountain","mountaineer","mountaineering","mountainous",
+"mountainside","mountaintop","mountebank","mountie","mourn",
+"mourner","mournful","mourning","mouse","mouser",
+"mousetrap","moussaka","mousse","moustache","mousy",
+"mouth","mouthful","mouthorgan","mouthpiece","mouthwash",
+"movable","move","moveable","movement","movements",
+"mover","movie","movies","moving","mow",
+"mower","mpg","mph","mra","mrs",
+"msc","much","muchness","mucilage","muck",
+"muckheap","muckrake","mucky","mucous","mucus",
+"mud","muddle","muddy","mudflat","mudguard",
+"mudpack","mudslinger","muesli","muezzin","muff",
+"muffin","muffle","muffler","mufti","mug",
+"mugger","muggins","muggy","mugwump","muhammadan",
+"muhammadanism","mulatto","mulberry","mulch","mulct",
+"mule","muleteer","mulish","mull","mullah",
+"mullet","mulligatawny","mullion","mullioned","multifarious",
+"multiform","multilateral","multilingual","multimillionaire","multiple",
+"multiplex","multiplication","multiplicity","multiply","multiracial",
+"multistorey","multitude","multitudinous","mum","mumble",
+"mummer","mummery","mummify","mumming","mummy",
+"mumps","munch","mundane","municipal","municipality",
+"munificence","munificent","muniments","munition","munitions",
+"mural","murder","murderous","murk","murky",
+"murmur","murphy","murrain","muscatel","muscle",
+"muscled","muscleman","muscovite","muscular","muse",
+"museum","mush","mushroom","mushy","music",
+"musical","musically","musician","musicianship","musk",
+"musket","musketeer","musketry","muskmelon","muskrat",
+"musky","muslim","muslin","musquash","muss",
+"mussel","must","mustache","mustachio","mustang",
+"mustard","muster","musty","mutable","mutant",
+"mutation","mute","muted","mutilate","mutilation",
+"mutineer","mutinous","mutiny","mutt","mutter",
+"mutton","muttonchops","mutual","mutuality","muzak",
+"muzzle","muzzy","mycology","myelitis","myna",
+"mynah","myopia","myriad","myrrh","myrtle",
+"myself","mysterious","mystery","mystic","mystical",
+"mysticism","mystification","mystify","mystique","myth",
+"mythical","mythological","mythologist","mythology","myxomatosis",
+"nab","nabob","nacelle","nacre","nadir",
+"nag","naiad","nail","nailbrush","naive",
+"naivete","naivety","naked","name","namedrop",
+"nameless","namely","nameplate","namesake","nanny",
+"nap","napalm","naphtha","naphthalene","napkin",
+"nappy","narc","narcissism","narcissus","narcotic",
+"nark","narky","narrate","narration","narrative",
+"narrator","narrow","narrowly","narrows","narwhal",
+"nasal","nasalise","nasalize","nascent","nasturtium",
+"nasty","natal","nation","national","nationalise",
+"nationalism","nationalist","nationalistic","nationality","nationalize",
+"nationwide","native","nativity","nato","natter",
+"natty","natural","naturalise","naturalism","naturalist",
+"naturalistic","naturalize","naturally","naturalness","nature",
+"naturism","naturopath","naught","naughty","nausea",
+"nauseate","nauseous","nautch","nautical","nautilus",
+"naval","nave","navel","navigable","navigate",
+"navigation","navigator","navvy","navy","nay",
+"nazi","nco","neanderthal","neapolitan","near",
+"nearby","nearly","nearside","nearsighted","neat",
+"nebula","nebular","nebulous","necessaries","necessarily",
+"necessary","necessitate","necessitous","necessity","neck",
+"neckband","neckerchief","necklace","necklet","neckline",
+"necktie","neckwear","necromancer","necromancy","necrophilia",
+"necrophiliac","necropolis","nectar","nectarine","need",
+"needful","needle","needless","needlessly","needlewoman",
+"needlework","needs","needy","nefarious","negate",
+"negative","neglect","neglectful","negligee","negligence",
+"negligent","negligible","negotiable","negotiate","negotiation",
+"negress","negro","negus","neigh","neighbor",
+"neighborhood","neighboring","neighborly","neighbour","neighbourhood",
+"neighbouring","neighbourly","neither","nelson","nemesis",
+"neoclassical","neocolonialism","neolithic","neologism","neon",
+"neonate","neophyte","neoplasm","nephew","nephritis",
+"nepotism","neptune","nereid","nerve","nerveless",
+"nerves","nervous","nervy","ness","nest",
+"nesting","nestle","nestling","nestor","net",
+"netball","nether","nethermost","nets","nett",
+"netting","nettle","network","neural","neuralgia",
+"neurasthenia","neurasthenic","neuritis","neurologist","neurology",
+"neurosis","neurotic","neuter","neutral","neutralise",
+"neutrality","neutralize","neutralizer","neutron","never",
+"nevermore","nevertheless","new","newborn","newcomer",
+"newel","newfangled","newfoundland","newly","newlywed",
+"newmarket","news","newsagent","newsboy","newscast",
+"newscaster","newsletter","newsmonger","newspaper","newsprint",
+"newsreel","newsroom","newssheet","newsstand","newsvendor",
+"newsworthy","newsy","newt","newtonian","next",
+"nexus","nhs","niacin","nib","nibble",
+"niblick","nibs","nice","nicely","nicety",
+"niche","nick","nickel","nicker","nicknack",
+"nickname","nicotine","niece","niff","nifty",
+"niggard","niggardly","nigger","niggle","niggling",
+"nigh","night","nightcap","nightclothes","nightclub",
+"nightdress","nightfall","nighthawk","nightingale","nightjar",
+"nightlife","nightlight","nightline","nightlong","nightly",
+"nightmare","nights","nightshade","nightshirt","nightstick",
+"nighttime","nihilism","nilotic","nimble","nimbus",
+"nimrod","nincompoop","nine","ninepin","ninepins",
+"nines","nineteen","ninety","ninny","ninth",
+"nip","nipper","nippers","nipping","nipple",
+"nippy","nirvana","nisi","nit","niter",
+"nitpick","nitpicking","nitrate","nitre","nitric",
+"nitrochalk","nitrogen","nitroglycerin","nitroglycerine","nitrous",
+"nitwit","nix","nob","nobble","nobility",
+"noble","nobleman","nobly","nobody","nocturnal",
+"nocturne","nod","nodal","noddle","nodular",
+"nodule","noel","noes","nog","noggin",
+"nohow","noise","noisome","noisy","nomad",
+"nomadic","nomenclature","nominal","nominate","nomination",
+"nominative","nominee","nonage","nonagenarian","nonaggression",
+"nonaligned","nonalignment","nonassertive","nonce","nonchalance",
+"nonchalant","noncombatant","noncommittal","nonconductor","nonconformist",
+"nonconformity","noncontributory","nondescript","none","nonentity",
+"nonesuch","nonetheless","nonfiction","nonflammable","nonintervention",
+"nonobservance","nonpareil","nonpayment","nonplus","nonproliferation",
+"nonresident","nonrestrictive","nonsense","nonsensical","nonskid",
+"nonsmoker","nonstandard","nonstarter","nonstick","nonstop",
+"nonunion","nonverbal","nonviolence","nonviolent","nonwhite",
+"noodle","nook","noon","noonday","noose",
+"nope","nor","nordic","norm","normal",
+"normalise","normality","normalize","normally","norman",
+"normative","north","northbound","northeast","northeaster",
+"northeasterly","northeastern","northeastward","northeastwards","northerly",
+"northern","northerner","northernmost","northward","northwards",
+"northwest","northwester","northwesterly","northwestern","northwestward",
+"northwestwards","nos","nose","nosebag","nosebleed",
+"nosecone","nosedive","nosegay","nosey","nosh",
+"nostalgia","nostril","nostrum","nosy","not",
+"notability","notable","notably","notarise","notarize",
+"notary","notation","notch","note","notebook",
+"notecase","noted","notepaper","noteworthy","nothing",
+"nothingness","notice","noticeable","notifiable","notification",
+"notify","notion","notional","notions","notoriety",
+"notorious","notwithstanding","nougat","nought","noun",
+"nourish","nourishment","nous","nova","novel",
+"novelette","novelettish","novelist","novella","novelty",
+"november","novice","noviciate","novitiate","novocaine",
+"now","nowadays","nowhere","nowise","noxious",
+"nozzle","nth","nuance","nub","nubile",
+"nuclear","nucleus","nude","nudge","nudism",
+"nudity","nugatory","nugget","nuisance","null",
+"nullah","nullify","nullity","numb","number",
+"numberless","numberplate","numbers","numbly","numbskull",
+"numeracy","numeral","numerate","numeration","numerator",
+"numerical","numerology","numerous","numinous","numismatic",
+"numismatics","numskull","nun","nuncio","nunnery",
+"nuptial","nuptials","nurse","nurseling","nursemaid",
+"nursery","nurseryman","nursing","nursling","nurture",
+"nut","nutcase","nutcracker","nuthouse","nutmeg",
+"nutria","nutrient","nutriment","nutrition","nutritious",
+"nutritive","nuts","nutshell","nutty","nuzzle",
+"nylon","nylons","nymph","nymphet","nymphomania",
+"nymphomaniac","oaf","oak","oaken","oakum",
+"oap","oar","oarlock","oarsman","oarsmanship",
+"oasis","oat","oatcake","oath","oatmeal",
+"oats","obbligato","obdurate","obeah","obedient",
+"obeisance","obelisk","obese","obey","obfuscate",
+"obituary","object","objection","objectionable","objective",
+"objector","oblation","obligate","obligation","obligatory",
+"oblige","obliging","oblique","obliterate","oblivion",
+"oblivious","oblong","obloquy","obnoxious","oboe",
+"oboist","obscene","obscenity","obscurantism","obscure",
+"obscurity","obsequies","obsequious","observable","observance",
+"observant","observation","observations","observatory","observe",
+"observer","observing","obsess","obsession","obsessional",
+"obsessive","obsidian","obsolescent","obsolete","obstacle",
+"obstetrician","obstetrics","obstinate","obstreperous","obstruct",
+"obstruction","obstructionism","obstructive","obtain","obtainable",
+"obtrude","obtrusive","obtuse","obverse","obviate",
+"obvious","obviously","ocarina","occasion","occasional",
+"occident","occidental","occult","occupancy","occupant",
+"occupation","occupational","occupier","occupy","occur",
+"occurrence","ocean","oceangoing","oceanography","ocelot",
+"ocher","ochre","octagon","octane","octave",
+"octavo","octet","october","octogenarian","octopus",
+"octosyllabic","ocular","oculist","odalisque","odd",
+"oddball","oddity","oddly","oddment","odds",
+"ode","odious","odium","odor","odoriferous",
+"odorous","odour","odyssey","oecumenical","oecumenicalism",
+"oesophagus","oestrogen","off","offal","offbeat",
+"offence","offend","offender","offense","offensive",
+"offer","offering","offertory","offhand","office",
+"officeholder","officer","offices","official","officialdom",
+"officialese","officially","officiate","officious","offing",
+"offish","offprint","offset","offshoot","offshore",
+"offside","offspring","offstage","oft","often",
+"ogle","ogre","ohm","oho","oil",
+"oilcake","oilcan","oilcloth","oiled","oilfield",
+"oilman","oilrig","oils","oilskin","oilskins",
+"oily","oink","ointment","okapi","okay",
+"okra","old","olden","oldish","oldster",
+"oleaginous","oleander","oleograph","olfactory","oligarch",
+"oligarchy","olive","olympiad","olympian","olympic",
+"ombudsman","omega","omelet","omelette","omen",
+"ominous","omission","omit","omnibus","omnipotent",
+"omnipresent","omniscient","omnivorous","once","oncoming",
+"one","onerous","oneself","onetime","ongoing",
+"onion","onlooker","only","onomatopoeia","onrush",
+"onset","onshore","onside","onslaught","onto",
+"ontology","onus","onward","onwards","onyx",
+"oodles","oof","oomph","oops","ooze",
+"opacity","opal","opalescent","opaque","ope",
+"open","opencast","opener","openhearted","opening",
+"openly","openwork","opera","operable","operate",
+"operation","operational","operative","operator","operetta",
+"ophthalmia","ophthalmic","ophthalmology","ophthalmoscope","opiate",
+"opine","opinion","opinionated","opium","opossum",
+"opponent","opportune","opportunism","opportunity","oppose",
+"opposite","opposition","oppress","oppression","oppressive",
+"oppressor","opprobrious","opprobrium","ops","opt",
+"optative","optic","optical","optician","optics",
+"optimism","optimum","option","optional","opulence",
+"opulent","opus","oracle","oracular","oral",
+"orange","orangeade","orangeman","orangutang","oration",
+"orator","oratorical","oratorio","oratory","orb",
+"orbit","orchard","orchestra","orchestral","orchestrate",
+"orchid","ordain","ordeal","order","ordered",
+"orderly","orders","ordinal","ordinance","ordinand",
+"ordinarily","ordinary","ordinate","ordination","ordnance",
+"ordure","ore","oregano","organ","organdie",
+"organdy","organic","organisation","organise","organised",
+"organism","organist","organization","organize","organized",
+"orgasm","orgiastic","orgy","orient","oriental",
+"orientalist","orientate","orientation","orifice","origin",
+"original","originality","originally","originate","oriole",
+"orison","orlon","ormolu","ornament","ornamental",
+"ornamentation","ornate","ornery","ornithology","orotund",
+"orphan","orphanage","orrery","orrisroot","orthodontic",
+"orthodontics","orthodox","orthodoxy","orthography","orthopaedic",
+"orthopaedics","orthopedic","orthopedics","ortolan","oryx",
+"oscar","oscillate","oscillation","oscillator","oscillograph",
+"oscilloscope","osculation","osier","osmosis","osprey",
+"osseous","ossification","ossify","ostensible","ostentation",
+"osteoarthritis","osteopath","osteopathy","ostler","ostracise",
+"ostracize","ostrich","other","otherwise","otherworldly",
+"otiose","otter","ottoman","oubliette","ouch",
+"ought","ounce","our","ours","ourselves",
+"ousel","oust","out","outback","outbalance",
+"outbid","outbound","outbrave","outbreak","outbuilding",
+"outburst","outcast","outcaste","outclass","outcome",
+"outcrop","outcry","outdated","outdistance","outdo",
+"outdoor","outdoors","outer","outermost","outface",
+"outfall","outfield","outfight","outfit","outflank",
+"outflow","outfox","outgeneral","outgoing","outgoings",
+"outgrow","outgrowth","outhouse","outing","outlandish",
+"outlast","outlaw","outlay","outlet","outline",
+"outlive","outlook","outlying","outmaneuver","outmanoeuvre",
+"outmarch","outmatch","outmoded","outmost","outnumber",
+"outpatient","outplay","outpoint","outpost","outpourings",
+"output","outrage","outrageous","outrange","outrank",
+"outride","outrider","outrigger","outright","outrival",
+"outrun","outsell","outset","outshine","outside",
+"outsider","outsize","outskirts","outsmart","outspoken",
+"outspread","outstanding","outstay","outstretched","outstrip",
+"outtalk","outvote","outward","outwardly","outwards",
+"outwear","outweigh","outwit","outwork","outworn",
+"ouzel","ouzo","ova","oval","ovarian",
+"ovary","ovation","oven","ovenware","over",
+"overact","overage","overall","overalls","overarch",
+"overarm","overawe","overbalance","overbear","overbearing",
+"overbid","overblown","overboard","overburden","overcall",
+"overcapitalise","overcapitalize","overcast","overcharge","overcloud",
+"overcoat","overcome","overcompensate","overcrop","overcrowd",
+"overdevelop","overdo","overdone","overdose","overdraft",
+"overdraw","overdrawn","overdress","overdrive","overdue",
+"overestimate","overexpose","overflow","overfly","overgrown",
+"overgrowth","overhand","overhang","overhaul","overhead",
+"overheads","overhear","overjoyed","overkill","overland",
+"overlap","overlay","overleaf","overleap","overload",
+"overlong","overlook","overlord","overly","overman",
+"overmaster","overmuch","overnight","overpass","overpay",
+"overplay","overpopulated","overpopulation","overpower","overpowering",
+"overprint","overrate","overreach","override","overriding",
+"overrule","overrun","overseas","oversee","overseer",
+"oversell","oversexed","overshadow","overshoe","overshoot",
+"overside","oversight","oversimplify","oversleep","overspill",
+"overstate","overstatement","overstay","oversteer","overstep",
+"overstock","overstrung","overstuffed","oversubscribed","overt",
+"overtake","overtax","overthrow","overtime","overtone",
+"overtones","overtop","overtrump","overture","overtures",
+"overturn","overweening","overweight","overwhelm","overwhelming",
+"overwork","overwrought","oviduct","oviparous","ovoid",
+"ovulate","ovum","owe","owl","owlet",
+"owlish","own","owner","ownership","oxbridge",
+"oxcart","oxeye","oxide","oxidise","oxidize",
+"oxon","oxonian","oxtail","oxyacetylene","oxygen",
+"oxygenate","oyez","oyster","oystercatcher","ozone",
+"pabulum","pace","pacemaker","pacesetter","pachyderm",
+"pacific","pacifier","pacifism","pacifist","pacify",
+"pack","package","packed","packer","packet",
+"packing","packsaddle","pact","pad","padding",
+"paddle","paddock","paddy","padlock","padre",
+"paean","paederast","paederasty","paediatrician","paediatrics",
+"paella","paeony","pagan","paganism","page",
+"pageant","pageantry","pagination","pagoda","paid",
+"pail","paillasse","pain","pained","painful",
+"painkiller","painless","pains","painstaking","paint",
+"paintbrush","painter","painting","paints","paintwork",
+"pair","paisley","pajama","pajamas","pal",
+"palace","paladin","palais","palakeen","palanquin",
+"palatable","palatal","palatalize","palate","palatial",
+"palatinate","palaver","pale","paleface","paleography",
+"paleolithic","paleontology","palette","palfrey","palimpsest",
+"palindrome","paling","palings","palisade","palish",
+"pall","palladian","pallbearer","pallet","palliasse",
+"palliate","palliation","palliative","pallid","pallor",
+"pally","palm","palmer","palmetto","palmist",
+"palmistry","palmy","palomino","palpable","palpate",
+"palpitate","palpitation","palsied","palsy","palter",
+"paltry","pampas","pamper","pamphlet","pamphleteer",
+"pan","panacea","panache","panama","panatela",
+"panatella","pancake","panchromatic","pancreas","panda",
+"pandemic","pandemonium","pander","pandit","panegyric",
+"panel","paneling","panelist","panelling","panellist",
+"pang","panhandle","panic","panicky","panjabi",
+"panjandrum","pannier","pannikin","panoplied","panoply",
+"panorama","panpipes","pansy","pant","pantaloon",
+"pantaloons","pantechnicon","pantheism","pantheon","panther",
+"panties","pantile","panto","pantograph","pantomime",
+"pantry","pants","panty","panzer","pap",
+"papa","papacy","papadum","papal","papaya",
+"paper","paperback","paperboy","paperhanger","papers",
+"paperweight","paperwork","papery","papist","papoose",
+"pappy","paprika","papyrus","par","parable",
+"parabola","parachute","parachutist","paraclete","parade",
+"paradigm","paradigmatic","paradise","paradisiacal","paradox",
+"paraffin","paragon","paragraph","parakeet","parallel",
+"parallelism","parallelogram","paralyse","paralysis","paralytic",
+"paralyze","paramilitary","paramount","paramountcy","paramour",
+"paranoia","paranoiac","paranoid","parapet","paraphernalia",
+"paraphrase","paraplegia","paraplegic","paraquat","paras",
+"parasite","parasitic","parasol","parathyroid","paratrooper",
+"paratroops","paratyphoid","parboil","parcel","parch",
+"parchment","pard","pardon","pardonable","pardonably",
+"pardoner","pare","parent","parentage","parental",
+"parenthesis","parenthetic","parenthood","parer","parhelion",
+"pariah","paring","parish","parishioner","parisian",
+"parity","park","parka","parkin","parking",
+"parkland","parky","parlance","parley","parliament",
+"parliamentarian","parliamentary","parlor","parlour","parlous",
+"parmesan","parochial","parodist","parody","parole",
+"paroxysm","parquet","parr","parricide","parrot",
+"parry","parse","parsee","parsi","parsimonious",
+"parsimony","parsley","parsnip","parson","parsonage",
+"part","partake","parterre","parthenogenesis","partial",
+"partiality","partially","participant","participate","participation",
+"participial","participle","particle","particular","particularise",
+"particularity","particularize","particularly","particulars","parting",
+"partisan","partita","partition","partitive","partizan",
+"partly","partner","partnership","partook","partridge",
+"parts","parturition","party","parvenu","paschal",
+"pasha","pass","passable","passage","passageway",
+"passbook","passenger","passerby","passim","passing",
+"passion","passionate","passionately","passionflower","passive",
+"passivity","passivize","passkey","passover","passport",
+"password","past","pasta","paste","pasteboard",
+"pastel","pastern","pasteurise","pasteurize","pastiche",
+"pastille","pastime","pasting","pastor","pastoral",
+"pastorale","pastorate","pastrami","pastry","pasturage",
+"pasture","pasty","pat","patch","patchouli",
+"patchwork","patchy","patella","patent","patentee",
+"patently","pater","paterfamilias","paternal","paternalism",
+"paternity","paternoster","path","pathan","pathetic",
+"pathfinder","pathological","pathologist","pathology","pathos",
+"pathway","patience","patient","patina","patio",
+"patisserie","patois","patrial","patriarch","patriarchal",
+"patriarchate","patriarchy","patrician","patricide","patrimony",
+"patriot","patriotic","patriotism","patrol","patrolman",
+"patron","patronage","patroness","patronise","patronize",
+"patronymic","patten","patter","pattern","patty",
+"paucity","paunch","paunchy","pauper","pauperise",
+"pauperism","pauperize","pause","pavan","pavane",
+"pave","paved","pavement","pavilion","paving",
+"paw","pawky","pawl","pawn","pawnbroker",
+"pawnshop","pawpaw","pay","payable","payday",
+"payee","payer","payload","paymaster","payment",
+"paynim","payoff","payola","payroll","pea",
+"peace","peaceable","peaceful","peacekeeping","peacemaker",
+"peacetime","peach","peachick","peacock","peafowl",
+"peahen","peak","peaked","peaky","peal",
+"peanut","peanuts","pear","pearl","pearly",
+"pearmain","peasant","peasantry","peashooter","peat",
+"pebble","pebbledash","pebbly","pecan","peccadillo",
+"peccary","peck","pecker","peckish","pectic",
+"pectin","pectoral","peculate","peculiar","peculiarity",
+"peculiarly","pecuniary","pedagogue","pedagogy","pedal",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java
new file mode 100644
index 00000000000..636af5f8251
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData6.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData6 {
+ private KStemData6() {
+ }
+ static String[] data = {
+"pedant","pedantic","pedantry","peddle","peddler",
+"pederast","pederasty","pedestal","pedestrian","pediatrician",
+"pediatrics","pedicab","pedicel","pedicure","pedigree",
+"pediment","pedlar","pedometer","pee","peek",
+"peekaboo","peel","peeler","peelings","peep",
+"peeper","peephole","peepul","peer","peerage",
+"peeress","peerless","peeve","peevish","peewit",
+"peg","pejorative","pekinese","pekingese","pekoe",
+"pelagic","pelf","pelican","pellagra","pellet",
+"pellucid","pelmet","pelota","pelt","pelvic",
+"pelvis","pemican","pemmican","pen","penal",
+"penalise","penalize","penalty","penance","pence",
+"penchant","pencil","pendant","pendent","pending",
+"pendulous","pendulum","penetrate","penetrating","penetration",
+"penetrative","penguin","penicillin","peninsula","penis",
+"penitent","penitential","penitentiary","penknife","penmanship",
+"pennant","penniless","pennon","penny","pennyweight",
+"pennywort","penology","pension","pensionable","pensioner",
+"pensive","pentagon","pentagram","pentameter","pentateuch",
+"pentathlon","pentecost","penthouse","penultimate","penumbra",
+"penurious","penury","peon","peony","people",
+"pep","pepper","peppercorn","peppermint","peppery",
+"pepsin","peptic","per","peradventure","perambulate",
+"perambulator","perceive","percentage","percentile","perceptible",
+"perception","perceptive","perch","perchance","percipient",
+"percolate","percolator","percussion","percussionist","perdition",
+"peregrination","peremptory","perennial","perfect","perfectible",
+"perfection","perfectionist","perfectly","perfidious","perfidy",
+"perforate","perforation","perforce","perform","performance",
+"performer","perfume","perfumier","perfunctory","pergola",
+"perhaps","perigee","perihelion","peril","perilous",
+"perimeter","period","periodic","periodical","periods",
+"peripatetic","peripheral","periphery","periphrasis","periphrastic",
+"periscope","perish","perishable","perisher","perishing",
+"peristyle","peritonitis","periwig","periwinkle","perjure",
+"perjurer","perjury","perk","perky","perm",
+"permafrost","permanence","permanency","permanent","permanganate",
+"permeable","permeate","permissible","permission","permissive",
+"permit","permutation","permute","pernicious","pernickety",
+"pernod","peroration","peroxide","perpendicular","perpetrate",
+"perpetual","perpetuate","perpetuity","perplex","perplexed",
+"perplexity","perquisite","perry","persecute","persecution",
+"perseverance","persevere","persevering","persian","persiflage",
+"persimmon","persist","persistence","persistent","persnickety",
+"person","persona","personable","personage","personal",
+"personalise","personalities","personality","personalize","personally",
+"personification","personify","personnel","perspective","perspex",
+"perspicacious","perspiration","perspire","persuade","persuasion",
+"persuasive","pert","pertain","pertinacious","pertinent",
+"perturb","perturbation","peruke","peruse","pervade",
+"pervasive","perverse","perversion","perversity","pervert",
+"peseta","pesky","peso","pessary","pessimism",
+"pessimist","pest","pester","pesticide","pestiferous",
+"pestilence","pestilent","pestle","pet","petal",
+"petaled","petalled","petard","peterman","petite",
+"petition","petitioner","petrel","petrifaction","petrify",
+"petrochemical","petrol","petroleum","petrology","petticoat",
+"pettifogging","pettish","petty","petulant","petunia",
+"pew","pewit","pewter","peyote","pfennig",
+"phaeton","phagocyte","phalanx","phalarope","phallic",
+"phallus","phantasmagoria","phantasmal","phantasy","phantom",
+"pharaoh","pharisaic","pharisee","pharmaceutical","pharmacist",
+"pharmacology","pharmacopoeia","pharmacy","pharyngitis","pharynx",
+"phase","phd","pheasant","phenobarbitone","phenol",
+"phenomenal","phenomenally","phenomenon","phew","phi",
+"phial","philander","philanthropic","philanthropist","philanthropy",
+"philatelist","philately","philharmonic","philhellene","philippic",
+"philistine","philological","philologist","philology","philosopher",
+"philosophical","philosophise","philosophize","philosophy","philter",
+"philtre","phizog","phlebitis","phlebotomy","phlegm",
+"phlegmatic","phlox","phobia","phoenician","phoenix",
+"phone","phoneme","phonemic","phonemics","phonetic",
+"phonetician","phonetics","phoney","phonic","phonics",
+"phonograph","phonology","phony","phooey","phosphate",
+"phosphorescence","phosphorescent","phosphoric","phosphorus","photo",
+"photocopier","photocopy","photoelectric","photogenic","photograph",
+"photographer","photographic","photography","photosensitive","photosensitize",
+"photostat","photosynthesis","phototsensitise","phrasal","phrase",
+"phrasebook","phraseology","phrenetic","phrenology","phthisis",
+"phut","phylloxera","phylum","physic","physical",
+"physically","physician","physicist","physics","physio",
+"physiognomy","physiology","physiotherapy","physique","pianissimo",
+"pianist","piano","pianola","piaster","piastre",
+"piazza","pibroch","picador","picaresque","piccalilli",
+"piccaninny","piccolo","pick","pickaback","pickaninny",
+"pickax","pickaxe","picked","picker","pickerel",
+"picket","pickings","pickle","pickled","pickpocket",
+"picky","picnic","picnicker","pictorial","picture",
+"pictures","picturesque","piddle","piddling","pidgin",
+"pie","piebald","piece","piecemeal","pieces",
+"piecework","piecrust","pied","pier","pierce",
+"piercing","pierrot","piety","piezoelectric","piffle",
+"piffling","pig","pigeon","pigeonhole","piggery",
+"piggish","piggy","piggyback","piggybank","pigheaded",
+"piglet","pigment","pigmentation","pigmy","pignut",
+"pigskin","pigsticking","pigsty","pigswill","pigtail",
+"pike","pikestaff","pilaster","pilau","pilchard",
+"pile","piles","pileup","pilfer","pilferage",
+"pilgrim","pilgrimage","pill","pillage","pillar",
+"pillbox","pillion","pillock","pillory","pillow",
+"pillowcase","pilot","pimento","pimp","pimpernel",
+"pimple","pin","pinafore","pincer","pincers",
+"pinch","pinchbeck","pinched","pinchpenny","pincushion",
+"pine","pineal","pineapple","pinecone","pinewood",
+"piney","ping","pinhead","pinion","pink",
+"pinkeye","pinkie","pinkish","pinko","pinky",
+"pinnace","pinnacle","pinnate","pinny","pinpoint",
+"pinprick","pinstripe","pint","pinta","pintable",
+"pinup","pinwheel","piny","pioneer","pious",
+"piousness","pip","pipal","pipe","pipeline",
+"piper","pipes","pipette","piping","pipit",
+"pippin","pipsqueak","piquant","pique","piquet",
+"piracy","piranha","pirate","pirouette","piscatorial",
+"pish","piss","pissed","pistachio","pistil",
+"pistol","piston","pit","pitch","pitchblende",
+"pitcher","pitchfork","piteous","pitfall","pith",
+"pithead","pithy","pitiable","pitiful","pitiless",
+"pitman","piton","pittance","pituitary","pity",
+"pivot","pivotal","pixie","pixilated","pixy",
+"pizza","pizzicato","placard","placate","place",
+"placebo","placed","placekick","placement","placenta",
+"placid","placket","plagarise","plagarize","plagiarism",
+"plague","plaguey","plaice","plaid","plain",
+"plainly","plainsman","plainsong","plainspoken","plaint",
+"plaintiff","plaintive","plait","plan","planchette",
+"planet","planetarium","planetary","plangent","plank",
+"planking","plankton","planner","plant","plantain",
+"plantation","planter","plaque","plash","plasma",
+"plaster","plasterboard","plastered","plasterer","plastering",
+"plastic","plasticine","plasticity","plastics","plastron",
+"plate","plateau","platelayer","platform","plating",
+"platinum","platitude","platonic","platoon","platter",
+"platypus","plaudit","plausible","play","playable",
+"playback","playbill","playboy","player","playful",
+"playgoer","playground","playgroup","playhouse","playmate",
+"playpen","playroom","playsuit","plaything","playtime",
+"playwright","plaza","plea","pleach","plead",
+"pleading","pleadings","pleasant","pleasantry","please",
+"pleased","pleasing","pleasurable","pleasure","pleat",
+"pleb","plebeian","plebiscite","plectrum","pled",
+"pledge","pleistocene","plenary","plenipotentiary","plenitude",
+"plenteous","plentiful","plenty","pleonasm","plethora",
+"pleurisy","plexus","pliable","pliant","pliers",
+"plight","plimsoll","plinth","pliocene","plod",
+"plodder","plonk","plop","plosive","plot",
+"plough","ploughboy","ploughman","ploughshare","plover",
+"plow","plowboy","plowman","plowshare","ploy",
+"pluck","plucky","plug","plughole","plum",
+"plumage","plumb","plumbago","plumber","plumbing",
+"plume","plumed","plummet","plummy","plump",
+"plunder","plunge","plunger","plunk","pluperfect",
+"plural","pluralism","plurality","pluribus","plus",
+"plush","plushy","pluto","plutocracy","plutocrat",
+"plutonium","ply","plywood","pneumatic","pneumoconiosis",
+"pneumonia","poach","poacher","pock","pocked",
+"pocket","pocketbook","pocketful","pocketknife","pockmark",
+"pockmarked","pod","podgy","podiatry","podium",
+"poem","poesy","poet","poetaster","poetess",
+"poetic","poetical","poetry","pogrom","poignancy",
+"poignant","poinsettia","point","pointed","pointer",
+"pointillism","pointless","points","pointsman","poise",
+"poised","poison","poisonous","poke","poker",
+"pokerwork","poky","polack","polar","polarisation",
+"polarise","polarity","polarization","polarize","polaroid",
+"polaroids","polder","pole","poleax","poleaxe",
+"polecat","polemic","polemical","polemics","police",
+"policeman","policewoman","policy","polio","polish",
+"polisher","politburo","polite","politic","politicalise",
+"politicalize","politician","politicise","politicize","politicking",
+"politico","politics","polity","polka","poll",
+"pollard","pollen","pollinate","polling","pollster",
+"pollutant","pollute","pollution","polly","pollyanna",
+"polo","polonaise","polony","poltergeist","poltroon",
+"poly","polyandrous","polyandry","polyanthus","polyester",
+"polyethylene","polygamist","polygamous","polygamy","polyglot",
+"polygon","polymath","polymer","polymorphous","polyp",
+"polyphony","polypus","polystyrene","polysyllable","polytechnic",
+"polytheism","polythene","polyurethane","pomade","pomander",
+"pomegranate","pomeranian","pommel","pommy","pomp",
+"pompom","pomposity","pompous","ponce","poncho",
+"poncy","pond","ponder","ponderous","pone",
+"pong","poniard","pontiff","pontifical","pontificals",
+"pontificate","pontoon","pony","ponytail","pooch",
+"poodle","poof","pooh","pool","poolroom",
+"pools","poop","pooped","poor","poorhouse",
+"poorly","poorness","poove","pop","popadam",
+"popadum","popcorn","popery","popgun","popinjay",
+"popish","poplar","poplin","poppa","popper",
+"poppet","poppy","poppycock","popshop","popsy",
+"populace","popular","popularise","popularity","popularize",
+"popularly","populate","population","populism","populist",
+"populous","porcelain","porch","porcine","porcupine",
+"pore","pork","porker","porky","porn",
+"pornography","porosity","porous","porphyry","porpoise",
+"porridge","porringer","port","portable","portage",
+"portal","portals","portcullis","portend","portent",
+"portentous","porter","porterage","porterhouse","portfolio",
+"porthole","portico","portion","portly","portmanteau",
+"portrait","portraitist","portraiture","portray","portrayal",
+"pose","poser","poseur","posh","posit",
+"position","positional","positive","positively","positiveness",
+"positivism","positron","posse","possess","possessed",
+"possession","possessive","possessor","posset","possibility",
+"possible","possibly","possum","post","postage",
+"postal","postbag","postbox","postcard","postcode",
+"postdate","poster","posterior","posterity","postern",
+"postgraduate","posthaste","posthumous","postilion","postillion",
+"posting","postman","postmark","postmaster","postmortem",
+"postpaid","postpone","postprandial","postscript","postulant",
+"postulate","posture","postwar","posy","pot",
+"potable","potash","potassium","potation","potato",
+"potbellied","potbelly","potboiler","potbound","poteen",
+"potency","potent","potentate","potential","potentiality",
+"pothead","pother","potherb","pothole","potholing",
+"pothouse","pothunter","potion","potluck","potpourri",
+"potsherd","potshot","pottage","potted","potter",
+"potteries","pottery","potty","pouch","pouf",
+"pouffe","poulterer","poultice","poultry","pounce",
+"pound","poundage","pounding","pour","pout",
+"poverty","powder","powdered","powdery","power",
+"powerboat","powerful","powerhouse","powerless","powers",
+"powwow","pox","pps","practicable","practical",
+"practicality","practically","practice","practiced","practise",
+"practised","practitioner","praesidium","praetor","praetorian",
+"pragmatic","pragmatism","prairie","praise","praises",
+"praiseworthy","praline","pram","prance","prank",
+"prankster","prat","prate","pratfall","prattle",
+"prawn","praxis","pray","prayer","preach",
+"preachify","preamble","prearrange","prebend","prebendary",
+"precarious","precast","precaution","precede","precedence",
+"precedent","preceding","precentor","precept","preceptor",
+"precession","precinct","precincts","preciosity","precious",
+"precipice","precipitate","precipitation","precipitous","precise",
+"precisely","precision","preclude","precocious","precognition",
+"preconceived","preconception","precondition","precook","precursor",
+"predator","predatory","predecease","predecessor","predestinate",
+"predestination","predestine","predetermine","predeterminer","predicament",
+"predicate","predicative","predict","predictable","prediction",
+"predigest","predilection","predispose","predisposition","predominance",
+"predominant","predominantly","predominate","preeminent","preeminently",
+"preempt","preemption","preemptive","preen","preexist",
+"preexistence","prefab","prefabricate","prefabricated","preface",
+"prefatory","prefect","prefecture","prefer","preferable",
+"preference","preferential","preferment","prefigure","prefix",
+"pregnancy","pregnant","preheat","prehensile","prehistoric",
+"prehistory","prejudge","prejudice","prejudiced","prejudicial",
+"prelacy","prelate","prelim","preliminary","prelims",
+"preliterate","prelude","premarital","premature","premeditate",
+"premeditated","premier","premise","premises","premiss",
+"premium","premonition","premonitory","prenatal","prentice",
+"preoccupation","preoccupied","preoccupy","preordain","prep",
+"prepack","preparation","preparatory","prepare","prepared",
+"preparedness","prepay","preponderance","preponderant","preponderate",
+"preposition","prepositional","prepossessed","prepossessing","prepossession",
+"preposterous","prepuce","prerecord","prerequisite","prerogative",
+"presage","presbyter","presbyterian","presbytery","preschool",
+"prescient","prescribe","prescribed","prescript","prescription",
+"prescriptive","presence","present","presentable","presentation",
+"presenter","presentiment","presently","presents","preservable",
+"preservation","preservative","preserve","preserver","preset",
+"preshrunk","preside","presidency","president","presidential",
+"presidium","press","pressed","pressgang","pressing",
+"pressman","pressmark","pressure","pressurise","pressurize",
+"prestidigitation","prestige","prestigious","prestissimo","presto",
+"prestressed","presumable","presume","presumption","presumptive",
+"presumptuous","presuppose","presupposition","pretence","pretend",
+"pretended","pretender","pretense","pretension","pretentious",
+"pretentiousness","preterit","preterite","preternatural","pretext",
+"pretor","pretorian","prettify","prettily","pretty",
+"pretzel","prevail","prevailing","prevalent","prevaricate",
+"prevent","prevention","preventive","preview","previous",
+"prevision","prewar","prey","price","priceless",
+"pricey","prick","prickle","prickly","pricy",
+"pride","priest","priesthood","priestly","prig",
+"priggish","prim","primacy","primaeval","primal",
+"primarily","primary","primate","prime","primer",
+"primeval","priming","primitive","primogeniture","primordial",
+"primp","primrose","primula","primus","prince",
+"princedom","princely","princess","principal","principality",
+"principally","principle","principled","principles","prink",
+"print","printable","printer","printing","printout",
+"prior","priority","priory","prise","prism",
+"prismatic","prison","prisoner","prissy","pristine",
+"prithee","privacy","private","privateer","privation",
+"privet","privilege","privileged","privily","privy",
+"prize","prizefight","prizeman","pro","probability",
+"probable","probably","probate","probation","probationer",
+"probe","probity","problem","problematic","proboscis",
+"procedural","procedure","proceed","proceeding","proceedings",
+"proceeds","process","procession","processional","proclaim",
+"proclamation","proclivity","proconsul","proconsulate","procrastinate",
+"procreate","proctor","procure","procurer","prod",
+"prodigal","prodigious","prodigy","produce","producer",
+"product","production","productive","productivity","proem",
+"prof","profanation","profane","profanity","profess",
+"professed","professedly","profession","professional","professionalism",
+"professor","professorial","professorship","proffer","proficient",
+"profile","profit","profitable","profiteer","profligacy",
+"profligate","profound","profundity","profuse","profusion",
+"progenitor","progeny","progesterone","prognathous","prognosis",
+"prognostic","prognosticate","prognostication","program","programer",
+"programmer","progress","progression","progressive","prohibit",
+"prohibition","prohibitionist","prohibitive","prohibitory","project",
+"projectile","projection","projectionist","projector","prolapse",
+"prole","prolegomena","proletarian","proletariat","proliferate",
+"proliferation","prolific","prolix","prolog","prologue",
+"prolong","prolongation","prolonged","prom","promenade",
+"promenader","prominence","prominent","promiscuity","promiscuous",
+"promise","promising","promontory","promote","promoter",
+"promotion","prompt","prompter","promptness","promulgate",
+"pron","prone","prong","pronominal","pronoun",
+"pronounce","pronounceable","pronounced","pronouncement","pronto",
+"pronunciamento","pronunciation","proof","proofread","prop",
+"propaganda","propagandise","propagandist","propagandize","propagate",
+"propagation","propane","propel","propellant","propellent",
+"propeller","propensity","proper","properly","propertied",
+"property","prophecy","prophesy","prophet","prophetess",
+"prophetic","prophets","prophylactic","prophylaxis","propinquity",
+"propitiate","propitiatory","propitious","propjet","proponent",
+"proportion","proportional","proportionate","proportions","proposal",
+"propose","proposition","propound","proprietary","proprieties",
+"proprietor","proprietress","propriety","propulsion","propulsive",
+"propylene","prorogation","prorogue","prosaic","proscenium",
+"proscribe","proscription","prose","prosecute","prosecution",
+"prosecutor","proselyte","proselytise","proselytize","prosody",
+"prospect","prospective","prospector","prospects","prospectus",
+"prosper","prosperity","prosperous","prostate","prosthesis",
+"prostitute","prostitution","prostrate","prostration","prosy",
+"protagonist","protean","protect","protection","protectionism",
+"protective","protector","protectorate","protein","protest",
+"protestant","protestation","protocol","proton","protoplasm",
+"prototype","protozoa","protozoan","protozoon","protract",
+"protraction","protractor","protrude","protrusion","protrusive",
+"protuberance","protuberant","proud","provable","prove",
+"proven","provenance","provender","proverb","proverbial",
+"proverbially","proverbs","provide","provided","providence",
+"provident","providential","provider","providing","province",
+"provinces","provincial","provision","provisional","provisions",
+"proviso","provocation","provocative","provoke","provoking",
+"provost","prow","prowess","prowl","prowler",
+"prox","proximal","proximate","proximity","proximo",
+"proxy","prude","prudence","prudent","prudential",
+"prudery","prudish","prune","pruning","prurience",
+"prurient","pruritus","prussian","pry","psalm",
+"psalmist","psalmody","psalms","psalter","psaltery",
+"psephology","pseud","pseudonym","pseudonymous","pshaw",
+"psittacosis","psoriasis","psst","psyche","psychedelic",
+"psychiatric","psychiatrist","psychiatry","psychic","psycho",
+"psychoanalyse","psychoanalysis","psychoanalyst","psychoanalytic","psychoanalyze",
+"psychokinesis","psychological","psychologist","psychology","psychopath",
+"psychosis","psychosomatic","psychotherapy","psychotic","pta",
+"ptarmigan","pterodactyl","pto","ptomaine","pub",
+"puberty","pubic","public","publican","publication",
+"publicise","publicist","publicity","publicize","publish",
+"publisher","publishing","puce","puck","pucker",
+"puckish","pud","pudding","puddle","pudendum",
+"pudgy","pueblo","puerile","puerility","puerperal",
+"puff","puffball","puffed","puffer","puffin",
+"puffy","pug","pugilism","pugilist","pugnacious",
+"pugnacity","puissance","puissant","puke","pukka",
+"pulchritude","pulchritudinous","pule","pull","pullback",
+"pullet","pulley","pullman","pullout","pullover",
+"pullthrough","pullulate","pulmonary","pulp","pulpit",
+"pulsar","pulsate","pulsation","pulse","pulverise",
+"pulverize","puma","pumice","pummel","pump",
+"pumpernickel","pumpkin","pun","punch","punchy",
+"punctilio","punctilious","punctual","punctuate","punctuation",
+"puncture","pundit","pungent","punic","punish",
+"punishable","punishing","punishment","punitive","punjabi",
+"punk","punkah","punnet","punster","punt",
+"puny","pup","pupa","pupate","pupil",
+"puppet","puppeteer","puppy","purblind","purchase",
+"purchaser","purdah","pure","pureblooded","purebred",
+"puree","purely","pureness","purgation","purgative",
+"purgatory","purge","purification","purify","purist",
+"puritan","puritanical","purity","purl","purler",
+"purlieus","purloin","purple","purplish","purport",
+"purpose","purposeful","purposeless","purposely","purposive",
+"purr","purse","purser","pursuance","pursue",
+"pursuer","pursuit","purulent","purvey","purveyance",
+"purveyor","purview","pus","push","pushbike",
+"pushcart","pushchair","pushed","pusher","pushover",
+"pushy","pusillanimous","puss","pussy","pussycat",
+"pussyfoot","pustule","put","putative","putrefaction",
+"putrefactive","putrefy","putrescent","putrid","putsch",
+"putt","puttee","putter","putto","putty",
+"puzzle","puzzlement","puzzler","pvc","pygmy",
+"pyjama","pyjamas","pylon","pyorrhea","pyorrhoea",
+"pyramid","pyre","pyrex","pyrexia","pyrites",
+"pyromania","pyromaniac","pyrotechnic","pyrotechnics","python",
+"pyx","qed","qty","qua","quack",
+"quackery","quad","quadragesima","quadrangle","quadrangular",
+"quadrant","quadrilateral","quadrille","quadrillion","quadroon",
+"quadruped","quadruple","quadruplet","quadruplicate","quaff",
+"quagga","quagmire","quail","quaint","quake",
+"quaker","qualification","qualifications","qualified","qualifier",
+"qualify","qualitative","quality","qualm","quandary",
+"quantify","quantitative","quantity","quantum","quarantine",
+"quark","quarrel","quarrelsome","quarry","quart",
+"quarter","quarterdeck","quarterfinal","quartering","quarterly",
+"quartermaster","quarters","quarterstaff","quartet","quartette",
+"quarto","quartz","quasar","quash","quatercentenary",
+"quatrain","quaver","quay","quean","queasy",
+"queen","queenly","queer","quell","quench",
+"quenchless","querulous","query","quest","question",
+"questionable","questioner","questioning","questionnaire","quetzal",
+"queue","quibble","quick","quicken","quickie",
+"quicklime","quicksand","quicksilver","quickstep","quid",
+"quiescent","quiet","quieten","quietism","quietude",
+"quietus","quiff","quill","quilt","quilted",
+"quin","quince","quinine","quinquagesima","quinsy",
+"quintal","quintessence","quintet","quintette","quintuplet",
+"quip","quire","quirk","quisling","quit",
+"quits","quittance","quitter","quiver","quixotic",
+"quiz","quizmaster","quizzical","quod","quoit",
+"quoits","quondam","quorum","quota","quotable",
+"quotation","quote","quoth","quotidian","quotient",
+"rabbi","rabbinical","rabbit","rabble","rabelaisian",
+"rabid","rabies","rac","raccoon","race",
+"racecourse","racehorse","raceme","racer","races",
+"racetrack","racial","racialism","racially","racing",
+"rack","racket","racketeer","racketeering","rackets",
+"raconteur","racoon","racquet","racquets","racy",
+"radar","radial","radiance","radiant","radiate",
+"radiation","radiator","radical","radicalise","radicalism",
+"radicalize","radicle","radii","radio","radioactive",
+"radioactivity","radiogram","radiograph","radiographer","radiography",
+"radioisotope","radiolocation","radiology","radiotherapist","radiotherapy",
+"radish","radium","radius","raffia","raffish",
+"raffle","raft","rafter","raftered","raftsman",
+"rag","raga","ragamuffin","ragbag","rage",
+"ragged","raglan","ragout","ragtag","ragtime",
+"raid","raider","rail","railhead","railing",
+"raillery","railroad","rails","railway","raiment",
+"rain","rainbow","raincoat","raindrop","rainfall",
+"rainproof","rains","rainstorm","rainwater","rainy",
+"raise","raisin","raj","raja","rajah",
+"rake","rakish","rallentando","rally","ram",
+"ramadan","ramble","rambler","rambling","rambunctious",
+"ramekin","ramification","ramify","ramjet","ramp",
+"rampage","rampant","rampart","ramrod","ramshackle",
+"ran","ranch","rancher","rancid","rancor",
+"rancorous","rancour","rand","random","randy",
+"ranee","rang","range","ranger","rani",
+"rank","ranker","ranking","rankle","ranks",
+"ransack","ransom","rant","rap","rapacious",
+"rapacity","rape","rapid","rapids","rapier",
+"rapine","rapist","rapport","rapprochement","rapscallion",
+"rapt","rapture","rapturous","rare","rarebit",
+"rarefied","rarefy","rarely","raring","rarity",
+"rascal","rascally","rash","rasher","rasp",
+"raspberry","rat","ratable","ratchet","rate",
+"rateable","ratepayer","rather","ratify","rating",
+"ratio","ratiocination","ration","rational","rationale",
+"rationalise","rationalism","rationalist","rationalize","rations",
+"ratlin","ratline","rats","rattan","ratter",
+"rattle","rattlebrained","rattlesnake","rattletrap","rattling",
+"ratty","raucous","raunchy","ravage","ravages",
+"rave","ravel","raven","ravening","ravenous",
+"raver","ravine","raving","ravings","ravioli",
+"ravish","ravishing","ravishment","raw","rawhide",
+"ray","rayon","raze","razor","razorback",
+"razzle","reach","react","reaction","reactionary",
+"reactivate","reactive","reactor","read","readable",
+"readdress","reader","readership","readily","readiness",
+"reading","readjust","readout","ready","reafforest",
+"reagent","real","realign","realisable","realisation",
+"realise","realism","realist","realistic","reality",
+"realizable","realization","realize","really","realm",
+"realpolitik","realtor","realty","ream","reanimate",
+"reap","reaper","reappear","reappraisal","rear",
+"rearguard","rearm","rearmament","rearmost","rearrange",
+"rearward","rearwards","reason","reasonable","reasonably",
+"reasoned","reasoning","reassure","rebarbative","rebate",
+"rebel","rebellion","rebellious","rebind","rebirth",
+"reborn","rebound","rebuff","rebuild","rebuke",
+"rebus","rebut","rebuttal","recalcitrance","recalcitrant",
+"recall","recant","recap","recapitulate","recapitulation",
+"recapture","recast","recce","recd","recede",
+"receipt","receipts","receivable","receive","received",
+"receiver","receivership","receiving","recent","recently",
+"receptacle","reception","receptionist","receptive","recess",
+"recession","recessional","recessive","recharge","recidivist",
+"recipe","recipient","reciprocal","reciprocate","reciprocity",
+"recital","recitation","recitative","recite","reck",
+"reckless","reckon","reckoner","reckoning","reclaim",
+"reclamation","recline","recluse","recognise","recognition",
+"recognizance","recognize","recoil","recollect","recollection",
+"recommend","recommendation","recompense","reconcile","reconciliation",
+"recondite","recondition","reconnaissance","reconnoiter","reconnoitre",
+"reconsider","reconstitute","reconstruct","reconstruction","record",
+"recorder","recording","recordkeeping","recount","recoup",
+"recourse","recover","recovery","recreant","recreate",
+"recreation","recreational","recriminate","recrimination","recrudescence",
+"recruit","rectal","rectangle","rectangular","rectification",
+"rectifier","rectify","rectilinear","rectitude","recto",
+"rector","rectory","rectum","recumbent","recuperate",
+"recuperative","recur","recurrence","recurrent","recurved",
+"recusant","recycle","red","redbreast","redbrick",
+"redcap","redcoat","redcurrant","redden","reddish",
+"redecorate","redeem","redeemer","redemption","redemptive",
+"redeploy","redhead","rediffusion","redirect","redistribute",
+"redo","redolence","redolent","redouble","redoubt",
+"redoubtable","redound","redress","redskin","reduce",
+"reduction","redundancy","redundant","reduplicate","redwing",
+"redwood","reecho","reed","reeds","reeducate",
+"reedy","reef","reefer","reek","reel",
+"reentry","reeve","ref","reface","refashion",
+"refectory","refer","referee","reference","referendum",
+"refill","refine","refined","refinement","refiner",
+"refinery","refit","reflate","reflation","reflect",
+"reflection","reflective","reflector","reflex","reflexes",
+"reflexive","refloat","refoot","reforest","reform",
+"reformation","reformatory","refract","refractory","refrain",
+"refresh","refresher","refreshing","refreshment","refreshments",
+"refrigerant","refrigerate","refrigeration","refrigerator","reft",
+"refuel","refuge","refugee","refulgence","refulgent",
+"refund","refurbish","refusal","refuse","refutable",
+"refutation","refute","regain","regal","regale",
+"regalia","regard","regardful","regarding","regardless",
+"regards","regatta","regency","regenerate","regent",
+"reggae","regicide","regime","regimen","regiment",
+"regimental","regimentals","regina","region","regional",
+"regions","register","registrar","registration","registry",
+"regnant","regress","regressive","regret","regrets",
+"regrettable","regrettably","regroup","regular","regularise",
+"regularity","regularize","regularly","regulate","regulation",
+"regulator","regulo","regurgitate","rehabilitate","rehash",
+"rehear","rehearsal","rehearse","rehouse","reich",
+"reification","reify","reign","reimburse","reimbursement",
+"rein","reincarnate","reincarnation","reindeer","reinforce",
+"reinforcement","reinforcements","reins","reinstate","reinsure",
+"reissue","reiterate","reject","rejection","rejoice",
+"rejoicing","rejoicings","rejoin","rejoinder","rejuvenate",
+"rekindle","relaid","relapse","relate","related",
+"relation","relational","relations","relationship","relative",
+"relatively","relativism","relativistic","relativity","relax",
+"relaxation","relaxing","relay","release","relegate",
+"relent","relentless","relevance","relevant","reliability",
+"reliable","reliance","reliant","relic","relics",
+"relict","relief","relieve","relieved","religion",
+"religious","religiously","reline","relinquish","reliquary",
+"relish","relive","reload","relocate","reluctance",
+"reluctant","reluctantly","rely","remain","remainder",
+"remains","remake","remand","remark","remarkable",
+"remarkably","remarry","remediable","remedial","remedy",
+"remember","remembrance","remilitarise","remilitarize","remind",
+"reminder","reminisce","reminiscence","reminiscences","reminiscent",
+"remiss","remission","remit","remittance","remittent",
+"remnant","remodel","remold","remonstrance","remonstrate",
+"remorse","remorseful","remote","remotely","remould",
+"remount","removal","remove","remover","remunerate",
+"remunerative","renaissance","renal","rename","renascent",
+"rend","render","rendering","rendezvous","rendition",
+"renegade","renege","renegue","renew","renewable",
+"renewal","rennet","renounce","renovate","renown",
+"renowned","rent","rental","renter","rentier",
+"renunciation","reopen","reorganise","reorganize","rep",
+"repaid","repair","reparable","reparation","reparations",
+"repartee","repast","repatriate","repay","repayable",
+"repayment","repeal","repeat","repeated","repeatedly",
+"repeater","repeating","repel","repellent","repent",
+"repentance","repentant","repercussion","repertoire","repertory",
+"repetition","repetitious","repine","replace","replacement",
+"replay","replenish","replete","repletion","replica",
+"replicate","reply","repoint","report","reportage",
+"reportedly","reporter","repose","repository","repossess",
+"repot","repp","reprehend","reprehensible","represent",
+"representation","representational","representations","representative","repress",
+"repressed","repression","repressive","reprieve","reprimand",
+"reprint","reprisal","reprise","reproach","reprobate",
+"reproduce","reproducer","reproduction","reproductive","reproof",
+"reprove","reproving","reptile","reptilian","republic",
+"republican","republicanism","repudiate","repugnance","repugnant",
+"repulse","repulsion","repulsive","reputable","reputation",
+"repute","reputed","reputedly","request","requiem",
+"require","requirement","requisite","requisition","requital",
+"requite","reredos","rerun","rescind","rescript",
+"rescue","research","reseat","resemblance","resemble",
+"resent","resentment","reservation","reserve","reserved",
+"reservedly","reservist","reservoir","reset","resettle",
+"reshuffle","reside","residence","residency","resident",
+"residential","residual","residuary","residue","resign",
+"resignation","resigned","resilience","resilient","resin",
+"resinated","resist","resistance","resistant","resistor",
+"resole","resolute","resolution","resolvable","resolve",
+"resonance","resonant","resonate","resonator","resort",
+"resound","resounding","resource","resourceful","resources",
+"respect","respectability","respectable","respecter","respectful",
+"respecting","respective","respectively","respects","respiration",
+"respirator","respiratory","respire","respite","resplendence",
+"resplendent","respond","respondent","response","responsibility",
+"responsible","responsibly","responsive","rest","restage",
+"restate","restaurant","restaurateur","restful","restitution",
+"restive","restless","restock","restoration","restorative",
+"restore","restorer","restrain","restrained","restraint",
+"restrict","restricted","restriction","restrictive","restructure",
+"result","resultant","resume","resumption","resurface",
+"resurgence","resurgent","resurrect","resurrection","resuscitate",
+"retail","retailer","retain","retainer","retake",
+"retaliate","retaliation","retaliatory","retard","retarded",
+"retch","retd","retell","retention","retentive",
+"rethink","reticence","reticent","reticulated","reticulation",
+"reticule","retina","retinue","retire","retired",
+"retirement","retiring","retort","retouch","retrace",
+"retract","retractable","retractile","retraction","retread",
+"retreat","retrench","retrial","retraining","retribution",
+"retributive","retrieval","retrieve","retriever","retroactive",
+"retroflex","retrograde","retrogress","retrogressive","retrospect",
+"retrospection","retrospective","retroversion","retsina","return",
+"returnable","returns","reunion","reunite","reuse",
+"rev","revalue","revamp","reveal","revealing",
+"reveille","revel","revelation","revelry","revenge",
+"revenue","reverberant","reverberate","reverberation","revere",
+"reverence","reverend","reverent","reverential","reverie",
+"revers","reversal","reverse","reversion","reversionary",
+"revert","revetment","review","reviewer","revile",
+"revise","revision","revisionism","revitalise","revitalize",
+"revival","revivalist","revive","revivify","revocable",
+"revocation","revoke","revolt","revolting","revolution",
+"revolutionary","revolutionise","revolutionize","revolve","revolver",
+"revolving","revue","revulsion","reward","rewarding",
+"rewards","rewire","reword","rewrite","rex",
+"rhapsodise","rhapsodize","rhapsody","rhea","rhenish",
+"rheostat","rhetoric","rhetorical","rhetorically","rhetorician",
+"rheum","rheumatic","rheumaticky","rheumatics","rheumatism",
+"rheumatoid","rhinestone","rhinoceros","rhizome","rhododendron",
+"rhomboid","rhombus","rhubarb","rhyme","rhymed",
+"rhymester","rhythm","rhythmic","rib","ribald",
+"ribaldry","ribbed","ribbing","ribbon","riboflavin",
+"rice","rich","riches","richly","richness",
+"rick","rickets","rickety","ricksha","rickshaw",
+"ricochet","rid","riddance","ridden","riddle",
+"ride","rider","riderless","ridge","ridgepole",
+"ridicule","ridiculous","riding","riesling","rife",
+"riff","riffle","riffraff","rifle","rifleman",
+"rifles","rifling","rift","rig","rigging",
+"right","righteous","rightful","rightist","rightly",
+"rights","rightward","rightwards","rigid","rigidity",
+"rigmarole","rigor","rigorous","rigour","rile",
+"rill","rim","rime","rind","rinderpest",
+"ring","ringer","ringleader","ringlet","ringmaster",
+"ringside","ringworm","rink","rinse","riot",
+"riotous","rip","riparian","ripcord","ripen",
+"riposte","ripple","ripsaw","riptide","rise",
+"riser","risibility","risible","rising","risk",
+"risky","risotto","rissole","rite","ritual",
+"ritualism","ritzy","rival","rivalry","rive",
+"river","riverbed","riverside","rivet","riveter",
+"riveting","riviera","rivulet","rna","roach",
+"road","roadbed","roadblock","roadhouse","roadman",
+"roadside","roadstead","roadster","roadway","roadworthy",
+"roam","roan","roar","roaring","roast",
+"roaster","roasting","rob","robber","robbery",
+"robe","robin","robot","robust","rock",
+"rockbound","rocker","rockery","rocket","rocketry",
+"rocks","rocky","rococo","rod","rode",
+"rodent","rodeo","rodomontade","roe","roebuck",
+"rogation","roger","rogue","roguery","roguish",
+"roisterer","role","roll","roller","rollicking",
+"rolling","rolls","romaic","roman","romance",
+"romanesque","romantic","romanticise","romanticism","romanticize",
+"romany","romish","romp","romper","rompers",
+"rondeau","rondo","roneo","rood","roodscreen",
+"roof","roofing","roofless","rooftree","rook",
+"rookery","rookie","room","roomer","roommate",
+"rooms","roomy","roost","rooster","root",
+"rooted","rootless","roots","rope","ropedancer",
+"ropes","ropewalk","ropeway","ropey","ropy",
+"roquefort","rosary","rose","roseate","rosebud",
+"roseleaf","rosemary","rosette","rosewater","rosewood",
+"rosin","roster","rostrum","rosy","rot",
+"rota","rotary","rotate","rotation","rotatory",
+"rotgut","rotisserie","rotogravure","rotor","rotten",
+"rottenly","rotter","rotund","rotunda","rouble",
+"rouge","rough","roughage","roughcast","roughen",
+"roughhouse","roughly","roughneck","roughness","roughrider",
+"roughshod","roulette","round","roundabout","roundel",
+"roundelay","rounders","roundhead","roundhouse","roundish",
+"roundly","rounds","roundsman","roundup","roup",
+"rouse","rousing","roustabout","rout","route",
+"routine","roux","rove","rover","row",
+"rowan","rowanberry","rowdy","rowdyism","rowel",
+"rower","rowing","rowlock","royal","royalist",
+"royalty","rpm","rsm","rsvp","rub",
+"rubber","rubberise","rubberize","rubberneck","rubbery",
+"rubbing","rubbish","rubbishy","rubble","rubdown",
+"rubella","rubicon","rubicund","ruble","rubric",
+"ruby","ruck","rucksack","ruckus","ruction",
+"ructions","rudder","ruddle","ruddy","rude",
+"rudely","rudiment","rudimentary","rudiments","rue",
+"rueful","ruff","ruffian","ruffianly","ruffle",
+"rug","rugby","rugged","ruin","ruination",
+"ruinous","ruins","rule","rulebook","ruler",
+"ruling","rum","rumba","rumble","rumbling",
+"rumbustious","ruminant","ruminate","ruminative","rummage",
+"rummy","rumor","rumored","rumormonger","rumour",
+"rumoured","rumourmonger","rump","rumple","rumpus",
+"run","runaway","rung","runnel","runner",
+"running","runny","runs","runt","runway",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java
new file mode 100644
index 00000000000..9ac12daf8df
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData7.java
@@ -0,0 +1,715 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This algorithm is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData7 {
+ private KStemData7() {
+ }
+ static String[] data = {
+"rupee","rupture","rural","ruritanian","ruse",
+"rush","rushes","rushlight","rusk","russet",
+"rust","rustic","rusticate","rustication","rustle",
+"rustler","rustless","rustling","rustproof","rusty",
+"rut","ruthless","rutting","rye","sabbatarian",
+"sabbath","sabbatical","saber","sable","sabot",
+"sabotage","saboteur","sabra","sabre","sac",
+"saccharin","saccharine","sacerdotal","sacerdotalism","sachet",
+"sack","sackbut","sackcloth","sacral","sacrament",
+"sacramental","sacred","sacrifice","sacrificial","sacrilege",
+"sacrilegious","sacristan","sacristy","sacroiliac","sacrosanct",
+"sad","sadden","saddle","saddlebag","saddler",
+"saddlery","sadducee","sadhu","sadism","sadly",
+"sadomasochism","safari","safe","safebreaker","safeguard",
+"safekeeping","safety","saffron","sag","saga",
+"sagacious","sagacity","sagebrush","sago","sahib",
+"said","sail","sailcloth","sailing","sailor",
+"sailplane","saint","sainted","saintly","saith",
+"sake","saki","salaam","salable","salacious",
+"salacity","salad","salamander","salami","salaried",
+"salary","sale","saleable","saleroom","sales",
+"salesclerk","salesgirl","saleslady","salesman","salesmanship",
+"salient","saliferous","salify","saline","salinometer",
+"saliva","salivary","salivate","sallow","sally",
+"salmon","salmonella","salon","saloon","salsify",
+"salt","saltcellar","saltire","saltlick","saltpan",
+"saltpeter","saltpetre","salts","saltshaker","saltwater",
+"salty","salubrious","salutary","salutation","salute",
+"salvage","salvation","salvationist","salve","salvedge",
+"salver","salvia","salvo","samaritan","samaritans",
+"samba","same","sameness","samovar","sampan",
+"sample","sampler","samurai","sanatorium","sanctify",
+"sanctimonious","sanction","sanctities","sanctity","sanctuary",
+"sanctum","sanctus","sand","sandal","sandalwood",
+"sandbag","sandbank","sandbar","sandblast","sandbox",
+"sandboy","sandcastle","sander","sandglass","sandman",
+"sandpaper","sandpiper","sandpit","sands","sandshoe",
+"sandstone","sandstorm","sandwich","sandy","sane",
+"sang","sangfroid","sangria","sanguinary","sanguine",
+"sanitary","sanitation","sanitorium","sanity","sank",
+"sans","sanskrit","sap","sapience","sapient",
+"sapless","sapling","sapper","sapphic","sapphire",
+"sappy","sapwood","saraband","sarabande","sarcasm",
+"sarcastic","sarcophagus","sardine","sardonic","sarge",
+"sari","sarky","sarong","sarsaparilla","sartorial",
+"sash","sashay","sass","sassafras","sassy",
+"sat","satan","satanic","satanism","satchel",
+"sate","sateen","satellite","satiable","satiate",
+"satiety","satin","satinwood","satiny","satire",
+"satirical","satirise","satirize","satisfaction","satisfactory",
+"satisfy","satisfying","satrap","satsuma","saturate",
+"saturation","saturday","saturn","saturnalia","saturnine",
+"satyr","sauce","saucepan","saucer","saucy",
+"sauerkraut","sauna","saunter","saurian","sausage",
+"sauterne","sauternes","savage","savagery","savanna",
+"savannah","savant","save","saveloy","saver",
+"saving","savings","savior","saviour","savor",
+"savory","savour","savoury","savoy","savvy",
+"saw","sawbones","sawbuck","sawdust","sawhorse",
+"sawmill","sawpit","sawyer","saxifrage","saxon",
+"saxophone","saxophonist","say","saying","scab",
+"scabbard","scabby","scabies","scabious","scabrous",
+"scads","scaffold","scaffolding","scalar","scalawag",
+"scald","scalding","scale","scalene","scallion",
+"scallop","scallywag","scalp","scalpel","scaly",
+"scamp","scamper","scampi","scan","scandal",
+"scandalise","scandalize","scandalmonger","scandalous","scandinavian",
+"scanner","scansion","scant","scanty","scapegoat",
+"scapegrace","scapula","scar","scarab","scarce",
+"scarcely","scarcity","scare","scarecrow","scared",
+"scaremonger","scarf","scarify","scarlet","scarp",
+"scarper","scary","scat","scathing","scatology",
+"scatter","scatterbrain","scatterbrained","scattered","scatty",
+"scavenge","scavenger","scenario","scenarist","scene",
+"scenery","sceneshifter","scenic","scent","scepter",
+"sceptic","sceptical","scepticism","sceptre","schedule",
+"schema","schematic","schematize","scheme","scherzo",
+"schism","schismatic","schist","schizoid","schizophrenia",
+"schizophrenic","schmaltz","schmalz","schnapps","schnitzel",
+"schnorkel","scholar","scholarly","scholarship","scholastic",
+"scholasticism","school","schoolboy","schoolhouse","schooling",
+"schoolman","schoolmarm","schoolmaster","schoolmastering","schoolmate",
+"schoolwork","schooner","schwa","sciatic","sciatica",
+"science","scientific","scientist","scientology","scimitar",
+"scintilla","scintillate","scion","scissor","scissors",
+"sclerosis","scoff","scold","scollop","sconce",
+"scone","scoop","scoot","scooter","scope",
+"scorbutic","scorch","scorcher","scorching","score",
+"scoreboard","scorebook","scorecard","scorekeeper","scoreless",
+"scorer","scorn","scorpio","scorpion","scotch",
+"scoundrel","scoundrelly","scour","scourer","scourge",
+"scout","scoutmaster","scow","scowl","scrabble",
+"scrag","scraggly","scraggy","scram","scramble",
+"scrap","scrapbook","scrape","scraper","scrapings",
+"scrappy","scraps","scratch","scratchpad","scratchy",
+"scrawl","scrawny","scream","screamingly","scree",
+"screech","screed","screen","screening","screenplay",
+"screw","screwball","screwdriver","screwy","scribble",
+"scribbler","scribe","scrimmage","scrimp","scrimshank",
+"scrimshaw","scrip","script","scripted","scriptural",
+"scripture","scriptwriter","scrivener","scrofula","scrofulous",
+"scroll","scrollwork","scrooge","scrotum","scrounge",
+"scrub","scrubber","scrubby","scruff","scruffy",
+"scrum","scrumcap","scrumhalf","scrummage","scrumptious",
+"scrumpy","scrunch","scruple","scrupulous","scrutineer",
+"scrutinise","scrutinize","scrutiny","scuba","scud",
+"scuff","scuffle","scull","scullery","scullion",
+"sculptor","sculptural","sculpture","scum","scupper",
+"scurf","scurrility","scurrilous","scurry","scurvy",
+"scut","scutcheon","scuttle","scylla","scythe",
+"sea","seabed","seabird","seaboard","seaborne",
+"seafaring","seafood","seafront","seagirt","seagoing",
+"seagull","seahorse","seakale","seal","sealer",
+"sealing","sealskin","sealyham","seam","seaman",
+"seamanlike","seamanship","seamstress","seamy","seaplane",
+"seaport","sear","search","searching","searchlight",
+"searing","seascape","seashell","seashore","seasick",
+"seaside","season","seasonable","seasonal","seasoning",
+"seat","seating","seawall","seaward","seawards",
+"seawater","seaway","seaweed","seaworthy","sec",
+"secateurs","secede","secession","seclude","secluded",
+"seclusion","seclusive","second","secondary","seconds",
+"secrecy","secret","secretarial","secretariat","secretary",
+"secrete","secretion","secretive","sect","sectarian",
+"section","sectional","sectionalism","sector","secular",
+"secularise","secularism","secularize","secure","security",
+"sedan","sedate","sedation","sedative","sedentary",
+"sedge","sediment","sedimentary","sedimentation","sedition",
+"seditious","seduce","seduction","seductive","sedulous",
+"see","seed","seedbed","seedcake","seedling",
+"seedsman","seedy","seeing","seek","seem",
+"seeming","seemingly","seemly","seen","seep",
+"seepage","seer","seersucker","seesaw","seethe",
+"segment","segmentation","segregate","segregated","segregation",
+"seigneur","seine","seismic","seismograph","seismology",
+"seize","seizure","seldom","select","selection",
+"selective","selector","selenium","self","selfish",
+"selfless","selfsame","sell","seller","sellotape",
+"selvage","selves","semantic","semantics","semaphore",
+"semblance","semeiology","semen","semester","semibreve",
+"semicircle","semicolon","semiconductor","semidetached","semifinal",
+"semifinalist","seminal","seminar","seminarist","seminary",
+"semiology","semiprecious","semiquaver","semitic","semitone",
+"semitropical","semivowel","semiweekly","semolina","sempstress",
+"sen","senate","senator","senatorial","send",
+"sender","senescence","senescent","seneschal","senile",
+"senility","senior","seniority","senna","sensation",
+"sensational","sensationalism","sense","senseless","senses",
+"sensibility","sensible","sensitise","sensitive","sensitivity",
+"sensitize","sensor","sensory","sensual","sensualist",
+"sensuality","sensuous","sent","sentence","sententious",
+"sentient","sentiment","sentimental","sentimentalise","sentimentalism",
+"sentimentality","sentimentalize","sentinel","sentry","sepal",
+"separable","separate","separation","separatism","separator",
+"sepia","sepoy","sepsis","september","septet",
+"septic","septicaemia","septicemia","septuagenarian","septuagesima",
+"septuagint","sepulcher","sepulchral","sepulchre","sequel",
+"sequence","sequencing","sequent","sequential","sequester",
+"sequestrate","sequestration","sequin","sequoia","seraglio",
+"seraph","seraphic","sere","serenade","serendipity",
+"serene","serf","serfdom","serge","sergeant",
+"serial","serialise","serialize","seriatim","sericulture",
+"series","serif","seriocomic","serious","seriously",
+"sermon","sermonise","sermonize","serous","serpent",
+"serpentine","serrated","serried","serum","serval",
+"servant","serve","server","servery","service",
+"serviceable","serviceman","serviette","servile","serving",
+"servitor","servitude","servomechanism","servomotor","sesame",
+"session","sessions","set","setback","setscrew",
+"setsquare","sett","settee","setter","setting",
+"settle","settled","settlement","settler","seven",
+"seventeen","seventy","sever","several","severally",
+"severance","severity","sew","sewage","sewer",
+"sewerage","sewing","sex","sexagenarian","sexagesima",
+"sexism","sexist","sexless","sextant","sextet",
+"sexton","sextuplet","sexual","sexuality","sexy",
+"sforzando","sgt","shabby","shack","shackle",
+"shad","shade","shades","shading","shadow",
+"shadowbox","shadowy","shady","shaft","shag",
+"shagged","shaggy","shagreen","shah","shake",
+"shakedown","shaker","shakes","shako","shaky",
+"shale","shall","shallop","shallot","shallow",
+"shallows","shalom","shalt","sham","shaman",
+"shamble","shambles","shame","shamefaced","shameful",
+"shameless","shammy","shampoo","shamrock","shandy",
+"shanghai","shank","shantung","shanty","shantytown",
+"shape","shaped","shapely","shard","share",
+"sharecropper","shareholder","shares","shark","sharkskin",
+"sharp","sharpen","sharpener","sharper","sharpshooter",
+"shatter","shave","shaver","shaving","shawl",
+"shay","she","sheaf","shear","shears",
+"sheath","sheathe","sheathing","shebang","shebeen",
+"shed","sheen","sheep","sheepdip","sheepdog",
+"sheepfold","sheepish","sheepskin","sheer","sheet",
+"sheeting","sheik","sheikdom","sheikh","sheikhdom",
+"sheila","shekels","shelduck","shelf","shell",
+"shellac","shellacking","shellfish","shellshock","shelter",
+"sheltered","shelve","shelves","shelving","shenanigan",
+"shepherd","shepherdess","sheraton","sherbet","sherd",
+"sheriff","sherpa","sherry","shew","shh",
+"shibboleth","shield","shift","shiftless","shifty",
+"shilling","shimmer","shin","shinbone","shindig",
+"shindy","shine","shiner","shingle","shingles",
+"shining","shinny","shinto","shiny","ship",
+"shipboard","shipbroker","shipbuilding","shipmate","shipment",
+"shipper","shipping","shipshape","shipwreck","shipwright",
+"shipyard","shire","shires","shirk","shirring",
+"shirt","shirtfront","shirting","shirtsleeve","shirttail",
+"shirtwaist","shirtwaister","shirty","shit","shits",
+"shitty","shiver","shivers","shivery","shoal",
+"shock","shocker","shockheaded","shocking","shockproof",
+"shod","shoddy","shoe","shoeblack","shoehorn",
+"shoelace","shoemaker","shoeshine","shoestring","shone",
+"shoo","shook","shoot","shop","shopkeeper",
+"shoplift","shopsoiled","shopworn","shore","shorn",
+"short","shortage","shortbread","shortcake","shortcoming",
+"shorten","shortening","shortfall","shorthand","shorthanded",
+"shorthorn","shortie","shortly","shorts","shortsighted",
+"shorty","shot","shotgun","should","shoulder",
+"shouldst","shout","shouting","shove","shovel",
+"shovelboard","show","showboat","showcase","showdown",
+"shower","showery","showgirl","showing","showman",
+"showmanship","shown","showpiece","showplace","showroom",
+"showy","shrank","shrapnel","shred","shredder",
+"shrew","shrewd","shrewish","shriek","shrift",
+"shrike","shrill","shrimp","shrine","shrink",
+"shrinkage","shrive","shrivel","shroud","shrub",
+"shrubbery","shrug","shuck","shucks","shudder",
+"shuffle","shuffleboard","shufty","shun","shunt",
+"shunter","shush","shut","shutdown","shutter",
+"shuttle","shuttlecock","shy","shyster","sibilant",
+"sibling","sibyl","sibylline","sic","sick",
+"sickbay","sickbed","sicken","sickening","sickle",
+"sickly","sickness","sickroom","side","sidearm",
+"sideboard","sideboards","sidecar","sidekick","sidelight",
+"sideline","sidelong","sidereal","sidesaddle","sideshow",
+"sideslip","sidesman","sidesplitting","sidestep","sidestroke",
+"sideswipe","sidetrack","sidewalk","sideward","sidewards",
+"sideways","siding","sidle","siege","sienna",
+"sierra","siesta","sieve","sift","sifter",
+"sigh","sight","sighted","sightless","sightly",
+"sightscreen","sightsee","sightseer","sign","signal",
+"signaler","signalise","signalize","signaller","signally",
+"signalman","signatory","signature","signer","signet",
+"significance","significant","signification","signify","signor",
+"signora","signorina","signpost","signposted","silage",
+"silence","silencer","silent","silhouette","silica",
+"silicate","silicon","silicone","silicosis","silk",
+"silken","silkworm","silky","sill","sillabub",
+"silly","silo","silt","silvan","silver",
+"silverfish","silverside","silversmith","silverware","silvery",
+"simian","similar","similarity","similarly","simile",
+"similitude","simmer","simony","simper","simple",
+"simpleton","simplicity","simplify","simply","simulacrum",
+"simulate","simulated","simulation","simulator","simultaneous",
+"sin","since","sincere","sincerely","sincerity",
+"sinecure","sinew","sinewy","sinful","sing",
+"singe","singhalese","singing","single","singleness",
+"singles","singlestick","singlet","singleton","singly",
+"singsong","singular","singularly","sinhalese","sinister",
+"sink","sinker","sinless","sinner","sinology",
+"sinuous","sinus","sip","siphon","sir",
+"sire","siren","sirloin","sirocco","sirrah",
+"sis","sisal","sissy","sister","sisterhood",
+"sisterly","sit","sitar","site","sitter",
+"sitting","situated","situation","six","sixpence",
+"sixteen","sixty","sizable","size","sizeable",
+"sizzle","sizzler","skate","skateboard","skedaddle",
+"skeet","skein","skeleton","skeptic","skeptical",
+"skepticism","sketch","sketchpad","sketchy","skew",
+"skewbald","skewer","ski","skibob","skid",
+"skidlid","skidpan","skiff","skiffle","skilful",
+"skill","skilled","skillet","skillful","skim",
+"skimmer","skimp","skimpy","skin","skinflint",
+"skinful","skinhead","skinny","skint","skip",
+"skipper","skirl","skirmish","skirt","skit",
+"skitter","skittish","skittle","skittles","skive",
+"skivvy","skua","skulduggery","skulk","skull",
+"skullcap","skullduggery","skunk","sky","skydiving",
+"skyhook","skyjack","skylark","skylight","skyline",
+"skyrocket","skyscraper","skywriting","slab","slack",
+"slacken","slacker","slacks","slag","slagheap",
+"slain","slake","slalom","slam","slander",
+"slanderous","slang","slangy","slant","slantwise",
+"slap","slapdash","slaphappy","slapstick","slash",
+"slat","slate","slattern","slaty","slaughter",
+"slaughterhouse","slave","slaver","slavery","slavic",
+"slavish","slay","sleazy","sled","sledge",
+"sledgehammer","sleek","sleep","sleeper","sleepless",
+"sleepwalker","sleepy","sleepyhead","sleet","sleeve",
+"sleigh","slender","slenderise","slenderize","slept",
+"sleuth","slew","slewed","slice","slick",
+"slicker","slide","slight","slightly","slim",
+"slimy","sling","slingshot","slink","slip",
+"slipcover","slipknot","slipover","slipper","slippery",
+"slippy","slips","slipshod","slipstream","slipway",
+"slit","slither","slithery","sliver","slivovitz",
+"slob","slobber","sloe","slog","slogan",
+"sloop","slop","slope","sloppy","slosh",
+"sloshed","slot","sloth","slothful","slouch",
+"slough","sloven","slovenly","slow","slowcoach",
+"slowworm","sludge","slue","slug","sluggard",
+"sluggish","sluice","sluiceway","slum","slumber",
+"slumberous","slummy","slump","slung","slunk",
+"slur","slurp","slurry","slush","slut",
+"sly","smack","smacker","small","smallholder",
+"smallholding","smallpox","smalls","smarmy","smart",
+"smarten","smash","smashed","smasher","smashing",
+"smattering","smear","smell","smelly","smelt",
+"smile","smirch","smirk","smite","smith",
+"smithereens","smithy","smitten","smock","smocking",
+"smog","smoke","smoker","smokescreen","smokestack",
+"smoking","smoky","smolder","smooch","smooth",
+"smoothie","smoothy","smorgasbord","smote","smother",
+"smoulder","smudge","smug","smuggle","smut",
+"smutty","snack","snaffle","snag","snail",
+"snake","snakebite","snaky","snap","snapdragon",
+"snapper","snappish","snappy","snapshot","snare",
+"snarl","snatch","snazzy","sneak","sneaker",
+"sneaking","sneaky","sneer","sneeze","snick",
+"snicker","snide","sniff","sniffle","sniffles",
+"sniffy","snifter","snigger","snip","snippet",
+"snips","snitch","snivel","snob","snobbery",
+"snobbish","snog","snood","snook","snooker",
+"snoop","snooper","snoot","snooty","snooze",
+"snore","snorkel","snort","snorter","snot",
+"snotty","snout","snow","snowball","snowberry",
+"snowbound","snowdrift","snowdrop","snowfall","snowfield",
+"snowflake","snowline","snowman","snowplough","snowplow",
+"snowshoe","snowstorm","snowy","snr","snub",
+"snuff","snuffer","snuffle","snug","snuggle",
+"soak","soaked","soaking","soap","soapbox",
+"soapstone","soapsuds","soapy","soar","sob",
+"sober","sobriety","sobriquet","soccer","sociable",
+"social","socialise","socialism","socialist","socialite",
+"socialize","society","sociology","sock","socket",
+"sod","soda","sodden","sodium","sodomite",
+"sodomy","soever","sofa","soft","softball",
+"soften","softhearted","softie","software","softwood",
+"softy","soggy","soigne","soignee","soil",
+"sojourn","sol","solace","solar","solarium",
+"sold","solder","soldier","soldierly","soldiery",
+"sole","solecism","solely","solemn","solemnise",
+"solemnity","solemnize","solicit","solicitor","solicitous",
+"solicitude","solid","solidarity","solidify","solidity",
+"solidus","soliloquise","soliloquize","soliloquy","solipsism",
+"solitaire","solitary","solitude","solo","soloist",
+"solstice","soluble","solution","solve","solvency",
+"solvent","somber","sombre","sombrero","some",
+"somebody","someday","somehow","somersault","something",
+"sometime","sometimes","someway","somewhat","somewhere",
+"somnambulism","somnolent","son","sonar","sonata",
+"song","songbird","songbook","songster","sonic",
+"sonnet","sonny","sonority","sonorous","sonsy",
+"soon","soot","soothe","soothsayer","sop",
+"sophism","sophisticate","sophisticated","sophistication","sophistry",
+"sophomore","soporific","sopping","soppy","soprano",
+"sorbet","sorcerer","sorcery","sordid","sore",
+"sorehead","sorely","sorghum","sorority","sorrel",
+"sorrow","sorry","sort","sortie","sos",
+"sot","sottish","sou","soubrette","soubriquet",
+"sough","sought","soul","soulful","soulless",
+"sound","soundings","soundproof","soundtrack","soup",
+"sour","source","sourdough","sourpuss","sousaphone",
+"souse","soused","south","southbound","southeast",
+"southeaster","southeasterly","southeastern","southeastward","southeastwards",
+"southerly","southern","southerner","southernmost","southpaw",
+"southward","southwards","southwest","southwester","southwesterly",
+"southwestern","southwestward","southwestwards","souvenir","sovereign",
+"sovereignty","soviet","sow","sox","soy",
+"soybean","sozzled","spa","space","spacecraft",
+"spaceship","spacesuit","spacing","spacious","spade",
+"spadework","spaghetti","spake","spam","span",
+"spangle","spaniel","spank","spanking","spanner",
+"spar","spare","spareribs","sparing","spark",
+"sparkle","sparkler","sparks","sparrow","sparse",
+"spartan","spasm","spasmodic","spastic","spat",
+"spatchcock","spate","spatial","spatter","spatula",
+"spavin","spawn","spay","speak","speakeasy",
+"speaker","speakership","spear","spearhead","spearmint",
+"spec","special","specialise","specialised","specialist",
+"speciality","specialize","specialized","specially","specie",
+"species","specific","specifically","specification","specifics",
+"specify","specimen","specious","speck","speckle",
+"spectacle","spectacled","spectacles","spectacular","spectator",
+"specter","spectral","spectre","spectroscope","spectrum",
+"speculate","speculation","speculative","speech","speechify",
+"speechless","speed","speedboat","speeding","speedometer",
+"speedway","speedwell","speedy","spelaeology","speleology",
+"spell","spellbind","spelling","spend","spender",
+"spendthrift","spent","sperm","spermaceti","spermatozoa",
+"spew","sphagnum","sphere","spherical","spheroid",
+"sphincter","sphinx","spice","spicy","spider",
+"spidery","spiel","spigot","spike","spikenard",
+"spiky","spill","spillover","spillway","spin",
+"spinach","spinal","spindle","spindly","spine",
+"spineless","spinet","spinnaker","spinner","spinney",
+"spinster","spiny","spiral","spire","spirit",
+"spirited","spiritless","spirits","spiritual","spiritualise",
+"spiritualism","spirituality","spiritualize","spirituous","spirt",
+"spit","spite","spitfire","spittle","spittoon",
+"spiv","splash","splashy","splat","splatter",
+"splay","splayfoot","spleen","splendid","splendiferous",
+"splendor","splendour","splenetic","splice","splicer",
+"splint","splinter","split","splits","splitting",
+"splotch","splurge","splutter","spoil","spoilage",
+"spoils","spoilsport","spoke","spoken","spokeshave",
+"spokesman","spoliation","spondee","sponge","spongy",
+"sponsor","spontaneous","spoof","spook","spooky",
+"spool","spoon","spoonerism","spoonful","spoor",
+"sporadic","spore","sporran","sport","sporting",
+"sportive","sports","sportsman","sportsmanlike","sportsmanship",
+"sporty","spot","spotless","spotlight","spotted",
+"spotter","spotty","spouse","spout","sprain",
+"sprang","sprat","sprawl","spray","sprayer",
+"spread","spree","sprig","sprigged","sprightly",
+"spring","springboard","springbok","springtime","springy",
+"sprinkle","sprinkler","sprinkling","sprint","sprite",
+"sprocket","sprout","spruce","sprung","spry",
+"spud","spume","spun","spunk","spur",
+"spurious","spurn","spurt","sputter","sputum",
+"spy","spyglass","squab","squabble","squad",
+"squadron","squalid","squall","squalor","squander",
+"square","squash","squashy","squat","squatter",
+"squaw","squawk","squeak","squeaky","squeal",
+"squeamish","squeegee","squeeze","squeezer","squelch",
+"squib","squid","squidgy","squiffy","squiggle",
+"squint","squirarchy","squire","squirearchy","squirm",
+"squirrel","squirt","squirter","sri","srn",
+"ssh","stab","stabbing","stabilise","stabiliser",
+"stability","stabilize","stabilizer","stable","stabling",
+"staccato","stack","stadium","staff","stag",
+"stage","stagecoach","stager","stagestruck","stagger",
+"staggering","staggers","staging","stagnant","stagnate",
+"stagy","staid","stain","stainless","stair",
+"staircase","stairs","stairwell","stake","stakeholder",
+"stakes","stalactite","stalagmite","stale","stalemate",
+"stalk","stall","stallholder","stallion","stalls",
+"stalwart","stamen","stamina","stammer","stamp",
+"stampede","stance","stanch","stanchion","stand",
+"standard","standardise","standardize","standby","standing",
+"standoffish","standpipe","standpoint","standstill","stank",
+"stanza","staple","stapler","star","starboard",
+"starch","starchy","stardom","stardust","stare",
+"starfish","stargazer","stargazing","staring","stark",
+"starkers","starlet","starlight","starling","starlit",
+"starry","stars","start","starter","starters",
+"startle","starvation","starve","starveling","stash",
+"state","statecraft","statehood","stateless","stately",
+"statement","stateroom","states","stateside","statesman",
+"static","statics","station","stationary","stationer",
+"stationery","stationmaster","statistic","statistician","statistics",
+"statuary","statue","statuesque","statuette","stature",
+"status","statute","statutory","staunch","stave",
+"staves","stay","stayer","stays","std",
+"stead","steadfast","steady","steak","steal",
+"stealth","stealthy","steam","steamboat","steamer",
+"steamroller","steamship","steed","steel","steelworker",
+"steelworks","steely","steelyard","steenbok","steep",
+"steepen","steeple","steeplechase","steeplejack","steer",
+"steerage","steerageway","steersman","stein","steinbok",
+"stele","stellar","stem","stench","stencil",
+"stenographer","stenography","stentorian","step","stepbrother",
+"stepchild","stepladder","stepparent","steps","stepsister",
+"stereo","stereoscope","stereoscopic","stereotype","sterile",
+"sterilise","sterility","sterilize","sterling","stern",
+"sternum","steroid","stertorous","stet","stethoscope",
+"stetson","stevedore","stew","steward","stewardess",
+"stewardship","stewed","stick","sticker","stickleback",
+"stickler","stickpin","sticks","sticky","stiff",
+"stiffen","stiffener","stiffening","stifle","stigma",
+"stigmata","stigmatise","stigmatize","stile","stiletto",
+"still","stillbirth","stillborn","stillroom","stilly",
+"stilt","stilted","stilton","stimulant","stimulate",
+"stimulus","sting","stinger","stingo","stingray",
+"stingy","stink","stinking","stint","stipend",
+"stipendiary","stipple","stipulate","stipulation","stir",
+"stirrer","stirring","stirrup","stitch","stoat",
+"stock","stockade","stockbreeder","stockbroker","stockcar",
+"stockfish","stockholder","stockily","stockinet","stockinette",
+"stocking","stockist","stockjobber","stockman","stockpile",
+"stockpot","stockroom","stocks","stocktaking","stocky",
+"stockyard","stodge","stodgy","stoic","stoical",
+"stoicism","stoke","stokehold","stoker","stole",
+"stolen","stolid","stomach","stomachache","stomachful",
+"stomp","stone","stonebreaker","stonecutter","stoned",
+"stoneless","stonemason","stonewall","stoneware","stonework",
+"stony","stood","stooge","stool","stoolpigeon",
+"stoop","stop","stopcock","stopgap","stopover",
+"stoppage","stopper","stopping","stopwatch","storage",
+"store","storehouse","storekeeper","storeroom","stores",
+"storey","storied","stork","storm","stormbound",
+"stormy","story","storybook","storyteller","stoup",
+"stout","stouthearted","stove","stovepipe","stow",
+"stowage","stowaway","straddle","stradivarius","strafe",
+"straggle","straggly","straight","straightaway","straightedge",
+"straighten","straightforward","straightway","strain","strained",
+"strainer","strait","straitened","straitjacket","straitlaced",
+"straits","strand","stranded","strange","stranger",
+"strangle","stranglehold","strangulate","strangulation","strap",
+"straphanging","strapless","strapping","strata","stratagem",
+"strategic","strategist","strategy","stratification","stratify",
+"stratosphere","stratum","straw","strawberry","strawboard",
+"stray","streak","streaker","streaky","stream",
+"streamer","streamline","streamlined","street","streetcar",
+"streetwalker","strength","strengthen","strenuous","streptococcus",
+"streptomycin","stress","stretch","stretcher","stretchy",
+"strew","strewth","striated","striation","stricken",
+"strict","stricture","stride","stridency","strident",
+"stridulate","strife","strike","strikebound","strikebreaker",
+"strikebreaking","striker","striking","string","stringency",
+"stringent","strings","stringy","strip","stripe",
+"striped","stripling","stripper","striptease","stripy",
+"strive","strode","stroke","stroll","stroller",
+"strolling","strong","strongarm","strongbox","stronghold",
+"strontium","strop","strophe","stroppy","strove",
+"struck","structural","structure","strudel","struggle",
+"strum","strumpet","strung","strut","strychnine",
+"stub","stubble","stubborn","stubby","stucco",
+"stuck","stud","studbook","student","studied",
+"studio","studious","study","stuff","stuffing",
+"stuffy","stultify","stumble","stump","stumper",
+"stumpy","stun","stung","stunk","stunner",
+"stunning","stunt","stupefaction","stupefy","stupendous",
+"stupid","stupidity","stupor","sturdy","sturgeon",
+"stutter","sty","stye","stygian","style",
+"stylise","stylish","stylist","stylistic","stylistics",
+"stylize","stylus","stymie","styptic","suasion",
+"suave","sub","subaltern","subatomic","subcommittee",
+"subconscious","subcontinent","subcontract","subcontractor","subcutaneous",
+"subdivide","subdue","subdued","subedit","subeditor",
+"subheading","subhuman","subject","subjection","subjective",
+"subjoin","subjugate","subjunctive","sublease","sublet",
+"sublieutenant","sublimate","sublime","subliminal","submarine",
+"submariner","submerge","submergence","submersible","submission",
+"submissive","submit","subnormal","suborbital","subordinate",
+"suborn","subplot","subpoena","subscribe","subscriber",
+"subscription","subsequent","subservience","subservient","subside",
+"subsidence","subsidiary","subsidise","subsidize","subsidy",
+"subsist","subsistence","subsoil","subsonic","substance",
+"substandard","substantial","substantially","substantiate","substantival",
+"substantive","substation","substitute","substratum","substructure",
+"subsume","subtenant","subtend","subterfuge","subterranean",
+"subtitle","subtitles","subtle","subtlety","subtopia",
+"subtract","subtraction","subtropical","suburb","suburban",
+"suburbanite","suburbia","suburbs","subvention","subversive",
+"subvert","subway","succeed","success","successful",
+"succession","successive","successor","succinct","succor",
+"succour","succubus","succulence","succulent","succumb",
+"such","suchlike","suck","sucker","suckle",
+"suckling","sucrose","suction","sudden","suds",
+"sue","suet","suffer","sufferable","sufferance",
+"sufferer","suffering","suffice","sufficiency","sufficient",
+"suffix","suffocate","suffragan","suffrage","suffragette",
+"suffuse","sugar","sugarcane","sugarcoated","sugarloaf",
+"sugary","suggest","suggestible","suggestion","suggestive",
+"suicidal","suicide","suit","suitability","suitable",
+"suitcase","suiting","suitor","sulfate","sulfide",
+"sulfur","sulfuret","sulfurous","sulk","sulks",
+"sulky","sullen","sully","sulphate","sulphide",
+"sulphur","sulphuret","sulphurous","sultan","sultana",
+"sultanate","sultry","sum","sumac","sumach",
+"summarise","summarize","summary","summat","summation",
+"summer","summerhouse","summertime","summery","summit",
+"summon","summons","sump","sumptuary","sumptuous",
+"sun","sunbaked","sunbathe","sunbeam","sunblind",
+"sunbonnet","sunburn","sunburnt","sundae","sunday",
+"sundeck","sunder","sundew","sundial","sundown",
+"sundowner","sundrenched","sundries","sundry","sunfish",
+"sunflower","sung","sunglasses","sunk","sunken",
+"sunlamp","sunless","sunlight","sunlit","sunny",
+"sunray","sunrise","sunroof","sunset","sunshade",
+"sunshine","sunspot","sunstroke","suntan","suntrap",
+"sup","super","superabundance","superabundant","superannuate",
+"superannuated","superannuation","superb","supercharged","supercharger",
+"supercilious","superconductivity","superduper","superego","superficial",
+"superficies","superfine","superfluity","superfluous","superhuman",
+"superimpose","superintend","superintendent","superior","superlative",
+"superlatively","superman","supermarket","supernal","supernatural",
+"supernova","supernumerary","superscription","supersede","supersession",
+"supersonic","superstar","superstition","superstitious","superstructure",
+"supertax","supervene","supervise","supervisory","supine",
+"supper","supplant","supple","supplement","supplementary",
+"suppliant","supplicant","supplicate","supplier","supplies",
+"supply","support","supportable","supporter","supportive",
+"suppose","supposed","supposedly","supposing","supposition",
+"suppository","suppress","suppression","suppressive","suppressor",
+"suppurate","supranational","supremacist","supremacy","supreme",
+"surcharge","surcoat","surd","sure","surefire",
+"surefooted","surely","surety","surf","surface",
+"surfboard","surfboat","surfeit","surfer","surge",
+"surgeon","surgery","surgical","surly","surmise",
+"surmount","surname","surpass","surpassing","surplice",
+"surplus","surprise","surprising","surreal","surrealism",
+"surrealist","surrealistic","surrender","surreptitious","surrey",
+"surrogate","surround","surrounding","surroundings","surtax",
+"surveillance","survey","surveyor","survival","survive",
+"survivor","susceptibilities","susceptibility","susceptible","suspect",
+"suspend","suspender","suspenders","suspense","suspension",
+"suspicion","suspicious","sustain","sustenance","suttee",
+"suture","suzerain","suzerainty","svelte","swab",
+"swaddle","swag","swagger","swain","swallow",
+"swallowtailed","swam","swami","swamp","swampy",
+"swan","swank","swanky","swansdown","swansong",
+"swap","sward","swarf","swarm","swarthy",
+"swashbuckler","swashbuckling","swastika","swat","swatch",
+"swath","swathe","swatter","sway","swayback",
+"swear","swearword","sweat","sweatband","sweated",
+"sweater","sweatshirt","sweatshop","sweaty","swede",
+"sweep","sweeper","sweeping","sweepings","sweepstake",
+"sweepstakes","sweet","sweetbread","sweetbriar","sweetbrier",
+"sweeten","sweetener","sweetening","sweetheart","sweetie",
+"sweetish","sweetmeat","sweets","swell","swelling",
+"swelter","sweltering","swept","swerve","swift",
+"swig","swill","swim","swimming","swimmingly",
+"swindle","swine","swineherd","swing","swingeing",
+"swinger","swinging","swinish","swipe","swirl",
+"swish","switch","switchback","switchblade","switchboard",
+"switchgear","switchman","swivel","swiz","swizzle",
+"swollen","swoon","swoop","swop","sword",
+"swordfish","swordplay","swordsman","swordsmanship","swordstick",
+"swore","sworn","swot","swum","swung",
+"sybarite","sybaritic","sycamore","sycophant","sycophantic",
+"sylabub","syllabary","syllabic","syllabify","syllable",
+"syllabub","syllabus","syllogism","syllogistic","sylph",
+"sylphlike","sylvan","symbiosis","symbol","symbolic",
+"symbolise","symbolism","symbolist","symbolize","symmetrical",
+"symmetry","sympathetic","sympathies","sympathise","sympathize",
+"sympathy","symphonic","symphony","symposium","symptom",
+"symptomatic","synagogue","sync","synch","synchonise",
+"synchromesh","synchronize","synchrotron","syncopate","syncope",
+"syndic","syndicalism","syndicate","syndrome","synod",
+"synonym","synonymous","synopsis","synoptic","syntactic",
+"syntax","synthesis","synthesise","synthesiser","synthesize",
+"synthesizer","synthetic","syphilis","syphilitic","syphon",
+"syringe","syrup","syrupy","system","systematic",
+"systematise","systematize","systemic","tab","tabard",
+"tabasco","tabby","tabernacle","table","tableau",
+"tablecloth","tableland","tablemat","tablespoon","tablespoonful",
+"tablet","tableware","tabloid","taboo","tabor",
+"tabular","tabulate","tabulator","tacit","taciturn",
+"tack","tackiness","tackle","tacky","tact",
+"tactic","tactical","tactician","tactics","tactile",
+"tactual","tadpole","taffeta","taffrail","taffy",
+"tag","tail","tailback","tailboard","tailcoat",
+"taillight","tailor","tailpiece","tails","tailspin",
+"tailwind","taint","take","takeaway","takeoff",
+"takeover","taking","takings","talc","tale",
+"talebearer","talent","talented","talisman","talk",
+"talkative","talker","talkie","talks","tall",
+"tallboy","tallow","tally","tallyho","tallyman",
+"talmud","talon","tamale","tamarind","tamarisk",
+"tambour","tambourine","tame","tammany","tamp",
+"tamper","tampon","tan","tandem","tang",
+"tangent","tangential","tangerine","tangible","tangle",
+"tango","tank","tankard","tanker","tanner",
+"tannery","tannin","tanning","tannoy","tansy",
+"tantalise","tantalize","tantalus","tantamount","tantrum",
+"taoism","tap","tape","taper","tapestry",
+"tapeworm","tapioca","tapir","tappet","taproom",
+"taproot","taps","tar","tarantella","tarantula",
+"tarboosh","tardy","target","tariff","tarmac",
+"tarn","tarnish","taro","tarot","tarpaulin",
+"tarragon","tarry","tarsal","tarsus","tart",
+"tartan","tartar","task","taskmaster","tassel",
+"taste","tasteful","tasteless","taster","tasty",
+"tat","tatas","tatter","tattered","tatters",
+"tatting","tattle","tattoo","tattooist","tatty",
+"taught","taunt","taurus","taut","tautological",
+"tautology","tavern","tawdry","tawny","tawse",
+"tax","taxation","taxi","taxidermist","taxidermy",
+"taximeter","taxonomy","tea","teabag","teacake",
+"teach","teacher","teaching","teacup","teacupful",
+"teagarden","teahouse","teak","teakettle","teal",
+"tealeaf","team","teamster","teamwork","teapot",
+"tear","tearaway","teardrop","tearful","teargas",
+"tearjerker","tearless","tearoom","tease","teasel",
+"teaser","teaspoon","teaspoonful","teat","teatime",
+"teazle","tech","technical","technicality","technician",
+"technique","technocracy","technocrat","technological","technologist",
+"technology","techy","tedious","tedium","tee",
+"teem","teeming","teenage","teenager","teens",
+"teenybopper","teeter","teeth","teethe","teetotal",
+"teetotaler","teetotaller","teflon","tegument","tele",
+"telecast","telecommunications","telegram","telegraph","telegrapher",
+"telegraphese","telegraphic","telemarketing","telemeter","telemetry",
+"teleology","telepathic","telepathist","telepathy","telephone",
+"telephonist","telephony","telephotograph","telephotography","teleprinter",
+"teleprompter","telescope","telescopic","televise","television",
+"televisual","telex","telfer","tell","teller",
+"telling","telltale","telly","telpher","telstar",
+"temerity","temp","temper","tempera","temperament",
+"temperamental","temperance","temperate","temperature","tempest",
+"tempestuous","template","temple","templet","tempo",
+"temporal","temporary","temporise","temporize","tempt",
+"temptation","ten","tenable","tenacious","tenacity",
+"tenancy","tenant","tenantry","tench","tend",
+"tendency","tendentious","tender","tenderfoot","tenderhearted",
+"tenderise","tenderize","tenderloin","tendon","tendril",
+"tenement","tenet","tenner","tennis","tenon",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java
new file mode 100644
index 00000000000..001a4657aa7
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemData8.java
@@ -0,0 +1,614 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This algorithm is adapted from the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/** A list of words used by Kstem
+ */
+class KStemData8 {
+ private KStemData8() {
+ }
+ static String[] data = {
+"tenor","tenpin","tense","tensile","tension",
+"tent","tentacle","tentative","tenterhooks","tenuity",
+"tenuous","tenure","tepee","tepid","tequila",
+"tercentenary","tercentennial","term","termagant","terminable",
+"terminal","terminate","termination","terminology","terminus",
+"termite","terms","tern","terpsichorean","terrace",
+"terracotta","terrain","terrapin","terrestrial","terrible",
+"terribly","terrier","terrific","terrifically","terrify",
+"territorial","territory","terror","terrorise","terrorism",
+"terrorize","terrycloth","terse","tertian","tertiary",
+"terylene","tessellated","test","testament","testamentary",
+"testate","testator","tester","testicle","testify",
+"testimonial","testimony","testis","testy","tetanus",
+"tetchy","tether","teutonic","text","textbook",
+"textile","textual","texture","thalidomide","than",
+"thane","thank","thankful","thankless","thanks",
+"thanksgiving","thankyou","that","thatch","thaw",
+"the","theater","theatergoer","theatre","theatregoer",
+"theatrical","theatricals","thee","theft","thegn",
+"their","theirs","theism","them","theme",
+"themselves","then","thence","thenceforth","theocracy",
+"theocratic","theodolite","theologian","theology","theorem",
+"theoretical","theoretically","theorise","theorist","theorize",
+"theory","theosophy","therapeutic","therapeutics","therapist",
+"therapy","there","thereabouts","thereafter","thereby",
+"therefore","therein","thereinafter","thereof","thereon",
+"thereto","thereunder","thereupon","therm","thermal",
+"thermionic","thermionics","thermodynamics","thermometer","thermonuclear",
+"thermoplastic","thermos","thermosetting","thermostat","thesaurus",
+"these","thesis","thespian","thews","they",
+"thick","thicken","thickener","thicket","thickheaded",
+"thickness","thickset","thief","thieve","thieving",
+"thievish","thigh","thimble","thimbleful","thin",
+"thine","thing","thingamajig","thingamujig","things",
+"think","thinkable","thinking","thinner","third",
+"thirst","thirsty","thirteen","thirty","this",
+"thistle","thistledown","thither","thole","thong",
+"thorax","thorn","thorny","thorough","thoroughbred",
+"thoroughfare","thoroughgoing","those","thou","though",
+"thought","thoughtful","thoughtless","thousand","thraldom",
+"thrall","thralldom","thrash","thrashing","thread",
+"threadbare","threadlike","threat","threaten","three",
+"threepence","threnody","thresh","thresher","threshold",
+"threw","thrice","thrift","thrifty","thrill",
+"thriller","thrive","throat","throaty","throb",
+"throes","thrombosis","throne","throng","throstle",
+"throttle","through","throughout","throughput","throughway",
+"throw","throwaway","throwback","thru","thrum",
+"thrush","thrust","thruster","thruway","thud",
+"thug","thuggery","thumb","thumbnail","thumbscrew",
+"thumbtack","thump","thumping","thunder","thunderbolt",
+"thunderclap","thundercloud","thundering","thunderous","thunderstorm",
+"thunderstruck","thundery","thurible","thursday","thus",
+"thwack","thwart","thy","thyme","thyroid",
+"thyself","tiara","tibia","tic","tick",
+"ticker","tickertape","ticket","ticking","tickle",
+"tickler","ticklish","tidal","tidbit","tiddler",
+"tiddley","tiddleywinks","tiddly","tiddlywinks","tide",
+"tidemark","tidewater","tideway","tidings","tidy",
+"tie","tiebreaker","tiepin","tier","tiff",
+"tiffin","tig","tiger","tigerish","tight",
+"tighten","tightfisted","tightrope","tights","tightwad",
+"tigress","tike","tilde","tile","till",
+"tillage","tiller","tilt","timber","timbered",
+"timberline","timbre","timbrel","time","timekeeper",
+"timeless","timely","timepiece","timer","times",
+"timesaving","timeserver","timeserving","timetable","timework",
+"timeworn","timid","timing","timorous","timothy",
+"timpani","timpanist","tin","tincture","tinder",
+"tinderbox","tinfoil","ting","tingaling","tinge",
+"tingle","tinker","tinkle","tinny","tinplate",
+"tinsel","tint","tintack","tintinnabulation","tiny",
+"tip","tippet","tipple","tipstaff","tipster",
+"tipsy","tiptoe","tirade","tire","tired",
+"tireless","tiresome","tiro","tissue","tit",
+"titan","titanic","titanium","titbit","titfer",
+"tithe","titillate","titivate","title","titled",
+"titleholder","titmouse","titter","tittivate","tittle",
+"titty","titular","tizzy","tnt","toad",
+"toadstool","toady","toast","toaster","toastmaster",
+"tobacco","tobacconist","toboggan","toccata","tocsin",
+"tod","today","toddle","toddler","toddy",
+"toe","toehold","toenail","toff","toffee",
+"toffy","tog","toga","together","togetherness",
+"toggle","togs","toil","toilet","toiletries",
+"toiletry","toils","tokay","token","told",
+"tolerable","tolerably","tolerance","tolerant","tolerate",
+"toleration","toll","tollgate","tollhouse","tomahawk",
+"tomato","tomb","tombola","tomboy","tombstone",
+"tomcat","tome","tomfoolery","tommyrot","tomorrow",
+"tomtit","ton","tonal","tonality","tone",
+"toneless","tong","tongs","tongue","tonic",
+"tonight","tonnage","tonne","tonsil","tonsilitis",
+"tonsillitis","tonsorial","tonsure","tontine","too",
+"took","tool","toot","tooth","toothache",
+"toothbrush","toothcomb","toothpaste","toothpick","toothsome",
+"toothy","tootle","toots","tootsie","top",
+"topaz","topcoat","topdressing","topee","topgallant",
+"topi","topiary","topic","topical","topicality",
+"topknot","topless","topmast","topmost","topographer",
+"topographical","topography","topper","topping","topple",
+"tops","topsail","topside","topsoil","topspin",
+"toque","tor","torch","torchlight","tore",
+"toreador","torment","tormentor","torn","tornado",
+"torpedo","torpid","torpor","torque","torrent",
+"torrential","torrid","torsion","torso","tort",
+"tortilla","tortoise","tortoiseshell","tortuous","torture",
+"tory","toss","tot","total","totalisator",
+"totalitarian","totalitarianism","totality","totalizator","tote",
+"totem","totter","tottery","toucan","touch",
+"touchdown","touched","touching","touchline","touchstone",
+"touchy","tough","toughen","toupee","tour",
+"tourism","tourist","tournament","tourney","tourniquet",
+"tousle","tout","tow","towards","towel",
+"toweling","towelling","tower","towering","towline",
+"town","townscape","township","townsman","townspeople",
+"towpath","toxaemia","toxemia","toxic","toxicologist",
+"toxicology","toxin","toy","toyshop","trace",
+"tracer","tracery","trachea","trachoma","tracing",
+"track","trackless","tracksuit","tract","tractable",
+"traction","tractor","trad","trade","trademark",
+"trader","trades","tradesman","tradespeople","tradition",
+"traditional","traditionalism","traduce","traffic","trafficator",
+"trafficker","tragedian","tragedienne","tragedy","tragic",
+"tragicomedy","trail","trailer","train","trainbearer",
+"trainee","training","trainman","traipse","trait",
+"traitor","traitorous","trajectory","tram","tramline",
+"trammel","trammels","tramp","trample","trampoline",
+"trance","tranny","tranquil","tranquiliser","tranquillise",
+"tranquillize","tranquillizer","transact","transaction","transactions",
+"transalpine","transatlantic","transcend","transcendence","transcendent",
+"transcendental","transcendentalism","transcontinental","transcribe","transcript",
+"transcription","transept","transfer","transference","transfiguration",
+"transfigure","transfix","transform","transformation","transformer",
+"transfuse","transgress","tranship","transience","transient",
+"transistor","transistorise","transistorize","transit","transition",
+"transitive","translate","translator","transliterate","translucence",
+"translucent","transmigration","transmission","transmit","transmitter",
+"transmogrify","transmute","transoceanic","transom","transparency",
+"transparent","transpiration","transpire","transplant","transpolar",
+"transport","transportation","transporter","transpose","transship",
+"transubstantiation","transverse","transvestism","transvestite","trap",
+"trapdoor","trapeze","trapezium","trapezoid","trapper",
+"trappings","trappist","trapse","trapshooting","trash",
+"trashcan","trashy","trauma","traumatic","travail",
+"travel","traveled","traveler","travelled","traveller",
+"travelog","travelogue","travels","travelsick","traverse",
+"travesty","trawl","trawler","tray","treacherous",
+"treachery","treacle","treacly","tread","treadle",
+"treadmill","treason","treasonable","treasure","treasurer",
+"treasury","treat","treatise","treatment","treaty",
+"treble","tree","trefoil","trek","trellis",
+"tremble","tremendous","tremolo","tremor","tremulous",
+"trench","trenchant","trencher","trencherman","trend",
+"trendsetter","trendy","trepan","trephine","trepidation",
+"trespass","tresses","trestle","trews","triad",
+"trial","triangle","triangular","tribal","tribalism",
+"tribe","tribesman","tribulation","tribunal","tribune",
+"tributary","tribute","trice","triceps","trichinosis",
+"trick","trickery","trickle","trickster","tricky",
+"tricolor","tricolour","tricycle","trident","triennial",
+"trier","trifle","trifler","trifling","trigger",
+"trigonometry","trike","trilateral","trilby","trilingual",
+"trill","trillion","trilobite","trilogy","trim",
+"trimaran","trimester","trimmer","trimming","trinitrotoluene",
+"trinity","trinket","trio","trip","tripartite",
+"triple","triplet","triplex","triplicate","tripod",
+"tripos","tripper","tripping","triptych","tripwire",
+"trireme","trisect","trite","triumph","triumphal",
+"triumphant","triumvir","triumvirate","trivet","trivia",
+"trivial","trivialise","triviality","trivialize","trochaic",
+"trochee","trod","trodden","troglodyte","troika",
+"trojan","troll","trolley","trolleybus","trollop",
+"trombone","trombonist","troop","trooper","troops",
+"troopship","trope","trophy","tropic","tropical",
+"tropics","trot","troth","trotskyist","trotter",
+"troubadour","trouble","troublemaker","troubleshooter","troublesome",
+"trough","trounce","troupe","trouper","trouser",
+"trousers","trousseau","trout","trove","trowel",
+"truancy","truant","truce","truck","trucking",
+"truckle","truculence","truculent","trudge","true",
+"trueborn","truehearted","truelove","truffle","trug",
+"truism","truly","trump","trumpery","trumpet",
+"trumps","truncate","truncheon","trundle","trunk",
+"trunks","truss","trust","trustee","trusteeship",
+"trustful","trustworthy","trusty","truth","truthful",
+"try","tryst","tsar","tsarina","tsp",
+"tub","tuba","tubby","tube","tubeless",
+"tuber","tubercular","tuberculosis","tubful","tubing",
+"tubular","tuck","tucker","tuckerbag","tuesday",
+"tuft","tug","tugboat","tuition","tulip",
+"tulle","tumble","tumbledown","tumbler","tumbleweed",
+"tumbrel","tumbril","tumescent","tumid","tummy",
+"tumor","tumour","tumult","tumultuous","tumulus",
+"tun","tuna","tundra","tune","tuneful",
+"tuneless","tuner","tungsten","tunic","tunnel",
+"tunny","tup","tuppence","tuppenny","turban",
+"turbid","turbine","turbojet","turboprop","turbot",
+"turbulence","turbulent","turd","tureen","turf",
+"turgid","turkey","turmeric","turmoil","turn",
+"turnabout","turncoat","turncock","turner","turning",
+"turnip","turnkey","turnout","turnover","turnpike",
+"turnstile","turntable","turpentine","turpitude","turquoise",
+"turret","turtle","turtledove","turtleneck","tush",
+"tusk","tusker","tussle","tussock","tut",
+"tutelage","tutelary","tutor","tutorial","tutu",
+"tuxedo","twaddle","twain","twang","twat",
+"tweak","twee","tweed","tweeds","tweedy",
+"tweet","tweeter","tweezers","twelfth","twelve",
+"twelvemonth","twenty","twerp","twice","twiddle",
+"twig","twilight","twill","twin","twinge",
+"twinkle","twinkling","twirl","twirp","twist",
+"twister","twit","twitch","twitter","twixt",
+"two","twofaced","twopence","twopenny","twosome",
+"tycoon","tyke","tympanum","type","typecast",
+"typeface","typescript","typesetter","typewriter","typewritten",
+"typhoid","typhoon","typhus","typical","typically",
+"typify","typist","typographer","typographic","typography",
+"tyrannical","tyrannise","tyrannize","tyrannosaurus","tyranny",
+"tyrant","tyre","tyro","tzar","tzarina",
+"ubiquitous","ucca","udder","ufo","ugh",
+"ugly","uhf","ukulele","ulcer","ulcerate",
+"ulcerous","ullage","ulna","ult","ulterior",
+"ultimate","ultimately","ultimatum","ultimo","ultramarine",
+"ultrasonic","ultraviolet","umber","umbrage","umbrella",
+"umlaut","umpire","umpteen","unabashed","unabated",
+"unable","unabridged","unaccompanied","unaccountable","unaccustomed",
+"unadopted","unadulterated","unadvised","unaffected","unalloyed",
+"unanimous","unannounced","unanswerable","unapproachable","unarmed",
+"unasked","unassuming","unattached","unattended","unavailing",
+"unawares","unbalance","unbar","unbearable","unbearably",
+"unbeknown","unbelief","unbelievable","unbeliever","unbelieving",
+"unbend","unbending","unbidden","unbind","unblushing",
+"unborn","unbosom","unbounded","unbowed","unbridled",
+"unbuckle","unburden","unbuttoned","uncanny","unceremonious",
+"uncertain","uncertainty","uncharitable","uncharted","unchecked",
+"unchristian","unclad","uncle","unclean","unclouded",
+"uncolored","uncoloured","uncomfortable","uncommitted","uncommonly",
+"uncompromising","unconcerned","unconditional","unconscionable","unconscious",
+"unconsidered","uncork","uncouple","uncouth","uncover",
+"uncritical","uncrowned","uncrushable","unction","unctuous",
+"uncut","undaunted","undeceive","undecided","undeclared",
+"undeniable","under","underact","underarm","underbelly",
+"underbrush","undercarriage","undercharge","underclothes","undercoat",
+"undercover","undercurrent","undercut","underdog","underdone",
+"underestimate","underfelt","underfloor","underfoot","undergarment",
+"undergo","undergraduate","underground","undergrowth","underhand",
+"underhanded","underhung","underlay","underlie","underline",
+"underling","underlying","undermanned","undermentioned","undermine",
+"underneath","undernourish","underpants","underpass","underpin",
+"underplay","underprivileged","underproof","underquote","underrate",
+"underscore","undersecretary","undersell","undersexed","undershirt",
+"underside","undersigned","undersized","underslung","understaffed",
+"understand","understanding","understate","understatement","understudy",
+"undertake","undertaker","undertaking","undertone","undertow",
+"underwater","underwear","underweight","underwent","underworld",
+"underwrite","underwriter","undesirable","undeveloped","undies",
+"undischarged","undistinguished","undivided","undo","undoing",
+"undomesticated","undone","undoubted","undress","undressed",
+"undue","undulate","undulation","unduly","undying",
+"unearth","unearthly","unease","uneasy","uneconomic",
+"uneducated","unemployed","unemployment","unenlightened","unenviable",
+"unequal","unequaled","unequalled","unequivocal","unerring",
+"unesco","uneven","uneventful","unexampled","unexceptionable",
+"unfailing","unfaithful","unfaltering","unfathomable","unfathomed",
+"unfavorable","unfavourable","unfeeling","unfettered","unfit",
+"unflagging","unflappable","unflinching","unfold","unforeseen",
+"unforgettable","unfortunate","unfortunately","unfounded","unfrequented",
+"unfrock","unfurl","ungainly","ungenerous","ungodly",
+"ungovernable","ungracious","ungrateful","ungrudging","unguarded",
+"unguent","unhallowed","unhand","unhappily","unhappy",
+"unhealthy","unheard","unhinge","unholy","unhook",
+"unhorse","unicef","unicorn","unidentified","unification",
+"uniform","uniformed","unify","unilateral","unimpeachable",
+"uninformed","uninhabitable","uninhibited","uninterested","uninterrupted",
+"union","unionise","unionism","unionist","unionize",
+"unique","unisex","unison","unit","unitarian",
+"unite","united","unity","universal","universally",
+"universe","university","unkempt","unkind","unkindly",
+"unknowing","unknown","unlawful","unlearn","unleash",
+"unleavened","unless","unlettered","unlike","unlikely",
+"unload","unlock","unloose","unloosen","unmade",
+"unmannerly","unmarried","unmask","unmatched","unmeasured",
+"unmentionable","unmentionables","unmindful","unmistakable","unmitigated",
+"unmoved","unnatural","unnecessary","unnerve","unnumbered",
+"uno","unobtrusive","unofficial","unorthodox","unpack",
+"unparalleled","unparliamentary","unperson","unpick","unplaced",
+"unplayable","unpleasant","unplumbed","unpracticed","unpractised",
+"unprecedented","unprejudiced","unpretentious","unprincipled","unprintable",
+"unprofessional","unprompted","unprovoked","unqualified","unquestionable",
+"unquestioning","unquiet","unquote","unravel","unreadable",
+"unreal","unreasonable","unreasoning","unrelenting","unrelieved",
+"unremitting","unrequited","unreserved","unrest","unrestrained",
+"unrip","unrivaled","unrivalled","unroll","unruffled",
+"unruly","unsaddle","unsaid","unsavory","unsavoury",
+"unsay","unscathed","unschooled","unscramble","unscrew",
+"unscripted","unscrupulous","unseat","unseeing","unseemly",
+"unseen","unserviceable","unsettle","unsettled","unsex",
+"unsexed","unshakable","unshakeable","unshod","unsightly",
+"unskilled","unsociable","unsocial","unsophisticated","unsound",
+"unsparing","unspeakable","unspotted","unstop","unstrung",
+"unstuck","unstudied","unsullied","unsung","unswerving",
+"untangle","untapped","untenable","unthinkable","unthinking",
+"untie","until","untimely","untinged","untiring",
+"unto","untold","untouchable","untoward","untruth",
+"untruthful","untutored","unused","unusual","unusually",
+"unutterable","unvarnished","unveil","unversed","unvoiced",
+"unwarranted","unwed","unwell","unwieldy","unwind",
+"unwitting","unwonted","unzip","upbeat","upbraid",
+"upbringing","upcoming","update","upend","upgrade",
+"upheaval","uphill","uphold","upholster","upholsterer",
+"upholstery","upkeep","upland","uplift","upon",
+"upper","uppercut","uppermost","uppish","uppity",
+"upright","uprising","uproar","uproarious","uproot",
+"upset","upshot","upstage","upstairs","upstanding",
+"upstart","upstream","upsurge","upswing","uptake",
+"uptight","uptown","upturn","upturned","upward",
+"upwards","uranium","uranus","urban","urbane",
+"urbanise","urbanize","urchin","urge","urgent",
+"uric","urinal","urinary","urinate","urine",
+"urn","usage","use","useful","usefulness",
+"useless","user","usher","usherette","ussr",
+"usual","usually","usurer","usurious","usurp",
+"usury","utensil","uterine","uterus","utilise",
+"utilitarian","utilitarianism","utility","utilize","utmost",
+"utopia","utopian","utter","utterance","utterly",
+"uvula","uvular","uxorious","vac","vacancy",
+"vacant","vacate","vacation","vaccinate","vaccination",
+"vaccine","vacillate","vacuity","vacuous","vacuum",
+"vagabond","vagary","vagina","vaginal","vagrancy",
+"vagrant","vague","vain","vainglorious","vainglory",
+"valance","vale","valediction","valedictory","valency",
+"valentine","valerian","valet","valetudinarian","valiant",
+"valiantly","valid","validate","valise","valley",
+"valor","valour","valse","valuable","valuation",
+"value","valuer","valve","valvular","vamoose",
+"vamp","vampire","van","vanadium","vandal",
+"vandalise","vandalism","vandalize","vane","vanguard",
+"vanilla","vanish","vanity","vanquish","vantagepoint",
+"vapid","vapidity","vapor","vaporise","vaporize",
+"vaporous","vapors","vapour","vapours","variability",
+"variable","variance","variant","variation","varicolored",
+"varicoloured","varicose","varied","variegated","variegation",
+"variety","variform","variorum","various","variously",
+"varlet","varmint","varnish","varsity","vary",
+"vascular","vase","vasectomy","vaseline","vassal",
+"vassalage","vast","vastly","vastness","vat",
+"vatican","vaudeville","vault","vaulted","vaulting",
+"vaunt","veal","vector","veer","veg",
+"vegan","vegetable","vegetarian","vegetarianism","vegetate",
+"vegetation","vehement","vehicle","vehicular","veil",
+"veiled","vein","veined","veining","velar",
+"velarize","veld","veldt","vellum","velocipede",
+"velocity","velour","velours","velvet","velveteen",
+"velvety","venal","vend","vendee","vender",
+"vendetta","vendor","veneer","venerable","venerate",
+"venereal","vengeance","vengeful","venial","venison",
+"venom","venomous","venous","vent","ventilate",
+"ventilation","ventilator","ventricle","ventriloquism","ventriloquist",
+"venture","venturer","venturesome","venue","veracious",
+"veracity","veranda","verandah","verb","verbal",
+"verbalise","verbalize","verbally","verbatim","verbena",
+"verbiage","verbose","verbosity","verdant","verdict",
+"verdigris","verdure","verge","verger","verify",
+"verily","verisimilitude","veritable","verity","vermicelli",
+"vermiculite","vermiform","vermifuge","vermilion","vermin",
+"verminous","vermouth","vernacular","vernal","veronal",
+"veronica","verruca","versatile","verse","versed",
+"versification","versify","version","verso","versus",
+"vertebra","vertebrate","vertex","vertical","vertiginous",
+"vertigo","verve","very","vesicle","vesicular",
+"vesper","vespers","vessel","vest","vestibule",
+"vestige","vestigial","vestment","vestry","vestryman",
+"vesture","vet","vetch","veteran","veterinary",
+"veto","vex","vexation","vexatious","vhf",
+"via","viable","viaduct","vial","viands",
+"vibes","vibrancy","vibrant","vibraphone","vibrate",
+"vibration","vibrato","vibrator","vicar","vicarage",
+"vicarious","vice","vicelike","viceregal","vicereine",
+"viceroy","vicinity","vicious","vicissitudes","victim",
+"victimise","victimize","victor","victorian","victorious",
+"victory","victual","victualer","victualler","victuals",
+"vicuaa","vicuana","vide","videlicet","video",
+"videotape","vie","view","viewer","viewfinder",
+"viewless","viewpoint","vigil","vigilance","vigilant",
+"vigilante","vignette","vigor","vigorous","vigour",
+"viking","vile","vilification","vilify","villa",
+"village","villager","villain","villainies","villainous",
+"villainy","villein","villeinage","villenage","vim",
+"vinaigrette","vindicate","vindication","vindictive","vine",
+"vinegar","vinegary","vinery","vineyard","vino",
+"vinous","vintage","vintner","vinyl","viol",
+"viola","violate","violence","violent","violet",
+"violin","violoncello","vip","viper","virago",
+"virgin","virginal","virginals","virginia","virginity",
+"virgo","virgule","virile","virility","virologist",
+"virology","virtu","virtual","virtually","virtue",
+"virtuosity","virtuoso","virtuous","virulence","virulent",
+"virus","visa","visage","viscera","visceral",
+"viscosity","viscount","viscountcy","viscountess","viscous",
+"vise","visibility","visible","visibly","vision",
+"visionary","visit","visitant","visitation","visiting",
+"visitor","visor","vista","visual","visualise",
+"visualize","visually","vital","vitalise","vitality",
+"vitalize","vitally","vitals","vitamin","vitiate",
+"viticulture","vitreous","vitrify","vitriol","vitriolic",
+"vituperate","vituperation","vituperative","vivace","vivacious",
+"vivarium","vivid","viviparous","vivisect","vivisection",
+"vivisectionist","vixen","vixenish","vizier","vocab",
+"vocabulary","vocal","vocalise","vocalist","vocalize",
+"vocation","vocational","vocative","vociferate","vociferation",
+"vociferous","vodka","vogue","voice","voiceless",
+"void","voile","vol","volatile","volcanic",
+"volcano","vole","volition","volitional","volley",
+"volleyball","volt","voltage","voluble","volume",
+"volumes","voluminous","voluntary","volunteer","voluptuary",
+"voluptuous","volute","vomit","voodoo","voracious",
+"vortex","votary","vote","voter","votive",
+"vouch","voucher","vouchsafe","vow","vowel",
+"voyage","voyager","voyages","voyeur","vtol",
+"vulcanise","vulcanite","vulcanize","vulgar","vulgarian",
+"vulgarise","vulgarism","vulgarity","vulgarize","vulgate",
+"vulnerable","vulpine","vulture","vulva","wac",
+"wack","wacky","wad","wadding","waddle",
+"wade","wader","wadge","wadi","wady",
+"wafer","waffle","waft","wag","wage",
+"wager","wages","waggery","waggish","waggle",
+"waggon","waggoner","waggonette","wagon","wagoner",
+"wagonette","wagtail","waif","wail","wain",
+"wainscot","waist","waistband","waistcoat","waistline",
+"wait","waiter","waits","waive","waiver",
+"wake","wakeful","waken","waking","walk",
+"walkabout","walkaway","walker","walking","walkout",
+"walkover","wall","walla","wallaby","wallah",
+"wallet","wallflower","wallop","walloping","wallow",
+"wallpaper","walnut","walrus","waltz","wampum",
+"wan","wand","wander","wanderer","wandering",
+"wanderings","wanderlust","wane","wangle","wank",
+"wanker","want","wanting","wanton","wants",
+"wapiti","war","warble","warbler","ward",
+"warden","warder","wardrobe","wardroom","warehouse",
+"wares","warfare","warhead","warhorse","warily",
+"warlike","warlock","warlord","warm","warmonger",
+"warmth","warn","warning","warp","warpath",
+"warrant","warrantee","warrantor","warranty","warren",
+"warrior","warship","wart","warthog","wartime",
+"wary","was","wash","washable","washbasin",
+"washboard","washbowl","washcloth","washday","washer",
+"washerwoman","washhouse","washing","washout","washroom",
+"washstand","washwoman","washy","wasp","waspish",
+"wassail","wast","wastage","waste","wasteful",
+"waster","wastrel","watch","watchband","watchdog",
+"watches","watchful","watchmaker","watchman","watchtower",
+"watchword","water","waterborne","watercolor","watercolour",
+"watercourse","watercress","waterfall","waterfowl","waterfront",
+"waterhole","waterline","waterlogged","waterloo","waterman",
+"watermark","watermelon","watermill","waterpower","waterproof",
+"waters","watershed","waterside","waterspout","watertight",
+"waterway","waterwheel","waterwings","waterworks","watery",
+"watt","wattage","wattle","wave","wavelength",
+"waver","wavy","wax","waxen","waxworks",
+"waxy","way","waybill","wayfarer","wayfaring",
+"waylay","ways","wayside","wayward","weak",
+"weaken","weakling","weakness","weal","weald",
+"wealth","wealthy","wean","weapon","weaponry",
+"wear","wearing","wearisome","weary","weasel",
+"weather","weatherboard","weathercock","weatherglass","weatherman",
+"weatherproof","weathers","weave","weaver","web",
+"webbed","webbing","wed","wedded","wedding",
+"wedge","wedged","wedgwood","wedlock","wednesday",
+"wee","weed","weeds","weedy","week",
+"weekday","weekend","weekender","weekly","weeknight",
+"weeny","weep","weeping","weepy","weevil",
+"weft","weigh","weighbridge","weight","weighted",
+"weighting","weightless","weighty","weir","weird",
+"weirdie","weirdo","welch","welcome","weld",
+"welder","welfare","welkin","well","wellbeing",
+"wellborn","wellington","wellspring","welsh","welt",
+"weltanschauung","welter","welterweight","wen","wench",
+"wend","wensleydale","went","wept","were",
+"werewolf","wert","wesleyan","west","westbound",
+"westerly","western","westerner","westernise","westernize",
+"westernmost","westward","westwards","wet","wether",
+"wetting","whack","whacked","whacker","whacking",
+"whale","whalebone","whaler","whaling","wham",
+"wharf","what","whatever","whatnot","wheat",
+"wheaten","wheedle","wheel","wheelbarrow","wheelbase",
+"wheelchair","wheelhouse","wheeling","wheels","wheelwright",
+"wheeze","wheezy","whelk","whelp","when",
+"whence","whenever","where","whereabouts","whereas",
+"whereat","whereby","wherefore","wherefores","wherein",
+"whereof","whereon","wheresoever","whereto","whereupon",
+"wherever","wherewithal","wherry","whet","whether",
+"whetstone","whew","whey","which","whichever",
+"whiff","whiffy","whig","while","whim",
+"whimper","whimsey","whimsical","whimsicality","whimsy",
+"whin","whine","whiner","whinny","whip",
+"whipcord","whiplash","whippersnapper","whippet","whipping",
+"whippoorwill","whippy","whir","whirl","whirligig",
+"whirlpool","whirlwind","whirlybird","whirr","whisk",
+"whisker","whiskered","whiskers","whiskey","whisky",
+"whisper","whist","whistle","whit","white",
+"whitebait","whitehall","whiten","whitening","whites",
+"whitethorn","whitethroat","whitewash","whither","whiting",
+"whitlow","whitsun","whitsuntide","whittle","whiz",
+"whizz","who","whoa","whodunit","whoever",
+"whole","wholemeal","wholesale","wholesaler","wholesome",
+"wholly","whom","whoop","whoopee","whoosh",
+"whop","whopper","whopping","whore","whorehouse",
+"whoremonger","whorl","whortleberry","whose","whosoever",
+"why","whys","wick","wicked","wicker",
+"wickerwork","wicket","wide","widely","widen",
+"widespread","widgeon","widow","widowed","widower",
+"widowhood","width","wield","wife","wifely",
+"wig","wigged","wigging","wiggle","wight",
+"wigwam","wilco","wild","wildcat","wildebeest",
+"wilderness","wildfire","wildfowl","wildlife","wildly",
+"wile","wiles","wilful","wiliness","will",
+"willful","willies","willing","willow","willowy",
+"willpower","wilt","wily","wimple","wimpy",
+"win","wince","winceyette","winch","wind",
+"windbag","windbreak","windcheater","windfall","windily",
+"winding","windjammer","windlass","windless","windmill",
+"window","windowpane","windowsill","windpipe","windscreen",
+"windshield","windsock","windstorm","windswept","windward",
+"windy","wine","winebibbing","wineglass","winepress",
+"wineskin","wing","winger","wings","wingspan",
+"wink","winkers","winkle","winner","winning",
+"winnings","winnow","winsome","winter","wintergreen",
+"wintertime","wintry","wipe","wiper","wire",
+"wirecutters","wireless","wiretap","wireworm","wiring",
+"wiry","wisdom","wise","wisecrack","wish",
+"wishbone","wisp","wispy","wisteria","wistful",
+"wit","witch","witchcraft","witchdoctor","witchery",
+"witching","with","withal","withdraw","withdrawal",
+"withdrawn","withe","wither","withering","withers",
+"withhold","within","without","withstand","withy",
+"witless","witness","witticism","witting","witty",
+"wives","wizard","wizardry","wizened","woad",
+"wobble","wobbly","woe","woebegone","woeful",
+"wog","woke","woken","wold","wolf",
+"wolfhound","wolfram","wolfsbane","woman","womanhood",
+"womanise","womanish","womanize","womankind","womanly",
+"womb","wombat","womenfolk","won","wonder",
+"wonderful","wonderland","wonderment","wonders","wondrous",
+"wonky","wont","wonted","woo","wood",
+"woodbine","woodblock","woodcock","woodcraft","woodcut",
+"woodcutter","wooded","wooden","woodenheaded","woodland",
+"woodlouse","woodpecker","woodpile","woodshed","woodsman",
+"woodwind","woodwork","woodworm","woody","wooer",
+"woof","woofer","wool","woolen","woolens",
+"woolgather","woolgathering","woollen","woollens","woolly",
+"woolsack","woozy","wop","word","wording",
+"wordless","wordplay","words","wordy","wore",
+"work","workable","workaday","workbag","workbasket",
+"workbench","workbook","workday","worker","workhorse",
+"workhouse","working","workings","workman","workmanlike",
+"workmanship","workout","workpeople","workroom","works",
+"workshop","worktop","world","worldly","worldshaking",
+"worldwide","worm","wormhole","wormwood","wormy",
+"worn","worried","worrisome","worry","worse",
+"worsen","worship","worshipful","worst","worsted",
+"wort","worth","worthless","worthwhile","worthy",
+"wot","wotcher","would","wouldst","wound",
+"wove","woven","wow","wrac","wrack",
+"wraith","wrangle","wrangler","wrap","wrapper",
+"wrapping","wrath","wreak","wreath","wreathe",
+"wreck","wreckage","wrecker","wren","wrench",
+"wrest","wrestle","wretch","wretched","wriggle",
+"wright","wring","wringer","wrinkle","wrist",
+"wristband","wristlet","wristwatch","wristy","writ",
+"write","writer","writhe","writing","writings",
+"written","wrong","wrongdoing","wrongful","wrongheaded",
+"wrote","wroth","wrought","wrung","wry",
+"wurst","wyvern","xenon","xenophobia","xerox",
+"xylophone","yacht","yachting","yachtsman","yahoo",
+"yak","yam","yammer","yang","yank",
+"yankee","yap","yard","yardage","yardarm",
+"yardstick","yarn","yarrow","yashmak","yaw",
+"yawl","yawn","yaws","yea","yeah",
+"year","yearbook","yearling","yearlong","yearly",
+"yearn","yearning","years","yeast","yeasty",
+"yell","yellow","yelp","yen","yeoman",
+"yeomanry","yes","yesterday","yet","yeti",
+"yew","yid","yiddish","yield","yielding",
+"yin","yippee","yobbo","yodel","yoga",
+"yoghurt","yogi","yogurt","yoke","yokel",
+"yolk","yonder","yonks","yore","yorker",
+"you","young","younger","youngster","your",
+"yours","yourself","youth","youthful","yowl",
+"yoyo","yucca","yule","yuletide","zany",
+"zeal","zealot","zealotry","zealous","zebra",
+"zebu","zed","zeitgeist","zen","zenana",
+"zenith","zephyr","zeppelin","zero","zest",
+"ziggurat","zigzag","zinc","zinnia","zionism",
+"zip","zipper","zippy","zither","zizz",
+"zodiac","zombi","zombie","zonal","zone",
+"zoning","zonked","zoo","zoologist","zoology",
+"zoom","zoophyte","zouave","zucchini","zulu",
+};
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java
new file mode 100644
index 00000000000..9169a1d335f
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/KStemmer.java
@@ -0,0 +1,1426 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This algorithm is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0, which was adapted from
+ * the kstemmer code base which is Copyright 2003, CIIR University of Massachusetts
+ * Amherst (http://ciir.cs.umass.edu) and Licensed under the terms of a modified old-style BSD license.
+ */
+package com.yahoo.language.simple.kstem;
+
+/**
+ * A stemmer implementing the Kstem algorithm by Bob Krovetz.
+ */
+public class KStemmer {
+
+ static private final int MaxWordLen = 50;
+
+ static private final String[] exceptionWords = {"aide", "bathe", "caste",
+ "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
+ "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
+ "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
+ "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
+ "swinge", "tare", "tine", "tope", "tripe", "twine"};
+
+ static private final String[][] directConflations = { {"aging", "age"},
+ {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
+ {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
+ {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
+ {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
+ {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
+ {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
+ {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
+ {"political", "politics"}, {"metaphysical", "metaphysics"},
+ {"cylindrical", "cylinder"}, {"nazism", "nazi"},
+ {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
+ {"credulity", "credulous"}, {"generosity", "generous"},
+ {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
+ {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
+
+ static private final String[][] countryNationality = {
+ {"afghan", "afghanistan"}, {"african", "africa"},
+ {"albanian", "albania"}, {"algerian", "algeria"},
+ {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
+ {"arabian", "arabia"}, {"argentine", "argentina"},
+ {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
+ {"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
+ {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
+ {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
+ {"bosnian", "bosnia"}, {"botswanan", "botswana"},
+ {"brazilian", "brazil"}, {"british", "britain"},
+ {"bulgarian", "bulgaria"}, {"burmese", "burma"},
+ {"californian", "california"}, {"cambodian", "cambodia"},
+ {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
+ {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
+ {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
+ {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
+ {"egyptian", "egypt"}, {"equadorian", "equador"},
+ {"eritrean", "eritrea"}, {"estonian", "estonia"},
+ {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
+ {"filipino", "philippines"}, {"finnish", "finland"},
+ {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
+ {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
+ {"grenadan", "grenada"}, {"guamian", "guam"},
+ {"guatemalan", "guatemala"}, {"guinean", "guinea"},
+ {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
+ {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
+ {"icelandic", "iceland"}, {"indonesian", "indonesia"},
+ {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
+ {"irish", "ireland"}, {"israeli", "israel"},
+ {"italian", "italy"},
+ {"jamaican", "jamaica"},
+ {"japanese", "japan"},
+ {"jordanian", "jordan"},
+ {"kampuchean", "cambodia"},
+ {"kenyan", "kenya"},
+ {"korean", "korea"},
+ {"kuwaiti", "kuwait"},
+ {"lankan", "lanka"},
+ {"laotian", "laos"},
+ {"latvian", "latvia"},
+ {"lebanese", "lebanon"},
+ {"liberian", "liberia"},
+ {"libyan", "libya"},
+ {"lithuanian", "lithuania"},
+ {"macedonian", "macedonia"},
+ {"madagascan", "madagascar"},
+ {"malaysian", "malaysia"},
+ {"maltese", "malta"},
+ {"mauritanian", "mauritania"},
+ {"mexican", "mexico"},
+ {"micronesian", "micronesia"},
+ {"moldovan", "moldova"},
+ {"monacan", "monaco"},
+ {"mongolian", "mongolia"},
+ {"montenegran", "montenegro"},
+ {"moroccan", "morocco"},
+ {"myanmar", "burma"},
+ {"namibian", "namibia"},
+ {"nepalese", "nepal"},
+ // {"netherlands", "dutch"},
+ {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
+ {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
+ {"panamanian", "panama"}, {"papuan", "papua"},
+ {"paraguayan", "paraguay"}, {"peruvian", "peru"},
+ {"portuguese", "portugal"}, {"romanian", "romania"},
+ {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
+ {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
+ {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
+ {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
+ {"slovenian", "slovenia"}, {"somali", "somalia"},
+ {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
+ {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
+ {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
+ {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
+ {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
+ {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
+ {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
+ {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
+ {"zambian", "zambia"}, {"zealander", "zealand"},
+ {"zimbabwean", "zimbabwe"}};
+
+ static private final String[] supplementDict = {"aids", "applicator",
+ "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
+ "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
+ "toroid", "superconduct", "backscatter", "connectionism"};
+
+ static private final String[] properNouns = {"abrams", "achilles",
+ "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
+ "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
+ "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
+ "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
+ "barbados", "barger", "bering", "brahms", "brandeis", "brussels",
+ "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
+ "charles", "cheops", "ching", "christmas", "cocos", "collins",
+ "columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
+ "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
+ "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
+ "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
+ "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
+ "everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
+ "fleming", "forbes", "fowler", "france", "francis", "goering",
+ "goodling", "goths", "grenadines", "guiness", "hades", "harding",
+ "harris", "hastings", "hawkes", "hawking", "hayes", "heights",
+ "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
+ "hopkins", "hughes", "humphreys", "illinois", "indianapolis",
+ "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
+ "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
+ "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
+ "levis", "leviticus", "lewis", "louis", "maccabees", "madras",
+ "maimonides", "maldive", "massachusetts", "matthews", "mauritius",
+ "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
+ "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
+ "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
+ "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
+ "pauling", "peking", "pershing", "peter", "peters", "philippines",
+ "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
+ "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
+ "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
+ "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
+ "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
+ "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
+ "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
+ "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
+ "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
+ "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
+ "connally", "conner", "coors", "cummings", "curtis", "daedalus",
+ "dionysus", "dobbs", "dolores", "edmonds"};
+
+ static class DictEntry {
+ boolean exception;
+ String root;
+
+ DictEntry(String root, boolean isException) {
+ this.root = root;
+ this.exception = isException;
+ }
+ }
+
+ private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
+
+
+ private final OpenStringBuilder word = new OpenStringBuilder();
+ private int j; /* index of final letter in stem (within word) */
+ private int k; /*
+ * INDEX of final letter in word. You must add 1 to k to get
+ * the current length of word. When you want the length of
+ * word, use the method wordLength, which returns (k+1).
+ */
+
+ /*
+ * private void initializeStemHash() { if (maxCacheSize > 0) cache = new
+ * CharArrayMap<String>(maxCacheSize,false); }
+ ***/
+
+ private char finalChar() {
+ return word.charAt(k);
+ }
+
+ private char penultChar() {
+ return word.charAt(k - 1);
+ }
+
+ private boolean isVowel(int index) {
+ return !isCons(index);
+ }
+
+ private boolean isCons(int index) {
+ char ch;
+
+ ch = word.charAt(index);
+
+ if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
+ if ((ch != 'y') || (index == 0)) return true;
+ else return (!isCons(index - 1));
+ }
+
+ private static CharArrayMap<DictEntry> initializeDictHash() {
+ DictEntry defaultEntry;
+ DictEntry entry;
+
+ CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false);
+ for (int i = 0; i < exceptionWords.length; i++) {
+ if (!d.containsKey(exceptionWords[i])) {
+ entry = new DictEntry(exceptionWords[i], true);
+ d.put(exceptionWords[i], entry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
+ + "] already in dictionary 1");
+ }
+ }
+
+ for (int i = 0; i < directConflations.length; i++) {
+ if (!d.containsKey(directConflations[i][0])) {
+ entry = new DictEntry(directConflations[i][1], false);
+ d.put(directConflations[i][0], entry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
+ + "] already in dictionary 2");
+ }
+ }
+
+ for (int i = 0; i < countryNationality.length; i++) {
+ if (!d.containsKey(countryNationality[i][0])) {
+ entry = new DictEntry(countryNationality[i][1], false);
+ d.put(countryNationality[i][0], entry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
+ + "] already in dictionary 3");
+ }
+ }
+
+ defaultEntry = new DictEntry(null, false);
+
+ String[] array;
+ array = KStemData1.data;
+
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData2.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData3.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData4.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData5.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData6.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ array = KStemData7.data;
+ for (int i = 0; i < array.length; i++) {
+ if (!d.containsKey(array[i])) {
+ d.put(array[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + array[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ for (int i = 0; i < KStemData8.data.length; i++) {
+ if (!d.containsKey(KStemData8.data[i])) {
+ d.put(KStemData8.data[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
+ + "] already in dictionary 4");
+ }
+ }
+
+ for (int i = 0; i < supplementDict.length; i++) {
+ if (!d.containsKey(supplementDict[i])) {
+ d.put(supplementDict[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + supplementDict[i]
+ + "] already in dictionary 5");
+ }
+ }
+
+ for (int i = 0; i < properNouns.length; i++) {
+ if (!d.containsKey(properNouns[i])) {
+ d.put(properNouns[i], defaultEntry);
+ } else {
+ throw new RuntimeException("Warning: Entry [" + properNouns[i]
+ + "] already in dictionary 6");
+ }
+ }
+
+ return d;
+ }
+
+ private boolean isAlpha(char ch) {
+ return ch >= 'a' && ch <= 'z'; // terms must be lowercased already
+ }
+
+ /* length of stem within word */
+ private int stemLength() {
+ return j + 1;
+ };
+
+ private boolean endsIn(char[] s) {
+ if (s.length > k) return false;
+
+ int r = word.length() - s.length; /* length of word before this suffix */
+ j = k;
+ for (int r1 = r, i = 0; i < s.length; i++, r1++) {
+ if (s[i] != word.charAt(r1)) return false;
+ }
+ j = r - 1; /* index of the character BEFORE the posfix */
+ return true;
+ }
+
+ private boolean endsIn(char a, char b) {
+ if (2 > k) return false;
+ // check left to right since the endings have often already matched
+ if (word.charAt(k - 1) == a && word.charAt(k) == b) {
+ j = k - 2;
+ return true;
+ }
+ return false;
+ }
+
+ private boolean endsIn(char a, char b, char c) {
+ if (3 > k) return false;
+ if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
+ && word.charAt(k) == c) {
+ j = k - 3;
+ return true;
+ }
+ return false;
+ }
+
+ private boolean endsIn(char a, char b, char c, char d) {
+ if (4 > k) return false;
+ if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
+ && word.charAt(k - 1) == c && word.charAt(k) == d) {
+ j = k - 4;
+ return true;
+ }
+ return false;
+ }
+
+ private DictEntry wordInDict() {
+ /***
+ * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
+ * word.size()) != matchedEntry) {
+ * System.out.println("Uh oh... cached entry doesn't match"); } return
+ * matchedEntry; }
+ ***/
+ if (matchedEntry != null) return matchedEntry;
+ DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
+ if (e != null && !e.exception) {
+ matchedEntry = e; // only cache if it's not an exception.
+ }
+ // lookups.add(word.toString());
+ return e;
+ }
+
+ /* Convert plurals to singular form, and '-ies' to 'y' */
+ private void plural() {
+ if (word.charAt(k) == 's') {
+ if (endsIn('i', 'e', 's')) {
+ word.setLength(j + 3);
+ k--;
+ if (lookup()) /* ensure calories -> calorie */
+ return;
+ k++;
+ word.unsafeWrite('s');
+ setSuffix("y");
+ lookup();
+ } else if (endsIn('e', 's')) {
+ /* try just removing the "s" */
+ word.setLength(j + 2);
+ k--;
+
+ /*
+ * note: don't check for exceptions here. So, `aides' -> `aide', but
+ * `aided' -> `aid'. The exception for double s is used to prevent
+ * crosses -> crosse. This is actually correct if crosses is a plural
+ * noun (a type of racket used in lacrosse), but the verb is much more
+ * common
+ */
+
+ /****
+ * YCS: this was the one place where lookup was not followed by return.
+ * So restructure it. if ((j>0)&&(lookup(word.toString())) &&
+ * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
+ *****/
+ boolean tryE = j > 0
+ && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
+ if (tryE && lookup()) return;
+
+ /* try removing the "es" */
+
+ word.setLength(j + 1);
+ k--;
+ if (lookup()) return;
+
+ /* the default is to retain the "e" */
+ word.unsafeWrite('e');
+ k++;
+
+ if (!tryE) lookup(); // if we didn't try the "e" ending before
+ return;
+ } else {
+ if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
+ /* unless the word ends in "ous" or a double "s", remove the final "s" */
+
+ word.setLength(k);
+ k--;
+ lookup();
+ }
+ }
+ }
+ }
+
+ private void setSuffix(String s) {
+ setSuff(s, s.length());
+ }
+
+ /* replace old suffix with s */
+ private void setSuff(String s, int len) {
+ word.setLength(j + 1);
+ for (int l = 0; l < len; l++) {
+ word.unsafeWrite(s.charAt(l));
+ }
+ k = j + len;
+ }
+
+ /* Returns true if the word is found in the dictionary */
+ // almost all uses of lookup() return immediately and are
+ // followed by another lookup in the dict. Store the match
+ // to avoid this double lookup.
+ DictEntry matchedEntry = null;
+
+ private boolean lookup() {
+ matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
+ return matchedEntry != null;
+ }
+
+ // Set<String> lookups = new HashSet<>();
+
+ /* convert past tense (-ed) to present, and `-ied' to `y' */
+ private void pastTense() {
+ /*
+ * Handle words less than 5 letters with a direct mapping This prevents
+ * (fled -> fl).
+ */
+ if (word.length() <= 4) return;
+
+ if (endsIn('i', 'e', 'd')) {
+ word.setLength(j + 3);
+ k--;
+ if (lookup()) /* we almost always want to convert -ied to -y, but */
+ return; /* this isn't true for short words (died->die) */
+ k++; /* I don't know any long words that this applies to, */
+ word.unsafeWrite('d'); /* but just in case... */
+ setSuffix("y");
+ lookup();
+ return;
+ }
+
+ /* the vowelInStem() is necessary so we don't stem acronyms */
+ if (endsIn('e', 'd') && vowelInStem()) {
+ /* see if the root ends in `e' */
+ word.setLength(j + 2);
+ k = j + 1;
+
+ DictEntry entry = wordInDict();
+ if (entry != null) if (!entry.exception) /*
+ * if it's in the dictionary and
+ * not an exception
+ */
+ return;
+
+ /* try removing the "ed" */
+ word.setLength(j + 1);
+ k = j;
+ if (lookup()) return;
+
+ /*
+ * try removing a doubled consonant. if the root isn't found in the
+ * dictionary, the default is to leave it doubled. This will correctly
+ * capture `backfilled' -> `backfill' instead of `backfill' ->
+ * `backfille', and seems correct most of the time
+ */
+
+ if (doubleC(k)) {
+ word.setLength(k);
+ k--;
+ if (lookup()) return;
+ word.unsafeWrite(word.charAt(k));
+ k++;
+ lookup();
+ return;
+ }
+
+ /* if we have a `un-' prefix, then leave the word alone */
+ /* (this will sometimes screw up with `under-', but we */
+ /* will take care of that later) */
+
+ if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
+ word.unsafeWrite('e');
+ word.unsafeWrite('d');
+ k = k + 2;
+ // nolookup()
+ return;
+ }
+
+ /*
+ * it wasn't found by just removing the `d' or the `ed', so prefer to end
+ * with an `e' (e.g., `microcoded' -> `microcode').
+ */
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e');
+ k = j + 1;
+ // nolookup() - we already tried the "e" ending
+ return;
+ }
+ }
+
+ /* return TRUE if word ends with a double consonant */
+ private boolean doubleC(int i) {
+ if (i < 1) return false;
+
+ if (word.charAt(i) != word.charAt(i - 1)) return false;
+ return (isCons(i));
+ }
+
+ private boolean vowelInStem() {
+ for (int i = 0; i < stemLength(); i++) {
+ if (isVowel(i)) return true;
+ }
+ return false;
+ }
+
+ /* handle `-ing' endings */
+ private void aspect() {
+ /*
+ * handle short words (aging -> age) via a direct mapping. This prevents
+ * (thing -> the) in the version of this routine that ignores inflectional
+ * variants that are mentioned in the dictionary (when the root is also
+ * present)
+ */
+
+ if (word.length() <= 5) return;
+
+ /* the vowelinstem() is necessary so we don't stem acronyms */
+ if (endsIn('i', 'n', 'g') && vowelInStem()) {
+
+ /* try adding an `e' to the stem and check against the dictionary */
+ word.setCharAt(j + 1, 'e');
+ word.setLength(j + 2);
+ k = j + 1;
+
+ DictEntry entry = wordInDict();
+ if (entry != null) {
+ if (!entry.exception) /* if it's in the dictionary and not an exception */
+ return;
+ }
+
+ /* adding on the `e' didn't work, so remove it */
+ word.setLength(k);
+ k--; /* note that `ing' has also been removed */
+
+ if (lookup()) return;
+
+ /* if I can remove a doubled consonant and get a word, then do so */
+ if (doubleC(k)) {
+ k--;
+ word.setLength(k + 1);
+ if (lookup()) return;
+ word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */
+
+ /* the default is to leave the consonant doubled */
+ /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */
+ /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */
+ /* Without making the algorithm significantly more complicated, this */
+ /* is the best I can do */
+ k++;
+ lookup();
+ return;
+ }
+
+ /*
+ * the word wasn't in the dictionary after removing the stem, and then
+ * checking with and without a final `e'. The default is to add an `e'
+ * unless the word ends in two consonants, so `microcoding' ->
+ * `microcode'. The two consonants restriction wouldn't normally be
+ * necessary, but is needed because we don't try to deal with prefixes and
+ * compounds, and most of the time it is correct (e.g., footstamping ->
+ * footstamp, not footstampe; however, decoupled -> decoupl). We can
+ * prevent almost all of the incorrect stems if we try to do some prefix
+ * analysis first
+ */
+
+ if ((j > 0) && isCons(j) && isCons(j - 1)) {
+ k = j;
+ word.setLength(k + 1);
+ // nolookup() because we already did according to the comment
+ return;
+ }
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e');
+ k = j + 1;
+ // nolookup(); we already tried an 'e' ending
+ return;
+ }
+ }
+
+ /*
+ * this routine deals with -ity endings. It accepts -ability, -ibility, and
+ * -ality, even without checking the dictionary because they are so
+ * productive. The first two are mapped to -ble, and the -ity is remove for
+ * the latter
+ */
+ private void ityEndings() {
+ int old_k = k;
+
+ if (endsIn('i', 't', 'y')) {
+ word.setLength(j + 1); /* try just removing -ity */
+ k = j;
+ if (lookup()) return;
+ word.unsafeWrite('e'); /* try removing -ity and adding -e */
+ k = j + 1;
+ if (lookup()) return;
+ word.setCharAt(j + 1, 'i');
+ word.append("ty");
+ k = old_k;
+ /*
+ * the -ability and -ibility endings are highly productive, so just accept
+ * them
+ */
+ if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
+ word.setLength(j - 1);
+ word.append("le"); /* convert to -ble */
+ k = j;
+ lookup();
+ return;
+ }
+
+ /* ditto for -ivity */
+ if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
+ word.setLength(j + 1);
+ word.unsafeWrite('e'); /* convert to -ive */
+ k = j + 1;
+ lookup();
+ return;
+ }
+ /* ditto for -ality */
+ if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
+ word.setLength(j + 1);
+ k = j;
+ lookup();
+ return;
+ }
+
+ /*
+ * if the root isn't in the dictionary, and the variant *is* there, then
+ * use the variant. This allows `immunity'->`immune', but prevents
+ * `capacity'->`capac'. If neither the variant nor the root form are in
+ * the dictionary, then remove the ending as a default
+ */
+
+ if (lookup()) return;
+
+ /* the default is to remove -ity altogether */
+ word.setLength(j + 1);
+ k = j;
+ // nolookup(), we already did it.
+ return;
+ }
+ }
+
+ /* handle -ence and -ance */
+ private void nceEndings() {
+ int old_k = k;
+ char word_char;
+
+ if (endsIn('n', 'c', 'e')) {
+ word_char = word.charAt(j);
+ if (!((word_char == 'e') || (word_char == 'a'))) return;
+ word.setLength(j);
+ word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */
+ k = j;
+ if (lookup()) return;
+ word.setLength(j); /*
+ * try removing -e/ance altogether
+ * (disappearance/disappear)
+ */
+ k = j - 1;
+ if (lookup()) return;
+ word.unsafeWrite(word_char); /* restore the original ending */
+ word.append("nce");
+ k = old_k;
+ // nolookup() because we restored the original ending
+ }
+ return;
+ }
+
+ /* handle -ness */
+ private void nessEndings() {
+ if (endsIn('n', 'e', 's', 's')) { /*
+ * this is a very productive endings, so
+ * just accept it
+ */
+ word.setLength(j + 1);
+ k = j;
+ if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
+ lookup();
+ }
+ return;
+ }
+
+ /* handle -ism */
+ private void ismEndings() {
+ if (endsIn('i', 's', 'm')) { /*
+ * this is a very productive ending, so just
+ * accept it
+ */
+ word.setLength(j + 1);
+ k = j;
+ lookup();
+ }
+ return;
+ }
+
+ /* this routine deals with -ment endings. */
+ private void mentEndings() {
+ int old_k = k;
+
+ if (endsIn('m', 'e', 'n', 't')) {
+ word.setLength(j + 1);
+ k = j;
+ if (lookup()) return;
+ word.append("ment");
+ k = old_k;
+ // nolookup
+ }
+ return;
+ }
+
+ /* this routine deals with -ize endings. */
+ private void izeEndings() {
+ int old_k = k;
+
+ if (endsIn('i', 'z', 'e')) {
+ word.setLength(j + 1); /* try removing -ize entirely */
+ k = j;
+ if (lookup()) return;
+ word.unsafeWrite('i');
+
+ if (doubleC(j)) { /* allow for a doubled consonant */
+ word.setLength(j);
+ k = j - 1;
+ if (lookup()) return;
+ word.unsafeWrite(word.charAt(j - 1));
+ }
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e'); /* try removing -ize and adding -e */
+ k = j + 1;
+ if (lookup()) return;
+ word.setLength(j + 1);
+ word.append("ize");
+ k = old_k;
+ // nolookup()
+ }
+ return;
+ }
+
+ /* handle -ency and -ancy */
+ private void ncyEndings() {
+ if (endsIn('n', 'c', 'y')) {
+ if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
+ word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */
+ word.setLength(j + 3);
+ k = j + 2;
+
+ if (lookup()) return;
+
+ word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */
+ word.unsafeWrite('e');
+ k = j + 3;
+ lookup();
+ }
+ return;
+ }
+
+ /* handle -able and -ible */
+ private void bleEndings() {
+ int old_k = k;
+ char word_char;
+
+ if (endsIn('b', 'l', 'e')) {
+ if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
+ word_char = word.charAt(j);
+ word.setLength(j); /* try just removing the ending */
+ k = j - 1;
+ if (lookup()) return;
+ if (doubleC(k)) { /* allow for a doubled consonant */
+ word.setLength(k);
+ k--;
+ if (lookup()) return;
+ k++;
+ word.unsafeWrite(word.charAt(k - 1));
+ }
+ word.setLength(j);
+ word.unsafeWrite('e'); /* try removing -a/ible and adding -e */
+ k = j;
+ if (lookup()) return;
+ word.setLength(j);
+ word.append("ate"); /* try removing -able and adding -ate */
+ /* (e.g., compensable/compensate) */
+ k = j + 2;
+ if (lookup()) return;
+ word.setLength(j);
+ word.unsafeWrite(word_char); /* restore the original values */
+ word.append("ble");
+ k = old_k;
+ // nolookup()
+ }
+ return;
+ }
+
+ /*
+ * handle -ic endings. This is fairly straightforward, but this is also the
+ * only place we try *expanding* an ending, -ic -> -ical. This is to handle
+ * cases like `canonic' -> `canonical'
+ */
+ private void icEndings() {
+ if (endsIn('i', 'c')) {
+ word.setLength(j + 3);
+ word.append("al"); /* try converting -ic to -ical */
+ k = j + 4;
+ if (lookup()) return;
+
+ word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */
+ word.setLength(j + 2);
+ k = j + 1;
+ if (lookup()) return;
+
+ word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */
+ if (lookup()) return;
+
+ word.setLength(j + 1); /* try removing -ic altogether */
+ k = j;
+ if (lookup()) return;
+ word.append("ic"); /* restore the original ending */
+ k = j + 2;
+ // nolookup()
+ }
+ return;
+ }
+
+ private static char[] ization = "ization".toCharArray();
+ private static char[] ition = "ition".toCharArray();
+ private static char[] ation = "ation".toCharArray();
+ private static char[] ication = "ication".toCharArray();
+
+ /* handle some derivational endings */
+ /*
+ * this routine deals with -ion, -ition, -ation, -ization, and -ication. The
+ * -ization ending is always converted to -ize
+ */
+ private void ionEndings() {
+ int old_k = k;
+ if (!endsIn('i', 'o', 'n')) {
+ return;
+ }
+
+ if (endsIn(ization)) { /*
+ * the -ize ending is very productive, so simply
+ * accept it as the root
+ */
+ word.setLength(j + 3);
+ word.unsafeWrite('e');
+ k = j + 3;
+ lookup();
+ return;
+ }
+
+ if (endsIn(ition)) {
+ word.setLength(j + 1);
+ word.unsafeWrite('e');
+ k = j + 1;
+ if (lookup()) /*
+ * remove -ition and add `e', and check against the
+ * dictionary
+ */
+ return; /* (e.g., definition->define, opposition->oppose) */
+
+ /* restore original values */
+ word.setLength(j + 1);
+ word.append("ition");
+ k = old_k;
+ // nolookup()
+ } else if (endsIn(ation)) {
+ word.setLength(j + 3);
+ word.unsafeWrite('e');
+ k = j + 3;
+ if (lookup()) /* remove -ion and add `e', and check against the dictionary */
+ return; /* (elmination -> eliminate) */
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e'); /*
+ * remove -ation and add `e', and check against the
+ * dictionary
+ */
+ k = j + 1;
+ if (lookup()) return;
+
+ word.setLength(j + 1);/*
+ * just remove -ation (resignation->resign) and
+ * check dictionary
+ */
+ k = j;
+ if (lookup()) return;
+
+ /* restore original values */
+ word.setLength(j + 1);
+ word.append("ation");
+ k = old_k;
+ // nolookup()
+
+ }
+
+ /*
+ * test -ication after -ation is attempted (e.g., `complication->complicate'
+ * rather than `complication->comply')
+ */
+
+ if (endsIn(ication)) {
+ word.setLength(j + 1);
+ word.unsafeWrite('y');
+ k = j + 1;
+ if (lookup()) /*
+ * remove -ication and add `y', and check against the
+ * dictionary
+ */
+ return; /* (e.g., amplification -> amplify) */
+
+ /* restore original values */
+ word.setLength(j + 1);
+ word.append("ication");
+ k = old_k;
+ // nolookup()
+ }
+
+ // if (endsIn(ion)) {
+ if (true) { // we checked for this earlier... just need to set "j"
+ j = k - 3; // YCS
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e');
+ k = j + 1;
+ if (lookup()) /* remove -ion and add `e', and check against the dictionary */
+ return;
+
+ word.setLength(j + 1);
+ k = j;
+ if (lookup()) /* remove -ion, and if it's found, treat that as the root */
+ return;
+
+ /* restore original values */
+ word.setLength(j + 1);
+ word.append("ion");
+ k = old_k;
+ // nolookup()
+ }
+
+ // nolookup(); all of the other paths restored original values
+ return;
+ }
+
+ /*
+ * this routine deals with -er, -or, -ier, and -eer. The -izer ending is
+ * always converted to -ize
+ */
+ private void erAndOrEndings() {
+ int old_k = k;
+
+ if (word.charAt(k) != 'r') return; // YCS
+
+ char word_char; /* so we can remember if it was -er or -or */
+
+ if (endsIn('i', 'z', 'e', 'r')) { /*
+ * -ize is very productive, so accept it
+ * as the root
+ */
+ word.setLength(j + 4);
+ k = j + 3;
+ lookup();
+ return;
+ }
+
+ if (endsIn('e', 'r') || endsIn('o', 'r')) {
+ word_char = word.charAt(j + 1);
+ if (doubleC(j)) {
+ word.setLength(j);
+ k = j - 1;
+ if (lookup()) return;
+ word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */
+ }
+
+ if (word.charAt(j) == 'i') { /* do we have a -ier ending? */
+ word.setCharAt(j, 'y');
+ word.setLength(j + 1);
+ k = j;
+ if (lookup()) /* yes, so check against the dictionary */
+ return;
+ word.setCharAt(j, 'i'); /* restore the endings */
+ word.unsafeWrite('e');
+ }
+
+ if (word.charAt(j) == 'e') { /* handle -eer */
+ word.setLength(j);
+ k = j - 1;
+ if (lookup()) return;
+ word.unsafeWrite('e');
+ }
+
+ word.setLength(j + 2); /* remove the -r ending */
+ k = j + 1;
+ if (lookup()) return;
+ word.setLength(j + 1); /* try removing -er/-or */
+ k = j;
+ if (lookup()) return;
+ word.unsafeWrite('e'); /* try removing -or and adding -e */
+ k = j + 1;
+ if (lookup()) return;
+ word.setLength(j + 1);
+ word.unsafeWrite(word_char);
+ word.unsafeWrite('r'); /* restore the word to the way it was */
+ k = old_k;
+ // nolookup()
+ }
+
+ }
+
+ /*
+ * this routine deals with -ly endings. The -ally ending is always converted
+ * to -al Sometimes this will temporarily leave us with a non-word (e.g.,
+ * heuristically maps to heuristical), but then the -al is removed in the next
+ * step.
+ */
+ private void lyEndings() {
+ int old_k = k;
+
+ if (endsIn('l', 'y')) {
+
+ word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */
+
+ if (lookup()) return;
+ word.setCharAt(j + 2, 'y');
+
+ word.setLength(j + 1); /* try just removing the -ly */
+ k = j;
+
+ if (lookup()) return;
+
+ if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /*
+ * always
+ * convert
+ * -
+ * ally
+ * to
+ * -
+ * al
+ */
+ return;
+ word.append("ly");
+ k = old_k;
+
+ if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { // always convert 'ably' to 'able'
+ word.setCharAt(j + 2, 'e');
+ k = j + 2;
+ return;
+ }
+
+ if (word.charAt(j) == 'i') { /* e.g., militarily -> military */
+ word.setLength(j);
+ word.unsafeWrite('y');
+ k = j;
+ if (lookup()) return;
+ word.setLength(j);
+ word.append("ily");
+ k = old_k;
+ }
+
+ word.setLength(j + 1); /* the default is to remove -ly */
+
+ k = j;
+ // nolookup()... we already tried removing the "ly" variant
+ }
+ return;
+ }
+
+ /*
+ * this routine deals with -al endings. Some of the endings from the previous
+ * routine are finished up here.
+ */
+ private void alEndings() {
+ int old_k = k;
+
+ if (word.length() < 4) return;
+ if (endsIn('a', 'l')) {
+ word.setLength(j + 1);
+ k = j;
+ if (lookup()) /* try just removing the -al */
+ return;
+
+ if (doubleC(j)) { /* allow for a doubled consonant */
+ word.setLength(j);
+ k = j - 1;
+ if (lookup()) return;
+ word.unsafeWrite(word.charAt(j - 1));
+ }
+
+ word.setLength(j + 1);
+ word.unsafeWrite('e'); /* try removing the -al and adding -e */
+ k = j + 1;
+ if (lookup()) return;
+
+ word.setLength(j + 1);
+ word.append("um"); /* try converting -al to -um */
+ /* (e.g., optimal - > optimum ) */
+ k = j + 2;
+ if (lookup()) return;
+
+ word.setLength(j + 1);
+ word.append("al"); /* restore the ending to the way it was */
+ k = old_k;
+
+ if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
+ word.setLength(j - 1); /* try removing -ical */
+ k = j - 2;
+ if (lookup()) return;
+
+ word.setLength(j - 1);
+ word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */
+ k = j - 1;
+ if (lookup()) return;
+
+ word.setLength(j - 1);
+ word.append("ic"); /* the default is to convert -ical to -ic */
+ k = j;
+ // nolookup() ... converting ical to ic means removing "al" which we
+ // already tried
+ // ERROR
+ lookup();
+ return;
+ }
+
+ if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */
+ word.setLength(j); /* (sometimes it gets turned into -y, but we */
+ k = j - 1; /* aren't dealing with that case for now) */
+ if (lookup()) return;
+ word.append("ial");
+ k = old_k;
+ lookup();
+ }
+
+ }
+ return;
+ }
+
+ /*
+ * this routine deals with -ive endings. It normalizes some of the -ative
+ * endings directly, and also maps some -ive endings to -ion.
+ */
+ private void iveEndings() {
+ int old_k = k;
+
+ if (endsIn('i', 'v', 'e')) {
+ word.setLength(j + 1); /* try removing -ive entirely */
+ k = j;
+ if (lookup()) return;
+
+ word.unsafeWrite('e'); /* try removing -ive and adding -e */
+ k = j + 1;
+ if (lookup()) return;
+ word.setLength(j + 1);
+ word.append("ive");
+ if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
+ word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */
+ word.setLength(j); /* (e.g., determinative -> determine) */
+ k = j - 1;
+ if (lookup()) return;
+ word.setLength(j - 1); /* try just removing -ative */
+ if (lookup()) return;
+
+ word.append("ative");
+ k = old_k;
+ }
+
+ /* try mapping -ive to -ion (e.g., injunctive/injunction) */
+ word.setCharAt(j + 2, 'o');
+ word.setCharAt(j + 3, 'n');
+ if (lookup()) return;
+
+ word.setCharAt(j + 2, 'v'); /* restore the original values */
+ word.setCharAt(j + 3, 'e');
+ k = old_k;
+ // nolookup()
+ }
+ return;
+ }
+
+ public KStemmer() {}
+
+ public String stem(String term) {
+ boolean changed = stem(term.toCharArray(), term.length());
+ if (!changed) return term;
+ return asString();
+ }
+
+ /**
+ * Returns the result of the stem (assuming the word was changed) as a String.
+ */
+ String asString() {
+ String s = getString();
+ if (s != null) return s;
+ return word.toString();
+ }
+
+ CharSequence asCharSequence() {
+ return result != null ? result : word;
+ }
+
+ String getString() {
+ return result;
+ }
+
+ char[] getChars() {
+ return word.getArray();
+ }
+
+ int getLength() {
+ return word.length();
+ }
+
+ String result;
+
+ private boolean matched() {
+ /***
+ * if (!lookups.contains(word.toString())) { throw new
+ * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
+ * }
+ ***/
+ // lookup();
+ return matchedEntry != null;
+ }
+
+ /**
+ * Stems the text in the token. Returns true if changed.
+ */
+ boolean stem(char[] term, int len) {
+
+ result = null;
+
+ k = len - 1;
+ if ((k <= 1) || (k >= MaxWordLen - 1)) {
+ return false; // don't stem
+ }
+
+ // first check the stemmer dictionaries, and avoid using the
+ // cache if it's in there.
+ DictEntry entry = dict_ht.get(term, 0, len);
+ if (entry != null) {
+ if (entry.root != null) {
+ result = entry.root;
+ return true;
+ }
+ return false;
+ }
+
+ /***
+ * caching off is normally faster if (cache == null) initializeStemHash();
+ *
+ * // now check the cache, before we copy chars to "word" if (cache != null)
+ * { String val = cache.get(term, 0, len); if (val != null) { if (val !=
+ * SAME) { result = val; return true; } return false; } }
+ ***/
+
+ word.reset();
+ // allocate enough space so that an expansion is never needed
+ word.reserve(len + 10);
+ for (int i = 0; i < len; i++) {
+ char ch = term[i];
+ if (!isAlpha(ch)) return false; // don't stem
+ // don't lowercase... it's a requirement that lowercase filter be
+ // used before this stemmer.
+ word.unsafeWrite(ch);
+ }
+
+ matchedEntry = null;
+ /***
+ * lookups.clear(); lookups.add(word.toString());
+ ***/
+
+ /*
+ * This while loop will never be executed more than one time; it is here
+ * only to allow the break statement to be used to escape as soon as a word
+ * is recognized
+ */
+ while (true) {
+ // YCS: extra lookup()s were inserted so we don't need to
+ // do an extra wordInDict() here.
+ plural();
+ if (matched()) break;
+ pastTense();
+ if (matched()) break;
+ aspect();
+ if (matched()) break;
+ ityEndings();
+ if (matched()) break;
+ nessEndings();
+ if (matched()) break;
+ ionEndings();
+ if (matched()) break;
+ erAndOrEndings();
+ if (matched()) break;
+ lyEndings();
+ if (matched()) break;
+ alEndings();
+ if (matched()) break;
+ entry = wordInDict();
+ iveEndings();
+ if (matched()) break;
+ izeEndings();
+ if (matched()) break;
+ mentEndings();
+ if (matched()) break;
+ bleEndings();
+ if (matched()) break;
+ ismEndings();
+ if (matched()) break;
+ icEndings();
+ if (matched()) break;
+ ncyEndings();
+ if (matched()) break;
+ nceEndings();
+ matched();
+ break;
+ }
+
+ /*
+ * try for a direct mapping (allows for cases like `Italian'->`Italy' and
+ * `Italians'->`Italy')
+ */
+ entry = matchedEntry;
+ if (entry != null) {
+ result = entry.root; // may be null, which means that "word" is the stem
+ }
+
+ // no entry matched means result is "word"
+ return true;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java
new file mode 100644
index 00000000000..4f10c058424
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/OpenStringBuilder.java
@@ -0,0 +1,136 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This is adapted from the Lucene code base which is Copyright 2008 Apache Software Foundation and Licensed
+ * under the terms of the Apache License, Version 2.0.
+ */
+package com.yahoo.language.simple.kstem;
+
+/**
+ * A StringBuilder that allows one to access the array.
+ */
+public class OpenStringBuilder implements Appendable, CharSequence {
+
+ protected char[] buf;
+ protected int len;
+
+ public OpenStringBuilder() {
+ this(32);
+ }
+
+ public OpenStringBuilder(int size) {
+ buf = new char[size];
+ }
+
+ public void setLength(int len) { this.len = len; }
+
+ public void set(char[] arr, int end) {
+ this.buf = arr;
+ this.len = end;
+ }
+
+ public char[] getArray() { return buf; }
+ public int size() { return len; }
+ @Override
+ public int length() { return len; }
+ public int capacity() { return buf.length; }
+
+ @Override
+ public Appendable append(CharSequence csq) {
+ return append(csq, 0, csq.length());
+ }
+
+ @Override
+ public Appendable append(CharSequence csq, int start, int end) {
+ reserve(end-start);
+ for (int i=start; i<end; i++) {
+ unsafeWrite(csq.charAt(i));
+ }
+ return this;
+ }
+
+ @Override
+ public Appendable append(char c) {
+ write(c);
+ return this;
+ }
+
+ @Override
+ public char charAt(int index) {
+ return buf[index];
+ }
+
+ public void setCharAt(int index, char ch) {
+ buf[index] = ch;
+ }
+
+ @Override
+ public CharSequence subSequence(int start, int end) {
+ throw new UnsupportedOperationException(); // todo
+ }
+
+ public void unsafeWrite(char b) {
+ buf[len++] = b;
+ }
+
+ public void unsafeWrite(char b[], int off, int len) {
+ System.arraycopy(b, off, buf, this.len, len);
+ this.len += len;
+ }
+
+ protected void resize(int len) {
+ char newbuf[] = new char[Math.max(buf.length << 1, len)];
+ System.arraycopy(buf, 0, newbuf, 0, size());
+ buf = newbuf;
+ }
+
+ public void reserve(int num) {
+ if (len + num > buf.length) resize(len + num);
+ }
+
+ public void write(char b) {
+ if (len >= buf.length) {
+ resize(len +1);
+ }
+ unsafeWrite(b);
+ }
+
+ public void write(int b) { write((char)b); }
+
+ public final void write(char[] b) {
+ write(b,0,b.length);
+ }
+
+ public void write(char b[], int off, int len) {
+ reserve(len);
+ unsafeWrite(b, off, len);
+ }
+
+ public final void write(OpenStringBuilder arr) {
+ write(arr.buf, 0, len);
+ }
+
+ public void write(String s) {
+ reserve(s.length());
+ s.getChars(0,s.length(),buf, len);
+ len +=s.length();
+ }
+
+ public void flush() {
+ }
+
+ public final void reset() {
+ len =0;
+ }
+
+ public char[] toCharArray() {
+ char newbuf[] = new char[size()];
+ System.arraycopy(buf, 0, newbuf, 0, size());
+ return newbuf;
+ }
+
+ @Override
+ public String toString() {
+ return new String(buf, 0, size());
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/package-info.java b/linguistics/src/main/java/com/yahoo/language/simple/package-info.java
new file mode 100644
index 00000000000..722002d6bcc
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/package-info.java
@@ -0,0 +1,9 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+package com.yahoo.language.simple;
+
+import com.yahoo.osgi.annotation.ExportPackage;
+
+/**
+ * A set of simple dependency-free linguistics processors suitable for testing.
+ */
diff --git a/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java
new file mode 100644
index 00000000000..c99c4009c4c
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java
@@ -0,0 +1,107 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author Rich Pito
+ */
+public class LanguageTestCase {
+
+ @Test
+ public void requireThatSpecificLanguagesAreCjk() {
+ List<Language> cjk = Arrays.asList(Language.CHINESE_SIMPLIFIED,
+ Language.CHINESE_TRADITIONAL,
+ Language.JAPANESE,
+ Language.KOREAN,
+ Language.THAI);
+ for (Language language : cjk) {
+ assertTrue(language.toString(), language.isCjk());
+ }
+ for (Language language : Language.values()) {
+ if (cjk.contains(language)) {
+ continue;
+ }
+ assertFalse(language.toString(), language.isCjk());
+ }
+ }
+
+ @Test
+ public void requireThatLanguageTagsAreRecognized() {
+ assertLanguage(Language.ARABIC, "ar");
+ assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-hans");
+ assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-Hans");
+ assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-foo-CN");
+ assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-CN");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-hant");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-TW");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-HK");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo-TW");
+ assertLanguage(Language.CHINESE_TRADITIONAL, "zh-TW");
+ assertLanguage(Language.CROATIAN, "hr");
+ assertLanguage(Language.DANISH, "da");
+ assertLanguage(Language.DUTCH, "nl");
+ assertLanguage(Language.ENGLISH, "en");
+ assertLanguage(Language.ENGLISH, "en-CA");
+ assertLanguage(Language.ENGLISH, "en-GB");
+ assertLanguage(Language.ENGLISH, "en-US");
+ assertLanguage(Language.ENGLISH, "en-Latn-i-oed-1992");
+ assertLanguage(Language.FINNISH, "fi");
+ assertLanguage(Language.FRENCH, "fr");
+ assertLanguage(Language.FRENCH, "fr-FR");
+ assertLanguage(Language.GERMAN, "de");
+ assertLanguage(Language.GERMAN, "de-DE");
+ assertLanguage(Language.GREEK, "el");
+ assertLanguage(Language.ITALIAN, "it");
+ assertLanguage(Language.ITALIAN, "it-IT");
+ assertLanguage(Language.JAPANESE, "ja");
+ assertLanguage(Language.KOREAN, "ko");
+ assertLanguage(Language.NORWEGIAN_BOKMAL, "no");
+ assertLanguage(Language.NORWEGIAN_BOKMAL, "nb");
+ assertLanguage(Language.POLISH, "pl");
+ assertLanguage(Language.PORTUGUESE, "pt");
+ assertLanguage(Language.ROMANIAN, "ro");
+ assertLanguage(Language.RUSSIAN, "ru");
+ assertLanguage(Language.SPANISH, "es");
+ assertLanguage(Language.SPANISH, "es-ES");
+ assertLanguage(Language.SPANISH, "es-419");
+ assertLanguage(Language.SWEDISH, "sv");
+ assertLanguage(Language.THAI, "th");
+ assertLanguage(Language.TURKISH, "tr");
+ assertLanguage(Language.VIETNAMESE, "vi");
+
+ assertLanguage(Language.UNKNOWN, null);
+ assertLanguage(Language.UNKNOWN, "");
+ assertLanguage(Language.UNKNOWN, "und");
+ assertLanguage(Language.UNKNOWN, "z-foo");
+ assertLanguage(Language.UNKNOWN, "ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe");
+ assertLanguage(Language.UNKNOWN, "#$_^@#$_@%#$)%@$%^--@&&&#-%^_^%");
+ }
+
+ @Test
+ public void requireThatLanguageIsGuessedCorrectlyFromEncodings() {
+ assertSame(Language.UNKNOWN, Language.fromEncoding(null));
+ assertSame(Language.UNKNOWN, Language.fromEncoding("lkij"));
+ assertSame(Language.UNKNOWN, Language.fromEncoding("(/)(###)"));
+
+ assertSame(Language.CHINESE_SIMPLIFIED, Language.fromEncoding("GB2312"));
+ assertSame(Language.CHINESE_TRADITIONAL, Language.fromEncoding("BIG5"));
+ assertSame(Language.JAPANESE, Language.fromEncoding("EUC-jp"));
+ assertSame(Language.JAPANESE, Language.fromEncoding("ISO-2022-jp"));
+ assertSame(Language.JAPANESE, Language.fromEncoding("Shift-JIS"));
+ assertSame(Language.KOREAN, Language.fromEncoding("EUC-kr"));
+ }
+
+ private static void assertLanguage(Language expected, String str) {
+ assertSame(expected, Language.fromLanguageTag(str));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java
new file mode 100644
index 00000000000..910627584ce
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java
@@ -0,0 +1,52 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language;
+
+import org.junit.Test;
+
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class LocaleFactoryTestCase {
+
+ @Test
+ public void requireThatLocaleCanBeCreatedFromLanguageTag() {
+ assertLocale("zh", "zh", "", "");
+ assertLocale("zh-CN", "zh", "", "CN");
+ assertLocale("zh-foo-CN", "zh", "", "CN");
+ assertLocale("zh-Hans", "zh", "Hans", "");
+ assertLocale("zh-TW", "zh", "", "TW");
+ assertLocale("zh-foo-TW", "zh", "", "TW");
+ assertLocale("zh-Hant", "zh", "Hant", "");
+ assertLocale("ja", "ja", "", "");
+ assertLocale("ko", "ko", "", "");
+ assertLocale("en", "en", "", "");
+ assertLocale("en-NO", "en", "", "NO");
+ assertLocale("de", "de", "", "");
+ assertLocale("es", "es", "", "");
+ assertLocale("es-419", "es", "", "419");
+
+ try {
+ LocaleFactory.fromLanguageTag(null);
+ fail();
+ } catch (NullPointerException e) {
+
+ }
+
+ assertLocale("", "", "", "");
+ assertLocale("z-foo", "", "", "");
+ assertLocale("ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe", "", "", "");
+ }
+
+ private static void assertLocale(String tag, String language, String variant, String country) {
+ Locale locale = LocaleFactory.fromLanguageTag(tag);
+ assertEquals(language, locale.getLanguage());
+ assertEquals(country, locale.getCountry());
+ assertEquals(variant, locale.getVariant());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java
new file mode 100644
index 00000000000..aa8102fe9f2
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.detect;
+
+import com.yahoo.language.Language;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class AbstractDetectorTestCase {
+
+ private static final Detection DETECTION = new Detection(Language.ARABIC, "encoding", true);
+ private static final Charset UTF8 = Charset.forName("UTF-8");
+
+ @Test
+ public void requireThatDetectStringForwardsUtf8Bytes() {
+ Hint hint = Hint.newCountryHint("no");
+ MyDetector detector = new MyDetector();
+ Detection detection = detector.detect("69", hint);
+ assertSame(DETECTION, detection);
+ assertArrayEquals("69".getBytes(UTF8), detector.input);
+ assertEquals(0, detector.offset);
+ assertEquals(2, detector.length);
+ assertSame(hint, detector.hint);
+ }
+
+ @Test
+ public void requireThatDetectByteBufferForwardsUtf8Bytes() {
+ byte[] buf = new byte[] { 6, 9 };
+ Hint hint = Hint.newCountryHint("no");
+ MyDetector detector = new MyDetector();
+ Detection detection = detector.detect(ByteBuffer.wrap(buf), hint);
+ assertSame(DETECTION, detection);
+ assertArrayEquals(buf, detector.input);
+ assertEquals(0, detector.offset);
+ assertEquals(2, detector.length);
+ assertSame(hint, detector.hint);
+ }
+
+ private static class MyDetector extends AbstractDetector {
+
+ byte[] input;
+ int offset;
+ int length;
+ Hint hint;
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ this.input = input;
+ this.offset = offset;
+ this.length = length;
+ this.hint = hint;
+ return DETECTION;
+ }
+ }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java
new file mode 100644
index 00000000000..3cb82572976
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public abstract class AbstractTokenizerTestCase {
+
+ private boolean accentDrop = false;
+ private Language language = Language.ENGLISH;
+ private Linguistics linguistics;
+ private StemMode stemMode = StemMode.NONE;
+
+ public void assertTokenStrings(String input, List<String> expectedTokenStrings) {
+ List<String> actual = new ArrayList<>();
+ for (Token token : tokenize(input)) {
+ findTokenStrings(token, actual);
+ }
+ assertEquals(expectedTokenStrings, actual);
+ }
+
+ public List<String> findTokenStrings(Token token, List<String> out) {
+ int numComponents = token.getNumComponents();
+ if (token.isSpecialToken() || numComponents == 0) {
+ out.add(token.getTokenString());
+ } else {
+ for (int i = 0; i < numComponents; ++i) {
+ findTokenStrings(token.getComponent(i), out);
+ }
+ }
+ return out;
+ }
+
+ public Iterable<Token> tokenize(String input) {
+ return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop);
+ }
+
+ public AbstractTokenizerTestCase setAccentDrop(boolean accentDrop) {
+ this.accentDrop = accentDrop;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setLanguage(Language language) {
+ this.language = language;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setLinguistics(Linguistics linguistics) {
+ this.linguistics = linguistics;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setStemMode(StemMode stemMode) {
+ this.stemMode = stemMode;
+ return this;
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
new file mode 100644
index 00000000000..8233ef1b8f0
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -0,0 +1,150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.*;
+
+/**
+ * @author bratseth
+ */
+public class GramSplitterTestCase {
+
+ private static final GramSplitter gramSplitter = new SimpleLinguistics().getGramSplitter();
+
+ @Test
+ public void testNoSpaces() {
+ // no spaces
+ assertGramSplit("engulbillesang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]");
+ assertGramSplit("engulbillesang", 2, "[en, ng, gu, ul, lb, bi, il, ll, le, es, sa, an, ng]");
+ assertGramSplit("engulbillesang", 3, "[eng, ngu, gul, ulb, lbi, bil, ill, lle, les, esa, san, ang]");
+ }
+
+ @Test
+ public void testWithSpaces() {
+ // with spaces
+ assertGramSplit("en gul bille sang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]");
+ assertGramSplit("en gul bille sang", 2, "[en, gu, ul, bi, il, ll, le, sa, an, ng]");
+ assertGramSplit("en gul bille sang", 3, "[en, gul, bil, ill, lle, san, ang]");
+ }
+
+ @Test
+ public void testCornerCases() {
+ // corner cases
+ assertGramSplit("", 1, "[]");
+ assertGramSplit("", 2, "[]");
+ assertGramSplit("e", 1, "[e]");
+ assertGramSplit("e", 2, "[e]");
+ assertGramSplit("en", 1, "[e, n]");
+ assertGramSplit("en", 2, "[en]");
+ assertGramSplit("en", 3, "[en]");
+ }
+
+ @Test
+ public void testSpaceCornerCases() {
+ // space corner cases
+ assertGramSplit("e en e", 1, "[e, e, n, e]");
+ assertGramSplit("e en e", 2, "[e, en, e]");
+ assertGramSplit("e en e", 3, "[e, en, e]");
+ assertGramSplit(" e en e ", 1, "[e, e, n, e]");
+ assertGramSplit(" e en e ", 2, "[e, en, e]");
+ assertGramSplit(" e en e ", 3, "[e, en, e]");
+ assertGramSplit(" e en e ", 1, "[e, e, n, e]");
+ assertGramSplit(" e en e ", 2, "[e, en, e]");
+ assertGramSplit(" e en e ", 3, "[e, en, e]");
+ assertGramSplit("a b c", 4, "[a, b, c]");
+ }
+
+ @Test
+ public void testWithCasing() {
+ assertGramSplit("This is the Black Eyed Peas", 2,
+ "[Th, hi, is, is, th, he, Bl, la, ac, ck, Ey, ye, ed, Pe, ea, as]");
+ assertGramSplit("This is the Black Eyed Peas", 3,
+ "[Thi, his, is, the, Bla, lac, ack, Eye, yed, Pea, eas]");
+ assertGramSplit("This is the Black Eyed Peas", 4,
+ "[This, is, the, Blac, lack, Eyed, Peas]");
+ assertGramSplit("This is the Black Eyed Peas", 5,
+ "[This, is, the, Black, Eyed, Peas]");
+ assertGramSplit("This is the Black Eyed Peas", 6,
+ "[This, is, the, Black, Eyed, Peas]");
+ }
+
+ @Test
+ public void testWithPunctuation() {
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 2,
+ "[th, hi, is, is, in, a, se, en, ns, se, mo, or, re, th, ha, an, th, he, su, um, of, pa, ar, rt, ts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 3,
+ "[thi, his, is, in, a, sen, ens, nse, mor, ore, tha, han, the, sum, of, par, art, rts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 4,
+ "[this, is, in, a, sens, ense, more, than, the, sum, of, part, arts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 5,
+ "[this, is, in, a, sense, more, than, the, sum, of, parts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 6,
+ "[this, is, in, a, sense, more, than, the, sum, of, parts]");
+ }
+
+ @Test
+ public void testAccents() {
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 2, "[ca, af, f\u00e9, de, l, h\u00f4, \u00f4t, te, el]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 3, "[caf, af\u00e9, de, l, h\u00f4t, \u00f4te, tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 4, "[caf\u00e9, de, l, h\u00f4te, \u00f4tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 5, "[caf\u00e9, de, l, h\u00f4tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 6, "[caf\u00e9, de, l, h\u00f4tel]");
+ }
+
+ @Test
+ public void testChinese() {
+ String input = "\u77f3\u5ba4\u8a69\u58eb\u65bd\u6c0f\uff0c\u55dc\u7345\uff0c\u8a93\u98df\u5341\u7345\u3002" +
+ "\u65bd\u6c0f\u6642\u6642\u9069\u5e02\u8996\u7345\uff0c\u5341\u6642\uff0c\u9069\u5341\u7345" +
+ "\u9069\u5e02\u3002";
+ assertGramSplit(input, 2, "[\u77f3\u5ba4, \u5ba4\u8a69, \u8a69\u58eb, \u58eb\u65bd, \u65bd\u6c0f, " +
+ "\u55dc\u7345, \u8a93\u98df, \u98df\u5341, \u5341\u7345, \u65bd\u6c0f, " +
+ "\u6c0f\u6642, \u6642\u6642, \u6642\u9069, \u9069\u5e02, \u5e02\u8996, " +
+ "\u8996\u7345, \u5341\u6642, \u9069\u5341, \u5341\u7345, \u7345\u9069, " +
+ "\u9069\u5e02]");
+ assertGramSplit(input, 3, "[\u77f3\u5ba4\u8a69, \u5ba4\u8a69\u58eb, \u8a69\u58eb\u65bd, \u58eb\u65bd\u6c0f, " +
+ "\u55dc\u7345, \u8a93\u98df\u5341, \u98df\u5341\u7345, \u65bd\u6c0f\u6642, " +
+ "\u6c0f\u6642\u6642, \u6642\u6642\u9069, \u6642\u9069\u5e02, \u9069\u5e02\u8996, " +
+ "\u5e02\u8996\u7345, \u5341\u6642, \u9069\u5341\u7345, \u5341\u7345\u9069, " +
+ "\u7345\u9069\u5e02]");
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testInvalidSplitSize() {
+ gramSplitter.split("en", 0);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testInvalidSplitNull() {
+ gramSplitter.split(null, 1);
+ }
+
+ @Test
+ public void testUnusualIteratorUse() {
+ String text = "en gul bille sang";
+ Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3);
+
+ assertThat(grams.next().extractFrom(text), is("en"));
+ assertTrue(grams.hasNext());
+ assertTrue(grams.hasNext());
+ assertThat(grams.next().extractFrom(text), is("gul"));
+ assertThat(grams.next().extractFrom(text), is("bil"));
+ assertThat(grams.next().extractFrom(text), is("ill"));
+ assertThat(grams.next().extractFrom(text), is("lle"));
+ assertTrue(grams.hasNext());
+ assertTrue(grams.hasNext());
+ assertThat(grams.next().extractFrom(text), is("san"));
+ assertThat(grams.next().extractFrom(text), is("ang"));
+ assertFalse(grams.hasNext());
+ assertFalse(grams.hasNext());
+ }
+
+ private void assertGramSplit(String input, int gramSize, String expected) {
+ assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
new file mode 100644
index 00000000000..771487d0e71
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
@@ -0,0 +1,35 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public class NormalizationTestCase {
+
+ private final Normalizer normalizer = new SimpleLinguistics().getNormalizer();
+
+ @Test
+ public void testEmptyStringNormalization() {
+ assertEquals("", normalizer.normalize(""));
+ }
+
+ @Test
+ public void testDoubleWidthAscii() {
+ assertNormalize("\uff41\uff42\uff43\uff44\uff45\uff46\uff47\uff48\uff49", "abcdefghi");
+ }
+
+ @Test
+ public void testLigature() {
+ assertNormalize("\uFB01nance", "finance");
+ }
+
+ private void assertNormalize(String input, String exp) {
+ assertEquals(exp, normalizer.normalize(input));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java
new file mode 100644
index 00000000000..a70a3dc24c5
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java
@@ -0,0 +1,27 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class ProcessingExceptionTestCase {
+
+ @Test
+ public void requireThatMessageCanBeSet() {
+ assertEquals("foo", new ProcessingException("foo").getMessage());
+ }
+
+ @Test
+ public void requireThatMessageAndCauseCanBeSet() {
+ Throwable t = new Throwable();
+ ProcessingException e = new ProcessingException("bar", t);
+ assertEquals("bar", e.getMessage());
+ assertSame(t, e.getCause());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
new file mode 100644
index 00000000000..8e7e52358f9
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SegmenterImplTestCase {
+
+ private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer()));
+
+ @Test
+ public void requireThatNonIndexableCharactersAreDelimiters() {
+ assertSegments("i've", Arrays.asList("i", "ve"));
+ assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz"));
+ assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4"));
+ }
+
+ @Test
+ public void requireThatAdjacentIndexableTokenTypesAreNotSplit() {
+ assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d"));
+ }
+
+ @Test
+ public void requireThatSegmentationReturnsOriginalForm() {
+ assertSegments("a\u030A", Arrays.asList("a\u030A"));
+ assertSegments("FOO BAR", Arrays.asList("FOO", "BAR"));
+ }
+
+ private static void assertSegments(String input, List<String> expectedSegments) {
+ assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
new file mode 100644
index 00000000000..9a592781998
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
@@ -0,0 +1,73 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import static org.junit.Assert.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Functional testing of StemList.
+ *
+ * @author steinar
+ */
+public class StemListTestCase {
+
+ private StemList stems;
+
+ @Before
+ public void setUp() throws Exception {
+ stems = new StemList();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ stems = null;
+ }
+
+ @Test
+ public void testSize() {
+ assertEquals(0, stems.size());
+ stems.add("a");
+ stems.add("b");
+ stems.add("a");
+ assertEquals(2, stems.size());
+ }
+
+ @Test
+ public void testSet() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals("a", stems.set(2, "a"));
+ assertEquals("c", stems.get(2));
+ assertEquals("c", stems.set(2, "z"));
+ assertEquals("z", stems.get(2));
+ }
+
+ @Test
+ public void testAdd() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals(4, stems.size());
+ stems.add("a");
+ assertEquals(4, stems.size());
+ stems.add("z");
+ assertEquals(5, stems.size());
+ }
+
+ @Test
+ public void testremove() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals("c", stems.remove(2));
+ assertEquals(3, stems.size());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java
new file mode 100644
index 00000000000..13cd8a82e36
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java
@@ -0,0 +1,27 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemModeTestCase {
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfWorks() {
+ for (StemMode mode : StemMode.values()) {
+ assertEquals(mode, StemMode.valueOf(mode.getValue()));
+ }
+ }
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfUnknownIsNone() {
+ assertEquals(StemMode.NONE, StemMode.valueOf(-1));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java
new file mode 100644
index 00000000000..d81aaaafcc8
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java
@@ -0,0 +1,68 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.ArrayList;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemmerImplTestCase {
+
+ @Test
+ public void requireThatStemIsNormalizedAndLowerCased() {
+ assertStem("FOO", Arrays.asList("foo"));
+ assertStem("a\u030A", Arrays.asList("\u00E5"));
+ }
+
+ @Test
+ public void requireThatOnlyIndexableTokensAreReturned() {
+ assertStem("foo. (bar)!", Arrays.asList("foo", "bar"));
+ }
+
+ @Test
+ public void requireThatSpecialTokensAreNotDecompounded() {
+ SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC)
+ .setTokenString("c++")
+ .addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC)
+ .setTokenString("c"))
+ .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
+ .setTokenString("p"))
+ .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
+ .setTokenString("p"));
+ Tokenizer tokenizer = Mockito.mock(Tokenizer.class);
+ Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.<Language>any(), Mockito.<StemMode>any(),
+ Mockito.anyBoolean()))
+ .thenReturn(Arrays.<Token>asList(token));
+ Stemmer stemmer = new StemmerImpl(tokenizer);
+
+ token.setSpecialToken(false);
+ assertEquals(Arrays.asList(new StemList("c"),
+ new StemList("p"),
+ new StemList("p")),
+ stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
+
+ token.setSpecialToken(true);
+ assertEquals(Arrays.asList(new StemList("c++")),
+ stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
+ }
+
+ private static void assertStem(String input, List<String> expectedStems) {
+ Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer()));
+ List<String> got = new ArrayList<>();
+ for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) {
+ got.add(word.get(0));
+ }
+ assertEquals(expectedStems, got);
+ }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
new file mode 100644
index 00000000000..1a92f5a750e
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class TokenTypeTestCase {
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfWorks() {
+ for (TokenType type : TokenType.values()) {
+ assertEquals(type, TokenType.valueOf(type.getValue()));
+ }
+ }
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfUnknownIsUnknown() {
+ assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1));
+ }
+
+ @Test
+ public void requireThatOnlyAlphaNumericsAreIndexable() {
+ for (TokenType type : TokenType.values()) {
+ if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) {
+ assertTrue(type.isIndexable());
+ } else {
+ assertFalse(type.isIndexable());
+ }
+ }
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
new file mode 100644
index 00000000000..6506b41fc79
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
@@ -0,0 +1,233 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+
+import java.util.*;
+
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.*;
+
+/**
+ * Test of tokenization, with stemming and accent removal
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public class TokenizationTestCase {
+
+ private final Tokenizer tokenizer = new SimpleTokenizer();
+
+ @Test
+ public void testTokenizer() {
+ assertTokenize("This is a test, 123",
+ Arrays.asList("this", "is", "a", "test", "123"),
+ Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
+ }
+
+ @Test
+ public void testUnderScoreTokenization() {
+ assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null);
+ }
+
+ @Test
+ public void testPhrasesWithPunctuation() {
+ assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE,
+ false,
+ Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
+ null);
+ assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null);
+ assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null);
+ }
+
+ @Test
+ public void testDoubleWidthTokenization() {
+ // "sony"
+ assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("sony"), null);
+ assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("sony"), null);
+ // "SONY"
+ assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("sony"), null);
+ assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("sony"), null);
+ // "on"
+ assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("on"), null);
+ assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("on"), null);
+ // "ON"
+ assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("on"), null);
+ assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("on"), null);
+ }
+
+ @Test
+ public void testLargeTextTokenization() {
+ StringBuilder sb = new StringBuilder();
+ String s = "teststring ";
+ for (int i = 0; i < 100000; i++) {
+ sb.append(s);
+ }
+
+ String input = sb.toString();
+
+ int numTokens = 0;
+ List<Long> pos = new ArrayList<>();
+ for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) {
+ numTokens++;
+ if ((numTokens % 100) == 0) {
+ pos.add(t.getOffset());
+ }
+ }
+
+ assertEquals("Check that all tokens have been tokenized", numTokens, 200000);
+ assertTrue("Increasing token pos", assertMonoIncr(pos));
+ }
+
+ @Test
+ public void testLargeTokenGuard() {
+ StringBuilder str = new StringBuilder();
+ for (int i = 0; i < 128 * 256; i++) {
+ str.append("ab");
+ }
+ Iterator<Token> it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertTrue(it.hasNext());
+ assertNotNull(it.next().getTokenString());
+ assertFalse(it.hasNext());
+ }
+
+ @Test
+ public void testTokenIterator() {
+ Iterator<Token> it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertFalse(it.hasNext());
+ try {
+ it.next();
+ fail();
+ } catch (NoSuchElementException e) {
+ // success
+ }
+
+ it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertFalse(it.hasNext());
+
+ it = tokenizer.tokenize("one two three", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertFalse(it.hasNext());
+ }
+
+ @Test
+ public void testGetOffsetLength() {
+ String input = "Deka-Chef Weber r\u00e4umt Kommunikationsfehler ein";
+ long[] expOffset = { 0, 4, 5, 9, 10, 15, 16, 21, 22, 42, 43 };
+ int[] len = { 4, 1, 4, 1, 5, 1, 5, 1, 20, 1, 3 };
+
+ int idx = 0;
+ for (Token token : tokenizer.tokenize(input, Language.GERMAN, StemMode.SHORTEST, false)) {
+ assertThat("Token offset for token #" + idx, token.getOffset(), is(expOffset[idx]));
+ assertThat("Token len for token #" + idx, token.getOrig().length(), is(len[idx]));
+ idx++;
+ }
+ }
+
+ @Test
+ public void testRecursiveDecompose() {
+ for (Token t : tokenizer.tokenize("\u00a510%", Language.ENGLISH, StemMode.SHORTEST, false)) {
+ recurseDecompose(t);
+ }
+ }
+
+ @Test
+ public void testIndexability() {
+ String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE,
+ StemMode.SHORTEST }) {
+ for (Language l : new Language[] { Language.INDONESIAN,
+ Language.ENGLISH, Language.ARABIC }) {
+ for (boolean accentDrop : new boolean[] { true, false }) {
+ for (Token token : tokenizer.tokenize(input,
+ l, stemMode, accentDrop)) {
+ if (token.getTokenString().length() == 0) {
+ assertFalse(token.isIndexable());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void recurseDecompose(Token t) {
+ assertTrue(t.getOffset() >= 0);
+ assertTrue(t.getOrig().length() >= 0);
+
+ int numComp = t.getNumComponents();
+ for (int i = 0; i < numComp; i++) {
+ Token comp = t.getComponent(i);
+ recurseDecompose(comp);
+ }
+ }
+
+ private boolean assertMonoIncr(Iterable<Long> n) {
+ long trailing = -1;
+ for (long i : n) {
+ if (i < trailing) {
+ return false;
+ }
+ trailing = i;
+ }
+ return true;
+ }
+
+ private void assertTokenize(String input, List<String> indexed, List<String> orig) {
+ assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig);
+ }
+
+ /**
+ * <p>Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
+ * "orig" truth.</p>
+ *
+ * @param input The text to process, passed to tokenizer.
+ * @param language The language tag, passed to tokenizer.
+ * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
+ * @param accentDrop Passed to the tokenizer.
+ * @param indexed Compared to the "TokenString" result from the tokenizer.
+ * @param orig Compared to the "Orig" result from the tokenizer.
+ */
+ private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
+ List<String> indexed, List<String> orig) {
+ int i = 0;
+ int j = 0;
+ for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) {
+ // System.err.println("got token orig '"+token.getOrig()+"'");
+ // System.err.println("got token stem '"+token.getTokenString(stemMode)+"'");
+ if (token.getNumComponents() > 0) {
+ for (int comp = 0; comp < token.getNumComponents(); comp++) {
+ Token t = token.getComponent(comp);
+ if (t.getType().isIndexable()) {
+ assertThat("comp index: " + i, toLowerCase(t.getTokenString()), is(indexed.get(i++)));
+ }
+ }
+ } else {
+ if (token.getType().isIndexable()) {
+ assertThat("exp index: " + i, toLowerCase(token.getTokenString()), is(indexed.get(i++)));
+ }
+ }
+ if (orig != null) {
+ assertThat("orig index: " + j, token.getOrig(), is(orig.get(j++)));
+ }
+ }
+ assertThat("indexed length", i, is(indexed.size()));
+ if (orig != null) {
+ assertThat("orig length", j, is(orig.size()));
+ }
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
new file mode 100644
index 00000000000..66eee3f73d4
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -0,0 +1,89 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.text.Utf8;
+import org.junit.Test;
+
+import java.nio.charset.Charset;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class SimpleDetectorTestCase {
+
+ @Test
+ public void requireThatLanguageCanDetected() {
+ assertLanguage(Language.UNKNOWN, "Hello!");
+
+ // "Chinese language"
+ assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input
+ "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002");
+
+ // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+ assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_TRADITIONAL input
+ "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u50B7\u8EAB\u9AD4\u3002");
+
+ // four katakana characters from this web page: http://www.japanese-online.com/language/lessons/katakana.htm
+ assertLanguage(Language.JAPANESE, "\u30ab\u30bf\u30ab\u30ca");
+
+ // four hiragana characters gotton from web page: http://www.japanese-online.com/language/lessons/hiragana.htm
+ assertLanguage(Language.JAPANESE, "\u3072\u3089\u304c\u306a");
+
+ // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+ // This is a good test because this string contains not only japanese but chinese characters, so we need to look
+ // through it to find the japanese ones.
+ assertLanguage(Language.JAPANESE,
+ "\u79c1\u306f\u30ac\u30e9\u30b9\u3092\u98df\u3079\u3089\u308c\u307e\u3059" +
+ "\u3002\u305d\u308c\u306f\u79c1\u3092\u50b7\u3064\u3051\u307e\u305b\u3093" +
+ "\u3002");
+
+ // an introduction on an adobe web page. What it measn I don't know.
+ assertLanguage(Language.KOREAN, "\ud55c\uae00\uacfc");
+
+ // for the sound of "A"
+ assertLanguage(Language.KOREAN, "\u314f");
+
+ // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+ assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
+ "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+ }
+
+ @Test
+ public void testEncodingGuess() {
+ // just some arbitrary data above 127 which is not valid as UTF-8
+ byte[] b = new byte[] { (byte)196, (byte)197, (byte)198 };
+ Detection d = new SimpleDetector().detect(b, 0, b.length, null);
+ assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+
+ // a string from http://www.columbia.edu/kermit/utf8.html that says
+ // "I can eat glass (and it doesn't hurt me)".
+ b = Utf8.toBytes("\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
+ "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+ d = new SimpleDetector().detect(b, 0, b.length, null);
+ assertEquals(Utf8.getCharset(), d.getEncoding());
+
+ // arbitrary ascii
+ b = new byte[] { 31, 32, 33 };
+ d = new SimpleDetector().detect(b, 0, b.length, null);
+ assertEquals(Charset.forName("US-ASCII"), d.getEncoding());
+
+ // character which is not valid in UTF-8
+ b = new byte[] { -1 };
+ d = new SimpleDetector().detect(b, 0, b.length, null);
+ assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+
+ // UTF-8 which requires more bytes than available
+ b = new byte[] { Utf8.toBytes("\u00E5")[0] };
+ d = new SimpleDetector().detect(b, 0, b.length, null);
+ assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+ }
+
+ private static void assertLanguage(Language language, String input) {
+ assertEquals(language, new SimpleDetector().detect(input, null).getLanguage());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java
new file mode 100644
index 00000000000..9c9c8b8fcc5
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java
@@ -0,0 +1,34 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.Normalizer;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SimpleNormalizerTestCase {
+
+ private static final Normalizer NORMALIZER = new SimpleNormalizer();
+
+ @Test
+ public void requireThatInputIsNfkcNormalized() {
+ assertNormalize("\u212B", "\u00C5");
+ assertNormalize("\u2126", "\u03A9");
+ assertNormalize("\u00C5", "\u00C5");
+ assertNormalize("\u00F4", "\u00F4");
+ assertNormalize("\u1E69", "\u1E69");
+ assertNormalize("\u1E0B\u0323", "\u1E0D\u0307");
+ assertNormalize("\u0071\u0307\u0323", "q\u0323\u0307");
+ assertNormalize("\uFB01", "fi");
+ assertNormalize("\u0032\u2075", "25");
+ assertNormalize("\u1E9B\u0323", "\u1E69");
+ }
+
+ private static void assertNormalize(String input, String expectedNormalForm) {
+ assertEquals(expectedNormalForm, NORMALIZER.normalize(input));
+ }
+
+} \ No newline at end of file
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java
new file mode 100644
index 00000000000..b27b70b4dc9
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java
@@ -0,0 +1,194 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.TokenScript;
+import com.yahoo.language.process.TokenType;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SimpleTokenTestCase {
+
+ @Test
+ public void requireThatOrigAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals("foo", token.getOrig());
+
+ assertEquals(token, new SimpleToken("foo"));
+ assertFalse(token.equals(new SimpleToken("bar")));
+ }
+
+ @Test
+ public void requireThatComponentAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals(0, token.getNumComponents());
+ SimpleToken bar = new SimpleToken("bar");
+ SimpleToken baz = new SimpleToken("baz");
+ token.addComponent(bar);
+ token.addComponent(baz);
+ assertEquals(2, token.getNumComponents());
+ assertSame(bar, token.getComponent(0));
+ assertSame(baz, token.getComponent(1));
+
+ SimpleToken other = new SimpleToken("foo");
+ assertFalse(token.equals(other));
+ other.addComponent(bar);
+ assertFalse(token.equals(other));
+ other.addComponent(baz);
+ assertEquals(token, other);
+
+ other = new SimpleToken("foo");
+ other.addComponent(baz);
+ other.addComponent(bar);
+ assertFalse(token.equals(other));
+ }
+
+ @Test
+ public void requireThatStemAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals(0, token.getNumStems());
+ assertNull(token.getStem(0));
+ token.setTokenString("bar");
+ assertEquals(1, token.getNumStems());
+ assertEquals("bar", token.getStem(0));
+ }
+
+ @Test
+ public void requireThatTokenStringAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertNull(token.getTokenString());
+ token.setTokenString("bar");
+ assertEquals("bar", token.getTokenString());
+ SimpleToken other = new SimpleToken("foo");
+ assertFalse(token.equals(other));
+ other.setTokenString("bar");
+ assertEquals(token, other);
+ }
+
+ @Test
+ public void requireThatTypeAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals(TokenType.UNKNOWN, token.getType());
+ for (TokenType type : TokenType.values()) {
+ token.setType(type);
+ assertEquals(type, token.getType());
+ }
+
+ SimpleToken other = new SimpleToken("foo");
+ for (TokenType type : TokenType.values()) {
+ other.setType(type);
+ if (type == token.getType()) {
+ assertEquals(token, other);
+ } else {
+ assertFalse(token.equals(other));
+ }
+ }
+ }
+
+ @Test
+ public void requireThatScriptAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals(TokenScript.UNKNOWN, token.getScript());
+ for (TokenScript script : TokenScript.values()) {
+ token.setScript(script);
+ assertEquals(script, token.getScript());
+ }
+
+ SimpleToken other = new SimpleToken("foo");
+ for (TokenScript script : TokenScript.values()) {
+ other.setScript(script);
+ if (script == token.getScript()) {
+ assertEquals(token, other);
+ } else {
+ assertFalse(token.equals(other));
+ }
+ }
+ }
+
+ @Test
+ public void requireThatSpecialTokenAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertFalse(token.isSpecialToken());
+ token.setSpecialToken(true);
+ assertTrue(token.isSpecialToken());
+ token.setSpecialToken(false);
+ assertFalse(token.isSpecialToken());
+
+ SimpleToken other = new SimpleToken("foo");
+ other.setSpecialToken(true);
+ assertFalse(token.equals(other));
+ other.setSpecialToken(false);
+ assertEquals(token, other);
+ }
+
+ @Test
+ public void requireThatOffsetAccessorsWork() {
+ SimpleToken token = new SimpleToken("foo");
+ assertEquals(0, token.getOffset());
+ token.setOffset(69);
+ assertEquals(69, token.getOffset());
+
+ SimpleToken other = new SimpleToken("foo");
+ assertFalse(token.equals(other));
+ other.setOffset(69);
+ assertEquals(token, other);
+ }
+
+ @Test
+ public void requireThatToStringIsExpressive() {
+ SimpleToken token = new SimpleToken("my_orig");
+ token.addComponent(new SimpleToken("my_component_1"));
+ token.addComponent(new SimpleToken("my_component_2"));
+ token.setTokenString("my_token_string");
+ token.setType(TokenType.ALPHABETIC);
+ token.setScript(TokenScript.ARABIC);
+ token.setOffset(1);
+
+ String expected = "token : SimpleToken {\n" +
+ " components : {\n" +
+ " [0] : SimpleToken {\n" +
+ " components : {\n" +
+ " }\n" +
+ " offset : 0\n" +
+ " orig : 'my_component_1'\n" +
+ " script : UNKNOWN\n" +
+ " special : false\n" +
+ " token string : null\n" +
+ " type : UNKNOWN\n" +
+ " }\n" +
+ " [1] : SimpleToken {\n" +
+ " components : {\n" +
+ " }\n" +
+ " offset : 0\n" +
+ " orig : 'my_component_2'\n" +
+ " script : UNKNOWN\n" +
+ " special : false\n" +
+ " token string : null\n" +
+ " type : UNKNOWN\n" +
+ " }\n" +
+ " }\n" +
+ " offset : 1\n" +
+ " orig : 'my_orig'\n" +
+ " script : ARABIC\n" +
+ " special : false\n" +
+ " token string : 'my_token_string'\n" +
+ " type : ALPHABETIC\n" +
+ "}";
+ assertEquals(expected, token.toString());
+ }
+
+ @Test
+ public void requireThatHashCodeIsImplemented() {
+ assertEquals(new SimpleToken("foo").hashCode(), new SimpleToken("foo").hashCode());
+ }
+
+ @Test
+ public void requireThatEqualsIsImplemented() {
+ assertFalse(new SimpleToken("foo").equals(new Object()));
+ assertEquals(new SimpleToken("foo"), new SimpleToken("foo"));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java
new file mode 100644
index 00000000000..2d258be7af0
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.TokenType;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Check simple token types.
+ *
+ * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ */
+public class SimpleTokenTypeTestCase {
+
+ @Test
+ public final void test() {
+ assertEquals(TokenType.ALPHABETIC, tokenType('a'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\u01c0'));
+ assertEquals(TokenType.SYMBOL, tokenType('\u20dd'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\u0912'));
+ assertEquals(TokenType.NUMERIC, tokenType('1'));
+ assertEquals(TokenType.PUNCTUATION, tokenType('.'));
+ assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3b'));
+ assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3c'));
+ assertEquals(TokenType.PUNCTUATION, tokenType('\u203f'));
+ assertEquals(TokenType.SYMBOL, tokenType('\u2044'));
+ assertEquals(TokenType.SYMBOL, tokenType('$'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\u2132'));
+ assertEquals(TokenType.ALPHABETIC, tokenType('\uD800', '\uDFC8'));
+ }
+
+ private static TokenType tokenType(char c) {
+ return SimpleTokenType.valueOf(c);
+ }
+
+ private static TokenType tokenType(char high, char low) {
+ return SimpleTokenType.valueOf(Character.toCodePoint(high, low));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
new file mode 100644
index 00000000000..8760da56415
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -0,0 +1,36 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.process.AbstractTokenizerTestCase;
+import com.yahoo.language.process.StemMode;
+import org.junit.Test;
+
+/**
+ * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author bratseth
+ */
+public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
+
+ @Test
+ public void testTokenizingNoStemming() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.NONE);
+ tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " +
+ "anyone 1s offended by ancien7 gods.Running)",
+ "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(",
+ "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ",
+ "if", " ", "anyone", " ", "1s", " ", "offended", " ", "by", " ", "ancien7",
+ " ", "gods", ".", "running", ")");
+ }
+
+ @Test
+ public void testTokenizingStemming() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " +
+ "anyone 1s offended by ancien7 gods.Running)",
+ "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(",
+ "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ",
+ "if", " ", "anyone", " ", "1s", " ", "offend", " ", "by", " ", "ancien7",
+ " ", "gods", ".", "running", ")");
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java
new file mode 100644
index 00000000000..ea4b85e4bd1
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java
@@ -0,0 +1,40 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.process.Transformer;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SimpleTransformerTestCase {
+
+ private final static Transformer TRANSFORMER = new SimpleTransformer();
+
+ @Test
+ public void requireThatNonAccentsRemain() {
+ assertTransform("foo", "foo");
+ }
+
+ @Test
+ public void requireThatTransformerRemovesAccents() {
+ assertTransform("\u212B", "A");
+ assertTransform("\u2126", "\u03A9");
+ assertTransform("\u00C5", "A");
+ assertTransform("\u00F4", "o");
+ assertTransform("\u1E69", "s");
+ assertTransform("\u1E0B\u0323", "d");
+ assertTransform("\u0071\u0307\u0323", "q");
+ assertTransform("\uFB01", "\uFB01");
+ assertTransform("2\u2075", "2\u2075");
+ assertTransform("\u1E9B\u0323", "\u017F");
+ }
+
+ private static void assertTransform(String input, String expectedTransform) {
+ assertEquals(expectedTransform, TRANSFORMER.accentDrop(input, Language.ENGLISH));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java
new file mode 100644
index 00000000000..bb59788b26e
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java
@@ -0,0 +1,69 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author bratseth
+ */
+public class TokenizerTester {
+
+ private boolean accentDrop = false;
+ private Language language = Language.ENGLISH;
+ private Linguistics linguistics = new SimpleLinguistics();
+ private StemMode stemMode = StemMode.NONE;
+
+ public void assertTokens(String input, String ... expectedTokenStrings) {
+ List<String> actual = new ArrayList<>();
+ for (Token token : tokenize(input)) {
+ findTokenStrings(token, actual);
+ }
+ assertEquals(Arrays.asList(expectedTokenStrings), actual);
+ }
+
+ public List<String> findTokenStrings(Token token, List<String> out) {
+ int numComponents = token.getNumComponents();
+ if (token.isSpecialToken() || numComponents == 0) {
+ out.add(token.getTokenString());
+ } else {
+ for (int i = 0; i < numComponents; ++i) {
+ findTokenStrings(token.getComponent(i), out);
+ }
+ }
+ return out;
+ }
+
+ public Iterable<Token> tokenize(String input) {
+ return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop);
+ }
+
+ public TokenizerTester setAccentDrop(boolean accentDrop) {
+ this.accentDrop = accentDrop;
+ return this;
+ }
+
+ public TokenizerTester setLanguage(Language language) {
+ this.language = language;
+ return this;
+ }
+
+ public TokenizerTester setLinguistics(Linguistics linguistics) {
+ this.linguistics = linguistics;
+ return this;
+ }
+
+ public TokenizerTester setStemMode(StemMode stemMode) {
+ this.stemMode = stemMode;
+ return this;
+ }
+
+}