diff options
author | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
---|---|---|
committer | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
commit | c8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (patch) | |
tree | b1cb65200f03490cce360a900b019c64d43eb7c4 /linguistics/src/test | |
parent | 3f59a7da59991ef74adfd5bc334d96095945c575 (diff) |
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics/src/test')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java | 3 | ||||
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java | 3 |
2 files changed, 6 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java index e36d90b3206..27cfc12da5e 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java @@ -64,6 +64,9 @@ public class TokenizationTestCase { Arrays.asList("on"), null); assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, Arrays.asList("on"), null); + + assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false, + Arrays.asList("наименован"), null); } @Test diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java index f9912f6b7a2..e1cac896525 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -50,6 +50,9 @@ public class SimpleDetectorTestCase { // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + + // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F + assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); } @Test |