summaryrefslogtreecommitdiffstats
path: root/linguistics/src/test
diff options
context:
space:
mode:
authorJefim Matskin <jefimm@wix.com>2018-07-17 15:43:44 +0300
committerJefim Matskin <jefimm@wix.com>2018-07-17 15:43:44 +0300
commitc8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (patch)
treeb1cb65200f03490cce360a900b019c64d43eb7c4 /linguistics/src/test
parent3f59a7da59991ef74adfd5bc334d96095945c575 (diff)
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics/src/test')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java3
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java3
2 files changed, 6 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
index e36d90b3206..27cfc12da5e 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
@@ -64,6 +64,9 @@ public class TokenizationTestCase {
Arrays.asList("on"), null);
assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
Arrays.asList("on"), null);
+
+ assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false,
+ Arrays.asList("наименован"), null);
}
@Test
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
index f9912f6b7a2..e1cac896525 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -50,6 +50,9 @@ public class SimpleDetectorTestCase {
// a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
"\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+
+ // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
+ assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
}
@Test