Publish

author: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
committer: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
commit: 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree: 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
1 files changed, 89 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
new file mode 100644
index 00000000000..66eee3f73d4
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -0,0 +1,89 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.text.Utf8;
+import org.junit.Test;
+
+import java.nio.charset.Charset;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class SimpleDetectorTestCase {
+
+    @Test
+    public void requireThatLanguageCanDetected() {
+        assertLanguage(Language.UNKNOWN, "Hello!");
+
+        // "Chinese language"
+        assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input
+                       "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002");
+
+        // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+        assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_TRADITIONAL input
+                       "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u50B7\u8EAB\u9AD4\u3002");
+
+        // four katakana characters from this web page: http://www.japanese-online.com/language/lessons/katakana.htm
+        assertLanguage(Language.JAPANESE, "\u30ab\u30bf\u30ab\u30ca");
+
+        // four hiragana characters gotton from web page: http://www.japanese-online.com/language/lessons/hiragana.htm
+        assertLanguage(Language.JAPANESE, "\u3072\u3089\u304c\u306a");
+
+        // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+        // This is a good test because this string contains not only japanese but chinese characters, so we need to look
+        // through it to find the japanese ones.
+        assertLanguage(Language.JAPANESE,
+                       "\u79c1\u306f\u30ac\u30e9\u30b9\u3092\u98df\u3079\u3089\u308c\u307e\u3059" +
+                       "\u3002\u305d\u308c\u306f\u79c1\u3092\u50b7\u3064\u3051\u307e\u305b\u3093" +
+                       "\u3002");
+
+        // an introduction on an adobe web page.  What it measn I don't know.
+        assertLanguage(Language.KOREAN, "\ud55c\uae00\uacfc");
+
+        // for the sound of "A"
+        assertLanguage(Language.KOREAN, "\u314f");
+
+        // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
+        assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
+                                        "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+    }
+
+    @Test
+    public void testEncodingGuess() {
+        // just some arbitrary data above 127 which is not valid as UTF-8
+        byte[] b = new byte[] { (byte)196, (byte)197, (byte)198 };
+        Detection d = new SimpleDetector().detect(b, 0, b.length, null);
+        assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+
+        // a string from http://www.columbia.edu/kermit/utf8.html that says
+        // "I can eat glass (and it doesn't hurt me)".
+        b = Utf8.toBytes("\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
+                         "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+        d = new SimpleDetector().detect(b, 0, b.length, null);
+        assertEquals(Utf8.getCharset(), d.getEncoding());
+
+        // arbitrary ascii
+        b = new byte[] { 31, 32, 33 };
+        d = new SimpleDetector().detect(b, 0, b.length, null);
+        assertEquals(Charset.forName("US-ASCII"), d.getEncoding());
+
+        // character which is not valid in UTF-8
+        b = new byte[] { -1 };
+        d = new SimpleDetector().detect(b, 0, b.length, null);
+        assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+
+        // UTF-8 which requires more bytes than available
+        b = new byte[] { Utf8.toBytes("\u00E5")[0] };
+        d = new SimpleDetector().detect(b, 0, b.length, null);
+        assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
+    }
+
+    private static void assertLanguage(Language language, String input) {
+        assertEquals(language, new SimpleDetector().detect(input, null).getLanguage());
+    }
+
+}
author	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
committer	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
commit	72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree	2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java