summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
Publish
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java179
1 files changed, 179 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
new file mode 100644
index 00000000000..eca35772296
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -0,0 +1,179 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.text.Utf8;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese,
+ * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character
+ * set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available,
+ * then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a
+ * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string.
+ * Unfortunately, its not 100% fool-proof. From what I've been able to determine, Korean characters do not overlap with
+ * Japanese or Chinese characters, so their presence is a good indication of Korean. If a string contains phonetic
+ * japanese, this is a good indication of Japanese. However, Japanese and Chinese characters occupy many of the same
+ * character blocks, so if there are no definitive signs of Japanese then it is assumed that the String is Chinese.</p>
+
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ */
+public class SimpleDetector implements Detector {
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
+ }
+
+ @Override
+ public Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
+ }
+
+ public static Language guessLanguage(byte[] buf, int offset, int length) {
+ return guessLanguage(Utf8.toString(buf, offset, length));
+ }
+
+ public static Language guessLanguage(String input) {
+ if (input == null || input.length() == 0) {
+ return Language.UNKNOWN;
+ }
+
+ // used to record the current theory of language guess, in case of ambiguous characters, such as Chinese
+ Language soFar = Language.UNKNOWN;
+ for (int i = 0; i < input.length(); i++) {
+ char c = input.charAt(i);
+ Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
+
+ // Check some special cases for Korean. Korean doesn't
+ // overlap with Japanese or Chinese, so this is a good test.
+ if ((c >= 0x3200 && c < 0x3220) || // parenthesized hangul
+ (c >= 0x3260 && c < 0x3280) || // circled hangul
+ (c >= 0xFFA0 && c < 0xFFE0) || // halfwidth hangul
+ (c == 0x302E || c == 0x302F) || // hangul tone mark
+
+ // standard Hangul character blocks
+ block == Character.UnicodeBlock.HANGUL_SYLLABLES ||
+ block == Character.UnicodeBlock.HANGUL_JAMO ||
+ block == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) {
+ return Language.KOREAN;
+ }
+ // katakana phonetic extensions.
+ if (0x31f0 <= c && c <= 0x31ff) {
+ // See http://www.unicode.org/charts/PDF/U31F0.pdf
+ // This is a special case because This range of character
+ // codes is classified as unasigned in
+ // Character.UnicodeBlock. But clearly it is assigned as
+ // per above.
+ return Language.JAPANESE;
+ }
+ if (0x31f0 <= c && c <= 0x31ff || // these are standard character blocks for japanese characters.
+ block == Character.UnicodeBlock.HIRAGANA ||
+ block == Character.UnicodeBlock.KATAKANA ||
+ block == Character.UnicodeBlock.KANBUN) {
+ // See http://www.unicode.org/charts/PDF/U31F0.pdf
+ // This is a special case because This range of character
+ // codes is classified as unasigned in
+ // Character.UnicodeBlock. But clearly it is assigned as
+ // per above.
+ return Language.JAPANESE;
+ }
+ if (block == Character.UnicodeBlock.CJK_COMPATIBILITY ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS ||
+ block == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT ||
+ block == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT ||
+ block == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) {
+ // seeing one of these chars, we assume that the text is Chinese, until more concrete evidence is found
+ soFar = Language.CHINESE_TRADITIONAL;
+ }
+ if (block == Character.UnicodeBlock.BOPOMOFO ||
+ block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) {
+ return Language.CHINESE_TRADITIONAL;
+ }
+ if (block == Character.UnicodeBlock.THAI) {
+ return Language.THAI;
+ }
+ }
+ // got to the end, so return the current best guess
+ return soFar;
+ }
+
+ private boolean isTrailingOctet(byte i) {
+ return ((i >>> 6) & 3) == 2;
+ }
+
+ // If UTF-8, how many trailing octets are expected?
+ private int isLeadingFor(byte c) {
+ int i = c & 0xff;
+ if ((i & (1 << 7)) == 0) {
+ return 0;
+ } else if ((i >>> 5) == ((1 << 3) - 2)) {
+ return 1;
+ } else if ((i >>> 4) == ((1 << 4) - 2)) {
+ return 2;
+ } else if ((i >>> 3) == ((1 << 5) - 2)) {
+ return 3;
+ } else if ((i >>> 2) == ((1 << 6) - 2)) {
+ return 4;
+ } else if ((i >>> 1) == ((1 << 7) - 2)) {
+ return 5;
+ } else {
+ return -1;
+ }
+ }
+
+ private String guessEncoding(byte[] input) {
+ boolean isUtf8 = true;
+ boolean hasHighs = false;
+ scan:
+ for (int i = 0; i < input.length; i++) {
+ final int l = isLeadingFor(input[i]);
+ if (l < 0 || i + l >= input.length) {
+ hasHighs = true;
+ isUtf8 = false;
+ break;
+ }
+ switch (l) {
+ case 0:
+ break;
+ case 5:
+ isUtf8 = isTrailingOctet(input[++i]);
+ case 4:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 3:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 2:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ case 1:
+ isUtf8 &= isTrailingOctet(input[++i]);
+ hasHighs = true;
+ if (!isUtf8) {
+ break scan;
+ }
+ break;
+ }
+ }
+ if (hasHighs && isUtf8) {
+ return Utf8.getCharset().name();
+ } else if (!hasHighs) {
+ return "US-ASCII";
+ } else {
+ return "ISO-8859-1";
+ }
+ }
+}