diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java |
Publish
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java new file mode 100644 index 00000000000..0e1327aabcf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -0,0 +1,55 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Determines the class of a given character. Use this rather than java.lang.Character. + * + * @author bratseth + */ +public class CharacterClasses { + + /** + * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters + * which are useful to view as letters even though not defined as such in unicode. + */ + public boolean isLetter(int c) { + if (java.lang.Character.isLetter(c)) return true; + if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters + // if (c == '_') return true; + + // Ticket 3864695, some CJK punctuation YST defined as word characters + if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || + c == '\u300c' || c == '\u300d' || c == '\u300e' || + c == '\u300f' || c == '\u3010' || c == '\u3011') { + return true; + } + int type = java.lang.Character.getType(c); + return type == java.lang.Character.NON_SPACING_MARK || + type == java.lang.Character.COMBINING_SPACING_MARK || + type == java.lang.Character.ENCLOSING_MARK; + } + + /** + * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit + */ + public boolean isDigit(int c) { + return Character.isDigit(c); + } + + /** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */ + public boolean isLatinDigit(int c) { + return Character.isDigit(c) && isLatin(c); + } + + /** Returns true if this is a latin character */ + public boolean isLatin(int c) { + return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN); + } + + /** + * Convenience, returns isLetter(c) || isDigit(c) + */ + public boolean isLetterOrDigit(int c) { + return isLetter(c) || isDigit(c); + } +} |