blob: 0e1327aabcf8cdc95878a704baf324fa1a4c71df (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
/**
* Determines the class of a given character. Use this rather than java.lang.Character.
*
* @author bratseth
*/
public class CharacterClasses {
/**
* Returns true for code points which are letters in unicode 3 or 4, plus some additional characters
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
if (java.lang.Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
// if (c == '_') return true;
// Ticket 3864695, some CJK punctuation YST defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
c == '\u300c' || c == '\u300d' || c == '\u300e' ||
c == '\u300f' || c == '\u3010' || c == '\u3011') {
return true;
}
int type = java.lang.Character.getType(c);
return type == java.lang.Character.NON_SPACING_MARK ||
type == java.lang.Character.COMBINING_SPACING_MARK ||
type == java.lang.Character.ENCLOSING_MARK;
}
/**
* Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
*/
public boolean isDigit(int c) {
return Character.isDigit(c);
}
/** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */
public boolean isLatinDigit(int c) {
return Character.isDigit(c) && isLatin(c);
}
/** Returns true if this is a latin character */
public boolean isLatin(int c) {
return Character.UnicodeBlock.of(c).equals(Character.UnicodeBlock.BASIC_LATIN);
}
/**
* Convenience, returns isLetter(c) || isDigit(c)
*/
public boolean isLetterOrDigit(int c) {
return isLetter(c) || isDigit(c);
}
}
|