summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java48
1 files changed, 48 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java
new file mode 100644
index 00000000000..d7e3f88ae8d
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java
@@ -0,0 +1,48 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+class WordCharDetector {
+ public static boolean isWordChar(int codepoint) {
+ int unicodeGeneralCategory = Character.getType(codepoint);
+ switch (unicodeGeneralCategory) {
+ case Character.LOWERCASE_LETTER:
+ case Character.OTHER_LETTER:
+ case Character.TITLECASE_LETTER:
+ case Character.UPPERCASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ return true;
+/*
+ * these are the other categories, currently considered non-word-chars:
+ *
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.CONTROL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.DASH_PUNCTUATION:
+ case Character.ENCLOSING_MARK:
+ case Character.END_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ case Character.FORMAT:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.MATH_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.NON_SPACING_MARK:
+ case Character.OTHER_PUNCTUATION:
+ case Character.OTHER_SYMBOL:
+ case Character.PRIVATE_USE:
+ case Character.START_PUNCTUATION:
+ case Character.SURROGATE:
+ case Character.UNASSIGNED:
+ case Character.DECIMAL_DIGIT_NUMBER:
+ case Character.LETTER_NUMBER:
+ case Character.OTHER_NUMBER:
+ case Character.COMBINING_SPACING_MARK:
+ case Character.LINE_SEPARATOR:
+ case Character.SPACE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ *
+ */
+ default:
+ return false;
+ }
+ }
+}