summaryrefslogtreecommitdiffstats
path: root/vespajlib/src/main/java/com/yahoo/text/Text.java
blob: 2b670e5d727ae50f601c5225596f89dda2c6c35b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package com.yahoo.text;

/**
 * Text utility functions.
 * 
 * @author bratseth
 */
public final class Text {

    private static final boolean[] allowedAsciiChars = new boolean[0x80];

    static {
        allowedAsciiChars[0x0] = false;
        allowedAsciiChars[0x1] = false;
        allowedAsciiChars[0x2] = false;
        allowedAsciiChars[0x3] = false;
        allowedAsciiChars[0x4] = false;
        allowedAsciiChars[0x5] = false;
        allowedAsciiChars[0x6] = false;
        allowedAsciiChars[0x7] = false;
        allowedAsciiChars[0x8] = false;
        allowedAsciiChars[0x9] = true;  //tab
        allowedAsciiChars[0xA] = true;  //nl
        allowedAsciiChars[0xB] = false;
        allowedAsciiChars[0xC] = false;
        allowedAsciiChars[0xD] = true;  //cr
        for (int i = 0xE; i < 0x20; i++) {
            allowedAsciiChars[i] = false;
        }
        for (int i = 0x20; i < 0x7F; i++) {
            allowedAsciiChars[i] = true;  //printable ascii chars
        }
        allowedAsciiChars[0x7F] = true;  //del - discouraged, but allowed
    }

    /** No instantiation */
    private Text() {}

    /**
     * Returns whether the given codepoint is a valid text character, potentially suitable for 
     * purposes such as indexing and display, see http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets
     */
    public static boolean isTextCharacter(int codepoint) {
        // The link above notes that 0x7F-0x84 and 0x86-0x9F are discouraged, but they are still allowed -
        // see http://www.w3.org/International/questions/qa-controls

        if (codepoint <  0x80)     return allowedAsciiChars[codepoint];
        if (codepoint <  0xFDD0)   return true;
        if (codepoint <= 0xFDDF)   return false;
        if (codepoint <  0x1FFFE)  return true;
        if (codepoint <= 0x1FFFF)  return false;
        if (codepoint <  0x2FFFE)  return true;
        if (codepoint <= 0x2FFFF)  return false;
        if (codepoint <  0x3FFFE)  return true;
        if (codepoint <= 0x3FFFF)  return false;
        if (codepoint <  0x4FFFE)  return true;
        if (codepoint <= 0x4FFFF)  return false;
        if (codepoint <  0x5FFFE)  return true;
        if (codepoint <= 0x5FFFF)  return false;
        if (codepoint <  0x6FFFE)  return true;
        if (codepoint <= 0x6FFFF)  return false;
        if (codepoint <  0x7FFFE)  return true;
        if (codepoint <= 0x7FFFF)  return false;
        if (codepoint <  0x8FFFE)  return true;
        if (codepoint <= 0x8FFFF)  return false;
        if (codepoint <  0x9FFFE)  return true;
        if (codepoint <= 0x9FFFF)  return false;
        if (codepoint <  0xAFFFE)  return true;
        if (codepoint <= 0xAFFFF)  return false;
        if (codepoint <  0xBFFFE)  return true;
        if (codepoint <= 0xBFFFF)  return false;
        if (codepoint <  0xCFFFE)  return true;
        if (codepoint <= 0xCFFFF)  return false;
        if (codepoint <  0xDFFFE)  return true;
        if (codepoint <= 0xDFFFF)  return false;
        if (codepoint <  0xEFFFE)  return true;
        if (codepoint <= 0xEFFFF)  return false;
        if (codepoint <  0xFFFFE)  return true;
        if (codepoint <= 0xFFFFF)  return false;
        if (codepoint <  0x10FFFE) return true;
        if (codepoint <= 0x10FFFF) return false;

        return true;
    }    
    
}