summaryrefslogtreecommitdiffstats
path: root/vespalib/src/vespa/fastlib/text/unicodeutil.h
diff options
context:
space:
mode:
Diffstat (limited to 'vespalib/src/vespa/fastlib/text/unicodeutil.h')
-rw-r--r--vespalib/src/vespa/fastlib/text/unicodeutil.h243
1 files changed, 243 insertions, 0 deletions
diff --git a/vespalib/src/vespa/fastlib/text/unicodeutil.h b/vespalib/src/vespa/fastlib/text/unicodeutil.h
new file mode 100644
index 00000000000..e155af134fb
--- /dev/null
+++ b/vespalib/src/vespa/fastlib/text/unicodeutil.h
@@ -0,0 +1,243 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Unicode utilities.
+ */
+#pragma once
+
+#include <sys/types.h>
+
+/** ucs4_t is the type of the 4-byte UCS4 characters */
+typedef unsigned int ucs4_t;
+
+/**
+ * Utility class for unicode character handling.
+ * Used to examine properties of unicode characters, and
+ * provide fast conversion methods between often used encodings.
+ */
+class Fast_UnicodeUtil {
+private:
+ /**
+ * Is true when the tables have been initialized. Is set by
+ * InitTables, and should be protected by the _initMutex before
+ * inspection.
+ */
+
+ /** Two-level lowercase table. 256 pages, 256 elements each.
+ * This table is defined in unicode-lowercase.cpp, which is
+ * autogenerated by the extcase application. */
+ static unsigned short *_compLowerCase[256];
+
+ /** Two-level character property table. 256 pages with 256 elements each.
+ * This table is defined in unicode-charprops.cpp, which is
+ * autogenerated by the extprop applicatoin. */
+ static unsigned char *_compCharProps[256];
+
+
+ /** The property bit identificators */
+ enum {
+ _spaceProp = 1,
+ _wordcharProp = 2,
+ _ideographicProp = 4,
+ _decimalDigitCharProp = 8,
+ _ignorableControlCharProp = 16,
+ _terminalPunctuationCharProp = 32
+ };
+
+public:
+ virtual ~Fast_UnicodeUtil() { }
+ /** Initialize the ISO 8859-1 static tables. */
+ static void InitTables();
+
+ /** Indicates an invalid UTF-8 character sequence. */
+ enum { _BadUTF8Char = 0xfffffffeu };
+
+ /**
+ * Test for word character. Characters with certain unicode properties
+ * are recognized as word characters. In addition to this, all
+ * characters with the custom _FASTWordProp is regarded as a word
+ * character. The previous range in _privateUseProp is included
+ * in the _FASTWordProp set of ranges.
+ * @param testchar the UCS4 character to test.
+ * @return true if testchar is a word character, i.e. if it has
+ * one or more of the properties alphabetic, ideographic,
+ * combining char, decimal digit char, private use, extender.
+ */
+ static bool IsWordChar(ucs4_t testchar) {
+ return (testchar < 65536 &&
+ (_compCharProps[testchar >> 8][testchar & 255] &
+ _wordcharProp) != 0);
+ }
+
+ /**
+ * Get the next UCS4 character from an UTF-8 string buffer.
+ * Modify the src pointer to allow future calls.
+ * @param src The address of a pointer to the current position
+ * in the UTF-8 string.
+ * @param length The maximum allowed length of the byte sequence.
+ * -1 means no check.
+ * @return The next UCS4 character, or _BadUTF8Char if the
+ * next character is invalid.
+ */
+ static ucs4_t GetUTF8Char(const unsigned char *& src);
+ static ucs4_t GetUTF8Char(const char *& src) {
+ const unsigned char *temp = reinterpret_cast<const unsigned char *>(src);
+ ucs4_t res = GetUTF8Char(temp);
+ src = reinterpret_cast<const char *>(temp);
+ return res;
+ }
+
+ /**
+ * Put an UCS4 character into a buffer as an UTF-8 representation.
+ * @param dst The destination buffer.
+ * @param i The UCS4 character.
+ * @return Pointer to the next position in dst after the putted byte(s).
+ */
+ static char *utf8cput(char *dst, ucs4_t i) {
+ if (i < 128)
+ *dst++ = i;
+ else if (i < 0x800) {
+ *dst++ = (i >> 6) | 0xc0;
+ *dst++ = (i & 63) | 0x80;
+ } else if (i < 0x10000) {
+ *dst++ = (i >> 12) | 0xe0;
+ *dst++ = ((i >> 6) & 63) | 0x80;
+ *dst++ = (i & 63) | 0x80;
+ } else if (i < 0x200000) {
+ *dst++ = (i >> 18) | 0xf0;
+ *dst++ = ((i >> 12) & 63) | 0x80;
+ *dst++ = ((i >> 6) & 63) | 0x80;
+ *dst++ = (i & 63) | 0x80;
+ } else if (i < 0x4000000) {
+ *dst++ = (i >> 24) | 0xf8;
+ *dst++ = ((i >> 18) & 63) | 0x80;
+ *dst++ = ((i >> 12) & 63) | 0x80;
+ *dst++ = ((i >> 6) & 63) | 0x80;
+ *dst++ = (i & 63) | 0x80;
+ } else {
+ *dst++ = (i >> 30) | 0xfc;
+ *dst++ = ((i >> 24) & 63) | 0x80;
+ *dst++ = ((i >> 18) & 63) | 0x80;
+ *dst++ = ((i >> 12) & 63) | 0x80;
+ *dst++ = ((i >> 6) & 63) | 0x80;
+ *dst++ = (i & 63) | 0x80;
+ }
+ return dst;
+ }
+
+ /**
+ * Copy an UTF-8 string into an UCS4 string.
+ * @param dst The UCS4 destination buffer.
+ * @param src The UTF-8 source buffer.
+ * @return A pointer to the destination string.
+ */
+ static ucs4_t *ucs4copy(ucs4_t *dst, const char *src);
+
+ /**
+ * Get the length of the UTF-8 representation of an UCS4 character.
+ * @param i The UCS4 character.
+ * @return The number of bytes required for the UTF-8 representation.
+ */
+ static size_t utf8clen(ucs4_t i) {
+ if (i < 128)
+ return 1;
+ else if (i < 0x800)
+ return 2;
+ else if (i < 0x10000)
+ return 3;
+ else if (i < 0x200000)
+ return 4;
+ else if (i < 0x4000000)
+ return 5;
+ else
+ return 6;
+ }
+
+ /**
+ * Lowercase an UCS4 character.
+ * @param testchar The character to lowercase.
+ * @return The lowercase of the input, if defined. Else the input character.
+ */
+ static ucs4_t ToLower(ucs4_t testchar)
+ {
+ ucs4_t ret;
+ if (testchar < 65536) {
+ ret = _compLowerCase[testchar >> 8][testchar & 255];
+ if (ret == 0)
+ return testchar;
+ return ret;
+ } else
+ return testchar;
+ }
+
+ /** Move forwards or backwards a number of characters within an UTF8 buffer
+ * Modify pos to yield new position if possible
+ * @param start A pointer to the start of the UTF8 buffer
+ * @param length The length of the UTF8 buffer
+ * @param pos A pointer to the current position within the UTF8 buffer,
+ * updated to reflect new position upon return
+ * @param offset An offset (+/-) in number of UTF8 characters.
+ * Offset 0 means move to the start of the current character.
+ * @return Number of bytes moved, or -1 if out of range
+ */
+ static int UTF8move(unsigned const char* start, size_t length,
+ unsigned const char*& pos, off_t offset);
+
+ /**
+ * Find the number of characters in an UCS4 string.
+ * @param str The UCS4 string.
+ * @return The number of characters.
+ */
+ static size_t ucs4strlen(const ucs4_t *str);
+
+ /**
+ * Convert UCS4 to UTF-8, bounded by max lengths.
+ * @param dst The destination buffer for the UTF-8 string.
+ * @param src The source UCS4 string.
+ * @param maxdst The maximum number of bytes to put into dst.
+ * @param maxsrc The maximum number of characters to convert from src.
+ * @return A pointer to the destination.
+ */
+ static char *utf8ncopy(char *dst, const ucs4_t *src, int maxdst, int maxsrc);
+
+
+ /**
+ * Compare an UTF-8 string to a UCS4 string, analogous to strcmp(3).
+ * @param s1 The UTF-8 string.
+ * @param s2 The UCS4 string.
+ * @return An integer less than, equal to, or greater than zero,
+ * if s1 is, respectively, less than, matching, or greater than s2.
+ * NB Only used in local test
+ */
+ static int utf8cmp(const char *s1, const ucs4_t *s2);
+
+ /**
+ * Test for terminal punctuation.
+ * @param testchar the UCS4 character to test.
+ * @return true if testchar is a terminal punctuation character,
+ * i.e. if it has the terminal punctuation char property.
+ */
+ static bool IsTerminalPunctuationChar(ucs4_t testchar) {
+ return (testchar < 65536 &&
+ (_compCharProps[testchar >> 8][testchar & 255] &
+ _terminalPunctuationCharProp) != 0);
+ }
+
+ /**
+ * Get the next UCS4 character from an UTF-8 string buffer.
+ * We assume that the first character in the UTF-8 string is >= 0x80 (non-ascii).
+ * Modify the src pointer to allow future calls.
+ * @param src The address of a pointer to the current position
+ * in the UTF-8 string.
+ * @return The next UCS4 character, or _BadUTF8Char if the
+ * next character is invalid.
+ */
+ static ucs4_t GetUTF8CharNonAscii(unsigned const char *&src);
+
+ // this is really an alias of the above function
+ static ucs4_t GetUTF8CharNonAscii(const char *&src) {
+ unsigned const char *temp = reinterpret_cast<unsigned const char *>(src);
+ ucs4_t res = GetUTF8CharNonAscii(temp);
+ src = reinterpret_cast<const char *>(temp);
+ return res;
+ }
+};