1 files changed, 243 insertions, 0 deletions
diff --git a/vespalib/src/vespa/fastlib/text/unicodeutil.h b/vespalib/src/vespa/fastlib/text/unicodeutil.h
new file mode 100644
index 00000000000..e155af134fb
--- /dev/null
+++ b/vespalib/src/vespa/fastlib/text/unicodeutil.h
@@ -0,0 +1,243 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Unicode utilities.
+ */
+#pragma once
+
+#include <sys/types.h>
+
+/** ucs4_t is the type of the 4-byte UCS4 characters */
+typedef unsigned int ucs4_t;
+
+/**
+ * Utility class for unicode character handling.
+ * Used to examine properties of unicode characters, and
+ * provide fast conversion methods between often used encodings.
+ */
+class Fast_UnicodeUtil {
+private:
+    /**
+     * Is true when the tables have been initialized. Is set by
+     * InitTables, and should be protected by the _initMutex before
+     * inspection.
+     */
+
+    /** Two-level lowercase table. 256 pages, 256 elements each.
+     * This table is defined in unicode-lowercase.cpp, which is
+     * autogenerated by the extcase application. */
+    static unsigned short *_compLowerCase[256];
+
+    /** Two-level character property table. 256 pages with 256 elements each.
+     * This table is defined in unicode-charprops.cpp, which is
+     * autogenerated by the extprop applicatoin. */
+    static unsigned char *_compCharProps[256];
+
+
+    /** The property bit identificators */
+    enum {
+        _spaceProp = 1,
+        _wordcharProp = 2,
+        _ideographicProp = 4,
+        _decimalDigitCharProp = 8,
+        _ignorableControlCharProp = 16,
+        _terminalPunctuationCharProp = 32
+    };
+
+public:
+    virtual ~Fast_UnicodeUtil() { }
+    /** Initialize the ISO 8859-1 static tables. */
+    static void InitTables();
+
+    /** Indicates an invalid UTF-8 character sequence. */
+    enum { _BadUTF8Char = 0xfffffffeu };
+
+    /**
+     * Test for word character. Characters with certain unicode properties
+     * are recognized as word characters. In addition to this, all
+     * characters with the custom _FASTWordProp is regarded as a word
+     * character. The previous range in _privateUseProp is included
+     * in the _FASTWordProp set of ranges.
+     * @param testchar the UCS4 character to test.
+     * @return true if testchar is a word character, i.e. if it has
+     * one or more of the properties alphabetic, ideographic,
+     * combining char, decimal digit char, private use, extender.
+     */
+    static bool IsWordChar(ucs4_t testchar) {
+        return (testchar < 65536 &&
+                (_compCharProps[testchar >> 8][testchar & 255] &
+                 _wordcharProp) != 0);
+    }
+
+    /**
+     * Get the next UCS4 character from an UTF-8 string buffer.
+     * Modify the src pointer to allow future calls.
+     * @param src The address of a pointer to the current position
+     *            in the UTF-8 string.
+     * @param length The maximum allowed length of the byte sequence.
+     *               -1 means no check.
+     * @return The next UCS4 character, or _BadUTF8Char if the
+     *         next character is invalid.
+     */
+    static ucs4_t GetUTF8Char(const unsigned char *& src);
+    static ucs4_t GetUTF8Char(const char *& src) {
+        const unsigned char *temp = reinterpret_cast<const unsigned char *>(src);
+        ucs4_t res = GetUTF8Char(temp);
+        src = reinterpret_cast<const char *>(temp);
+        return res;
+    }
+
+    /**
+     * Put an UCS4 character into a buffer as an UTF-8 representation.
+     * @param dst The destination buffer.
+     * @param i The UCS4 character.
+     * @return Pointer to the next position in dst after the putted byte(s).
+     */
+    static char *utf8cput(char *dst, ucs4_t i) {
+        if (i < 128)
+            *dst++ = i;
+        else if (i < 0x800) {
+            *dst++ = (i >> 6) | 0xc0;
+            *dst++ = (i & 63) | 0x80;
+        } else if (i < 0x10000) {
+            *dst++ = (i >> 12) | 0xe0;
+            *dst++ = ((i >> 6) & 63) | 0x80;
+            *dst++ = (i & 63) | 0x80;
+        } else if (i < 0x200000) {
+            *dst++ = (i >> 18) | 0xf0;
+            *dst++ = ((i >> 12) & 63) | 0x80;
+            *dst++ = ((i >> 6) & 63) | 0x80;
+            *dst++ = (i & 63) | 0x80;
+        } else if (i < 0x4000000) {
+            *dst++ = (i >> 24) | 0xf8;
+            *dst++ = ((i >> 18) & 63) | 0x80;
+            *dst++ = ((i >> 12) & 63) | 0x80;
+            *dst++ = ((i >> 6) & 63) | 0x80;
+            *dst++ = (i & 63) | 0x80;
+        } else {
+            *dst++ = (i >> 30) | 0xfc;
+            *dst++ = ((i >> 24) & 63) | 0x80;
+            *dst++ = ((i >> 18) & 63) | 0x80;
+            *dst++ = ((i >> 12) & 63) | 0x80;
+            *dst++ = ((i >> 6) & 63) | 0x80;
+            *dst++ = (i & 63) | 0x80;
+        }
+        return dst;
+    }
+
+    /**
+     * Copy an UTF-8 string into an UCS4 string.
+     * @param dst The UCS4 destination buffer.
+     * @param src The UTF-8 source buffer.
+     * @return A pointer to the destination string.
+     */
+    static ucs4_t *ucs4copy(ucs4_t *dst, const char *src);
+
+    /**
+     * Get the length of the UTF-8 representation of an UCS4 character.
+     * @param i The UCS4 character.
+     * @return The number of bytes required for the UTF-8 representation.
+     */
+    static size_t utf8clen(ucs4_t i) {
+        if (i < 128)
+            return 1;
+        else if (i < 0x800)
+            return 2;
+        else if (i < 0x10000)
+            return 3;
+        else if (i < 0x200000)
+            return 4;
+        else if (i < 0x4000000)
+            return 5;
+        else
+            return 6;
+    }
+
+    /**
+     * Lowercase an UCS4 character.
+     * @param testchar The character to lowercase.
+     * @return The lowercase of the input, if defined. Else the input character.
+     */
+    static ucs4_t ToLower(ucs4_t testchar)
+    {
+        ucs4_t ret;
+        if (testchar < 65536) {
+            ret = _compLowerCase[testchar >> 8][testchar & 255];
+            if (ret == 0)
+                return testchar;
+            return ret;
+        } else
+            return testchar;
+    }
+
+    /** Move forwards or backwards a number of characters within an UTF8 buffer
+     * Modify pos to yield new position if possible
+     * @param start A pointer to the start of the UTF8 buffer
+     * @param length The length of the UTF8 buffer
+     * @param pos A pointer to the current position within the UTF8 buffer,
+     *            updated to reflect new position upon return
+     * @param offset An offset (+/-) in number of UTF8 characters.
+     *        Offset 0 means move to the start of the current character.
+     * @return Number of bytes moved, or -1 if out of range
+     */
+    static int UTF8move(unsigned const char* start, size_t length,
+                        unsigned const char*& pos, off_t offset);
+
+    /**
+     * Find the number of characters in an UCS4 string.
+     * @param str The UCS4 string.
+     * @return The number of characters.
+     */
+    static size_t ucs4strlen(const ucs4_t *str);
+
+    /**
+     * Convert UCS4 to UTF-8, bounded by max lengths.
+     * @param dst The destination buffer for the UTF-8 string.
+     * @param src The source UCS4 string.
+     * @param maxdst The maximum number of bytes to put into dst.
+     * @param maxsrc The maximum number of characters to convert from src.
+     * @return A pointer to the destination.
+     */
+    static char *utf8ncopy(char *dst, const ucs4_t *src, int maxdst, int maxsrc);
+
+
+    /**
+     * Compare an UTF-8 string to a UCS4 string, analogous to strcmp(3).
+     * @param s1 The UTF-8 string.
+     * @param s2 The UCS4 string.
+     * @return An integer less than, equal to, or greater than zero,
+     *        if s1 is, respectively, less than, matching, or greater than s2.
+     * NB Only used in local test
+     */
+    static int utf8cmp(const char *s1, const ucs4_t *s2);
+
+    /**
+     * Test for terminal punctuation.
+     * @param testchar the UCS4 character to test.
+     * @return true if testchar is a terminal punctuation character,
+     *    i.e. if it has the terminal punctuation char property.
+     */
+    static bool IsTerminalPunctuationChar(ucs4_t testchar) {
+        return (testchar < 65536 &&
+                (_compCharProps[testchar >> 8][testchar & 255] &
+                 _terminalPunctuationCharProp) != 0);
+    }
+
+    /**
+     * Get the next UCS4 character from an UTF-8 string buffer.
+     * We assume that the first character in the UTF-8 string is >= 0x80 (non-ascii).
+     * Modify the src pointer to allow future calls.
+     * @param src The address of a pointer to the current position
+     *            in the UTF-8 string.
+     * @return The next UCS4 character, or _BadUTF8Char if the
+     *         next character is invalid.
+     */
+    static ucs4_t GetUTF8CharNonAscii(unsigned const char *&src);
+
+    // this is really an alias of the above function
+    static ucs4_t GetUTF8CharNonAscii(const char *&src) {
+        unsigned const char *temp = reinterpret_cast<unsigned const char *>(src);
+        ucs4_t res = GetUTF8CharNonAscii(temp);
+        src = reinterpret_cast<const char *>(temp);
+        return res;
+    }
+};