diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-21 10:56:49 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-21 10:56:49 +0000 |
commit | 543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch) | |
tree | f62fb82c65a6152fabe944caa4e719051f4ab032 /vespalib | |
parent | ef3db955e75e6df68a2a358feb5b95e44979377f (diff) |
- Separate methods for lowercasing, and lowercasing and folding.
- Hide implementations and use accessors.
- Minor code cleanup.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/vespa/fastlib/text/normwordfolder.cpp | 23 | ||||
-rw-r--r-- | vespalib/src/vespa/fastlib/text/normwordfolder.h | 98 | ||||
-rw-r--r-- | vespalib/src/vespa/fastlib/text/wordfolder.h | 8 |
3 files changed, 61 insertions, 68 deletions
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp index ef6d17e20f1..f9dbf202fcb 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp @@ -13,7 +13,9 @@ bool Fast_NormalizeWordFolder::_doMulticharExpansion = false; bool Fast_NormalizeWordFolder::_isWord[128]; ucs4_t Fast_NormalizeWordFolder::_foldCase[767]; // Up to Latin Extended B (0x0250) +ucs4_t Fast_NormalizeWordFolder::_lowerCase[767]; ucs4_t Fast_NormalizeWordFolder::_foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) +ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256]; ucs4_t Fast_NormalizeWordFolder::_kanaMap[192]; ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240]; @@ -43,11 +45,10 @@ Fast_NormalizeWordFolder::Initialize() for (i = 0; i < 128; i++) _isWord[i] = Fast_UnicodeUtil::IsWordChar(i); for (i = 0; i < 767; i++) { - _foldCase[i] = Fast_UnicodeUtil::ToLower(i); + _foldCase[i] = _lowerCase[i] = Fast_UnicodeUtil::ToLower(i); } - for (i = 0x1E00; i < 0x1F00; i++) { - _foldCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i); + _foldCaseHighAscii[i - 0x1E00] = _lowerCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i); } if (_doAccentRemoval) { @@ -394,17 +395,11 @@ Fast_NormalizeWordFolder::Fast_NormalizeWordFolder() } -Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder(void) -{ -} +Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder() = default; const char* -Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, - const char *bufend, - ucs4_t *dstbuf, - ucs4_t *dstbufend, - const char*& origstart, - size_t& tokenlen) const +Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const { ucs4_t c; @@ -451,7 +446,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, if (repllen > 0) q = Fast_UnicodeUtil::ucs4copy(q,repl); } else { - c = ToFold(c); + c = lowercase_and_fold(c); *q++ = c; } } @@ -563,7 +558,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, if (repllen > 0) q = Fast_UnicodeUtil::ucs4copy(q,repl); } else { - c = ToFold(c); + c = lowercase_and_fold(c); *q++ = c; } if (q >= eq) { // Junk rest of word diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h index d7b07b698c9..c596b0fd2b4 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.h +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h @@ -11,21 +11,6 @@ */ class Fast_NormalizeWordFolder : public Fast_WordFolder { -private: - static bool _isInitialized; - - /** Features */ - static bool _doAccentRemoval; - static bool _doSharpSSubstitution; - static bool _doLigatureSubstitution; - static bool _doMulticharExpansion; - - /** - * Freeze the config, either from call to Setup, environment - * or defaults. - */ - static void Initialize(); - public: enum { DO_ACCENT_REMOVAL = 0x1 << 0, @@ -37,6 +22,10 @@ public: DO_LIGATURE_SUBSTITUTION = 0x1 << 6, DO_MULTICHAR_EXPANSION = 0x1 << 7 }; + Fast_NormalizeWordFolder(); + ~Fast_NormalizeWordFolder() override; + const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override; /** * Setup behaviour prior to constructing an object. * Not needed if default behaviour is wanted. The default is @@ -46,33 +35,38 @@ public: * added together. */ static void Setup(uint32_t flags); - -public: - /** character tables */ - static bool _isWord[128]; - static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF) - static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese) -private: - /** Map the values from range 0x3040 (0) - 0x30FF (191). */ - static ucs4_t _kanaMap[192]; - static ucs4_t _halfwidth_fullwidthMap[240]; -public: - static ucs4_t ToFold(ucs4_t testchar) { - if (testchar < 767) - return _foldCase[testchar]; - else if (testchar >= 0x1E00 && testchar < 0x1F00) - return _foldCaseHighAscii[testchar - 0x1E00]; + static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; } + static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; } + static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; } + static ucs4_t lowercase(ucs4_t c) { + if (c < 767) + return _lowerCase[c]; + else if (c >= 0x1E00 && c < 0x1F00) + return _lowerCaseHighAscii[c - 0x1E00]; + else + if (c >= 0x3040 && c < 0x3100) + return _kanaMap[c - 0x3040]; + else + if (c >= 0xFF00 && c < 0xFFF0) + return _halfwidth_fullwidthMap[c - 0xFF00]; + else + return Fast_UnicodeUtil::ToLower(c); + } + static ucs4_t lowercase_and_fold(ucs4_t c) { + if (c < 767) + return _foldCase[c]; + else if (c >= 0x1E00 && c < 0x1F00) + return _foldCaseHighAscii[c - 0x1E00]; else - if (testchar >= 0x3040 && testchar < 0x3100) - return _kanaMap[testchar - 0x3040]; + if (c >= 0x3040 && c < 0x3100) + return _kanaMap[c - 0x3040]; else - if (testchar >= 0xFF00 && testchar < 0xFFF0) - return _halfwidth_fullwidthMap[testchar - 0xFF00]; + if (c >= 0xFF00 && c < 0xFFF0) + return _halfwidth_fullwidthMap[c - 0xFF00]; else - return Fast_UnicodeUtil::ToLower(testchar); + return Fast_UnicodeUtil::ToLower(c); } -public: static const char *ReplacementString(ucs4_t testchar) { if (testchar < 0xc4 || testchar > 0x1f3) { return nullptr; @@ -150,18 +144,26 @@ public: } private: /** - * Check if the given char is a word character or used - * for interlinear annotation. - * @param c The character to check. - * @return true if c is a word character, or interlinear annotation syntax characters. + * Freeze the config, either from call to Setup, environment + * or defaults. */ + static void Initialize(); static bool IsWordCharOrIA(ucs4_t c) { - return Fast_UnicodeUtil::IsWordChar(c) - || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB; + return Fast_UnicodeUtil::IsWordChar(c) || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB; } -public: - Fast_NormalizeWordFolder(); - ~Fast_NormalizeWordFolder() override; - const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, - ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override; + + /** character tables */ + static bool _isWord[128]; + static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF) + static ucs4_t _lowerCase[767]; + static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese) + static ucs4_t _lowerCaseHighAscii[256]; + /** Map the values from range 0x3040 (0) - 0x30FF (191). */ + static ucs4_t _kanaMap[192]; + static ucs4_t _halfwidth_fullwidthMap[240]; + static bool _isInitialized; + static bool _doAccentRemoval; + static bool _doSharpSSubstitution; + static bool _doLigatureSubstitution; + static bool _doMulticharExpansion; }; diff --git a/vespalib/src/vespa/fastlib/text/wordfolder.h b/vespalib/src/vespa/fastlib/text/wordfolder.h index e5412859f3e..ac8c590be7c 100644 --- a/vespalib/src/vespa/fastlib/text/wordfolder.h +++ b/vespalib/src/vespa/fastlib/text/wordfolder.h @@ -7,10 +7,6 @@ class Fast_WordFolder { public: virtual ~Fast_WordFolder() = default; - virtual const char* UCS4Tokenize(const char *buf, - const char *bufend, - ucs4_t *dstbuf, - ucs4_t *dstbufend, - const char*& origstart, - size_t& tokenlen) const = 0; + virtual const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const = 0; }; |