- Separate methods for lowercasing, and lowercasing and folding.

- Hide implementations and use accessors. - Minor code cleanup.
author: Henning Baldersheim <balder@yahoo-inc.com> 2023-12-21 10:56:49 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2023-12-21 10:56:49 +0000
commit: 543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
tree: f62fb82c65a6152fabe944caa4e719051f4ab032 /vespalib
parent: ef3db955e75e6df68a2a358feb5b95e44979377f (diff)
3 files changed, 61 insertions, 68 deletions
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
index ef6d17e20f1..f9dbf202fcb 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
@@ -13,7 +13,9 @@ bool Fast_NormalizeWordFolder::_doMulticharExpansion = false;
 bool Fast_NormalizeWordFolder::_isWord[128];
 
 ucs4_t Fast_NormalizeWordFolder::_foldCase[767]; // Up to Latin Extended B (0x0250)
+ucs4_t Fast_NormalizeWordFolder::_lowerCase[767];
 ucs4_t Fast_NormalizeWordFolder::_foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00)
+ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256];
 ucs4_t Fast_NormalizeWordFolder::_kanaMap[192];
 ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240];
 
@@ -43,11 +45,10 @@ Fast_NormalizeWordFolder::Initialize()
             for (i = 0; i < 128; i++)
                 _isWord[i] = Fast_UnicodeUtil::IsWordChar(i);
             for (i = 0; i < 767; i++) {
-                _foldCase[i] = Fast_UnicodeUtil::ToLower(i);
+                _foldCase[i] = _lowerCase[i] = Fast_UnicodeUtil::ToLower(i);
             }
-
             for (i = 0x1E00; i < 0x1F00; i++) {
-                _foldCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
+                _foldCaseHighAscii[i - 0x1E00] = _lowerCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
             }
 
             if (_doAccentRemoval) {
@@ -394,17 +395,11 @@ Fast_NormalizeWordFolder::Fast_NormalizeWordFolder()
 }
 
 
-Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder(void)
-{
-}
+Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder() = default;
 
 const char*
-Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
-                                   const char *bufend,
-                                   ucs4_t *dstbuf,
-                                   ucs4_t *dstbufend,
-                                   const char*& origstart,
-                                   size_t& tokenlen) const
+Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+                                       ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const
 {
 
     ucs4_t c;
@@ -451,7 +446,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
             if (repllen > 0)
                 q = Fast_UnicodeUtil::ucs4copy(q,repl);
         } else {
-            c = ToFold(c);
+            c = lowercase_and_fold(c);
             *q++ = c;
         }
     }
@@ -563,7 +558,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
                     if (repllen > 0)
                         q = Fast_UnicodeUtil::ucs4copy(q,repl);
                 } else {
-                    c = ToFold(c);
+                    c = lowercase_and_fold(c);
                     *q++ = c;
                 }
                 if (q >= eq) {		// Junk rest of word
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h
index d7b07b698c9..c596b0fd2b4 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h
@@ -11,21 +11,6 @@
  */
 class Fast_NormalizeWordFolder : public Fast_WordFolder
 {
-private:
-    static bool _isInitialized;
-
-    /** Features */
-    static bool _doAccentRemoval;
-    static bool _doSharpSSubstitution;
-    static bool _doLigatureSubstitution;
-    static bool _doMulticharExpansion;
-
-    /**
-     * Freeze the config, either from call to Setup, environment
-     * or defaults.
-     */
-    static void Initialize();
-
 public:
     enum {
         DO_ACCENT_REMOVAL =           0x1 << 0,
@@ -37,6 +22,10 @@ public:
         DO_LIGATURE_SUBSTITUTION =    0x1 << 6,
         DO_MULTICHAR_EXPANSION =      0x1 << 7
     };
+    Fast_NormalizeWordFolder();
+    ~Fast_NormalizeWordFolder() override;
+    const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+                             ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
     /**
      * Setup behaviour prior to constructing an object.
      * Not needed if default behaviour is wanted. The default is
@@ -46,33 +35,38 @@ public:
      *              added together.
      */
     static void Setup(uint32_t flags);
-
-public:
-    /** character tables */
-    static bool _isWord[128];
-    static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
-    static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
-private:
-    /** Map the values from range 0x3040 (0) - 0x30FF (191). */
-    static ucs4_t _kanaMap[192];
-    static ucs4_t _halfwidth_fullwidthMap[240];
-public:
-    static ucs4_t ToFold(ucs4_t testchar) {
-        if (testchar < 767)
-            return _foldCase[testchar];
-        else if (testchar >= 0x1E00 && testchar < 0x1F00)
-            return _foldCaseHighAscii[testchar - 0x1E00];
+    static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; }
+    static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; }
+    static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; }
+    static ucs4_t lowercase(ucs4_t c) {
+        if (c < 767)
+            return _lowerCase[c];
+        else if (c >= 0x1E00 && c < 0x1F00)
+            return _lowerCaseHighAscii[c - 0x1E00];
+        else
+        if (c >= 0x3040 && c < 0x3100)
+            return _kanaMap[c - 0x3040];
+        else
+            if (c >= 0xFF00 && c < 0xFFF0)
+                return _halfwidth_fullwidthMap[c - 0xFF00];
+            else
+                return Fast_UnicodeUtil::ToLower(c);
+    }
+    static ucs4_t lowercase_and_fold(ucs4_t c) {
+        if (c < 767)
+            return _foldCase[c];
+        else if (c >= 0x1E00 && c < 0x1F00)
+            return _foldCaseHighAscii[c - 0x1E00];
         else
-            if (testchar >= 0x3040 && testchar < 0x3100)
-                return _kanaMap[testchar - 0x3040];
+            if (c >= 0x3040 && c < 0x3100)
+                return _kanaMap[c - 0x3040];
             else
-                if (testchar >= 0xFF00 && testchar < 0xFFF0)
-                    return _halfwidth_fullwidthMap[testchar - 0xFF00];
+                if (c >= 0xFF00 && c < 0xFFF0)
+                    return _halfwidth_fullwidthMap[c - 0xFF00];
                 else
-                    return Fast_UnicodeUtil::ToLower(testchar);
+                    return Fast_UnicodeUtil::ToLower(c);
     }
 
-public:
     static const char *ReplacementString(ucs4_t testchar) {
         if (testchar < 0xc4 || testchar > 0x1f3) {
             return nullptr;
@@ -150,18 +144,26 @@ public:
     }
 private:
     /**
-     * Check if the given char is a word character or used
-     * for interlinear annotation.
-     * @param c The character to check.
-     * @return true if c is a word character, or interlinear annotation syntax characters.
+     * Freeze the config, either from call to Setup, environment
+     * or defaults.
      */
+    static void Initialize();
     static bool IsWordCharOrIA(ucs4_t c) {
-        return Fast_UnicodeUtil::IsWordChar(c)
-            || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
+        return Fast_UnicodeUtil::IsWordChar(c) || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
     }
-public:
-    Fast_NormalizeWordFolder();
-    ~Fast_NormalizeWordFolder() override;
-    const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
-                             ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
+
+    /** character tables */
+    static bool _isWord[128];
+    static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
+    static ucs4_t _lowerCase[767];
+    static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
+    static ucs4_t _lowerCaseHighAscii[256];
+    /** Map the values from range 0x3040 (0) - 0x30FF (191). */
+    static ucs4_t _kanaMap[192];
+    static ucs4_t _halfwidth_fullwidthMap[240];
+    static bool   _isInitialized;
+    static bool   _doAccentRemoval;
+    static bool   _doSharpSSubstitution;
+    static bool   _doLigatureSubstitution;
+    static bool   _doMulticharExpansion;
 };
diff --git a/vespalib/src/vespa/fastlib/text/wordfolder.h b/vespalib/src/vespa/fastlib/text/wordfolder.h
index e5412859f3e..ac8c590be7c 100644
--- a/vespalib/src/vespa/fastlib/text/wordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/wordfolder.h
@@ -7,10 +7,6 @@ class Fast_WordFolder
 {
 public:
     virtual ~Fast_WordFolder() = default;
-    virtual const char* UCS4Tokenize(const char *buf,
-                                     const char *bufend,
-                                     ucs4_t *dstbuf,
-                                     ucs4_t *dstbufend,
-                                     const char*& origstart,
-                                     size_t& tokenlen) const = 0;
+    virtual const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+                                     ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const = 0;
 };
author	Henning Baldersheim <balder@yahoo-inc.com>	2023-12-21 10:56:49 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2023-12-21 10:56:49 +0000
commit	543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
tree	f62fb82c65a6152fabe944caa4e719051f4ab032 /vespalib
parent	ef3db955e75e6df68a2a358feb5b95e44979377f (diff)