summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-12-21 10:56:49 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2023-12-21 10:56:49 +0000
commit543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
treef62fb82c65a6152fabe944caa4e719051f4ab032 /vespalib
parentef3db955e75e6df68a2a358feb5b95e44979377f (diff)
- Separate methods for lowercasing, and lowercasing and folding.
- Hide implementations and use accessors. - Minor code cleanup.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.cpp23
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.h98
-rw-r--r--vespalib/src/vespa/fastlib/text/wordfolder.h8
3 files changed, 61 insertions, 68 deletions
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
index ef6d17e20f1..f9dbf202fcb 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
@@ -13,7 +13,9 @@ bool Fast_NormalizeWordFolder::_doMulticharExpansion = false;
bool Fast_NormalizeWordFolder::_isWord[128];
ucs4_t Fast_NormalizeWordFolder::_foldCase[767]; // Up to Latin Extended B (0x0250)
+ucs4_t Fast_NormalizeWordFolder::_lowerCase[767];
ucs4_t Fast_NormalizeWordFolder::_foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00)
+ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256];
ucs4_t Fast_NormalizeWordFolder::_kanaMap[192];
ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240];
@@ -43,11 +45,10 @@ Fast_NormalizeWordFolder::Initialize()
for (i = 0; i < 128; i++)
_isWord[i] = Fast_UnicodeUtil::IsWordChar(i);
for (i = 0; i < 767; i++) {
- _foldCase[i] = Fast_UnicodeUtil::ToLower(i);
+ _foldCase[i] = _lowerCase[i] = Fast_UnicodeUtil::ToLower(i);
}
-
for (i = 0x1E00; i < 0x1F00; i++) {
- _foldCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
+ _foldCaseHighAscii[i - 0x1E00] = _lowerCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
}
if (_doAccentRemoval) {
@@ -394,17 +395,11 @@ Fast_NormalizeWordFolder::Fast_NormalizeWordFolder()
}
-Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder(void)
-{
-}
+Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder() = default;
const char*
-Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
- const char *bufend,
- ucs4_t *dstbuf,
- ucs4_t *dstbufend,
- const char*& origstart,
- size_t& tokenlen) const
+Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const
{
ucs4_t c;
@@ -451,7 +446,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
if (repllen > 0)
q = Fast_UnicodeUtil::ucs4copy(q,repl);
} else {
- c = ToFold(c);
+ c = lowercase_and_fold(c);
*q++ = c;
}
}
@@ -563,7 +558,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
if (repllen > 0)
q = Fast_UnicodeUtil::ucs4copy(q,repl);
} else {
- c = ToFold(c);
+ c = lowercase_and_fold(c);
*q++ = c;
}
if (q >= eq) { // Junk rest of word
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h
index d7b07b698c9..c596b0fd2b4 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h
@@ -11,21 +11,6 @@
*/
class Fast_NormalizeWordFolder : public Fast_WordFolder
{
-private:
- static bool _isInitialized;
-
- /** Features */
- static bool _doAccentRemoval;
- static bool _doSharpSSubstitution;
- static bool _doLigatureSubstitution;
- static bool _doMulticharExpansion;
-
- /**
- * Freeze the config, either from call to Setup, environment
- * or defaults.
- */
- static void Initialize();
-
public:
enum {
DO_ACCENT_REMOVAL = 0x1 << 0,
@@ -37,6 +22,10 @@ public:
DO_LIGATURE_SUBSTITUTION = 0x1 << 6,
DO_MULTICHAR_EXPANSION = 0x1 << 7
};
+ Fast_NormalizeWordFolder();
+ ~Fast_NormalizeWordFolder() override;
+ const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
/**
* Setup behaviour prior to constructing an object.
* Not needed if default behaviour is wanted. The default is
@@ -46,33 +35,38 @@ public:
* added together.
*/
static void Setup(uint32_t flags);
-
-public:
- /** character tables */
- static bool _isWord[128];
- static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
- static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
-private:
- /** Map the values from range 0x3040 (0) - 0x30FF (191). */
- static ucs4_t _kanaMap[192];
- static ucs4_t _halfwidth_fullwidthMap[240];
-public:
- static ucs4_t ToFold(ucs4_t testchar) {
- if (testchar < 767)
- return _foldCase[testchar];
- else if (testchar >= 0x1E00 && testchar < 0x1F00)
- return _foldCaseHighAscii[testchar - 0x1E00];
+ static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; }
+ static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; }
+ static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; }
+ static ucs4_t lowercase(ucs4_t c) {
+ if (c < 767)
+ return _lowerCase[c];
+ else if (c >= 0x1E00 && c < 0x1F00)
+ return _lowerCaseHighAscii[c - 0x1E00];
+ else
+ if (c >= 0x3040 && c < 0x3100)
+ return _kanaMap[c - 0x3040];
+ else
+ if (c >= 0xFF00 && c < 0xFFF0)
+ return _halfwidth_fullwidthMap[c - 0xFF00];
+ else
+ return Fast_UnicodeUtil::ToLower(c);
+ }
+ static ucs4_t lowercase_and_fold(ucs4_t c) {
+ if (c < 767)
+ return _foldCase[c];
+ else if (c >= 0x1E00 && c < 0x1F00)
+ return _foldCaseHighAscii[c - 0x1E00];
else
- if (testchar >= 0x3040 && testchar < 0x3100)
- return _kanaMap[testchar - 0x3040];
+ if (c >= 0x3040 && c < 0x3100)
+ return _kanaMap[c - 0x3040];
else
- if (testchar >= 0xFF00 && testchar < 0xFFF0)
- return _halfwidth_fullwidthMap[testchar - 0xFF00];
+ if (c >= 0xFF00 && c < 0xFFF0)
+ return _halfwidth_fullwidthMap[c - 0xFF00];
else
- return Fast_UnicodeUtil::ToLower(testchar);
+ return Fast_UnicodeUtil::ToLower(c);
}
-public:
static const char *ReplacementString(ucs4_t testchar) {
if (testchar < 0xc4 || testchar > 0x1f3) {
return nullptr;
@@ -150,18 +144,26 @@ public:
}
private:
/**
- * Check if the given char is a word character or used
- * for interlinear annotation.
- * @param c The character to check.
- * @return true if c is a word character, or interlinear annotation syntax characters.
+ * Freeze the config, either from call to Setup, environment
+ * or defaults.
*/
+ static void Initialize();
static bool IsWordCharOrIA(ucs4_t c) {
- return Fast_UnicodeUtil::IsWordChar(c)
- || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
+ return Fast_UnicodeUtil::IsWordChar(c) || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
}
-public:
- Fast_NormalizeWordFolder();
- ~Fast_NormalizeWordFolder() override;
- const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
- ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
+
+ /** character tables */
+ static bool _isWord[128];
+ static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
+ static ucs4_t _lowerCase[767];
+ static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
+ static ucs4_t _lowerCaseHighAscii[256];
+ /** Map the values from range 0x3040 (0) - 0x30FF (191). */
+ static ucs4_t _kanaMap[192];
+ static ucs4_t _halfwidth_fullwidthMap[240];
+ static bool _isInitialized;
+ static bool _doAccentRemoval;
+ static bool _doSharpSSubstitution;
+ static bool _doLigatureSubstitution;
+ static bool _doMulticharExpansion;
};
diff --git a/vespalib/src/vespa/fastlib/text/wordfolder.h b/vespalib/src/vespa/fastlib/text/wordfolder.h
index e5412859f3e..ac8c590be7c 100644
--- a/vespalib/src/vespa/fastlib/text/wordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/wordfolder.h
@@ -7,10 +7,6 @@ class Fast_WordFolder
{
public:
virtual ~Fast_WordFolder() = default;
- virtual const char* UCS4Tokenize(const char *buf,
- const char *bufend,
- ucs4_t *dstbuf,
- ucs4_t *dstbufend,
- const char*& origstart,
- size_t& tokenlen) const = 0;
+ virtual const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const = 0;
};