diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-21 10:56:49 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-21 10:56:49 +0000 |
commit | 543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch) | |
tree | f62fb82c65a6152fabe944caa4e719051f4ab032 | |
parent | ef3db955e75e6df68a2a358feb5b95e44979377f (diff) |
- Separate methods for lowercasing, and lowercasing and folding.
- Hide implementations and use accessors.
- Minor code cleanup.
8 files changed, 91 insertions, 99 deletions
diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp index c723470f0fb..3aa2bbe5a86 100644 --- a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp +++ b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp @@ -27,7 +27,7 @@ main(int argc, char ** argv) ref.getline(refBuf, 128); ucs4_t inputChar = getUCS4Char(inputBuf); ucs4_t refChar = getUCS4Char(refBuf); - ucs4_t lowerChar = wordFolder.ToFold(inputChar); + ucs4_t lowerChar = wordFolder.lowercase_and_fold(inputChar); Fast_UnicodeUtil::utf8ncopy(lowerBuf, &lowerChar, 128, 1); if (refChar != lowerChar) { printf("input(%s,%u,0x%X), lower(%s,%u,0x%X), ref(%s,%u,0x%X) \n", diff --git a/searchlib/src/vespa/searchlib/common/sortspec.cpp b/searchlib/src/vespa/searchlib/common/sortspec.cpp index 04bc87f1000..40e2616367f 100644 --- a/searchlib/src/vespa/searchlib/common/sortspec.cpp +++ b/searchlib/src/vespa/searchlib/common/sortspec.cpp @@ -30,7 +30,7 @@ LowercaseConverter::onConvert(const ConstBufferRef & src) const vespalib::Utf8Writer w(_buffer); while (r.hasMore()) { ucs4_t c = r.getChar(0xFFFD); - c = Fast_NormalizeWordFolder::ToFold(c); + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); w.putChar(c); } return {_buffer.begin(), _buffer.size()}; diff --git a/searchsummary/src/vespa/juniper/tokenizer.cpp b/searchsummary/src/vespa/juniper/tokenizer.cpp index cd3c9c410ce..211ffe7054a 100644 --- a/searchsummary/src/vespa/juniper/tokenizer.cpp +++ b/searchsummary/src/vespa/juniper/tokenizer.cpp @@ -8,11 +8,10 @@ #include <vespa/log/log.h> LOG_SETUP(".juniper.tokenizer"); -JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder, - const char* text, size_t len, ITokenProcessor* successor, - const juniper::SpecialTokenRegistry * registry) : +JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder, const char* text, size_t len, + ITokenProcessor* successor, const juniper::SpecialTokenRegistry * registry) : _wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry), - _charpos(0), _wordpos(0) + _charpos(0), _wordpos(0), _buffer() { } @@ -32,19 +31,19 @@ void JuniperTokenizer::scan() const char* src = _text; const char* src_end = _text + _len; - const char* startpos = NULL; + const char* startpos = nullptr; ucs4_t* dst = _buffer; ucs4_t* dst_end = dst + TOKEN_DSTLEN; size_t result_len; while (src < src_end) { - if (_registry == NULL) { + if (_registry == nullptr) { // explicit prefetching seems to have negative effect with many threads src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len); } else { const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len); - if (tmpSrc == NULL) { + if (tmpSrc == nullptr) { src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len); } else { src = tmpSrc; @@ -63,6 +62,6 @@ void JuniperTokenizer::scan() } token.bytepos = _len; token.bytelen = 0; - token.token = NULL; + token.token = nullptr; _successor->handle_end(token); } diff --git a/searchsummary/src/vespa/juniper/tokenizer.h b/searchsummary/src/vespa/juniper/tokenizer.h index 68ef8118f5d..910da3f67ef 100644 --- a/searchsummary/src/vespa/juniper/tokenizer.h +++ b/searchsummary/src/vespa/juniper/tokenizer.h @@ -12,8 +12,8 @@ class JuniperTokenizer { public: JuniperTokenizer(const Fast_WordFolder* wordfolder, - const char* text, size_t len, ITokenProcessor* = NULL, - const juniper::SpecialTokenRegistry * registry = NULL); + const char* text, size_t len, ITokenProcessor* = nullptr, + const juniper::SpecialTokenRegistry * registry = nullptr); inline void SetSuccessor(ITokenProcessor* successor) { _successor = successor; } void setRegistry(const juniper::SpecialTokenRegistry * registry) { _registry = registry; } @@ -23,13 +23,13 @@ public: void scan(); private: const Fast_WordFolder* _wordfolder; - const char* _text; // The current input text - size_t _len; // Length of the text input - ITokenProcessor* _successor; + const char* _text; // The current input text + size_t _len; // Length of the text input + ITokenProcessor* _successor; const juniper::SpecialTokenRegistry * _registry; - off_t _charpos; // Last utf8 character position - off_t _wordpos; // Offset in numbering of words compared to input (as result of splits) - ucs4_t _buffer[TOKEN_DSTLEN]; // Temp. buffer to store folding result + off_t _charpos; // Last utf8 character position + off_t _wordpos; // Offset in numbering of words compared to input (as result of splits) + ucs4_t _buffer[TOKEN_DSTLEN]; // Temp. buffer to store folding result private: JuniperTokenizer(const JuniperTokenizer&); JuniperTokenizer& operator=(const JuniperTokenizer&); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index 4daea693e95..c31102ec0ab 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -25,8 +25,8 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * if (c < 128) { if (!c) { break; } p++; - if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) { - *q++ = Fast_NormalizeWordFolder::_foldCase[c]; + if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { + *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); c = 0; } else { c = *p; @@ -37,13 +37,13 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * if (Fast_UnicodeUtil::IsWordChar(c)) { _utf8Count[p-oldP-1]++; const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { + if (repl != nullptr) { size_t repllen = strlen(repl); if (repllen > 0) { q = Fast_UnicodeUtil::ucs4copy(q,repl); } } else { - c = Fast_NormalizeWordFolder::ToFold(c); + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); *q++ = c; } break; @@ -63,10 +63,10 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * if (c < 128) { // Common case, ASCII if (!c) { break; } p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) { + if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { c = 0; } else { - *q++ = Fast_NormalizeWordFolder::_foldCase[c]; + *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); c = *p; } } else { @@ -75,13 +75,13 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { _utf8Count[p-oldP-1]++; const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { + if (repl != nullptr) { size_t repllen = strlen(repl); if (repllen > 0) { q = Fast_UnicodeUtil::ucs4copy(q,repl); } } else { - c = Fast_NormalizeWordFolder::ToFold(c); + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); *q++ = c; } @@ -144,9 +144,9 @@ UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) bool equal(true); for (; equal && (n < e) && (term < eterm); term++) { if (*term < 0x80) { - equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]); + equal = (*term == Fast_NormalizeWordFolder::lowercase_ascii(*n++)); } else { - cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); + cmptype_t c = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); equal = (*term == c); } } @@ -280,12 +280,12 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T if (c < 128) { p++; if (!isSeparatorCharacter(c)) { - dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b)); + dstbuf.onCharacter(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c), (oldP - b)); } } else { c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { + if (repl != nullptr) { size_t repllen = strlen(repl); if (repllen > 0) { ucs4_t * buf = dstbuf.getBuf(); @@ -300,7 +300,7 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T } } } else { - c = Fast_NormalizeWordFolder::ToFold(c); + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); dstbuf.onCharacter(c, (oldP - b)); } if (c == Fast_UnicodeUtil::_BadUTF8Char) { diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp index ef6d17e20f1..f9dbf202fcb 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp @@ -13,7 +13,9 @@ bool Fast_NormalizeWordFolder::_doMulticharExpansion = false; bool Fast_NormalizeWordFolder::_isWord[128]; ucs4_t Fast_NormalizeWordFolder::_foldCase[767]; // Up to Latin Extended B (0x0250) +ucs4_t Fast_NormalizeWordFolder::_lowerCase[767]; ucs4_t Fast_NormalizeWordFolder::_foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) +ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256]; ucs4_t Fast_NormalizeWordFolder::_kanaMap[192]; ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240]; @@ -43,11 +45,10 @@ Fast_NormalizeWordFolder::Initialize() for (i = 0; i < 128; i++) _isWord[i] = Fast_UnicodeUtil::IsWordChar(i); for (i = 0; i < 767; i++) { - _foldCase[i] = Fast_UnicodeUtil::ToLower(i); + _foldCase[i] = _lowerCase[i] = Fast_UnicodeUtil::ToLower(i); } - for (i = 0x1E00; i < 0x1F00; i++) { - _foldCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i); + _foldCaseHighAscii[i - 0x1E00] = _lowerCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i); } if (_doAccentRemoval) { @@ -394,17 +395,11 @@ Fast_NormalizeWordFolder::Fast_NormalizeWordFolder() } -Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder(void) -{ -} +Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder() = default; const char* -Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, - const char *bufend, - ucs4_t *dstbuf, - ucs4_t *dstbufend, - const char*& origstart, - size_t& tokenlen) const +Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const { ucs4_t c; @@ -451,7 +446,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, if (repllen > 0) q = Fast_UnicodeUtil::ucs4copy(q,repl); } else { - c = ToFold(c); + c = lowercase_and_fold(c); *q++ = c; } } @@ -563,7 +558,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, if (repllen > 0) q = Fast_UnicodeUtil::ucs4copy(q,repl); } else { - c = ToFold(c); + c = lowercase_and_fold(c); *q++ = c; } if (q >= eq) { // Junk rest of word diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h index d7b07b698c9..c596b0fd2b4 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.h +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h @@ -11,21 +11,6 @@ */ class Fast_NormalizeWordFolder : public Fast_WordFolder { -private: - static bool _isInitialized; - - /** Features */ - static bool _doAccentRemoval; - static bool _doSharpSSubstitution; - static bool _doLigatureSubstitution; - static bool _doMulticharExpansion; - - /** - * Freeze the config, either from call to Setup, environment - * or defaults. - */ - static void Initialize(); - public: enum { DO_ACCENT_REMOVAL = 0x1 << 0, @@ -37,6 +22,10 @@ public: DO_LIGATURE_SUBSTITUTION = 0x1 << 6, DO_MULTICHAR_EXPANSION = 0x1 << 7 }; + Fast_NormalizeWordFolder(); + ~Fast_NormalizeWordFolder() override; + const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override; /** * Setup behaviour prior to constructing an object. * Not needed if default behaviour is wanted. The default is @@ -46,33 +35,38 @@ public: * added together. */ static void Setup(uint32_t flags); - -public: - /** character tables */ - static bool _isWord[128]; - static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF) - static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese) -private: - /** Map the values from range 0x3040 (0) - 0x30FF (191). */ - static ucs4_t _kanaMap[192]; - static ucs4_t _halfwidth_fullwidthMap[240]; -public: - static ucs4_t ToFold(ucs4_t testchar) { - if (testchar < 767) - return _foldCase[testchar]; - else if (testchar >= 0x1E00 && testchar < 0x1F00) - return _foldCaseHighAscii[testchar - 0x1E00]; + static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; } + static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; } + static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; } + static ucs4_t lowercase(ucs4_t c) { + if (c < 767) + return _lowerCase[c]; + else if (c >= 0x1E00 && c < 0x1F00) + return _lowerCaseHighAscii[c - 0x1E00]; + else + if (c >= 0x3040 && c < 0x3100) + return _kanaMap[c - 0x3040]; + else + if (c >= 0xFF00 && c < 0xFFF0) + return _halfwidth_fullwidthMap[c - 0xFF00]; + else + return Fast_UnicodeUtil::ToLower(c); + } + static ucs4_t lowercase_and_fold(ucs4_t c) { + if (c < 767) + return _foldCase[c]; + else if (c >= 0x1E00 && c < 0x1F00) + return _foldCaseHighAscii[c - 0x1E00]; else - if (testchar >= 0x3040 && testchar < 0x3100) - return _kanaMap[testchar - 0x3040]; + if (c >= 0x3040 && c < 0x3100) + return _kanaMap[c - 0x3040]; else - if (testchar >= 0xFF00 && testchar < 0xFFF0) - return _halfwidth_fullwidthMap[testchar - 0xFF00]; + if (c >= 0xFF00 && c < 0xFFF0) + return _halfwidth_fullwidthMap[c - 0xFF00]; else - return Fast_UnicodeUtil::ToLower(testchar); + return Fast_UnicodeUtil::ToLower(c); } -public: static const char *ReplacementString(ucs4_t testchar) { if (testchar < 0xc4 || testchar > 0x1f3) { return nullptr; @@ -150,18 +144,26 @@ public: } private: /** - * Check if the given char is a word character or used - * for interlinear annotation. - * @param c The character to check. - * @return true if c is a word character, or interlinear annotation syntax characters. + * Freeze the config, either from call to Setup, environment + * or defaults. */ + static void Initialize(); static bool IsWordCharOrIA(ucs4_t c) { - return Fast_UnicodeUtil::IsWordChar(c) - || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB; + return Fast_UnicodeUtil::IsWordChar(c) || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB; } -public: - Fast_NormalizeWordFolder(); - ~Fast_NormalizeWordFolder() override; - const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, - ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override; + + /** character tables */ + static bool _isWord[128]; + static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF) + static ucs4_t _lowerCase[767]; + static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese) + static ucs4_t _lowerCaseHighAscii[256]; + /** Map the values from range 0x3040 (0) - 0x30FF (191). */ + static ucs4_t _kanaMap[192]; + static ucs4_t _halfwidth_fullwidthMap[240]; + static bool _isInitialized; + static bool _doAccentRemoval; + static bool _doSharpSSubstitution; + static bool _doLigatureSubstitution; + static bool _doMulticharExpansion; }; diff --git a/vespalib/src/vespa/fastlib/text/wordfolder.h b/vespalib/src/vespa/fastlib/text/wordfolder.h index e5412859f3e..ac8c590be7c 100644 --- a/vespalib/src/vespa/fastlib/text/wordfolder.h +++ b/vespalib/src/vespa/fastlib/text/wordfolder.h @@ -7,10 +7,6 @@ class Fast_WordFolder { public: virtual ~Fast_WordFolder() = default; - virtual const char* UCS4Tokenize(const char *buf, - const char *bufend, - ucs4_t *dstbuf, - ucs4_t *dstbufend, - const char*& origstart, - size_t& tokenlen) const = 0; + virtual const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf, + ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const = 0; }; |