aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-12-21 10:56:49 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2023-12-21 10:56:49 +0000
commit543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
treef62fb82c65a6152fabe944caa4e719051f4ab032
parentef3db955e75e6df68a2a358feb5b95e44979377f (diff)
- Separate methods for lowercasing, and lowercasing and folding.
- Hide implementations and use accessors. - Minor code cleanup.
-rw-r--r--lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/common/sortspec.cpp2
-rw-r--r--searchsummary/src/vespa/juniper/tokenizer.cpp15
-rw-r--r--searchsummary/src/vespa/juniper/tokenizer.h16
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp26
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.cpp23
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.h98
-rw-r--r--vespalib/src/vespa/fastlib/text/wordfolder.h8
8 files changed, 91 insertions, 99 deletions
diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp
index c723470f0fb..3aa2bbe5a86 100644
--- a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp
+++ b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp
@@ -27,7 +27,7 @@ main(int argc, char ** argv)
ref.getline(refBuf, 128);
ucs4_t inputChar = getUCS4Char(inputBuf);
ucs4_t refChar = getUCS4Char(refBuf);
- ucs4_t lowerChar = wordFolder.ToFold(inputChar);
+ ucs4_t lowerChar = wordFolder.lowercase_and_fold(inputChar);
Fast_UnicodeUtil::utf8ncopy(lowerBuf, &lowerChar, 128, 1);
if (refChar != lowerChar) {
printf("input(%s,%u,0x%X), lower(%s,%u,0x%X), ref(%s,%u,0x%X) \n",
diff --git a/searchlib/src/vespa/searchlib/common/sortspec.cpp b/searchlib/src/vespa/searchlib/common/sortspec.cpp
index 04bc87f1000..40e2616367f 100644
--- a/searchlib/src/vespa/searchlib/common/sortspec.cpp
+++ b/searchlib/src/vespa/searchlib/common/sortspec.cpp
@@ -30,7 +30,7 @@ LowercaseConverter::onConvert(const ConstBufferRef & src) const
vespalib::Utf8Writer w(_buffer);
while (r.hasMore()) {
ucs4_t c = r.getChar(0xFFFD);
- c = Fast_NormalizeWordFolder::ToFold(c);
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
w.putChar(c);
}
return {_buffer.begin(), _buffer.size()};
diff --git a/searchsummary/src/vespa/juniper/tokenizer.cpp b/searchsummary/src/vespa/juniper/tokenizer.cpp
index cd3c9c410ce..211ffe7054a 100644
--- a/searchsummary/src/vespa/juniper/tokenizer.cpp
+++ b/searchsummary/src/vespa/juniper/tokenizer.cpp
@@ -8,11 +8,10 @@
#include <vespa/log/log.h>
LOG_SETUP(".juniper.tokenizer");
-JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder,
- const char* text, size_t len, ITokenProcessor* successor,
- const juniper::SpecialTokenRegistry * registry) :
+JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder, const char* text, size_t len,
+ ITokenProcessor* successor, const juniper::SpecialTokenRegistry * registry) :
_wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry),
- _charpos(0), _wordpos(0)
+ _charpos(0), _wordpos(0), _buffer()
{ }
@@ -32,19 +31,19 @@ void JuniperTokenizer::scan()
const char* src = _text;
const char* src_end = _text + _len;
- const char* startpos = NULL;
+ const char* startpos = nullptr;
ucs4_t* dst = _buffer;
ucs4_t* dst_end = dst + TOKEN_DSTLEN;
size_t result_len;
while (src < src_end)
{
- if (_registry == NULL) {
+ if (_registry == nullptr) {
// explicit prefetching seems to have negative effect with many threads
src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
} else {
const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len);
- if (tmpSrc == NULL) {
+ if (tmpSrc == nullptr) {
src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
} else {
src = tmpSrc;
@@ -63,6 +62,6 @@ void JuniperTokenizer::scan()
}
token.bytepos = _len;
token.bytelen = 0;
- token.token = NULL;
+ token.token = nullptr;
_successor->handle_end(token);
}
diff --git a/searchsummary/src/vespa/juniper/tokenizer.h b/searchsummary/src/vespa/juniper/tokenizer.h
index 68ef8118f5d..910da3f67ef 100644
--- a/searchsummary/src/vespa/juniper/tokenizer.h
+++ b/searchsummary/src/vespa/juniper/tokenizer.h
@@ -12,8 +12,8 @@ class JuniperTokenizer
{
public:
JuniperTokenizer(const Fast_WordFolder* wordfolder,
- const char* text, size_t len, ITokenProcessor* = NULL,
- const juniper::SpecialTokenRegistry * registry = NULL);
+ const char* text, size_t len, ITokenProcessor* = nullptr,
+ const juniper::SpecialTokenRegistry * registry = nullptr);
inline void SetSuccessor(ITokenProcessor* successor) { _successor = successor; }
void setRegistry(const juniper::SpecialTokenRegistry * registry) { _registry = registry; }
@@ -23,13 +23,13 @@ public:
void scan();
private:
const Fast_WordFolder* _wordfolder;
- const char* _text; // The current input text
- size_t _len; // Length of the text input
- ITokenProcessor* _successor;
+ const char* _text; // The current input text
+ size_t _len; // Length of the text input
+ ITokenProcessor* _successor;
const juniper::SpecialTokenRegistry * _registry;
- off_t _charpos; // Last utf8 character position
- off_t _wordpos; // Offset in numbering of words compared to input (as result of splits)
- ucs4_t _buffer[TOKEN_DSTLEN]; // Temp. buffer to store folding result
+ off_t _charpos; // Last utf8 character position
+ off_t _wordpos; // Offset in numbering of words compared to input (as result of splits)
+ ucs4_t _buffer[TOKEN_DSTLEN]; // Temp. buffer to store folding result
private:
JuniperTokenizer(const JuniperTokenizer&);
JuniperTokenizer& operator=(const JuniperTokenizer&);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 4daea693e95..c31102ec0ab 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -25,8 +25,8 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
if (c < 128) {
if (!c) { break; }
p++;
- if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) {
- *q++ = Fast_NormalizeWordFolder::_foldCase[c];
+ if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
+ *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
c = 0;
} else {
c = *p;
@@ -37,13 +37,13 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
if (Fast_UnicodeUtil::IsWordChar(c)) {
_utf8Count[p-oldP-1]++;
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
+ if (repl != nullptr) {
size_t repllen = strlen(repl);
if (repllen > 0) {
q = Fast_UnicodeUtil::ucs4copy(q,repl);
}
} else {
- c = Fast_NormalizeWordFolder::ToFold(c);
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
*q++ = c;
}
break;
@@ -63,10 +63,10 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
if (c < 128) { // Common case, ASCII
if (!c) { break; }
p++;
- if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) {
+ if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
c = 0;
} else {
- *q++ = Fast_NormalizeWordFolder::_foldCase[c];
+ *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
c = *p;
}
} else {
@@ -75,13 +75,13 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
_utf8Count[p-oldP-1]++;
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
+ if (repl != nullptr) {
size_t repllen = strlen(repl);
if (repllen > 0) {
q = Fast_UnicodeUtil::ucs4copy(q,repl);
}
} else {
- c = Fast_NormalizeWordFolder::ToFold(c);
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
*q++ = c;
}
@@ -144,9 +144,9 @@ UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
bool equal(true);
for (; equal && (n < e) && (term < eterm); term++) {
if (*term < 0x80) {
- equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]);
+ equal = (*term == Fast_NormalizeWordFolder::lowercase_ascii(*n++));
} else {
- cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
+ cmptype_t c = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
equal = (*term == c);
}
}
@@ -280,12 +280,12 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
if (c < 128) {
p++;
if (!isSeparatorCharacter(c)) {
- dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b));
+ dstbuf.onCharacter(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c), (oldP - b));
}
} else {
c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
+ if (repl != nullptr) {
size_t repllen = strlen(repl);
if (repllen > 0) {
ucs4_t * buf = dstbuf.getBuf();
@@ -300,7 +300,7 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
}
}
} else {
- c = Fast_NormalizeWordFolder::ToFold(c);
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
dstbuf.onCharacter(c, (oldP - b));
}
if (c == Fast_UnicodeUtil::_BadUTF8Char) {
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
index ef6d17e20f1..f9dbf202fcb 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
@@ -13,7 +13,9 @@ bool Fast_NormalizeWordFolder::_doMulticharExpansion = false;
bool Fast_NormalizeWordFolder::_isWord[128];
ucs4_t Fast_NormalizeWordFolder::_foldCase[767]; // Up to Latin Extended B (0x0250)
+ucs4_t Fast_NormalizeWordFolder::_lowerCase[767];
ucs4_t Fast_NormalizeWordFolder::_foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00)
+ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256];
ucs4_t Fast_NormalizeWordFolder::_kanaMap[192];
ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240];
@@ -43,11 +45,10 @@ Fast_NormalizeWordFolder::Initialize()
for (i = 0; i < 128; i++)
_isWord[i] = Fast_UnicodeUtil::IsWordChar(i);
for (i = 0; i < 767; i++) {
- _foldCase[i] = Fast_UnicodeUtil::ToLower(i);
+ _foldCase[i] = _lowerCase[i] = Fast_UnicodeUtil::ToLower(i);
}
-
for (i = 0x1E00; i < 0x1F00; i++) {
- _foldCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
+ _foldCaseHighAscii[i - 0x1E00] = _lowerCaseHighAscii[i - 0x1E00] = Fast_UnicodeUtil::ToLower(i);
}
if (_doAccentRemoval) {
@@ -394,17 +395,11 @@ Fast_NormalizeWordFolder::Fast_NormalizeWordFolder()
}
-Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder(void)
-{
-}
+Fast_NormalizeWordFolder::~Fast_NormalizeWordFolder() = default;
const char*
-Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
- const char *bufend,
- ucs4_t *dstbuf,
- ucs4_t *dstbufend,
- const char*& origstart,
- size_t& tokenlen) const
+Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const
{
ucs4_t c;
@@ -451,7 +446,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
if (repllen > 0)
q = Fast_UnicodeUtil::ucs4copy(q,repl);
} else {
- c = ToFold(c);
+ c = lowercase_and_fold(c);
*q++ = c;
}
}
@@ -563,7 +558,7 @@ Fast_NormalizeWordFolder::UCS4Tokenize(const char *buf,
if (repllen > 0)
q = Fast_UnicodeUtil::ucs4copy(q,repl);
} else {
- c = ToFold(c);
+ c = lowercase_and_fold(c);
*q++ = c;
}
if (q >= eq) { // Junk rest of word
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h
index d7b07b698c9..c596b0fd2b4 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h
@@ -11,21 +11,6 @@
*/
class Fast_NormalizeWordFolder : public Fast_WordFolder
{
-private:
- static bool _isInitialized;
-
- /** Features */
- static bool _doAccentRemoval;
- static bool _doSharpSSubstitution;
- static bool _doLigatureSubstitution;
- static bool _doMulticharExpansion;
-
- /**
- * Freeze the config, either from call to Setup, environment
- * or defaults.
- */
- static void Initialize();
-
public:
enum {
DO_ACCENT_REMOVAL = 0x1 << 0,
@@ -37,6 +22,10 @@ public:
DO_LIGATURE_SUBSTITUTION = 0x1 << 6,
DO_MULTICHAR_EXPANSION = 0x1 << 7
};
+ Fast_NormalizeWordFolder();
+ ~Fast_NormalizeWordFolder() override;
+ const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
/**
* Setup behaviour prior to constructing an object.
* Not needed if default behaviour is wanted. The default is
@@ -46,33 +35,38 @@ public:
* added together.
*/
static void Setup(uint32_t flags);
-
-public:
- /** character tables */
- static bool _isWord[128];
- static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
- static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
-private:
- /** Map the values from range 0x3040 (0) - 0x30FF (191). */
- static ucs4_t _kanaMap[192];
- static ucs4_t _halfwidth_fullwidthMap[240];
-public:
- static ucs4_t ToFold(ucs4_t testchar) {
- if (testchar < 767)
- return _foldCase[testchar];
- else if (testchar >= 0x1E00 && testchar < 0x1F00)
- return _foldCaseHighAscii[testchar - 0x1E00];
+ static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; }
+ static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; }
+ static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; }
+ static ucs4_t lowercase(ucs4_t c) {
+ if (c < 767)
+ return _lowerCase[c];
+ else if (c >= 0x1E00 && c < 0x1F00)
+ return _lowerCaseHighAscii[c - 0x1E00];
+ else
+ if (c >= 0x3040 && c < 0x3100)
+ return _kanaMap[c - 0x3040];
+ else
+ if (c >= 0xFF00 && c < 0xFFF0)
+ return _halfwidth_fullwidthMap[c - 0xFF00];
+ else
+ return Fast_UnicodeUtil::ToLower(c);
+ }
+ static ucs4_t lowercase_and_fold(ucs4_t c) {
+ if (c < 767)
+ return _foldCase[c];
+ else if (c >= 0x1E00 && c < 0x1F00)
+ return _foldCaseHighAscii[c - 0x1E00];
else
- if (testchar >= 0x3040 && testchar < 0x3100)
- return _kanaMap[testchar - 0x3040];
+ if (c >= 0x3040 && c < 0x3100)
+ return _kanaMap[c - 0x3040];
else
- if (testchar >= 0xFF00 && testchar < 0xFFF0)
- return _halfwidth_fullwidthMap[testchar - 0xFF00];
+ if (c >= 0xFF00 && c < 0xFFF0)
+ return _halfwidth_fullwidthMap[c - 0xFF00];
else
- return Fast_UnicodeUtil::ToLower(testchar);
+ return Fast_UnicodeUtil::ToLower(c);
}
-public:
static const char *ReplacementString(ucs4_t testchar) {
if (testchar < 0xc4 || testchar > 0x1f3) {
return nullptr;
@@ -150,18 +144,26 @@ public:
}
private:
/**
- * Check if the given char is a word character or used
- * for interlinear annotation.
- * @param c The character to check.
- * @return true if c is a word character, or interlinear annotation syntax characters.
+ * Freeze the config, either from call to Setup, environment
+ * or defaults.
*/
+ static void Initialize();
static bool IsWordCharOrIA(ucs4_t c) {
- return Fast_UnicodeUtil::IsWordChar(c)
- || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
+ return Fast_UnicodeUtil::IsWordChar(c) || c == 0xFFF9 || c == 0xFFFA || c == 0xFFFB;
}
-public:
- Fast_NormalizeWordFolder();
- ~Fast_NormalizeWordFolder() override;
- const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
- ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const override;
+
+ /** character tables */
+ static bool _isWord[128];
+ static ucs4_t _foldCase[767]; // Up to Spacing Modifiers, inclusize (0x02FF)
+ static ucs4_t _lowerCase[767];
+ static ucs4_t _foldCaseHighAscii[256]; // Latin Extended Additional (0x1E00 - 0x1F00) (incl. vietnamese)
+ static ucs4_t _lowerCaseHighAscii[256];
+ /** Map the values from range 0x3040 (0) - 0x30FF (191). */
+ static ucs4_t _kanaMap[192];
+ static ucs4_t _halfwidth_fullwidthMap[240];
+ static bool _isInitialized;
+ static bool _doAccentRemoval;
+ static bool _doSharpSSubstitution;
+ static bool _doLigatureSubstitution;
+ static bool _doMulticharExpansion;
};
diff --git a/vespalib/src/vespa/fastlib/text/wordfolder.h b/vespalib/src/vespa/fastlib/text/wordfolder.h
index e5412859f3e..ac8c590be7c 100644
--- a/vespalib/src/vespa/fastlib/text/wordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/wordfolder.h
@@ -7,10 +7,6 @@ class Fast_WordFolder
{
public:
virtual ~Fast_WordFolder() = default;
- virtual const char* UCS4Tokenize(const char *buf,
- const char *bufend,
- ucs4_t *dstbuf,
- ucs4_t *dstbufend,
- const char*& origstart,
- size_t& tokenlen) const = 0;
+ virtual const char* UCS4Tokenize(const char *buf, const char *bufend, ucs4_t *dstbuf,
+ ucs4_t *dstbufend, const char*& origstart, size_t& tokenlen) const = 0;
};