From 02c5bce07737a899726097e577c6dd1121ca5a7c Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 9 Jan 2024 07:35:14 +0000 Subject: Simplify ancient carefully hand optimized code in favour of simple readable code --- .../vsm/searcher/utf8stringfieldsearcherbase.cpp | 119 ++++----------------- 1 file changed, 21 insertions(+), 98 deletions(-) (limited to 'streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp') diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index 1148083b042..ce63f55ea63 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,7 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" -#include #include using search::streaming::QueryTerm; @@ -10,107 +9,36 @@ using search::byte; namespace vsm { -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = 0; - } else { - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - break; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - c = 0; - } else { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - break; - } +template +void +UTF8StringFieldSearcherBase::tokenize(Reader & reader) { + ucs4_t c(0); + Normalizing norm_mode = normalize_mode(); + while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); + + if (Fast_UnicodeUtil::IsWordChar(c)) { + reader.normalize(c, norm_mode); + while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { + reader.normalize(c, norm_mode); } } - *q = 0; - tokenlen = q - dstbuf; - return p; } size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { termcount_t words(0); - const byte * n = reinterpret_cast (f.data()); - // __builtin_prefetch(n, 0, 0); const cmptype_t * term; termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); if ( f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); + cmptype_t * fn = _buf->data(); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -185,22 +113,17 @@ size_t UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) { termcount_t words = 0; - const byte * srcbuf = reinterpret_cast (f.data()); - const byte * srcend = srcbuf + f.size(); const cmptype_t * term; termsize_t tsz = qt.term(term); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; + cmptype_t * dstbuf = _buf->data(); - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } -- cgit v1.2.3