aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-09 07:35:14 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-01-10 08:12:55 +0000
commit02c5bce07737a899726097e577c6dd1121ca5a7c (patch)
treee6c73d2df7f9f2c55322330cbc4ba644a2bbb8e0 /streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
parent4388490c151581bc6e04059baa04b580c80577d3 (diff)
Simplify ancient carefully hand optimized code in favour of simple readable code
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp')
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp119
1 files changed, 21 insertions, 98 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 1148083b042..ce63f55ea63 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
#include <cassert>
using search::streaming::QueryTerm;
@@ -10,107 +9,36 @@ using search::byte;
namespace vsm {
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
- if (maxSz > 0) {
- maxSz--;
- }
- ucs4_t c(*p);
- ucs4_t *q(dstbuf);
- const byte * end(p+maxSz);
-
- // Skip non-word characters between words
- for (; p < end; ) {
- if (c < 128) {
- if (!c) { break; }
- p++;
- if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = 0;
- } else {
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
- break;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- c = *p;
- }
- }
- }
-
- c = *p; // Next char
- for (; p < end;) {
- if (c < 128) { // Common case, ASCII
- if (!c) { break; }
- p++;
- if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- c = 0;
- } else {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
-
- c = *p;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- break;
- }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+ ucs4_t c(0);
+ Normalizing norm_mode = normalize_mode();
+ while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+ if (Fast_UnicodeUtil::IsWordChar(c)) {
+ reader.normalize(c, norm_mode);
+ while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+ reader.normalize(c, norm_mode);
}
}
- *q = 0;
- tokenlen = q - dstbuf;
- return p;
}
size_t
UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
{
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- // __builtin_prefetch(n, 0, 0);
const cmptype_t * term;
termsize_t tsz = qt.term(term);
- const byte * e = n + f.size();
if ( f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
+ cmptype_t * fn = _buf->data();
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
const cmptype_t *tt=term, *et=term+tsz;
for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -185,22 +113,17 @@ size_t
UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
{
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
const cmptype_t * term;
termsize_t tsz = qt.term(term);
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
+ cmptype_t * dstbuf = _buf->data();
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
addHit(qt, words);
}