From 02c5bce07737a899726097e577c6dd1121ca5a7c Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Tue, 9 Jan 2024 07:35:14 +0000
Subject: Simplify ancient carefully hand optimized code in favour of simple
 readable code

---
 .../vsm/searcher/utf8stringfieldsearcherbase.cpp   | 119 ++++-----------------
 1 file changed, 21 insertions(+), 98 deletions(-)

(limited to 'streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp')
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 1148083b042..ce63f55ea63 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
 #include <cassert>
 
 using search::streaming::QueryTerm;
@@ -10,107 +9,36 @@ using search::byte;
 
 namespace vsm {
 
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
-    if (maxSz > 0) {
-        maxSz--;
-    }
-    ucs4_t c(*p);
-    ucs4_t *q(dstbuf);
-    const byte * end(p+maxSz);
-
-    // Skip non-word characters between words
-    for (; p < end; ) {
-        if (c < 128) {
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
-                *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
-                c = 0;
-            } else {
-                c = *p;
-            }
-        } else {
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (Fast_UnicodeUtil::IsWordChar(c)) {
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != nullptr) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    *q++ = c;
-                }
-                break;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                }
-                c = *p;
-            }
-        }
-    }
-
-    c = *p;  // Next char
-    for (; p < end;) {
-        if (c < 128) {             // Common case, ASCII
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
-                c = 0;
-            } else {
-                *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
-                c = *p;
-            }
-        } else {
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != nullptr) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    *q++ = c;
-                }
-
-                c = *p;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                }
-                break;
-            }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+    ucs4_t c(0);
+    Normalizing norm_mode = normalize_mode();
+    while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+    if (Fast_UnicodeUtil::IsWordChar(c)) {
+        reader.normalize(c, norm_mode);
+        while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+            reader.normalize(c, norm_mode);
         }
     }
-    *q = 0;
-    tokenlen = q - dstbuf;
-    return p;
 }
 
 size_t
 UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words(0);
-    const byte * n = reinterpret_cast<const byte *> (f.data());
-    // __builtin_prefetch(n, 0, 0);
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
-    const byte * e = n + f.size();
     if ( f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * fn = &(*_buf.get())[0];
-    size_t fl(0);
+    cmptype_t * fn = _buf->data();
 
-    for( ; n < e; ) {
-        if (!*n) { _zeroCount++; n++; }
-        n = tokenize(n, _buf->capacity(), fn, fl);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t fl = reader.complete();
         if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
             const cmptype_t *tt=term, *et=term+tsz;
             for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -185,22 +113,17 @@ size_t
 UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words = 0;
-    const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
-    const byte * srcend = srcbuf + f.size();
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * dstbuf = &(*_buf.get())[0];
-    size_t tokenlen = 0;
+    cmptype_t * dstbuf = _buf->data();
 
-    for( ; srcbuf < srcend; ) {
-        if (*srcbuf == 0) {
-            ++_zeroCount;
-            ++srcbuf;
-        }
-        srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t tokenlen = reader.complete();
         if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
             addHit(qt, words);
         }
-- 
cgit v1.2.3