Split out tokenizer and test it explicit.

author: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-11 13:49:52 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-11 13:49:52 +0000
commit: 8a14af615bee86a178ea4838cc91d2079d9007aa (patch)
tree: 27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src/vespa
parent: b4b5bd584110601471abf51bc59f29752e295fca (diff)
7 files changed, 75 insertions, 57 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 1a9238346b0..40aad418b22 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
     intfieldsearcher.cpp
     nearest_neighbor_field_searcher.cpp
     strchrfieldsearcher.cpp
+    tokenizereader.cpp
     utf8flexiblestringfieldsearcher.cpp
     utf8strchrfieldsearcher.cpp
     utf8stringfieldsearcherbase.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
new file mode 100644
index 00000000000..d8a6091fe11
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -0,0 +1,21 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokenizereader.h"
+
+namespace vsm {
+
+void
+TokenizeReader::fold(ucs4_t c) {
+    const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+    if (repl != nullptr) {
+        size_t repllen = strlen(repl);
+        if (repllen > 0) {
+            _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+        }
+    } else {
+        c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+        *_q++ = c;
+    }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
new file mode 100644
index 00000000000..76ca2e8d24b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -0,0 +1,50 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+class TokenizeReader {
+public:
+    using byte = search::byte;
+    using Normalizing = search::streaming::Normalizing;
+    TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+        : _p(p),
+          _p_end(p + len),
+          _q(q),
+          _q_start(q)
+    {}
+    ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+    void normalize(ucs4_t c, Normalizing normalize_mode) {
+        switch (normalize_mode) {
+            case Normalizing::LOWERCASE:
+                c = Fast_NormalizeWordFolder::lowercase(c);
+                [[fallthrough]];
+            case Normalizing::NONE:
+                *_q++ = c;
+                break;
+            case Normalizing::LOWERCASE_AND_FOLD:
+                fold(c);
+                break;
+        }
+    }
+    bool hasNext() const noexcept { return _p < _p_end; }
+    const byte * p() const noexcept { return _p; }
+    size_t complete() noexcept {
+        *_q = 0;
+        size_t token_len = _q - _q_start;
+        _q = _q_start;
+        return token_len;
+    }
+private:
+    void fold(ucs4_t c);
+    const byte *_p;
+    const byte *_p_end;
+    ucs4_t     *_q;
+    ucs4_t     *_q_start;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index fa1fc83728c..37dc4ffb99c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8strchrfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::streaming::QueryTerm;
 using search::streaming::QueryTermList;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index ce63f55ea63..d9ac47a3431 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,6 +1,7 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
+#include "tokenizereader.h"
 #include <cassert>
 
 using search::streaming::QueryTerm;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 115cddce619..b196f2795a4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,7 +2,6 @@
 #pragma once
 
 #include "strchrfieldsearcher.h"
-#include <vespa/fastlib/text/normwordfolder.h>
 
 namespace vsm {
 
@@ -61,62 +60,6 @@ public:
 protected:
     SharedSearcherBuf _buf;
 
-    using byte = search::byte;
-
-    class TokenizeReader {
-    public:
-        TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
-            : _p(p),
-              _p_end(p + len),
-              _q(q),
-              _q_start(q)
-        {}
-        ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
-        void normalize(ucs4_t c, Normalizing normalize_mode) {
-            switch (normalize_mode) {
-                case Normalizing::LOWERCASE:
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    [[fallthrough]];
-                case Normalizing::NONE:
-                    *_q++ = c;
-                    break;
-                case Normalizing::LOWERCASE_AND_FOLD:
-                    fold(c);
-                    break;
-            }
-        }
-        bool hasNext() const noexcept { return _p < _p_end; }
-        const byte * p() const noexcept { return _p; }
-        size_t complete() noexcept {
-            *_q = 0;
-            size_t token_len = _q - _q_start;
-            _q = _q_start;
-            return token_len;
-        }
-    private:
-        void fold(ucs4_t c) {
-            const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-            if (repl != nullptr) {
-                size_t repllen = strlen(repl);
-                if (repllen > 0) {
-                    _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
-                }
-            } else {
-                c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                *_q++ = c;
-            }
-        }
-        void lowercase(ucs4_t c) {
-            c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-            *_q++ = c;
-        }
-        const byte *_p;
-        const byte *_p_end;
-        ucs4_t     *_q;
-        ucs4_t     *_q_start;
-    };
-
-
     template<typename Reader>
     void tokenize(Reader & reader);
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index 4318d5fe1a3..8bbacf168cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8suffixstringfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::byte;
 using search::streaming::QueryTerm;
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-11 13:49:52 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-11 13:49:52 +0000
commit	8a14af615bee86a178ea4838cc91d2079d9007aa (patch)
tree	27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src/vespa
parent	b4b5bd584110601471abf51bc59f29752e295fca (diff)