Split out tokenizer and test it explicit.

author: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-11 13:49:52 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-11 13:49:52 +0000
commit: 8a14af615bee86a178ea4838cc91d2079d9007aa (patch)
tree: 27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src
parent: b4b5bd584110601471abf51bc59f29752e295fca (diff)
8 files changed, 96 insertions, 57 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 74d8fdc4bf3..6ed9ee9dace 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -15,6 +15,7 @@
 #include <vespa/vsm/searcher/utf8substringsearcher.h>
 #include <vespa/vsm/searcher/utf8substringsnippetmodifier.h>
 #include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
 #include <vespa/vsm/vsm/snippetmodifier.h>
 
 using namespace document;
@@ -871,4 +872,24 @@ TEST("counting of words") {
     assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits()));
 }
 
+vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization";
+
+void
+verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) {
+    ucs4_t buf[256];
+    TokenizeReader reader(reinterpret_cast<const search::byte *>(NormalizationInput.c_str()), NormalizationInput.size(), buf);
+    while (reader.hasNext()) {
+        reader.normalize(reader.next(), normalizing);
+    }
+    size_t len = reader.complete();
+    EXPECT_EQUAL(expected_len, len);
+    EXPECT_EQUAL(0,  Fast_UnicodeUtil::utf8cmp(expected, buf));
+}
+
+TEST("test normalizing") {
+    verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str());
+    verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization");
+    verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization");
+}
+
 TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 1a9238346b0..40aad418b22 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
     intfieldsearcher.cpp
     nearest_neighbor_field_searcher.cpp
     strchrfieldsearcher.cpp
+    tokenizereader.cpp
     utf8flexiblestringfieldsearcher.cpp
     utf8strchrfieldsearcher.cpp
     utf8stringfieldsearcherbase.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
new file mode 100644
index 00000000000..d8a6091fe11
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -0,0 +1,21 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokenizereader.h"
+
+namespace vsm {
+
+void
+TokenizeReader::fold(ucs4_t c) {
+    const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+    if (repl != nullptr) {
+        size_t repllen = strlen(repl);
+        if (repllen > 0) {
+            _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+        }
+    } else {
+        c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+        *_q++ = c;
+    }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
new file mode 100644
index 00000000000..76ca2e8d24b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -0,0 +1,50 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+class TokenizeReader {
+public:
+    using byte = search::byte;
+    using Normalizing = search::streaming::Normalizing;
+    TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+        : _p(p),
+          _p_end(p + len),
+          _q(q),
+          _q_start(q)
+    {}
+    ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+    void normalize(ucs4_t c, Normalizing normalize_mode) {
+        switch (normalize_mode) {
+            case Normalizing::LOWERCASE:
+                c = Fast_NormalizeWordFolder::lowercase(c);
+                [[fallthrough]];
+            case Normalizing::NONE:
+                *_q++ = c;
+                break;
+            case Normalizing::LOWERCASE_AND_FOLD:
+                fold(c);
+                break;
+        }
+    }
+    bool hasNext() const noexcept { return _p < _p_end; }
+    const byte * p() const noexcept { return _p; }
+    size_t complete() noexcept {
+        *_q = 0;
+        size_t token_len = _q - _q_start;
+        _q = _q_start;
+        return token_len;
+    }
+private:
+    void fold(ucs4_t c);
+    const byte *_p;
+    const byte *_p_end;
+    ucs4_t     *_q;
+    ucs4_t     *_q_start;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index fa1fc83728c..37dc4ffb99c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8strchrfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::streaming::QueryTerm;
 using search::streaming::QueryTermList;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index ce63f55ea63..d9ac47a3431 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,6 +1,7 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
+#include "tokenizereader.h"
 #include <cassert>
 
 using search::streaming::QueryTerm;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 115cddce619..b196f2795a4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,7 +2,6 @@
 #pragma once
 
 #include "strchrfieldsearcher.h"
-#include <vespa/fastlib/text/normwordfolder.h>
 
 namespace vsm {
 
@@ -61,62 +60,6 @@ public:
 protected:
     SharedSearcherBuf _buf;
 
-    using byte = search::byte;
-
-    class TokenizeReader {
-    public:
-        TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
-            : _p(p),
-              _p_end(p + len),
-              _q(q),
-              _q_start(q)
-        {}
-        ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
-        void normalize(ucs4_t c, Normalizing normalize_mode) {
-            switch (normalize_mode) {
-                case Normalizing::LOWERCASE:
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    [[fallthrough]];
-                case Normalizing::NONE:
-                    *_q++ = c;
-                    break;
-                case Normalizing::LOWERCASE_AND_FOLD:
-                    fold(c);
-                    break;
-            }
-        }
-        bool hasNext() const noexcept { return _p < _p_end; }
-        const byte * p() const noexcept { return _p; }
-        size_t complete() noexcept {
-            *_q = 0;
-            size_t token_len = _q - _q_start;
-            _q = _q_start;
-            return token_len;
-        }
-    private:
-        void fold(ucs4_t c) {
-            const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-            if (repl != nullptr) {
-                size_t repllen = strlen(repl);
-                if (repllen > 0) {
-                    _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
-                }
-            } else {
-                c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                *_q++ = c;
-            }
-        }
-        void lowercase(ucs4_t c) {
-            c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-            *_q++ = c;
-        }
-        const byte *_p;
-        const byte *_p_end;
-        ucs4_t     *_q;
-        ucs4_t     *_q_start;
-    };
-
-
     template<typename Reader>
     void tokenize(Reader & reader);
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index 4318d5fe1a3..8bbacf168cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8suffixstringfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::byte;
 using search::streaming::QueryTerm;
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-11 13:49:52 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-11 13:49:52 +0000
commit	8a14af615bee86a178ea4838cc91d2079d9007aa (patch)
tree	27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src
parent	b4b5bd584110601471abf51bc59f29752e295fca (diff)