aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-11 13:49:52 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-01-11 13:49:52 +0000
commit8a14af615bee86a178ea4838cc91d2079d9007aa (patch)
tree27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src/vespa
parentb4b5bd584110601471abf51bc59f29752e295fca (diff)
Split out tokenizer and test it explicit.
Diffstat (limited to 'streamingvisitors/src/vespa')
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp21
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h50
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h57
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp1
7 files changed, 75 insertions, 57 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 1a9238346b0..40aad418b22 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
intfieldsearcher.cpp
nearest_neighbor_field_searcher.cpp
strchrfieldsearcher.cpp
+ tokenizereader.cpp
utf8flexiblestringfieldsearcher.cpp
utf8strchrfieldsearcher.cpp
utf8stringfieldsearcherbase.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
new file mode 100644
index 00000000000..d8a6091fe11
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -0,0 +1,21 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokenizereader.h"
+
+namespace vsm {
+
+void
+TokenizeReader::fold(ucs4_t c) {
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+ }
+ } else {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
new file mode 100644
index 00000000000..76ca2e8d24b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -0,0 +1,50 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+class TokenizeReader {
+public:
+ using byte = search::byte;
+ using Normalizing = search::streaming::Normalizing;
+ TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+ : _p(p),
+ _p_end(p + len),
+ _q(q),
+ _q_start(q)
+ {}
+ ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+ void normalize(ucs4_t c, Normalizing normalize_mode) {
+ switch (normalize_mode) {
+ case Normalizing::LOWERCASE:
+ c = Fast_NormalizeWordFolder::lowercase(c);
+ [[fallthrough]];
+ case Normalizing::NONE:
+ *_q++ = c;
+ break;
+ case Normalizing::LOWERCASE_AND_FOLD:
+ fold(c);
+ break;
+ }
+ }
+ bool hasNext() const noexcept { return _p < _p_end; }
+ const byte * p() const noexcept { return _p; }
+ size_t complete() noexcept {
+ *_q = 0;
+ size_t token_len = _q - _q_start;
+ _q = _q_start;
+ return token_len;
+ }
+private:
+ void fold(ucs4_t c);
+ const byte *_p;
+ const byte *_p_end;
+ ucs4_t *_q;
+ ucs4_t *_q_start;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index fa1fc83728c..37dc4ffb99c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -1,5 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8strchrfieldsearcher.h"
+#include "tokenizereader.h"
using search::streaming::QueryTerm;
using search::streaming::QueryTermList;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index ce63f55ea63..d9ac47a3431 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8stringfieldsearcherbase.h"
+#include "tokenizereader.h"
#include <cassert>
using search::streaming::QueryTerm;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 115cddce619..b196f2795a4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,7 +2,6 @@
#pragma once
#include "strchrfieldsearcher.h"
-#include <vespa/fastlib/text/normwordfolder.h>
namespace vsm {
@@ -61,62 +60,6 @@ public:
protected:
SharedSearcherBuf _buf;
- using byte = search::byte;
-
- class TokenizeReader {
- public:
- TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
- : _p(p),
- _p_end(p + len),
- _q(q),
- _q_start(q)
- {}
- ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
- void normalize(ucs4_t c, Normalizing normalize_mode) {
- switch (normalize_mode) {
- case Normalizing::LOWERCASE:
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- [[fallthrough]];
- case Normalizing::NONE:
- *_q++ = c;
- break;
- case Normalizing::LOWERCASE_AND_FOLD:
- fold(c);
- break;
- }
- }
- bool hasNext() const noexcept { return _p < _p_end; }
- const byte * p() const noexcept { return _p; }
- size_t complete() noexcept {
- *_q = 0;
- size_t token_len = _q - _q_start;
- _q = _q_start;
- return token_len;
- }
- private:
- void fold(ucs4_t c) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *_q++ = c;
- }
- }
- void lowercase(ucs4_t c) {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *_q++ = c;
- }
- const byte *_p;
- const byte *_p_end;
- ucs4_t *_q;
- ucs4_t *_q_start;
- };
-
-
template<typename Reader>
void tokenize(Reader & reader);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index 4318d5fe1a3..8bbacf168cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -1,5 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8suffixstringfieldsearcher.h"
+#include "tokenizereader.h"
using search::byte;
using search::streaming::QueryTerm;