diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-11 13:49:52 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-11 13:49:52 +0000 |
commit | 8a14af615bee86a178ea4838cc91d2079d9007aa (patch) | |
tree | 27aca677675d33cebefec09367f580f61f31a54b /streamingvisitors/src | |
parent | b4b5bd584110601471abf51bc59f29752e295fca (diff) |
Split out tokenizer and test it explicit.
Diffstat (limited to 'streamingvisitors/src')
8 files changed, 96 insertions, 57 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 74d8fdc4bf3..6ed9ee9dace 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -15,6 +15,7 @@ #include <vespa/vsm/searcher/utf8substringsearcher.h> #include <vespa/vsm/searcher/utf8substringsnippetmodifier.h> #include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> +#include <vespa/vsm/searcher/tokenizereader.h> #include <vespa/vsm/vsm/snippetmodifier.h> using namespace document; @@ -871,4 +872,24 @@ TEST("counting of words") { assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits())); } +vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization"; + +void +verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) { + ucs4_t buf[256]; + TokenizeReader reader(reinterpret_cast<const search::byte *>(NormalizationInput.c_str()), NormalizationInput.size(), buf); + while (reader.hasNext()) { + reader.normalize(reader.next(), normalizing); + } + size_t len = reader.complete(); + EXPECT_EQUAL(expected_len, len); + EXPECT_EQUAL(0, Fast_UnicodeUtil::utf8cmp(expected, buf)); +} + +TEST("test normalizing") { + verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str()); + verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization"); + verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization"); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt index 1a9238346b0..40aad418b22 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt @@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT intfieldsearcher.cpp nearest_neighbor_field_searcher.cpp strchrfieldsearcher.cpp + tokenizereader.cpp utf8flexiblestringfieldsearcher.cpp utf8strchrfieldsearcher.cpp utf8stringfieldsearcherbase.cpp diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp new file mode 100644 index 00000000000..d8a6091fe11 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp @@ -0,0 +1,21 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokenizereader.h" + +namespace vsm { + +void +TokenizeReader::fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h new file mode 100644 index 00000000000..76ca2e8d24b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h @@ -0,0 +1,50 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/searchlib/query/streaming/querynoderesultbase.h> +#include <vespa/searchlib/query/base.h> +#include <vespa/fastlib/text/normwordfolder.h> + +namespace vsm { + +class TokenizeReader { +public: + using byte = search::byte; + using Normalizing = search::streaming::Normalizing; + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } +private: + void fold(ucs4_t c); + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index fa1fc83728c..37dc4ffb99c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8strchrfieldsearcher.h" +#include "tokenizereader.h" using search::streaming::QueryTerm; using search::streaming::QueryTermList; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index ce63f55ea63..d9ac47a3431 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" +#include "tokenizereader.h" #include <cassert> using search::streaming::QueryTerm; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 115cddce619..b196f2795a4 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -2,7 +2,6 @@ #pragma once #include "strchrfieldsearcher.h" -#include <vespa/fastlib/text/normwordfolder.h> namespace vsm { @@ -61,62 +60,6 @@ public: protected: SharedSearcherBuf _buf; - using byte = search::byte; - - class TokenizeReader { - public: - TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept - : _p(p), - _p_end(p + len), - _q(q), - _q_start(q) - {} - ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } - void normalize(ucs4_t c, Normalizing normalize_mode) { - switch (normalize_mode) { - case Normalizing::LOWERCASE: - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - [[fallthrough]]; - case Normalizing::NONE: - *_q++ = c; - break; - case Normalizing::LOWERCASE_AND_FOLD: - fold(c); - break; - } - } - bool hasNext() const noexcept { return _p < _p_end; } - const byte * p() const noexcept { return _p; } - size_t complete() noexcept { - *_q = 0; - size_t token_len = _q - _q_start; - _q = _q_start; - return token_len; - } - private: - void fold(ucs4_t c) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - _q = Fast_UnicodeUtil::ucs4copy(_q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *_q++ = c; - } - } - void lowercase(ucs4_t c) { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *_q++ = c; - } - const byte *_p; - const byte *_p_end; - ucs4_t *_q; - ucs4_t *_q_start; - }; - - template<typename Reader> void tokenize(Reader & reader); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index 4318d5fe1a3..8bbacf168cf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8suffixstringfieldsearcher.h" +#include "tokenizereader.h" using search::byte; using search::streaming::QueryTerm; |