From 8a14af615bee86a178ea4838cc91d2079d9007aa Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Thu, 11 Jan 2024 13:49:52 +0000 Subject: Split out tokenizer and test it explicit. --- .../src/tests/searcher/searcher_test.cpp | 21 ++++++++ .../src/vespa/vsm/searcher/CMakeLists.txt | 1 + .../src/vespa/vsm/searcher/tokenizereader.cpp | 21 ++++++++ .../src/vespa/vsm/searcher/tokenizereader.h | 50 +++++++++++++++++++ .../vespa/vsm/searcher/utf8strchrfieldsearcher.cpp | 1 + .../vsm/searcher/utf8stringfieldsearcherbase.cpp | 1 + .../vsm/searcher/utf8stringfieldsearcherbase.h | 57 ---------------------- .../vsm/searcher/utf8suffixstringfieldsearcher.cpp | 1 + 8 files changed, 96 insertions(+), 57 deletions(-) create mode 100644 streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp create mode 100644 streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h (limited to 'streamingvisitors') diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 74d8fdc4bf3..6ed9ee9dace 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include using namespace document; @@ -871,4 +872,24 @@ TEST("counting of words") { assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits())); } +vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization"; + +void +verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) { + ucs4_t buf[256]; + TokenizeReader reader(reinterpret_cast(NormalizationInput.c_str()), NormalizationInput.size(), buf); + while (reader.hasNext()) { + reader.normalize(reader.next(), normalizing); + } + size_t len = reader.complete(); + EXPECT_EQUAL(expected_len, len); + EXPECT_EQUAL(0, Fast_UnicodeUtil::utf8cmp(expected, buf)); +} + +TEST("test normalizing") { + verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str()); + verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization"); + verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization"); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt index 1a9238346b0..40aad418b22 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt @@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT intfieldsearcher.cpp nearest_neighbor_field_searcher.cpp strchrfieldsearcher.cpp + tokenizereader.cpp utf8flexiblestringfieldsearcher.cpp utf8strchrfieldsearcher.cpp utf8stringfieldsearcherbase.cpp diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp new file mode 100644 index 00000000000..d8a6091fe11 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp @@ -0,0 +1,21 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokenizereader.h" + +namespace vsm { + +void +TokenizeReader::fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h new file mode 100644 index 00000000000..76ca2e8d24b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h @@ -0,0 +1,50 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include +#include +#include + +namespace vsm { + +class TokenizeReader { +public: + using byte = search::byte; + using Normalizing = search::streaming::Normalizing; + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } +private: + void fold(ucs4_t c); + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index fa1fc83728c..37dc4ffb99c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8strchrfieldsearcher.h" +#include "tokenizereader.h" using search::streaming::QueryTerm; using search::streaming::QueryTermList; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index ce63f55ea63..d9ac47a3431 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" +#include "tokenizereader.h" #include using search::streaming::QueryTerm; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 115cddce619..b196f2795a4 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -2,7 +2,6 @@ #pragma once #include "strchrfieldsearcher.h" -#include namespace vsm { @@ -61,62 +60,6 @@ public: protected: SharedSearcherBuf _buf; - using byte = search::byte; - - class TokenizeReader { - public: - TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept - : _p(p), - _p_end(p + len), - _q(q), - _q_start(q) - {} - ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } - void normalize(ucs4_t c, Normalizing normalize_mode) { - switch (normalize_mode) { - case Normalizing::LOWERCASE: - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - [[fallthrough]]; - case Normalizing::NONE: - *_q++ = c; - break; - case Normalizing::LOWERCASE_AND_FOLD: - fold(c); - break; - } - } - bool hasNext() const noexcept { return _p < _p_end; } - const byte * p() const noexcept { return _p; } - size_t complete() noexcept { - *_q = 0; - size_t token_len = _q - _q_start; - _q = _q_start; - return token_len; - } - private: - void fold(ucs4_t c) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - _q = Fast_UnicodeUtil::ucs4copy(_q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *_q++ = c; - } - } - void lowercase(ucs4_t c) { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *_q++ = c; - } - const byte *_p; - const byte *_p_end; - ucs4_t *_q; - ucs4_t *_q_start; - }; - - template void tokenize(Reader & reader); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index 4318d5fe1a3..8bbacf168cf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8suffixstringfieldsearcher.h" +#include "tokenizereader.h" using search::byte; using search::streaming::QueryTerm; -- cgit v1.2.3