From 796b4c88c5b990b9446e3166394d8248080bcb05 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 27 Mar 2024 12:29:36 +0100 Subject: Move UTF8StringFieldSearcherBase tokenize member function to TokenizeReader. Move anonymous normalize_mode funtion to a public static FieldSearchSpecMap::convert_normalize_mode member function. --- .../src/vespa/vsm/searcher/tokenizereader.cpp | 33 ++++++++++++++++++++++ .../src/vespa/vsm/searcher/tokenizereader.h | 4 +++ .../vespa/vsm/searcher/utf8strchrfieldsearcher.cpp | 3 +- .../vsm/searcher/utf8stringfieldsearcherbase.cpp | 21 ++------------ .../vsm/searcher/utf8stringfieldsearcherbase.h | 3 -- .../vsm/searcher/utf8suffixstringfieldsearcher.cpp | 3 +- .../src/vespa/vsm/vsm/fieldsearchspec.cpp | 9 +++--- .../src/vespa/vsm/vsm/fieldsearchspec.h | 1 + 8 files changed, 47 insertions(+), 30 deletions(-) diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp index d8a6091fe11..5988bdd912f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp @@ -4,6 +4,19 @@ namespace vsm { +namespace { + +template inline bool is_word_char(ucs4_t c); + +template <> +inline bool is_word_char(ucs4_t c) { return Fast_UnicodeUtil::IsWordChar(c); } + +// All characters are treated as word characters for exact match +template <> +inline constexpr bool is_word_char(ucs4_t) { return true; } + +} + void TokenizeReader::fold(ucs4_t c) { const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); @@ -18,4 +31,24 @@ TokenizeReader::fold(ucs4_t c) { } } +template +size_t +TokenizeReader::tokenize_helper(Normalizing norm_mode) +{ + ucs4_t c(0); + while (hasNext()) { + if (is_word_char(c = next())) { + normalize(c, norm_mode); + while (hasNext() && is_word_char(c = next())) { + normalize(c, norm_mode); + } + break; + } + } + return complete(); +} + +template size_t TokenizeReader::tokenize_helper(Normalizing); +template size_t TokenizeReader::tokenize_helper(Normalizing); + } diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h index 2bb5e62e0aa..f680d9b6c47 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h @@ -43,6 +43,10 @@ public: _q = _q_start; return token_len; } + template + size_t tokenize_helper(Normalizing norm_mode); + size_t tokenize(Normalizing norm_mode) { return tokenize_helper(norm_mode); } + size_t tokenize_exact_match(Normalizing norm_mode) { return tokenize_helper(norm_mode); } private: void fold(ucs4_t c); const byte *_p; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 37dc4ffb99c..c860178d583 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -26,8 +26,7 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) TokenizeReader reader(reinterpret_cast (f.data()), f.size(), fn); while ( reader.hasNext() ) { - tokenize(reader); - size_t fl = reader.complete(); + size_t fl = reader.tokenize(normalize_mode()); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index 5036e9bedb1..f016d08ece8 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -10,21 +10,6 @@ using search::byte; namespace vsm { -template -void -UTF8StringFieldSearcherBase::tokenize(Reader & reader) { - ucs4_t c(0); - Normalizing norm_mode = normalize_mode(); - while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); - - if (Fast_UnicodeUtil::IsWordChar(c)) { - reader.normalize(c, norm_mode); - while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { - reader.normalize(c, norm_mode); - } - } -} - size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { @@ -38,8 +23,7 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt TokenizeReader reader(reinterpret_cast (f.data()), f.size(), fn); while ( reader.hasNext() ) { - tokenize(reader); - size_t fl = reader.complete(); + size_t fl = reader.tokenize(normalize_mode()); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -127,8 +111,7 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) TokenizeReader reader(reinterpret_cast (f.data()), f.size(), dstbuf); while ( reader.hasNext() ) { - tokenize(reader); - size_t tokenlen = reader.complete(); + size_t tokenlen = reader.tokenize(normalize_mode()); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index b196f2795a4..c217a7b8866 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -60,9 +60,6 @@ public: protected: SharedSearcherBuf _buf; - template - void tokenize(Reader & reader); - /** * Matches the given query term against the words in the given field reference * using exact or prefix match strategy. diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index 8bbacf168cf..d5bf4e4238a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -26,8 +26,7 @@ UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) TokenizeReader reader(reinterpret_cast (f.data()), f.size(), dstbuf); while ( reader.hasNext() ) { - tokenize(reader); - size_t tokenlen = reader.complete(); + size_t tokenlen = reader.tokenize(normalize_mode()); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 1ab1b16cb86..1986db79972 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -273,8 +273,11 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch return ifm; } +} + search::Normalizing -normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { +FieldSearchSpecMap::convert_normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) +{ switch (normalize_mode) { case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::Normalizing::NONE; case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::Normalizing::LOWERCASE; @@ -283,8 +286,6 @@ normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { return search::Normalizing::LOWERCASE_AND_FOLD; } -} - void FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf, const search::fef::IIndexEnvironment& index_env) { @@ -292,7 +293,7 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf, const search:: for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { LOG(spam, "Parsing %s", cfs.name.c_str()); FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, convert_normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); _specMap[fieldId] = std::move(fss); _nameIdMap.add(cfs.name, fieldId); LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h index 5b5a6b9a783..8bab0cad3b6 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -101,6 +101,7 @@ public: static vespalib::string stripNonFields(vespalib::stringref rawIndex); search::attribute::DistanceMetric get_distance_metric(const vespalib::string& name) const; + static search::Normalizing convert_normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode); private: FieldSearchSpecMapT _specMap; // mapping from field id to field search spec -- cgit v1.2.3