diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-12 08:29:43 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-12 08:29:43 +0000 |
commit | 679087c481fbb1aa02b21162f5a96e3c9ce56abc (patch) | |
tree | 28bf2f0f6b7b89dd4c8f4c3fd94fa4d65e2319ce /streamingvisitors/src/vespa/vsm/searcher | |
parent | 13d6a727e8c2c23d04e8bca980588abbd0424d69 (diff) |
Also handle different normalization during query time.
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher')
3 files changed, 21 insertions, 14 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp index a7cb2300b74..70cef08428a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp @@ -7,6 +7,13 @@ using search::streaming::QueryTermList; namespace vsm { +UTF8ExactStringFieldSearcher::UTF8ExactStringFieldSearcher(FieldIdT fId) + : UTF8StringFieldSearcherBase(fId) +{ + match_type(EXACT); + normalize_mode(Normalizing::LOWERCASE); +} + std::unique_ptr<FieldSearcher> UTF8ExactStringFieldSearcher::duplicate() const { diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index 7ed4c125573..48480fac684 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -17,11 +17,7 @@ protected: public: std::unique_ptr<FieldSearcher> duplicate() const override; - explicit UTF8ExactStringFieldSearcher(FieldIdT fId) - : UTF8StringFieldSearcherBase(fId) - { - match_type(EXACT); - } + explicit UTF8ExactStringFieldSearcher(FieldIdT fId); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index d9ac47a3431..5036e9bedb1 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -55,22 +55,26 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt size_t UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) { - const byte * n = reinterpret_cast<const byte *> (f.data()); const cmptype_t * term; termsize_t tsz = qt.term(term); const cmptype_t * eterm = term+tsz; - const byte * e = n + f.size(); + if ( f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fn = _buf->data(); if (tsz <= f.size()) { bool equal(true); - for (; equal && (n < e) && (term < eterm); term++) { - if (*term < 0x80) { - equal = (*term == Fast_NormalizeWordFolder::lowercase_ascii(*n++)); - } else { - cmptype_t c = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); - equal = (*term == c); + Normalizing norm_mode = normalize_mode(); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while (equal && reader.hasNext() && (term < eterm)) { + reader.normalize(reader.next(), norm_mode); + size_t len = reader.complete(); + for (size_t i(0); i < len; i++) { + equal = (term[i] == fn[i]); } + term += len; } - if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) { + if (equal && (term == eterm) && (qt.isPrefix() || ! reader.hasNext())) { addHit(qt,0); } } |