diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-03 10:03:12 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-05 08:29:15 +0000 |
commit | 192af4443cb572791c8f11520e8ebec4ee4e5a8e (patch) | |
tree | 755a603c0fe1b28116a24749f4f919ffee756c84 /searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp | |
parent | d8b50e4eaea708fed984c7c6ccdd06ac48b358bf (diff) |
- Fold query for streaming search based on either query item type, or field definition.
- This ensures that query processing and document processing is symmetric for streaming search.
No longer rely on java query processing being symmetric with backend c++ variant.
- Indexed search does no normalization in backend and uses query as is.
Diffstat (limited to 'searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp')
-rw-r--r-- | searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp | 104 |
1 files changed, 89 insertions, 15 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index fe6f73367d7..3950a179d67 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "queryterm.h" +#include <vespa/fastlib/text/normwordfolder.h> #include <vespa/vespalib/objects/visit.h> #include <cmath> @@ -34,7 +35,7 @@ CharInfo::CharInfo() _charInfo[uint8_t('E')] = 0x05; } -CharInfo _G_charTable; +CharInfo G_charTable; } @@ -54,20 +55,93 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const visit(visitor, "uniqueid", _uniqueId); } -QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & termS, const string & indexS, Type type) : - QueryTermUCS4(termS, type), - _index(indexS), - _encoding(0x01), - _result(org.release()), - _hitList(), - _weight(100), - _uniqueId(0), - _fieldInfo() +namespace { + +using Type = QueryTermSimple::Type; + +Normalizing +requireFold(Type type, Normalizing normalizing) { + if (normalizing == Normalizing::NONE) return Normalizing::NONE; + if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; + if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE; + return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) || + (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM)) + ? Normalizing::LOWERCASE_AND_FOLD + : Normalizing::NONE; +} + +vespalib::string +fold(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); + } else { + c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); + if (repl != nullptr) { + size_t repllen = strlen(repl); + folded.append(repl, repllen); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + } + return folded; +} + +vespalib::string +lowercase(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + return folded; +} + +vespalib::string +optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) { + switch ( requireFold(type, normalizing)) { + case Normalizing::NONE: return s; + case Normalizing::LOWERCASE: return lowercase(s); + case Normalizing::LOWERCASE_AND_FOLD: return fold(s); + } + return s; +} + +} + +QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS, + Type type, Normalizing normalizing) + : QueryTermUCS4(optional_fold(termS, type, normalizing), type), + _index(indexS), + _encoding(0x01), + _result(org.release()), + _hitList(), + _weight(100), + _uniqueId(0), + _fieldInfo() { - if (!termS.empty()) { + if (!empty()) { uint8_t enc(0xff); - for (char c : termS) { - enc &= _G_charTable.get(c); + for (char c : getTermString()) { + enc &= G_charTable.get(c); } _encoding = EncodingBitMap(enc); } @@ -75,8 +149,8 @@ QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & te void QueryTerm::getPhrases(QueryNodeRefList & tl) { (void) tl; } void QueryTerm::getPhrases(ConstQueryNodeRefList & tl) const { (void) tl; } -void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); } -void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); } +void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); } +void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); } bool QueryTerm::evaluate() const { return !_hitList.empty(); } void QueryTerm::reset() { _hitList.clear(); } const HitList & QueryTerm::evaluateHits(HitList &) const { return _hitList; } |