aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-03 10:03:12 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-01-05 08:29:15 +0000
commit192af4443cb572791c8f11520e8ebec4ee4e5a8e (patch)
tree755a603c0fe1b28116a24749f4f919ffee756c84 /searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
parentd8b50e4eaea708fed984c7c6ccdd06ac48b358bf (diff)
- Fold query for streaming search based on either query item type, or field definition.
- This ensures that query processing and document processing is symmetric for streaming search. No longer rely on java query processing being symmetric with backend c++ variant. - Indexed search does no normalization in backend and uses query as is.
Diffstat (limited to 'searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp')
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp104
1 files changed, 89 insertions, 15 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index fe6f73367d7..3950a179d67 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "queryterm.h"
+#include <vespa/fastlib/text/normwordfolder.h>
#include <vespa/vespalib/objects/visit.h>
#include <cmath>
@@ -34,7 +35,7 @@ CharInfo::CharInfo()
_charInfo[uint8_t('E')] = 0x05;
}
-CharInfo _G_charTable;
+CharInfo G_charTable;
}
@@ -54,20 +55,93 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const
visit(visitor, "uniqueid", _uniqueId);
}
-QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & termS, const string & indexS, Type type) :
- QueryTermUCS4(termS, type),
- _index(indexS),
- _encoding(0x01),
- _result(org.release()),
- _hitList(),
- _weight(100),
- _uniqueId(0),
- _fieldInfo()
+namespace {
+
+using Type = QueryTermSimple::Type;
+
+Normalizing
+requireFold(Type type, Normalizing normalizing) {
+ if (normalizing == Normalizing::NONE) return Normalizing::NONE;
+ if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
+ if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
+ return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) ||
+ (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM))
+ ? Normalizing::LOWERCASE_AND_FOLD
+ : Normalizing::NONE;
+}
+
+vespalib::string
+fold(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
+ } else {
+ c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ folded.append(repl, repllen);
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ }
+ return folded;
+}
+
+vespalib::string
+lowercase(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ return folded;
+}
+
+vespalib::string
+optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) {
+ switch ( requireFold(type, normalizing)) {
+ case Normalizing::NONE: return s;
+ case Normalizing::LOWERCASE: return lowercase(s);
+ case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
+ }
+ return s;
+}
+
+}
+
+QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS,
+ Type type, Normalizing normalizing)
+ : QueryTermUCS4(optional_fold(termS, type, normalizing), type),
+ _index(indexS),
+ _encoding(0x01),
+ _result(org.release()),
+ _hitList(),
+ _weight(100),
+ _uniqueId(0),
+ _fieldInfo()
{
- if (!termS.empty()) {
+ if (!empty()) {
uint8_t enc(0xff);
- for (char c : termS) {
- enc &= _G_charTable.get(c);
+ for (char c : getTermString()) {
+ enc &= G_charTable.get(c);
}
_encoding = EncodingBitMap(enc);
}
@@ -75,8 +149,8 @@ QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & te
void QueryTerm::getPhrases(QueryNodeRefList & tl) { (void) tl; }
void QueryTerm::getPhrases(ConstQueryNodeRefList & tl) const { (void) tl; }
-void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); }
-void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); }
+void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); }
+void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); }
bool QueryTerm::evaluate() const { return !_hitList.empty(); }
void QueryTerm::reset() { _hitList.clear(); }
const HitList & QueryTerm::evaluateHits(HitList &) const { return _hitList; }