From 192af4443cb572791c8f11520e8ebec4ee4e5a8e Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Wed, 3 Jan 2024 10:03:12 +0000 Subject: - Fold query for streaming search based on either query item type, or field definition. - This ensures that query processing and document processing is symmetric for streaming search. No longer rely on java query processing being symmetric with backend c++ variant. - Indexed search does no normalization in backend and uses query as is. --- searchlib/src/tests/query/streaming_query_test.cpp | 5 +- .../searchlib/query/streaming/dot_product_term.cpp | 2 +- .../vespa/searchlib/query/streaming/in_term.cpp | 5 +- .../src/vespa/searchlib/query/streaming/in_term.h | 3 +- .../vespa/searchlib/query/streaming/multi_term.cpp | 17 ++-- .../vespa/searchlib/query/streaming/multi_term.h | 7 +- .../streaming/nearest_neighbor_query_node.cpp | 2 +- .../vespa/searchlib/query/streaming/querynode.cpp | 39 ++++---- .../vespa/searchlib/query/streaming/querynode.h | 5 +- .../query/streaming/querynoderesultbase.h | 15 ++- .../vespa/searchlib/query/streaming/queryterm.cpp | 104 ++++++++++++++++++--- .../vespa/searchlib/query/streaming/queryterm.h | 10 +- .../src/tests/searcher/searcher_test.cpp | 19 ++-- .../src/vespa/searchvisitor/querytermdata.h | 8 +- .../src/vespa/searchvisitor/searchvisitor.cpp | 16 ++++ .../src/vespa/searchvisitor/searchvisitor.h | 1 + .../src/vespa/vsm/searcher/fieldsearcher.h | 2 +- .../vsm/searcher/utf8exactstringfieldsearcher.h | 11 ++- vespalib/src/vespa/fastlib/text/normwordfolder.cpp | 13 ++- vespalib/src/vespa/fastlib/text/normwordfolder.h | 4 +- 20 files changed, 205 insertions(+), 83 deletions(-) diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp index f5370785167..c4ef2028123 100644 --- a/searchlib/src/tests/query/streaming_query_test.cpp +++ b/searchlib/src/tests/query/streaming_query_test.cpp @@ -27,6 +27,7 @@ void assertHit(const Hit & h, size_t expWordpos, size_t expContext, int32_t weig EXPECT_EQ(h.weight(), weight); } + TEST(StreamingQueryTest, test_query_language) { QueryNodeResultFactory factory; @@ -297,7 +298,7 @@ class AllowRewrite : public QueryNodeResultFactory { public: explicit AllowRewrite(vespalib::stringref index) noexcept : _allowedIndex(index) {} - bool getRewriteFloatTerms(vespalib::stringref index) const noexcept override { return index == _allowedIndex; } + bool allow_float_terms_rewrite(vespalib::stringref index) const noexcept override { return index == _allowedIndex; } private: vespalib::string _allowedIndex; }; @@ -905,7 +906,7 @@ TEST(StreamingQueryTest, test_in_term) { auto term_vector = std::make_unique(1); term_vector->addTerm("7"); - search::streaming::InTerm term({}, "index", std::move(term_vector)); + search::streaming::InTerm term({}, "index", std::move(term_vector), Normalizing::NONE); SimpleTermData td; td.addField(10); td.addField(11); diff --git a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp index d2c1ba872f5..9bb6d8c3342 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp @@ -11,7 +11,7 @@ using search::fef::MatchData; namespace search::streaming { DotProductTerm::DotProductTerm(std::unique_ptr result_base, const string & index, uint32_t num_terms) - : MultiTerm(std::move(result_base), index, Type::WORD, num_terms) + : MultiTerm(std::move(result_base), index, num_terms) { } diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp index 36303d4e991..3e75f4a5114 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp @@ -12,8 +12,9 @@ using search::query::TermVector; namespace search::streaming { -InTerm::InTerm(std::unique_ptr result_base, const string & index, std::unique_ptr terms) - : MultiTerm(std::move(result_base), index, Type::WORD, std::move(terms)) +InTerm::InTerm(std::unique_ptr result_base, const string & index, + std::unique_ptr terms, Normalizing normalize_mode) + : MultiTerm(std::move(result_base), index, std::move(terms), normalize_mode) { } diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.h b/searchlib/src/vespa/searchlib/query/streaming/in_term.h index 7d03ed989c7..7b388b3f6e6 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/in_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.h @@ -11,7 +11,8 @@ namespace search::streaming { */ class InTerm : public MultiTerm { public: - InTerm(std::unique_ptr result_base, const string& index, std::unique_ptr terms); + InTerm(std::unique_ptr result_base, const string& index, + std::unique_ptr terms, Normalizing normalize_mode); ~InTerm() override; void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override; }; diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp index ad5857b8c41..dd34b9b7e73 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp @@ -9,19 +9,20 @@ using search::query::TermVector; namespace search::streaming { -MultiTerm::MultiTerm(std::unique_ptr result_base, const string & index, Type type, uint32_t num_terms) - : QueryTerm(std::move(result_base), "", index, type), +MultiTerm::MultiTerm(std::unique_ptr result_base, const string & index, uint32_t num_terms) + : QueryTerm(std::move(result_base), "", index, Type::WORD, Normalizing::NONE), _terms() { _terms.reserve(num_terms); } -MultiTerm::MultiTerm(std::unique_ptr result_base, const string & index, Type type, std::unique_ptr terms) - : MultiTerm(std::move(result_base), index, type, terms->size()) +MultiTerm::MultiTerm(std::unique_ptr result_base, const string & index, + std::unique_ptr terms, Normalizing normalizing) + : MultiTerm(std::move(result_base), index, terms->size()) { auto num_terms = terms->size(); for (uint32_t i = 0; i < num_terms; ++i) { - add_term(std::make_unique(std::unique_ptr(), terms->getAsString(i).first, "", QueryTermSimple::Type::WORD)); + add_term(std::make_unique(std::unique_ptr(), terms->getAsString(i).first, "", Type::WORD, normalizing)); } } @@ -33,12 +34,6 @@ MultiTerm::add_term(std::unique_ptr term) _terms.emplace_back(std::move(term)); } -MultiTerm* -MultiTerm::as_multi_term() noexcept -{ - return this; -} - void MultiTerm::reset() { diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h index 4c3f1ea5b5a..3bb69e29693 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h @@ -24,11 +24,12 @@ class MultiTerm : public QueryTerm { protected: std::vector> _terms; public: - MultiTerm(std::unique_ptr result_base, const string & index, Type type, uint32_t num_terms); - MultiTerm(std::unique_ptr result_base, const string & index, Type type, std::unique_ptr terms); + MultiTerm(std::unique_ptr result_base, const string & index, uint32_t num_terms); + MultiTerm(std::unique_ptr result_base, const string & index, + std::unique_ptr terms, Normalizing normalizing); ~MultiTerm() override; void add_term(std::unique_ptr term); - MultiTerm* as_multi_term() noexcept override; + MultiTerm* as_multi_term() noexcept override { return this; } void reset() override; bool evaluate() const override; const HitList& evaluateHits(HitList& hl) const override; diff --git a/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp index f710218297d..1317d1c0651 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp @@ -9,7 +9,7 @@ NearestNeighborQueryNode::NearestNeighborQueryNode(std::unique_ptr(factory.create(), - queryRep.getTerm(), - queryRep.getIndexName(), - QueryTerm::Type::GEO_LOCATION); + qn = std::make_unique(factory.create(), queryRep.getTerm(), queryRep.getIndexName(), + QueryTerm::Type::GEO_LOCATION, Normalizing::NONE); break; case ParseItem::ITEM_NEAREST_NEIGHBOR: qn = build_nearest_neighbor_query_node(factory, queryRep); @@ -149,18 +147,19 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor // But it will do for now as only correct sddocname queries are sent down. qn = std::make_unique(); } else { - auto qt = std::make_unique(factory.create(), ssTerm, ssIndex, sTerm); + Normalizing normalize_mode = factory.normalizing_mode(ssIndex); + auto qt = std::make_unique(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); qt->setWeight(queryRep.GetWeight()); qt->setUniqueId(queryRep.getUniqueId()); if (qt->isFuzzy()) { qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance()); qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength()); } - if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.getRewriteFloatTerms(ssIndex)) { + if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) { auto phrase = std::make_unique(); auto dotPos = ssTerm.find('.'); - phrase->addChild(std::make_unique(factory.create(), ssTerm.substr(0, dotPos), ssIndex, TermType::WORD)); - phrase->addChild(std::make_unique(factory.create(), ssTerm.substr(dotPos + 1), ssIndex, TermType::WORD)); + phrase->addChild(std::make_unique(factory.create(), ssTerm.substr(0, dotPos), ssIndex, TermType::WORD, normalize_mode)); + phrase->addChild(std::make_unique(factory.create(), ssTerm.substr(dotPos + 1), ssIndex, TermType::WORD, normalize_mode)); auto orqn = std::make_unique(); orqn->addChild(std::move(qt)); orqn->addChild(std::move(phrase)); @@ -183,8 +182,11 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor } break; case ParseItem::ITEM_STRING_IN: + qn = std::make_unique(factory.create(), queryRep.getIndexName(), queryRep.get_terms(), + factory.normalizing_mode(queryRep.getIndexName())); + break; case ParseItem::ITEM_NUMERIC_IN: - qn = std::make_unique(factory.create(), queryRep.getIndexName(), queryRep.get_terms()); + qn = std::make_unique(factory.create(), queryRep.getIndexName(), queryRep.get_terms(), Normalizing::NONE); break; case ParseItem::ITEM_DOT_PRODUCT: qn = build_dot_product_term(factory, queryRep); @@ -210,17 +212,12 @@ QueryNode::build_nearest_neighbor_query_node(const QueryNodeResultFactory& facto auto weight = query_rep.GetWeight(); uint32_t target_hits = query_rep.getTargetHits(); double distance_threshold = query_rep.getDistanceThreshold(); - return std::make_unique(factory.create(), - query_tensor_name, - field_name, - target_hits, - distance_threshold, - unique_id, - weight); + return std::make_unique(factory.create(), query_tensor_name, field_name, + target_hits, distance_threshold, unique_id, weight); } void -QueryNode::populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep) +QueryNode::populate_multi_term(Normalizing string_normalize_mode, MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep) { char buf[24]; vespalib::string subterm; @@ -229,13 +226,15 @@ QueryNode::populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& quer std::unique_ptr term; switch (queryRep.getType()) { case ParseItem::ITEM_PURE_WEIGHTED_STRING: - term = std::make_unique(std::unique_ptr(), queryRep.getTerm(), "", QueryTermSimple::Type::WORD); + term = std::make_unique(std::unique_ptr(), queryRep.getTerm(), "", + QueryTermSimple::Type::WORD, string_normalize_mode); break; case ParseItem::ITEM_PURE_WEIGHTED_LONG: { auto res = std::to_chars(buf, buf + sizeof(buf), queryRep.getIntergerTerm(), 10); subterm.assign(buf, res.ptr - buf); - term = std::make_unique(std::unique_ptr(), subterm, "", QueryTermSimple::Type::WORD); + term = std::make_unique(std::unique_ptr(), subterm, "", + QueryTermSimple::Type::WORD, Normalizing::NONE); } break; default: @@ -255,7 +254,7 @@ QueryNode::build_dot_product_term(const QueryNodeResultFactory& factory, SimpleQ auto dp =std::make_unique(factory.create(), queryRep.getIndexName(), queryRep.getArity()); dp->setWeight(queryRep.GetWeight()); dp->setUniqueId(queryRep.getUniqueId()); - populate_multi_term(*dp, queryRep); + populate_multi_term(factory.normalizing_mode(dp->index()), *dp, queryRep); return dp; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.h b/searchlib/src/vespa/searchlib/query/streaming/querynode.h index bfc840e4603..576d614e58b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.h +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.h @@ -2,8 +2,7 @@ #pragma once #include "hit.h" -#include -#include +#include "querynoderesultbase.h" namespace search { class SimpleQueryStackDumpIterator; } @@ -30,7 +29,7 @@ using ConstQueryTermList = std::vector; class QueryNode { static std::unique_ptr build_nearest_neighbor_query_node(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep); - static void populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep); + static void populate_multi_term(Normalizing string_normalize_mode, MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep); static std::unique_ptr build_dot_product_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep); static void skip_unknown(SimpleQueryStackDumpIterator& queryRep); public: diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h index d7704fb60e1..74f872ad187 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h +++ b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h @@ -18,10 +18,23 @@ public: virtual QueryNodeResultBase * clone() const = 0; }; +enum class Normalizing { + NONE, + LOWERCASE, + LOWERCASE_AND_FOLD +}; + class QueryNodeResultFactory { public: virtual ~QueryNodeResultFactory() = default; - virtual bool getRewriteFloatTerms(vespalib::stringref index) const noexcept { (void) index; return false; } + virtual bool allow_float_terms_rewrite(vespalib::stringref index) const noexcept { + (void) index; + return false; + } + virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept { + (void) index; + return Normalizing::NONE; + } virtual std::unique_ptr create() const { return {}; } }; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index fe6f73367d7..3950a179d67 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "queryterm.h" +#include #include #include @@ -34,7 +35,7 @@ CharInfo::CharInfo() _charInfo[uint8_t('E')] = 0x05; } -CharInfo _G_charTable; +CharInfo G_charTable; } @@ -54,20 +55,93 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const visit(visitor, "uniqueid", _uniqueId); } -QueryTerm::QueryTerm(std::unique_ptr org, const string & termS, const string & indexS, Type type) : - QueryTermUCS4(termS, type), - _index(indexS), - _encoding(0x01), - _result(org.release()), - _hitList(), - _weight(100), - _uniqueId(0), - _fieldInfo() +namespace { + +using Type = QueryTermSimple::Type; + +Normalizing +requireFold(Type type, Normalizing normalizing) { + if (normalizing == Normalizing::NONE) return Normalizing::NONE; + if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; + if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE; + return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) || + (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM)) + ? Normalizing::LOWERCASE_AND_FOLD + : Normalizing::NONE; +} + +vespalib::string +fold(vespalib::stringref s) { + const auto * curr = reinterpret_cast(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); + } else { + c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); + if (repl != nullptr) { + size_t repllen = strlen(repl); + folded.append(repl, repllen); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + } + return folded; +} + +vespalib::string +lowercase(vespalib::stringref s) { + const auto * curr = reinterpret_cast(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(static_cast(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + return folded; +} + +vespalib::string +optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) { + switch ( requireFold(type, normalizing)) { + case Normalizing::NONE: return s; + case Normalizing::LOWERCASE: return lowercase(s); + case Normalizing::LOWERCASE_AND_FOLD: return fold(s); + } + return s; +} + +} + +QueryTerm::QueryTerm(std::unique_ptr org, stringref termS, const string & indexS, + Type type, Normalizing normalizing) + : QueryTermUCS4(optional_fold(termS, type, normalizing), type), + _index(indexS), + _encoding(0x01), + _result(org.release()), + _hitList(), + _weight(100), + _uniqueId(0), + _fieldInfo() { - if (!termS.empty()) { + if (!empty()) { uint8_t enc(0xff); - for (char c : termS) { - enc &= _G_charTable.get(c); + for (char c : getTermString()) { + enc &= G_charTable.get(c); } _encoding = EncodingBitMap(enc); } @@ -75,8 +149,8 @@ QueryTerm::QueryTerm(std::unique_ptr org, const string & te void QueryTerm::getPhrases(QueryNodeRefList & tl) { (void) tl; } void QueryTerm::getPhrases(ConstQueryNodeRefList & tl) const { (void) tl; } -void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); } -void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); } +void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); } +void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); } bool QueryTerm::evaluate() const { return !_hitList.empty(); } void QueryTerm::reset() { _hitList.clear(); } const HitList & QueryTerm::evaluateHits(HitList &) const { return _hitList; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 2d1156a9c51..743998a630e 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -31,9 +31,6 @@ public: bool isFloat() const { return _enc & Float; } bool isBase10Integer() const { return _enc & Base10Integer; } bool isAscii7Bit() const { return _enc & Ascii7Bit; } - void setBase10Integer(bool v) { if (v) _enc |= Base10Integer; else _enc &= ~Base10Integer; } - void setAscii7Bit(bool v) { if (v) _enc |= Ascii7Bit; else _enc &= ~Ascii7Bit; } - void setFloat(bool v) { if (v) _enc |= Float; else _enc &= ~Float; } private: enum { Ascii7Bit=0x01, Base10Integer=0x02, Float=0x04 }; uint8_t _enc; @@ -54,7 +51,12 @@ public: uint32_t _hitCount; uint32_t _fieldLength; }; - QueryTerm(std::unique_ptr resultBase, const string & term, const string & index, Type type); + QueryTerm(std::unique_ptr resultBase, stringref term, const string & index, Type type) + : QueryTerm(std::move(resultBase), term, index, type, (type == Type::EXACTSTRINGTERM) + ? Normalizing::LOWERCASE + : Normalizing::LOWERCASE_AND_FOLD) + {} + QueryTerm(std::unique_ptr resultBase, stringref term, const string & index, Type type, Normalizing normalizing); QueryTerm(const QueryTerm &) = delete; QueryTerm & operator = (const QueryTerm &) = delete; QueryTerm(QueryTerm &&) = delete; diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 1ce285c2103..83b84fffa11 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -21,6 +21,7 @@ using namespace document; using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; using search::streaming::QueryTerm; +using search::streaming::Normalizing; using search::streaming::QueryTermList; using TermType = QueryTerm::Type; using namespace vsm; @@ -56,11 +57,11 @@ public: class Query { private: - void setupQuery(const StringList & terms) { + void setupQuery(const StringList & terms, Normalizing normalizing) { for (const auto & term : terms) { ParsedQueryTerm pqt = parseQueryTerm(term); ParsedTerm pt = parseTerm(pqt.second); - qtv.push_back(std::make_unique(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second)); + qtv.push_back(std::make_unique(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing)); } for (const auto & i : qtv) { qtl.push_back(i.get()); @@ -72,7 +73,9 @@ public: QueryNodeResultFactory eqnr; std::vector qtv; QueryTermList qtl; - explicit Query(const StringList & terms); + + explicit Query(const StringList & terms) : Query(terms, Normalizing::LOWERCASE_AND_FOLD) {} + Query(const StringList & terms, Normalizing normalizing); ~Query(); static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) { size_t i = queryTerm.find(':'); @@ -94,8 +97,8 @@ public: } }; -Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() { - setupQuery(terms); +Query::Query(const StringList & terms, Normalizing normalizing) : eqnr(), qtv(), qtl() { + setupQuery(terms, normalizing); } Query::~Query() = default; @@ -286,8 +289,8 @@ bool assertMatchTermSuffix(const std::string & term, const std::string & word) { QueryNodeResultFactory eqnr; - QueryTerm qa(eqnr.create(), term, "index", TermType::WORD); - QueryTerm qb(eqnr.create(), word, "index", TermType::WORD); + QueryTerm qa(eqnr.create(), term, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD); + QueryTerm qb(eqnr.create(), word, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD); const ucs4_t * a; size_t alen = qa.term(a); const ucs4_t * b; @@ -308,7 +311,7 @@ assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & f std::vector performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv) { - Query q(query); + Query q(query, fs.exact() ? Normalizing::LOWERCASE : Normalizing::LOWERCASE_AND_FOLD); // prepare field searcher test::MockFieldSearcherEnv env; diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h index 36176f70d1d..38d0e942fbc 100644 --- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h +++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h @@ -22,17 +22,23 @@ public: class SearchMethodInfo { public: + using Normalizing = search::streaming::Normalizing; virtual ~SearchMethodInfo() = default; virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0; + virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0; }; class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory { public: + using Normalizing = search::streaming::Normalizing; QueryTermDataFactory(const SearchMethodInfo * searchMethodInfo) noexcept : _searchMethodInfo(searchMethodInfo) {} std::unique_ptr create() const override { return std::make_unique(); } - bool getRewriteFloatTerms(vespalib::stringref index ) const noexcept override { + Normalizing normalizing_mode(vespalib::stringref index) const noexcept override { + return _searchMethodInfo ? _searchMethodInfo->normalizing_mode(index) : Normalizing::LOWERCASE_AND_FOLD; + } + bool allow_float_terms_rewrite(vespalib::stringref index ) const noexcept override { return _searchMethodInfo && _searchMethodInfo->is_text_matching(index); } private: diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 49604135afc..4161adaf21f 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -326,6 +326,22 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept { return false; } +SearchMethodInfo::Normalizing +SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept { + StringFieldIdTMap fieldIdMap; + _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); + size_t num_exact = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = _fieldSearchSpecMap.specMap().find(fieldId.second); + if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.searcher().exact()) { + num_exact++; + } + } + return ((num_exact == 0) || (num_exact != fieldIdMap.map().size())) + ? Normalizing::LOWERCASE_AND_FOLD + : Normalizing::LOWERCASE; +} + void SearchVisitor::init(const Parameters & params) { diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h index 709564bcf02..ce40b5ba742 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h @@ -487,6 +487,7 @@ private: void setupAttributeVector(const vsm::FieldPath &fieldPath); bool is_text_matching(vespalib::stringref index) const noexcept override; + Normalizing normalizing_mode(vespalib::stringref index) const noexcept override; }; class SearchVisitorFactory : public storage::VisitorFactory { diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index 43443bd9cf4..e64c41f814f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -77,7 +77,7 @@ private: void onStructStart(const Content & c) override; public: - explicit IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} + explicit IteratorHandler(FieldSearcher & searcher) noexcept : _searcher(searcher) {} }; friend class IteratorHandler; // to allow calls to onValue(); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index 997bed74787..dd6f31581a0 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -12,13 +12,16 @@ namespace vsm class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase { protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr duplicate() const override; - UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + UTF8ExactStringFieldSearcher(FieldIdT fId) + : UTF8StringFieldSearcherBase(fId) + { + setMatchType(EXACT); + } }; } diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp index 8d3ccad9900..97b4b5aabb7 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp @@ -5,7 +5,7 @@ #include bool Fast_NormalizeWordFolder::_isInitialized = false; -std::mutex _initMutex; + bool Fast_NormalizeWordFolder::_doAccentRemoval = false; bool Fast_NormalizeWordFolder::_doSharpSSubstitution = false; bool Fast_NormalizeWordFolder::_doLigatureSubstitution = false; @@ -19,12 +19,19 @@ ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256]; ucs4_t Fast_NormalizeWordFolder::_kanaMap[192]; ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240]; +namespace { + +std::mutex G_initMutex; +Fast_NormalizeWordFolder G_forceWorldFolderInit; +} + + void Fast_NormalizeWordFolder::Setup(uint32_t flags) { // Only allow setting these when not initialized or initializing... { - std::lock_guard initGuard(_initMutex); + std::lock_guard initGuard(G_initMutex); _doAccentRemoval = (DO_ACCENT_REMOVAL & flags) != 0; _doSharpSSubstitution = (DO_SHARP_S_SUBSTITUTION & flags) != 0; _doLigatureSubstitution = (DO_LIGATURE_SUBSTITUTION & flags) != 0; @@ -39,7 +46,7 @@ Fast_NormalizeWordFolder::Initialize() { unsigned int i; if (!_isInitialized) { - std::lock_guard initGuard(_initMutex); + std::lock_guard initGuard(G_initMutex); if (!_isInitialized) { for (i = 0; i < 128; i++) diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h index 121a83e260d..5a77fe73e01 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.h +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h @@ -35,8 +35,8 @@ public: * added together. */ static void Setup(uint32_t flags); - static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; } - static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; } + static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _foldCase[c]; } + static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _lowerCase[c]; } static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; } static ucs4_t lowercase(ucs4_t c) { if (c < 767) -- cgit v1.2.3