diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-02-06 13:05:28 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-06 13:05:28 +0100 |
commit | 6ddd7ffa0f929ff528c0d81a54830693e62f07c7 (patch) | |
tree | f87cab146098b9469d80d2ad583eef8dcfcd7665 /searchlib | |
parent | fcca2419ded121767da85b563efb7a634533bae1 (diff) | |
parent | 1bb00080f59ad614e63dd32cd2de5a5255626384 (diff) |
Merge pull request #30185 from vespa-engine/balder/symmetric-query-processing-docsum-time
Balder/symmetric query processing docsum time
Diffstat (limited to 'searchlib')
10 files changed, 182 insertions, 158 deletions
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 89996515a4a..5e3b1dffe3a 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -4,6 +4,7 @@ #include "item_creator.h" #include <vespa/searchlib/query/weight.h> +#include <vespa/searchlib/query/query_normalization.h> #include <vespa/vespalib/stllike/string.h> namespace search { @@ -89,19 +90,37 @@ public: }; /** Extra information on each item (creator id) coded in bit 3 of flags */ - static inline ItemCreator GetCreator(uint8_t flags) { return static_cast<ItemCreator>((flags >> 3) & 0x01); } + static inline ItemCreator GetCreator(uint8_t flags) { + return static_cast<ItemCreator>((flags >> 3) & 0x01); + } - static inline bool GetFeature(uint8_t type, uint8_t feature) - { return ((type & feature) != 0); } + static inline bool GetFeature(uint8_t type, uint8_t feature) { + return ((type & feature) != 0); + } - static inline bool GetFeature_Weight(uint8_t type) - { return GetFeature(type, IF_WEIGHT); } + static inline bool GetFeature_Weight(uint8_t type) { + return GetFeature(type, IF_WEIGHT); + } - static inline bool getFeature_UniqueId(uint8_t type) - { return GetFeature(type, IF_UNIQUEID); } - - static inline bool getFeature_Flags(uint8_t type) - { return GetFeature(type, IF_FLAGS); } + static inline bool getFeature_UniqueId(uint8_t type) { + return GetFeature(type, IF_UNIQUEID); + } + static inline bool getFeature_Flags(uint8_t type) { + return GetFeature(type, IF_FLAGS); + } + static TermType toTermType(ItemType itemType) noexcept { + switch (itemType) { + case ParseItem::ITEM_REGEXP: return TermType::REGEXP; + case ParseItem::ITEM_PREFIXTERM: return TermType::PREFIXTERM; + case ParseItem::ITEM_SUBSTRINGTERM: return TermType::SUBSTRINGTERM; + case ParseItem::ITEM_EXACTSTRINGTERM: return TermType::EXACTSTRINGTERM; + case ParseItem::ITEM_SUFFIXTERM: return TermType::SUFFIXTERM; + case ParseItem::ITEM_FUZZY: return TermType::FUZZYTERM; + case ParseItem::ITEM_GEO_LOCATION_TERM: return TermType::GEO_LOCATION; + case ParseItem::ITEM_NEAREST_NEIGHBOR: return TermType::NEAREST_NEIGHBOR; + default: return TermType::WORD; + } + } }; } // namespace search diff --git a/searchlib/src/vespa/searchlib/query/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/CMakeLists.txt index 29e9c02e6f2..1c47c910cb4 100644 --- a/searchlib/src/vespa/searchlib/query/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/CMakeLists.txt @@ -4,5 +4,6 @@ vespa_add_library(searchlib_query OBJECT query_term_simple.cpp query_term_ucs4.cpp query_term_decoder.cpp + query_normalization.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.cpp b/searchlib/src/vespa/searchlib/query/query_normalization.cpp new file mode 100644 index 00000000000..64e1e0ed496 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/query_normalization.cpp @@ -0,0 +1,98 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "query_normalization.h" +#include <vespa/fastlib/text/normwordfolder.h> +#include <ostream> + +namespace search { + +namespace { + +const char * +to_str(search::Normalizing norm) noexcept { + switch (norm) { + case search::Normalizing::NONE: + return "NONE"; + case search::Normalizing::LOWERCASE: + return "LOWERCASE"; + case search::Normalizing::LOWERCASE_AND_FOLD: + return "LOWERCASE_AND_FOLD"; + } + abort(); +} + +Normalizing +requireFold(TermType type, Normalizing normalizing) { + if (normalizing == Normalizing::NONE) return Normalizing::NONE; + if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; + if (type == TermType::EXACTSTRINGTERM) return Normalizing::LOWERCASE; + return ((type == TermType::WORD) || (type == TermType::SUBSTRINGTERM) || + (type == TermType::PREFIXTERM) || (type == TermType::SUFFIXTERM)) + ? Normalizing::LOWERCASE_AND_FOLD + : Normalizing::NONE; +} + +vespalib::string +fold(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); + } else { + c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); + if (repl != nullptr) { + size_t repllen = strlen(repl); + folded.append(repl, repllen); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + } + return folded; +} + +vespalib::string +lowercase(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + return folded; +} + +} + +std::ostream & +operator<<(std::ostream &os, Normalizing n) { + os << to_str(n); + return os; +} + +vespalib::string +QueryNormalization::optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing) { + switch ( requireFold(type, normalizing)) { + case Normalizing::NONE: return s; + case Normalizing::LOWERCASE: return lowercase(s); + case Normalizing::LOWERCASE_AND_FOLD: return fold(s); + } + return s; +} + +} diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.h b/searchlib/src/vespa/searchlib/query/query_normalization.h new file mode 100644 index 00000000000..6a8350d1791 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/query_normalization.h @@ -0,0 +1,42 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <iosfwd> + +namespace search { + +enum class Normalizing : uint8_t { + NONE, + LOWERCASE, + LOWERCASE_AND_FOLD +}; + +enum class TermType : uint8_t { + WORD = 0, + PREFIXTERM = 1, + SUBSTRINGTERM = 2, + EXACTSTRINGTERM = 3, + SUFFIXTERM = 4, + REGEXP = 5, + GEO_LOCATION = 6, + FUZZYTERM = 7, + NEAREST_NEIGHBOR = 8 +}; + +std::ostream &operator<<(std::ostream &, Normalizing); + +/** + * Resolves what kind of normalization that is needed for the query terms in context + * of the fields searched. It also provides a utility method for doing the normalization. + */ +class QueryNormalization { +public: + using Normalizing = search::Normalizing; + virtual ~QueryNormalization() = default; + virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0; + virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0; + static vespalib::string optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing); +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index ab3bd512d1d..060cd5015b3 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -215,21 +215,24 @@ QueryTermSimple::getRange() const noexcept return getIntegerRange<int64_t>(); } -bool QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept +bool +QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept { lower = std::numeric_limits<int64_t>::min(); upper = std::numeric_limits<int64_t>::max(); return getAsNumericTerm(lower, upper, IntDecoder()); } -bool QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept +bool +QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept { lower = -std::numeric_limits<double>::infinity(); upper = std::numeric_limits<double>::infinity(); return getAsNumericTerm(lower, upper, FloatDecoder<double>()); } -bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept +bool +QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept { lower = -std::numeric_limits<float>::infinity(); upper = std::numeric_limits<float>::infinity(); @@ -238,12 +241,6 @@ bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcep QueryTermSimple::~QueryTermSimple() = default; -namespace { - - - -} - QueryTermSimple::QueryTermSimple(const string & term_, Type type) : _rangeLimit(0), _maxPerGroup(0), diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 87bf7c26b80..a740afb0340 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include "query_normalization.h" #include <vespa/vespalib/objects/objectvisitor.h> #include <vespa/vespalib/stllike/string.h> #include <vespa/vespalib/util/memory.h> @@ -15,17 +16,7 @@ public: using UP = std::unique_ptr<QueryTermSimple>; using string = vespalib::string; using stringref = vespalib::stringref; - enum class Type : uint8_t { - WORD = 0, - PREFIXTERM = 1, - SUBSTRINGTERM = 2, - EXACTSTRINGTERM = 3, - SUFFIXTERM = 4, - REGEXP = 5, - GEO_LOCATION = 6, - FUZZYTERM = 7, - NEAREST_NEIGHBOR = 8 - }; + using Type = TermType; template <typename N> struct RangeResult { diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 69fe77d3fd5..a0abdcd28fb 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -84,7 +84,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_GEO_LOCATION_TERM: // just keep the string representation here; parsed in vsm::GeoPosFieldSearcher qn = std::make_unique<QueryTerm>(factory.create(), queryRep.getTerm(), queryRep.getIndexName(), - QueryTerm::Type::GEO_LOCATION, Normalizing::NONE); + TermType::GEO_LOCATION, Normalizing::NONE); break; case ParseItem::ITEM_NEAREST_NEIGHBOR: qn = build_nearest_neighbor_query_node(factory, queryRep); @@ -111,30 +111,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor if (dynamic_cast<const SameElementQueryNode *>(parent) != nullptr) { index = parent->getIndex() + "." + index; } - using TermType = QueryTerm::Type; - TermType sTerm(TermType::WORD); - switch (type) { - case ParseItem::ITEM_REGEXP: - sTerm = TermType::REGEXP; - break; - case ParseItem::ITEM_PREFIXTERM: - sTerm = TermType::PREFIXTERM; - break; - case ParseItem::ITEM_SUBSTRINGTERM: - sTerm = TermType::SUBSTRINGTERM; - break; - case ParseItem::ITEM_EXACTSTRINGTERM: - sTerm = TermType::EXACTSTRINGTERM; - break; - case ParseItem::ITEM_SUFFIXTERM: - sTerm = TermType::SUFFIXTERM; - break; - case ParseItem::ITEM_FUZZY: - sTerm = TermType::FUZZYTERM; - break; - default: - break; - } + TermType sTerm = ParseItem::toTermType(type); QueryTerm::string ssTerm; if (type == ParseItem::ITEM_PURE_WEIGHTED_LONG) { char buf[24]; diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.cpp index af8ce7c9994..ccfd187441d 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.cpp @@ -4,22 +4,4 @@ namespace search::streaming { -namespace { - -const char* to_str(Normalizing norm) noexcept { - switch (norm) { - case Normalizing::NONE: return "NONE"; - case Normalizing::LOWERCASE: return "LOWERCASE"; - case Normalizing::LOWERCASE_AND_FOLD: return "LOWERCASE_AND_FOLD"; - } - abort(); -} - -} - -std::ostream& operator<<(std::ostream& os, Normalizing n) { - os << to_str(n); - return os; -} - } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h index 83fb27794a3..4097250f67e 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h +++ b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h @@ -1,8 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vespalib/stllike/string.h> -#include <iosfwd> +#include <vespa/searchlib/query/query_normalization.h> #include <memory> namespace search::streaming { @@ -19,14 +18,6 @@ public: virtual QueryNodeResultBase * clone() const = 0; }; -enum class Normalizing { - NONE, - LOWERCASE, - LOWERCASE_AND_FOLD -}; - -std::ostream& operator<<(std::ostream&, Normalizing); - class QueryNodeResultFactory { public: virtual ~QueryNodeResultFactory() = default; diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index e5e1473dd3c..dbaeaa5d895 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -1,12 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "queryterm.h" -#include <vespa/fastlib/text/normwordfolder.h> #include <vespa/searchlib/fef/itermdata.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/vespalib/objects/visit.h> #include <algorithm> -#include <cmath> #include <limits> #include <vespa/log/log.h> @@ -68,81 +66,9 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const visit(visitor, "uniqueid", _uniqueId); } -namespace { - -using Type = QueryTermSimple::Type; - -Normalizing -requireFold(Type type, Normalizing normalizing) { - if (normalizing == Normalizing::NONE) return Normalizing::NONE; - if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; - if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE; - return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) || - (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM)) - ? Normalizing::LOWERCASE_AND_FOLD - : Normalizing::NONE; -} - -vespalib::string -fold(vespalib::stringref s) { - const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); - const unsigned char * end = curr + s.size(); - vespalib::string folded; - for (; curr < end;) { - uint32_t c_ucs4 = *curr; - if (c_ucs4 < 0x80) { - folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); - } else { - c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); - if (repl != nullptr) { - size_t repllen = strlen(repl); - folded.append(repl, repllen); - } else { - c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); - char tmp[6]; - const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); - folded.append(tmp, tmp_end - tmp); - } - } - } - return folded; -} - -vespalib::string -lowercase(vespalib::stringref s) { - const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); - const unsigned char * end = curr + s.size(); - vespalib::string folded; - for (; curr < end;) { - uint32_t c_ucs4 = *curr; - if (c_ucs4 < 0x80) { - folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); - } else { - c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); - char tmp[6]; - const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); - folded.append(tmp, tmp_end - tmp); - } - } - return folded; -} - -vespalib::string -optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) { - switch ( requireFold(type, normalizing)) { - case Normalizing::NONE: return s; - case Normalizing::LOWERCASE: return lowercase(s); - case Normalizing::LOWERCASE_AND_FOLD: return fold(s); - } - return s; -} - -} - QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS, Type type, Normalizing normalizing) - : QueryTermUCS4(optional_fold(termS, type, normalizing), type), + : QueryTermUCS4(QueryNormalization::optional_fold(termS, type, normalizing), type), _index(indexS), _encoding(0x01), _result(org.release()), |