diff options
Diffstat (limited to 'searchlib')
7 files changed, 121 insertions, 131 deletions
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 89996515a4a..5e3b1dffe3a 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -4,6 +4,7 @@ #include "item_creator.h" #include <vespa/searchlib/query/weight.h> +#include <vespa/searchlib/query/query_normalization.h> #include <vespa/vespalib/stllike/string.h> namespace search { @@ -89,19 +90,37 @@ public: }; /** Extra information on each item (creator id) coded in bit 3 of flags */ - static inline ItemCreator GetCreator(uint8_t flags) { return static_cast<ItemCreator>((flags >> 3) & 0x01); } + static inline ItemCreator GetCreator(uint8_t flags) { + return static_cast<ItemCreator>((flags >> 3) & 0x01); + } - static inline bool GetFeature(uint8_t type, uint8_t feature) - { return ((type & feature) != 0); } + static inline bool GetFeature(uint8_t type, uint8_t feature) { + return ((type & feature) != 0); + } - static inline bool GetFeature_Weight(uint8_t type) - { return GetFeature(type, IF_WEIGHT); } + static inline bool GetFeature_Weight(uint8_t type) { + return GetFeature(type, IF_WEIGHT); + } - static inline bool getFeature_UniqueId(uint8_t type) - { return GetFeature(type, IF_UNIQUEID); } - - static inline bool getFeature_Flags(uint8_t type) - { return GetFeature(type, IF_FLAGS); } + static inline bool getFeature_UniqueId(uint8_t type) { + return GetFeature(type, IF_UNIQUEID); + } + static inline bool getFeature_Flags(uint8_t type) { + return GetFeature(type, IF_FLAGS); + } + static TermType toTermType(ItemType itemType) noexcept { + switch (itemType) { + case ParseItem::ITEM_REGEXP: return TermType::REGEXP; + case ParseItem::ITEM_PREFIXTERM: return TermType::PREFIXTERM; + case ParseItem::ITEM_SUBSTRINGTERM: return TermType::SUBSTRINGTERM; + case ParseItem::ITEM_EXACTSTRINGTERM: return TermType::EXACTSTRINGTERM; + case ParseItem::ITEM_SUFFIXTERM: return TermType::SUFFIXTERM; + case ParseItem::ITEM_FUZZY: return TermType::FUZZYTERM; + case ParseItem::ITEM_GEO_LOCATION_TERM: return TermType::GEO_LOCATION; + case ParseItem::ITEM_NEAREST_NEIGHBOR: return TermType::NEAREST_NEIGHBOR; + default: return TermType::WORD; + } + } }; } // namespace search diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.cpp b/searchlib/src/vespa/searchlib/query/query_normalization.cpp index e6a9d2202a9..64e1e0ed496 100644 --- a/searchlib/src/vespa/searchlib/query/query_normalization.cpp +++ b/searchlib/src/vespa/searchlib/query/query_normalization.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "query_normalization.h" +#include <vespa/fastlib/text/normwordfolder.h> #include <ostream> namespace search { @@ -20,6 +21,62 @@ to_str(search::Normalizing norm) noexcept { abort(); } +Normalizing +requireFold(TermType type, Normalizing normalizing) { + if (normalizing == Normalizing::NONE) return Normalizing::NONE; + if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; + if (type == TermType::EXACTSTRINGTERM) return Normalizing::LOWERCASE; + return ((type == TermType::WORD) || (type == TermType::SUBSTRINGTERM) || + (type == TermType::PREFIXTERM) || (type == TermType::SUFFIXTERM)) + ? Normalizing::LOWERCASE_AND_FOLD + : Normalizing::NONE; +} + +vespalib::string +fold(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); + } else { + c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); + if (repl != nullptr) { + size_t repllen = strlen(repl); + folded.append(repl, repllen); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + } + return folded; +} + +vespalib::string +lowercase(vespalib::stringref s) { + const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); + const unsigned char * end = curr + s.size(); + vespalib::string folded; + for (; curr < end;) { + uint32_t c_ucs4 = *curr; + if (c_ucs4 < 0x80) { + folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); + } else { + c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); + char tmp[6]; + const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); + folded.append(tmp, tmp_end - tmp); + } + } + return folded; +} + } std::ostream & @@ -28,4 +85,14 @@ operator<<(std::ostream &os, Normalizing n) { return os; } +vespalib::string +QueryNormalization::optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing) { + switch ( requireFold(type, normalizing)) { + case Normalizing::NONE: return s; + case Normalizing::LOWERCASE: return lowercase(s); + case Normalizing::LOWERCASE_AND_FOLD: return fold(s); + } + return s; +} + } diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.h b/searchlib/src/vespa/searchlib/query/query_normalization.h index 004876536b4..cdfc3587aa4 100644 --- a/searchlib/src/vespa/searchlib/query/query_normalization.h +++ b/searchlib/src/vespa/searchlib/query/query_normalization.h @@ -6,12 +6,24 @@ namespace search { -enum class Normalizing { +enum class Normalizing : uint8_t { NONE, LOWERCASE, LOWERCASE_AND_FOLD }; +enum class TermType : uint8_t { + WORD = 0, + PREFIXTERM = 1, + SUBSTRINGTERM = 2, + EXACTSTRINGTERM = 3, + SUFFIXTERM = 4, + REGEXP = 5, + GEO_LOCATION = 6, + FUZZYTERM = 7, + NEAREST_NEIGHBOR = 8 +}; + std::ostream &operator<<(std::ostream &, Normalizing); class QueryNormalization { @@ -20,6 +32,7 @@ public: virtual ~QueryNormalization() = default; virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0; virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0; + static vespalib::string optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing); }; } diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index ab3bd512d1d..060cd5015b3 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -215,21 +215,24 @@ QueryTermSimple::getRange() const noexcept return getIntegerRange<int64_t>(); } -bool QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept +bool +QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept { lower = std::numeric_limits<int64_t>::min(); upper = std::numeric_limits<int64_t>::max(); return getAsNumericTerm(lower, upper, IntDecoder()); } -bool QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept +bool +QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept { lower = -std::numeric_limits<double>::infinity(); upper = std::numeric_limits<double>::infinity(); return getAsNumericTerm(lower, upper, FloatDecoder<double>()); } -bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept +bool +QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept { lower = -std::numeric_limits<float>::infinity(); upper = std::numeric_limits<float>::infinity(); @@ -238,12 +241,6 @@ bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcep QueryTermSimple::~QueryTermSimple() = default; -namespace { - - - -} - QueryTermSimple::QueryTermSimple(const string & term_, Type type) : _rangeLimit(0), _maxPerGroup(0), diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 87bf7c26b80..a740afb0340 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include "query_normalization.h" #include <vespa/vespalib/objects/objectvisitor.h> #include <vespa/vespalib/stllike/string.h> #include <vespa/vespalib/util/memory.h> @@ -15,17 +16,7 @@ public: using UP = std::unique_ptr<QueryTermSimple>; using string = vespalib::string; using stringref = vespalib::stringref; - enum class Type : uint8_t { - WORD = 0, - PREFIXTERM = 1, - SUBSTRINGTERM = 2, - EXACTSTRINGTERM = 3, - SUFFIXTERM = 4, - REGEXP = 5, - GEO_LOCATION = 6, - FUZZYTERM = 7, - NEAREST_NEIGHBOR = 8 - }; + using Type = TermType; template <typename N> struct RangeResult { diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 69fe77d3fd5..a0abdcd28fb 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -84,7 +84,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_GEO_LOCATION_TERM: // just keep the string representation here; parsed in vsm::GeoPosFieldSearcher qn = std::make_unique<QueryTerm>(factory.create(), queryRep.getTerm(), queryRep.getIndexName(), - QueryTerm::Type::GEO_LOCATION, Normalizing::NONE); + TermType::GEO_LOCATION, Normalizing::NONE); break; case ParseItem::ITEM_NEAREST_NEIGHBOR: qn = build_nearest_neighbor_query_node(factory, queryRep); @@ -111,30 +111,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor if (dynamic_cast<const SameElementQueryNode *>(parent) != nullptr) { index = parent->getIndex() + "." + index; } - using TermType = QueryTerm::Type; - TermType sTerm(TermType::WORD); - switch (type) { - case ParseItem::ITEM_REGEXP: - sTerm = TermType::REGEXP; - break; - case ParseItem::ITEM_PREFIXTERM: - sTerm = TermType::PREFIXTERM; - break; - case ParseItem::ITEM_SUBSTRINGTERM: - sTerm = TermType::SUBSTRINGTERM; - break; - case ParseItem::ITEM_EXACTSTRINGTERM: - sTerm = TermType::EXACTSTRINGTERM; - break; - case ParseItem::ITEM_SUFFIXTERM: - sTerm = TermType::SUFFIXTERM; - break; - case ParseItem::ITEM_FUZZY: - sTerm = TermType::FUZZYTERM; - break; - default: - break; - } + TermType sTerm = ParseItem::toTermType(type); QueryTerm::string ssTerm; if (type == ParseItem::ITEM_PURE_WEIGHTED_LONG) { char buf[24]; diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index e5e1473dd3c..dbaeaa5d895 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -1,12 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "queryterm.h" -#include <vespa/fastlib/text/normwordfolder.h> #include <vespa/searchlib/fef/itermdata.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/vespalib/objects/visit.h> #include <algorithm> -#include <cmath> #include <limits> #include <vespa/log/log.h> @@ -68,81 +66,9 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const visit(visitor, "uniqueid", _uniqueId); } -namespace { - -using Type = QueryTermSimple::Type; - -Normalizing -requireFold(Type type, Normalizing normalizing) { - if (normalizing == Normalizing::NONE) return Normalizing::NONE; - if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE; - if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE; - return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) || - (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM)) - ? Normalizing::LOWERCASE_AND_FOLD - : Normalizing::NONE; -} - -vespalib::string -fold(vespalib::stringref s) { - const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); - const unsigned char * end = curr + s.size(); - vespalib::string folded; - for (; curr < end;) { - uint32_t c_ucs4 = *curr; - if (c_ucs4 < 0x80) { - folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++)); - } else { - c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr); - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4); - if (repl != nullptr) { - size_t repllen = strlen(repl); - folded.append(repl, repllen); - } else { - c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4); - char tmp[6]; - const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); - folded.append(tmp, tmp_end - tmp); - } - } - } - return folded; -} - -vespalib::string -lowercase(vespalib::stringref s) { - const auto * curr = reinterpret_cast<const unsigned char *>(s.data()); - const unsigned char * end = curr + s.size(); - vespalib::string folded; - for (; curr < end;) { - uint32_t c_ucs4 = *curr; - if (c_ucs4 < 0x80) { - folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++))); - } else { - c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr)); - char tmp[6]; - const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4); - folded.append(tmp, tmp_end - tmp); - } - } - return folded; -} - -vespalib::string -optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) { - switch ( requireFold(type, normalizing)) { - case Normalizing::NONE: return s; - case Normalizing::LOWERCASE: return lowercase(s); - case Normalizing::LOWERCASE_AND_FOLD: return fold(s); - } - return s; -} - -} - QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS, Type type, Normalizing normalizing) - : QueryTermUCS4(optional_fold(termS, type, normalizing), type), + : QueryTermUCS4(QueryNormalization::optional_fold(termS, type, normalizing), type), _index(indexS), _encoding(0x01), _result(org.release()), |