summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-02-05 23:00:27 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-02-05 23:00:27 +0000
commitd59d1cdb2b4872c2309cfad2e96012fdbdfc6ff9 (patch)
tree64bf053dd750d9ae2c1ec5f9ce7500d0945f22ee /searchlib
parentc48eb091494ccb39d2edd0a1b50073f3c5dc4c2b (diff)
Wire QueryNormalization in to JuniperQueryAdapter and use it there.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/parse.h39
-rw-r--r--searchlib/src/vespa/searchlib/query/query_normalization.cpp67
-rw-r--r--searchlib/src/vespa/searchlib/query/query_normalization.h15
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h13
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp76
7 files changed, 121 insertions, 131 deletions
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h
index 89996515a4a..5e3b1dffe3a 100644
--- a/searchlib/src/vespa/searchlib/parsequery/parse.h
+++ b/searchlib/src/vespa/searchlib/parsequery/parse.h
@@ -4,6 +4,7 @@
#include "item_creator.h"
#include <vespa/searchlib/query/weight.h>
+#include <vespa/searchlib/query/query_normalization.h>
#include <vespa/vespalib/stllike/string.h>
namespace search {
@@ -89,19 +90,37 @@ public:
};
/** Extra information on each item (creator id) coded in bit 3 of flags */
- static inline ItemCreator GetCreator(uint8_t flags) { return static_cast<ItemCreator>((flags >> 3) & 0x01); }
+ static inline ItemCreator GetCreator(uint8_t flags) {
+ return static_cast<ItemCreator>((flags >> 3) & 0x01);
+ }
- static inline bool GetFeature(uint8_t type, uint8_t feature)
- { return ((type & feature) != 0); }
+ static inline bool GetFeature(uint8_t type, uint8_t feature) {
+ return ((type & feature) != 0);
+ }
- static inline bool GetFeature_Weight(uint8_t type)
- { return GetFeature(type, IF_WEIGHT); }
+ static inline bool GetFeature_Weight(uint8_t type) {
+ return GetFeature(type, IF_WEIGHT);
+ }
- static inline bool getFeature_UniqueId(uint8_t type)
- { return GetFeature(type, IF_UNIQUEID); }
-
- static inline bool getFeature_Flags(uint8_t type)
- { return GetFeature(type, IF_FLAGS); }
+ static inline bool getFeature_UniqueId(uint8_t type) {
+ return GetFeature(type, IF_UNIQUEID);
+ }
+ static inline bool getFeature_Flags(uint8_t type) {
+ return GetFeature(type, IF_FLAGS);
+ }
+ static TermType toTermType(ItemType itemType) noexcept {
+ switch (itemType) {
+ case ParseItem::ITEM_REGEXP: return TermType::REGEXP;
+ case ParseItem::ITEM_PREFIXTERM: return TermType::PREFIXTERM;
+ case ParseItem::ITEM_SUBSTRINGTERM: return TermType::SUBSTRINGTERM;
+ case ParseItem::ITEM_EXACTSTRINGTERM: return TermType::EXACTSTRINGTERM;
+ case ParseItem::ITEM_SUFFIXTERM: return TermType::SUFFIXTERM;
+ case ParseItem::ITEM_FUZZY: return TermType::FUZZYTERM;
+ case ParseItem::ITEM_GEO_LOCATION_TERM: return TermType::GEO_LOCATION;
+ case ParseItem::ITEM_NEAREST_NEIGHBOR: return TermType::NEAREST_NEIGHBOR;
+ default: return TermType::WORD;
+ }
+ }
};
} // namespace search
diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.cpp b/searchlib/src/vespa/searchlib/query/query_normalization.cpp
index e6a9d2202a9..64e1e0ed496 100644
--- a/searchlib/src/vespa/searchlib/query/query_normalization.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_normalization.cpp
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "query_normalization.h"
+#include <vespa/fastlib/text/normwordfolder.h>
#include <ostream>
namespace search {
@@ -20,6 +21,62 @@ to_str(search::Normalizing norm) noexcept {
abort();
}
+Normalizing
+requireFold(TermType type, Normalizing normalizing) {
+ if (normalizing == Normalizing::NONE) return Normalizing::NONE;
+ if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
+ if (type == TermType::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
+ return ((type == TermType::WORD) || (type == TermType::SUBSTRINGTERM) ||
+ (type == TermType::PREFIXTERM) || (type == TermType::SUFFIXTERM))
+ ? Normalizing::LOWERCASE_AND_FOLD
+ : Normalizing::NONE;
+}
+
+vespalib::string
+fold(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
+ } else {
+ c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ folded.append(repl, repllen);
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ }
+ return folded;
+}
+
+vespalib::string
+lowercase(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ return folded;
+}
+
}
std::ostream &
@@ -28,4 +85,14 @@ operator<<(std::ostream &os, Normalizing n) {
return os;
}
+vespalib::string
+QueryNormalization::optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing) {
+ switch ( requireFold(type, normalizing)) {
+ case Normalizing::NONE: return s;
+ case Normalizing::LOWERCASE: return lowercase(s);
+ case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
+ }
+ return s;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/query_normalization.h b/searchlib/src/vespa/searchlib/query/query_normalization.h
index 004876536b4..cdfc3587aa4 100644
--- a/searchlib/src/vespa/searchlib/query/query_normalization.h
+++ b/searchlib/src/vespa/searchlib/query/query_normalization.h
@@ -6,12 +6,24 @@
namespace search {
-enum class Normalizing {
+enum class Normalizing : uint8_t {
NONE,
LOWERCASE,
LOWERCASE_AND_FOLD
};
+enum class TermType : uint8_t {
+ WORD = 0,
+ PREFIXTERM = 1,
+ SUBSTRINGTERM = 2,
+ EXACTSTRINGTERM = 3,
+ SUFFIXTERM = 4,
+ REGEXP = 5,
+ GEO_LOCATION = 6,
+ FUZZYTERM = 7,
+ NEAREST_NEIGHBOR = 8
+};
+
std::ostream &operator<<(std::ostream &, Normalizing);
class QueryNormalization {
@@ -20,6 +32,7 @@ public:
virtual ~QueryNormalization() = default;
virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0;
virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0;
+ static vespalib::string optional_fold(vespalib::stringref s, TermType type, Normalizing normalizing);
};
}
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
index ab3bd512d1d..060cd5015b3 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
@@ -215,21 +215,24 @@ QueryTermSimple::getRange() const noexcept
return getIntegerRange<int64_t>();
}
-bool QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept
+bool
+QueryTermSimple::getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept
{
lower = std::numeric_limits<int64_t>::min();
upper = std::numeric_limits<int64_t>::max();
return getAsNumericTerm(lower, upper, IntDecoder());
}
-bool QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept
+bool
+QueryTermSimple::getAsFloatTerm(double & lower, double & upper) const noexcept
{
lower = -std::numeric_limits<double>::infinity();
upper = std::numeric_limits<double>::infinity();
return getAsNumericTerm(lower, upper, FloatDecoder<double>());
}
-bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept
+bool
+QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcept
{
lower = -std::numeric_limits<float>::infinity();
upper = std::numeric_limits<float>::infinity();
@@ -238,12 +241,6 @@ bool QueryTermSimple::getAsFloatTerm(float & lower, float & upper) const noexcep
QueryTermSimple::~QueryTermSimple() = default;
-namespace {
-
-
-
-}
-
QueryTermSimple::QueryTermSimple(const string & term_, Type type)
: _rangeLimit(0),
_maxPerGroup(0),
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index 87bf7c26b80..a740afb0340 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
+#include "query_normalization.h"
#include <vespa/vespalib/objects/objectvisitor.h>
#include <vespa/vespalib/stllike/string.h>
#include <vespa/vespalib/util/memory.h>
@@ -15,17 +16,7 @@ public:
using UP = std::unique_ptr<QueryTermSimple>;
using string = vespalib::string;
using stringref = vespalib::stringref;
- enum class Type : uint8_t {
- WORD = 0,
- PREFIXTERM = 1,
- SUBSTRINGTERM = 2,
- EXACTSTRINGTERM = 3,
- SUFFIXTERM = 4,
- REGEXP = 5,
- GEO_LOCATION = 6,
- FUZZYTERM = 7,
- NEAREST_NEIGHBOR = 8
- };
+ using Type = TermType;
template <typename N>
struct RangeResult {
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 69fe77d3fd5..a0abdcd28fb 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -84,7 +84,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
case ParseItem::ITEM_GEO_LOCATION_TERM:
// just keep the string representation here; parsed in vsm::GeoPosFieldSearcher
qn = std::make_unique<QueryTerm>(factory.create(), queryRep.getTerm(), queryRep.getIndexName(),
- QueryTerm::Type::GEO_LOCATION, Normalizing::NONE);
+ TermType::GEO_LOCATION, Normalizing::NONE);
break;
case ParseItem::ITEM_NEAREST_NEIGHBOR:
qn = build_nearest_neighbor_query_node(factory, queryRep);
@@ -111,30 +111,7 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
if (dynamic_cast<const SameElementQueryNode *>(parent) != nullptr) {
index = parent->getIndex() + "." + index;
}
- using TermType = QueryTerm::Type;
- TermType sTerm(TermType::WORD);
- switch (type) {
- case ParseItem::ITEM_REGEXP:
- sTerm = TermType::REGEXP;
- break;
- case ParseItem::ITEM_PREFIXTERM:
- sTerm = TermType::PREFIXTERM;
- break;
- case ParseItem::ITEM_SUBSTRINGTERM:
- sTerm = TermType::SUBSTRINGTERM;
- break;
- case ParseItem::ITEM_EXACTSTRINGTERM:
- sTerm = TermType::EXACTSTRINGTERM;
- break;
- case ParseItem::ITEM_SUFFIXTERM:
- sTerm = TermType::SUFFIXTERM;
- break;
- case ParseItem::ITEM_FUZZY:
- sTerm = TermType::FUZZYTERM;
- break;
- default:
- break;
- }
+ TermType sTerm = ParseItem::toTermType(type);
QueryTerm::string ssTerm;
if (type == ParseItem::ITEM_PURE_WEIGHTED_LONG) {
char buf[24];
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index e5e1473dd3c..dbaeaa5d895 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -1,12 +1,10 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "queryterm.h"
-#include <vespa/fastlib/text/normwordfolder.h>
#include <vespa/searchlib/fef/itermdata.h>
#include <vespa/searchlib/fef/matchdata.h>
#include <vespa/vespalib/objects/visit.h>
#include <algorithm>
-#include <cmath>
#include <limits>
#include <vespa/log/log.h>
@@ -68,81 +66,9 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const
visit(visitor, "uniqueid", _uniqueId);
}
-namespace {
-
-using Type = QueryTermSimple::Type;
-
-Normalizing
-requireFold(Type type, Normalizing normalizing) {
- if (normalizing == Normalizing::NONE) return Normalizing::NONE;
- if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
- if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
- return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) ||
- (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM))
- ? Normalizing::LOWERCASE_AND_FOLD
- : Normalizing::NONE;
-}
-
-vespalib::string
-fold(vespalib::stringref s) {
- const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
- const unsigned char * end = curr + s.size();
- vespalib::string folded;
- for (; curr < end;) {
- uint32_t c_ucs4 = *curr;
- if (c_ucs4 < 0x80) {
- folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
- } else {
- c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- folded.append(repl, repllen);
- } else {
- c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
- char tmp[6];
- const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
- folded.append(tmp, tmp_end - tmp);
- }
- }
- }
- return folded;
-}
-
-vespalib::string
-lowercase(vespalib::stringref s) {
- const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
- const unsigned char * end = curr + s.size();
- vespalib::string folded;
- for (; curr < end;) {
- uint32_t c_ucs4 = *curr;
- if (c_ucs4 < 0x80) {
- folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
- } else {
- c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
- char tmp[6];
- const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
- folded.append(tmp, tmp_end - tmp);
- }
- }
- return folded;
-}
-
-vespalib::string
-optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) {
- switch ( requireFold(type, normalizing)) {
- case Normalizing::NONE: return s;
- case Normalizing::LOWERCASE: return lowercase(s);
- case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
- }
- return s;
-}
-
-}
-
QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS,
Type type, Normalizing normalizing)
- : QueryTermUCS4(optional_fold(termS, type, normalizing), type),
+ : QueryTermUCS4(QueryNormalization::optional_fold(termS, type, normalizing), type),
_index(indexS),
_encoding(0x01),
_result(org.release()),