summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-06 21:16:28 +0100
committerGitHub <noreply@github.com>2024-01-06 21:16:28 +0100
commit50782cfd34e4e1f1094314dbfbae3da524a83d36 (patch)
treec33578caf74293975d7d0a4b99c0623fa41871ea
parent4bee8602c045365b32e3f6a2a8972ec6af879a18 (diff)
parent192af4443cb572791c8f11520e8ebec4ee4e5a8e (diff)
Merge pull request #29713 from vespa-engine/balder/consider-nordic-variations-of-o-and-a
Consider the nordic variations over A and O as the other european var…
-rw-r--r--searchlib/src/tests/query/streaming_query_test.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/in_term.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/in_term.h3
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp17
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.h7
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp39
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h15
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp104
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h10
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp19
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/querytermdata.h8
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp16
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.h1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h11
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.cpp13
-rw-r--r--vespalib/src/vespa/fastlib/text/normwordfolder.h4
20 files changed, 205 insertions, 83 deletions
diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp
index f5370785167..c4ef2028123 100644
--- a/searchlib/src/tests/query/streaming_query_test.cpp
+++ b/searchlib/src/tests/query/streaming_query_test.cpp
@@ -27,6 +27,7 @@ void assertHit(const Hit & h, size_t expWordpos, size_t expContext, int32_t weig
EXPECT_EQ(h.weight(), weight);
}
+
TEST(StreamingQueryTest, test_query_language)
{
QueryNodeResultFactory factory;
@@ -297,7 +298,7 @@ class AllowRewrite : public QueryNodeResultFactory
{
public:
explicit AllowRewrite(vespalib::stringref index) noexcept : _allowedIndex(index) {}
- bool getRewriteFloatTerms(vespalib::stringref index) const noexcept override { return index == _allowedIndex; }
+ bool allow_float_terms_rewrite(vespalib::stringref index) const noexcept override { return index == _allowedIndex; }
private:
vespalib::string _allowedIndex;
};
@@ -905,7 +906,7 @@ TEST(StreamingQueryTest, test_in_term)
{
auto term_vector = std::make_unique<StringTermVector>(1);
term_vector->addTerm("7");
- search::streaming::InTerm term({}, "index", std::move(term_vector));
+ search::streaming::InTerm term({}, "index", std::move(term_vector), Normalizing::NONE);
SimpleTermData td;
td.addField(10);
td.addField(11);
diff --git a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp
index d2c1ba872f5..9bb6d8c3342 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp
@@ -11,7 +11,7 @@ using search::fef::MatchData;
namespace search::streaming {
DotProductTerm::DotProductTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, uint32_t num_terms)
- : MultiTerm(std::move(result_base), index, Type::WORD, num_terms)
+ : MultiTerm(std::move(result_base), index, num_terms)
{
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp
index 36303d4e991..3e75f4a5114 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp
@@ -12,8 +12,9 @@ using search::query::TermVector;
namespace search::streaming {
-InTerm::InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, std::unique_ptr<TermVector> terms)
- : MultiTerm(std::move(result_base), index, Type::WORD, std::move(terms))
+InTerm::InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index,
+ std::unique_ptr<TermVector> terms, Normalizing normalize_mode)
+ : MultiTerm(std::move(result_base), index, std::move(terms), normalize_mode)
{
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.h b/searchlib/src/vespa/searchlib/query/streaming/in_term.h
index 7d03ed989c7..7b388b3f6e6 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/in_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.h
@@ -11,7 +11,8 @@ namespace search::streaming {
*/
class InTerm : public MultiTerm {
public:
- InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, std::unique_ptr<query::TermVector> terms);
+ InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string& index,
+ std::unique_ptr<query::TermVector> terms, Normalizing normalize_mode);
~InTerm() override;
void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override;
};
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp
index ad5857b8c41..dd34b9b7e73 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp
@@ -9,19 +9,20 @@ using search::query::TermVector;
namespace search::streaming {
-MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms)
- : QueryTerm(std::move(result_base), "", index, type),
+MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, uint32_t num_terms)
+ : QueryTerm(std::move(result_base), "", index, Type::WORD, Normalizing::NONE),
_terms()
{
_terms.reserve(num_terms);
}
-MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<TermVector> terms)
- : MultiTerm(std::move(result_base), index, type, terms->size())
+MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index,
+ std::unique_ptr<TermVector> terms, Normalizing normalizing)
+ : MultiTerm(std::move(result_base), index, terms->size())
{
auto num_terms = terms->size();
for (uint32_t i = 0; i < num_terms; ++i) {
- add_term(std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), terms->getAsString(i).first, "", QueryTermSimple::Type::WORD));
+ add_term(std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), terms->getAsString(i).first, "", Type::WORD, normalizing));
}
}
@@ -33,12 +34,6 @@ MultiTerm::add_term(std::unique_ptr<QueryTerm> term)
_terms.emplace_back(std::move(term));
}
-MultiTerm*
-MultiTerm::as_multi_term() noexcept
-{
- return this;
-}
-
void
MultiTerm::reset()
{
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
index 4c3f1ea5b5a..3bb69e29693 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
@@ -24,11 +24,12 @@ class MultiTerm : public QueryTerm {
protected:
std::vector<std::unique_ptr<QueryTerm>> _terms;
public:
- MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms);
- MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<query::TermVector> terms);
+ MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, uint32_t num_terms);
+ MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index,
+ std::unique_ptr<query::TermVector> terms, Normalizing normalizing);
~MultiTerm() override;
void add_term(std::unique_ptr<QueryTerm> term);
- MultiTerm* as_multi_term() noexcept override;
+ MultiTerm* as_multi_term() noexcept override { return this; }
void reset() override;
bool evaluate() const override;
const HitList& evaluateHits(HitList& hl) const override;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp
index f710218297d..1317d1c0651 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/nearest_neighbor_query_node.cpp
@@ -9,7 +9,7 @@ NearestNeighborQueryNode::NearestNeighborQueryNode(std::unique_ptr<QueryNodeResu
const string& query_tensor_name, const string& field_name,
uint32_t target_hits, double distance_threshold,
int32_t unique_id, search::query::Weight weight)
- : QueryTerm(std::move(resultBase), query_tensor_name, field_name, Type::NEAREST_NEIGHBOR),
+ : QueryTerm(std::move(resultBase), query_tensor_name, field_name, Type::NEAREST_NEIGHBOR, Normalizing::NONE),
_target_hits(target_hits),
_distance_threshold(distance_threshold),
_distance(),
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 9484999f45a..1e43c32a263 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -81,10 +81,8 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
break;
case ParseItem::ITEM_GEO_LOCATION_TERM:
// just keep the string representation here; parsed in vsm::GeoPosFieldSearcher
- qn = std::make_unique<QueryTerm>(factory.create(),
- queryRep.getTerm(),
- queryRep.getIndexName(),
- QueryTerm::Type::GEO_LOCATION);
+ qn = std::make_unique<QueryTerm>(factory.create(), queryRep.getTerm(), queryRep.getIndexName(),
+ QueryTerm::Type::GEO_LOCATION, Normalizing::NONE);
break;
case ParseItem::ITEM_NEAREST_NEIGHBOR:
qn = build_nearest_neighbor_query_node(factory, queryRep);
@@ -149,18 +147,19 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
// But it will do for now as only correct sddocname queries are sent down.
qn = std::make_unique<TrueNode>();
} else {
- auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm);
+ Normalizing normalize_mode = factory.normalizing_mode(ssIndex);
+ auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
if (qt->isFuzzy()) {
qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance());
qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength());
}
- if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.getRewriteFloatTerms(ssIndex)) {
+ if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) {
auto phrase = std::make_unique<PhraseQueryNode>();
auto dotPos = ssTerm.find('.');
- phrase->addChild(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(0, dotPos), ssIndex, TermType::WORD));
- phrase->addChild(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(dotPos + 1), ssIndex, TermType::WORD));
+ phrase->addChild(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(0, dotPos), ssIndex, TermType::WORD, normalize_mode));
+ phrase->addChild(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(dotPos + 1), ssIndex, TermType::WORD, normalize_mode));
auto orqn = std::make_unique<EquivQueryNode>();
orqn->addChild(std::move(qt));
orqn->addChild(std::move(phrase));
@@ -183,8 +182,11 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
}
break;
case ParseItem::ITEM_STRING_IN:
+ qn = std::make_unique<InTerm>(factory.create(), queryRep.getIndexName(), queryRep.get_terms(),
+ factory.normalizing_mode(queryRep.getIndexName()));
+ break;
case ParseItem::ITEM_NUMERIC_IN:
- qn = std::make_unique<InTerm>(factory.create(), queryRep.getIndexName(), queryRep.get_terms());
+ qn = std::make_unique<InTerm>(factory.create(), queryRep.getIndexName(), queryRep.get_terms(), Normalizing::NONE);
break;
case ParseItem::ITEM_DOT_PRODUCT:
qn = build_dot_product_term(factory, queryRep);
@@ -210,17 +212,12 @@ QueryNode::build_nearest_neighbor_query_node(const QueryNodeResultFactory& facto
auto weight = query_rep.GetWeight();
uint32_t target_hits = query_rep.getTargetHits();
double distance_threshold = query_rep.getDistanceThreshold();
- return std::make_unique<NearestNeighborQueryNode>(factory.create(),
- query_tensor_name,
- field_name,
- target_hits,
- distance_threshold,
- unique_id,
- weight);
+ return std::make_unique<NearestNeighborQueryNode>(factory.create(), query_tensor_name, field_name,
+ target_hits, distance_threshold, unique_id, weight);
}
void
-QueryNode::populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep)
+QueryNode::populate_multi_term(Normalizing string_normalize_mode, MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep)
{
char buf[24];
vespalib::string subterm;
@@ -229,13 +226,15 @@ QueryNode::populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& quer
std::unique_ptr<QueryTerm> term;
switch (queryRep.getType()) {
case ParseItem::ITEM_PURE_WEIGHTED_STRING:
- term = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), queryRep.getTerm(), "", QueryTermSimple::Type::WORD);
+ term = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), queryRep.getTerm(), "",
+ QueryTermSimple::Type::WORD, string_normalize_mode);
break;
case ParseItem::ITEM_PURE_WEIGHTED_LONG:
{
auto res = std::to_chars(buf, buf + sizeof(buf), queryRep.getIntergerTerm(), 10);
subterm.assign(buf, res.ptr - buf);
- term = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), subterm, "", QueryTermSimple::Type::WORD);
+ term = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), subterm, "",
+ QueryTermSimple::Type::WORD, Normalizing::NONE);
}
break;
default:
@@ -255,7 +254,7 @@ QueryNode::build_dot_product_term(const QueryNodeResultFactory& factory, SimpleQ
auto dp =std::make_unique<DotProductTerm>(factory.create(), queryRep.getIndexName(), queryRep.getArity());
dp->setWeight(queryRep.GetWeight());
dp->setUniqueId(queryRep.getUniqueId());
- populate_multi_term(*dp, queryRep);
+ populate_multi_term(factory.normalizing_mode(dp->index()), *dp, queryRep);
return dp;
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.h b/searchlib/src/vespa/searchlib/query/streaming/querynode.h
index bfc840e4603..576d614e58b 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.h
@@ -2,8 +2,7 @@
#pragma once
#include "hit.h"
-#include <vespa/vespalib/stllike/string.h>
-#include <memory>
+#include "querynoderesultbase.h"
namespace search { class SimpleQueryStackDumpIterator; }
@@ -30,7 +29,7 @@ using ConstQueryTermList = std::vector<const QueryTerm *>;
class QueryNode
{
static std::unique_ptr<QueryNode> build_nearest_neighbor_query_node(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep);
- static void populate_multi_term(MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep);
+ static void populate_multi_term(Normalizing string_normalize_mode, MultiTerm& mt, SimpleQueryStackDumpIterator& queryRep);
static std::unique_ptr<QueryNode> build_dot_product_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep);
static void skip_unknown(SimpleQueryStackDumpIterator& queryRep);
public:
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h
index d7704fb60e1..74f872ad187 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynoderesultbase.h
@@ -18,10 +18,23 @@ public:
virtual QueryNodeResultBase * clone() const = 0;
};
+enum class Normalizing {
+ NONE,
+ LOWERCASE,
+ LOWERCASE_AND_FOLD
+};
+
class QueryNodeResultFactory {
public:
virtual ~QueryNodeResultFactory() = default;
- virtual bool getRewriteFloatTerms(vespalib::stringref index) const noexcept { (void) index; return false; }
+ virtual bool allow_float_terms_rewrite(vespalib::stringref index) const noexcept {
+ (void) index;
+ return false;
+ }
+ virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept {
+ (void) index;
+ return Normalizing::NONE;
+ }
virtual std::unique_ptr<QueryNodeResultBase> create() const { return {}; }
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index fe6f73367d7..3950a179d67 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "queryterm.h"
+#include <vespa/fastlib/text/normwordfolder.h>
#include <vespa/vespalib/objects/visit.h>
#include <cmath>
@@ -34,7 +35,7 @@ CharInfo::CharInfo()
_charInfo[uint8_t('E')] = 0x05;
}
-CharInfo _G_charTable;
+CharInfo G_charTable;
}
@@ -54,20 +55,93 @@ QueryTerm::visitMembers(vespalib::ObjectVisitor & visitor) const
visit(visitor, "uniqueid", _uniqueId);
}
-QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & termS, const string & indexS, Type type) :
- QueryTermUCS4(termS, type),
- _index(indexS),
- _encoding(0x01),
- _result(org.release()),
- _hitList(),
- _weight(100),
- _uniqueId(0),
- _fieldInfo()
+namespace {
+
+using Type = QueryTermSimple::Type;
+
+Normalizing
+requireFold(Type type, Normalizing normalizing) {
+ if (normalizing == Normalizing::NONE) return Normalizing::NONE;
+ if (normalizing == Normalizing::LOWERCASE) return Normalizing::LOWERCASE;
+ if (type == Type::EXACTSTRINGTERM) return Normalizing::LOWERCASE;
+ return ((type == Type::WORD) || (type == Type::SUBSTRINGTERM) ||
+ (type == Type::PREFIXTERM) || (type == Type::SUFFIXTERM))
+ ? Normalizing::LOWERCASE_AND_FOLD
+ : Normalizing::NONE;
+}
+
+vespalib::string
+fold(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(*curr++));
+ } else {
+ c_ucs4 = Fast_UnicodeUtil::GetUTF8CharNonAscii(curr);
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c_ucs4);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ folded.append(repl, repllen);
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase_and_fold(c_ucs4);
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ }
+ return folded;
+}
+
+vespalib::string
+lowercase(vespalib::stringref s) {
+ const auto * curr = reinterpret_cast<const unsigned char *>(s.data());
+ const unsigned char * end = curr + s.size();
+ vespalib::string folded;
+ for (; curr < end;) {
+ uint32_t c_ucs4 = *curr;
+ if (c_ucs4 < 0x80) {
+ folded.append(static_cast<char>(Fast_NormalizeWordFolder::lowercase_ascii(*curr++)));
+ } else {
+ c_ucs4 = Fast_NormalizeWordFolder::lowercase(Fast_UnicodeUtil::GetUTF8CharNonAscii(curr));
+ char tmp[6];
+ const char * tmp_end = Fast_UnicodeUtil::utf8cput(tmp, c_ucs4);
+ folded.append(tmp, tmp_end - tmp);
+ }
+ }
+ return folded;
+}
+
+vespalib::string
+optional_fold(vespalib::stringref s, Type type, Normalizing normalizing) {
+ switch ( requireFold(type, normalizing)) {
+ case Normalizing::NONE: return s;
+ case Normalizing::LOWERCASE: return lowercase(s);
+ case Normalizing::LOWERCASE_AND_FOLD: return fold(s);
+ }
+ return s;
+}
+
+}
+
+QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, stringref termS, const string & indexS,
+ Type type, Normalizing normalizing)
+ : QueryTermUCS4(optional_fold(termS, type, normalizing), type),
+ _index(indexS),
+ _encoding(0x01),
+ _result(org.release()),
+ _hitList(),
+ _weight(100),
+ _uniqueId(0),
+ _fieldInfo()
{
- if (!termS.empty()) {
+ if (!empty()) {
uint8_t enc(0xff);
- for (char c : termS) {
- enc &= _G_charTable.get(c);
+ for (char c : getTermString()) {
+ enc &= G_charTable.get(c);
}
_encoding = EncodingBitMap(enc);
}
@@ -75,8 +149,8 @@ QueryTerm::QueryTerm(std::unique_ptr<QueryNodeResultBase> org, const string & te
void QueryTerm::getPhrases(QueryNodeRefList & tl) { (void) tl; }
void QueryTerm::getPhrases(ConstQueryNodeRefList & tl) const { (void) tl; }
-void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); }
-void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); }
+void QueryTerm::getLeaves(QueryTermList & tl) { tl.push_back(this); }
+void QueryTerm::getLeaves(ConstQueryTermList & tl) const { tl.push_back(this); }
bool QueryTerm::evaluate() const { return !_hitList.empty(); }
void QueryTerm::reset() { _hitList.clear(); }
const HitList & QueryTerm::evaluateHits(HitList &) const { return _hitList; }
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 2d1156a9c51..743998a630e 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -31,9 +31,6 @@ public:
bool isFloat() const { return _enc & Float; }
bool isBase10Integer() const { return _enc & Base10Integer; }
bool isAscii7Bit() const { return _enc & Ascii7Bit; }
- void setBase10Integer(bool v) { if (v) _enc |= Base10Integer; else _enc &= ~Base10Integer; }
- void setAscii7Bit(bool v) { if (v) _enc |= Ascii7Bit; else _enc &= ~Ascii7Bit; }
- void setFloat(bool v) { if (v) _enc |= Float; else _enc &= ~Float; }
private:
enum { Ascii7Bit=0x01, Base10Integer=0x02, Float=0x04 };
uint8_t _enc;
@@ -54,7 +51,12 @@ public:
uint32_t _hitCount;
uint32_t _fieldLength;
};
- QueryTerm(std::unique_ptr<QueryNodeResultBase> resultBase, const string & term, const string & index, Type type);
+ QueryTerm(std::unique_ptr<QueryNodeResultBase> resultBase, stringref term, const string & index, Type type)
+ : QueryTerm(std::move(resultBase), term, index, type, (type == Type::EXACTSTRINGTERM)
+ ? Normalizing::LOWERCASE
+ : Normalizing::LOWERCASE_AND_FOLD)
+ {}
+ QueryTerm(std::unique_ptr<QueryNodeResultBase> resultBase, stringref term, const string & index, Type type, Normalizing normalizing);
QueryTerm(const QueryTerm &) = delete;
QueryTerm & operator = (const QueryTerm &) = delete;
QueryTerm(QueryTerm &&) = delete;
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 1ce285c2103..83b84fffa11 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -21,6 +21,7 @@ using namespace document;
using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
using search::streaming::QueryTerm;
+using search::streaming::Normalizing;
using search::streaming::QueryTermList;
using TermType = QueryTerm::Type;
using namespace vsm;
@@ -56,11 +57,11 @@ public:
class Query
{
private:
- void setupQuery(const StringList & terms) {
+ void setupQuery(const StringList & terms, Normalizing normalizing) {
for (const auto & term : terms) {
ParsedQueryTerm pqt = parseQueryTerm(term);
ParsedTerm pt = parseTerm(pqt.second);
- qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second));
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing));
}
for (const auto & i : qtv) {
qtl.push_back(i.get());
@@ -72,7 +73,9 @@ public:
QueryNodeResultFactory eqnr;
std::vector<QueryTerm::UP> qtv;
QueryTermList qtl;
- explicit Query(const StringList & terms);
+
+ explicit Query(const StringList & terms) : Query(terms, Normalizing::LOWERCASE_AND_FOLD) {}
+ Query(const StringList & terms, Normalizing normalizing);
~Query();
static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) {
size_t i = queryTerm.find(':');
@@ -94,8 +97,8 @@ public:
}
};
-Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() {
- setupQuery(terms);
+Query::Query(const StringList & terms, Normalizing normalizing) : eqnr(), qtv(), qtl() {
+ setupQuery(terms, normalizing);
}
Query::~Query() = default;
@@ -286,8 +289,8 @@ bool
assertMatchTermSuffix(const std::string & term, const std::string & word)
{
QueryNodeResultFactory eqnr;
- QueryTerm qa(eqnr.create(), term, "index", TermType::WORD);
- QueryTerm qb(eqnr.create(), word, "index", TermType::WORD);
+ QueryTerm qa(eqnr.create(), term, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
+ QueryTerm qb(eqnr.create(), word, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
const ucs4_t * a;
size_t alen = qa.term(a);
const ucs4_t * b;
@@ -308,7 +311,7 @@ assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & f
std::vector<QueryTerm::UP>
performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv)
{
- Query q(query);
+ Query q(query, fs.exact() ? Normalizing::LOWERCASE : Normalizing::LOWERCASE_AND_FOLD);
// prepare field searcher
test::MockFieldSearcherEnv env;
diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
index 36176f70d1d..38d0e942fbc 100644
--- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
+++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
@@ -22,17 +22,23 @@ public:
class SearchMethodInfo {
public:
+ using Normalizing = search::streaming::Normalizing;
virtual ~SearchMethodInfo() = default;
virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0;
+ virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0;
};
class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory {
public:
+ using Normalizing = search::streaming::Normalizing;
QueryTermDataFactory(const SearchMethodInfo * searchMethodInfo) noexcept : _searchMethodInfo(searchMethodInfo) {}
std::unique_ptr<search::streaming::QueryNodeResultBase> create() const override {
return std::make_unique<QueryTermData>();
}
- bool getRewriteFloatTerms(vespalib::stringref index ) const noexcept override {
+ Normalizing normalizing_mode(vespalib::stringref index) const noexcept override {
+ return _searchMethodInfo ? _searchMethodInfo->normalizing_mode(index) : Normalizing::LOWERCASE_AND_FOLD;
+ }
+ bool allow_float_terms_rewrite(vespalib::stringref index ) const noexcept override {
return _searchMethodInfo && _searchMethodInfo->is_text_matching(index);
}
private:
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
index 49604135afc..4161adaf21f 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
@@ -326,6 +326,22 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept {
return false;
}
+SearchMethodInfo::Normalizing
+SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept {
+ StringFieldIdTMap fieldIdMap;
+ _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
+ size_t num_exact = 0;
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = _fieldSearchSpecMap.specMap().find(fieldId.second);
+ if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.searcher().exact()) {
+ num_exact++;
+ }
+ }
+ return ((num_exact == 0) || (num_exact != fieldIdMap.map().size()))
+ ? Normalizing::LOWERCASE_AND_FOLD
+ : Normalizing::LOWERCASE;
+}
+
void
SearchVisitor::init(const Parameters & params)
{
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
index 709564bcf02..ce40b5ba742 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
@@ -487,6 +487,7 @@ private:
void setupAttributeVector(const vsm::FieldPath &fieldPath);
bool is_text_matching(vespalib::stringref index) const noexcept override;
+ Normalizing normalizing_mode(vespalib::stringref index) const noexcept override;
};
class SearchVisitorFactory : public storage::VisitorFactory {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index 43443bd9cf4..e64c41f814f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -77,7 +77,7 @@ private:
void onStructStart(const Content & c) override;
public:
- explicit IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {}
+ explicit IteratorHandler(FieldSearcher & searcher) noexcept : _searcher(searcher) {}
};
friend class IteratorHandler; // to allow calls to onValue();
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
index 997bed74787..dd6f31581a0 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
@@ -12,13 +12,16 @@ namespace vsm
class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase
{
protected:
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ UTF8ExactStringFieldSearcher(FieldIdT fId)
+ : UTF8StringFieldSearcherBase(fId)
+ {
+ setMatchType(EXACT);
+ }
};
}
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
index 8d3ccad9900..97b4b5aabb7 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp
@@ -5,7 +5,7 @@
#include <cstring>
bool Fast_NormalizeWordFolder::_isInitialized = false;
-std::mutex _initMutex;
+
bool Fast_NormalizeWordFolder::_doAccentRemoval = false;
bool Fast_NormalizeWordFolder::_doSharpSSubstitution = false;
bool Fast_NormalizeWordFolder::_doLigatureSubstitution = false;
@@ -19,12 +19,19 @@ ucs4_t Fast_NormalizeWordFolder::_lowerCaseHighAscii[256];
ucs4_t Fast_NormalizeWordFolder::_kanaMap[192];
ucs4_t Fast_NormalizeWordFolder::_halfwidth_fullwidthMap[240];
+namespace {
+
+std::mutex G_initMutex;
+Fast_NormalizeWordFolder G_forceWorldFolderInit;
+}
+
+
void
Fast_NormalizeWordFolder::Setup(uint32_t flags)
{
// Only allow setting these when not initialized or initializing...
{
- std::lock_guard<std::mutex> initGuard(_initMutex);
+ std::lock_guard<std::mutex> initGuard(G_initMutex);
_doAccentRemoval = (DO_ACCENT_REMOVAL & flags) != 0;
_doSharpSSubstitution = (DO_SHARP_S_SUBSTITUTION & flags) != 0;
_doLigatureSubstitution = (DO_LIGATURE_SUBSTITUTION & flags) != 0;
@@ -39,7 +46,7 @@ Fast_NormalizeWordFolder::Initialize()
{
unsigned int i;
if (!_isInitialized) {
- std::lock_guard<std::mutex> initGuard(_initMutex);
+ std::lock_guard<std::mutex> initGuard(G_initMutex);
if (!_isInitialized) {
for (i = 0; i < 128; i++)
diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h
index 121a83e260d..5a77fe73e01 100644
--- a/vespalib/src/vespa/fastlib/text/normwordfolder.h
+++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h
@@ -35,8 +35,8 @@ public:
* added together.
*/
static void Setup(uint32_t flags);
- static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _lowerCase[c]; }
- static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _foldCase[c]; }
+ static ucs4_t lowercase_and_fold_ascii(ucs4_t c) noexcept { return _foldCase[c]; }
+ static ucs4_t lowercase_ascii(ucs4_t c) noexcept { return _lowerCase[c]; }
static bool is_wordchar_ascii7bit(ucs4_t c) noexcept { return _isWord[c]; }
static ucs4_t lowercase(ucs4_t c) {
if (c < 767)