summaryrefslogtreecommitdiffstats
path: root/streamingvisitors
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-15 10:23:18 +0100
committerGitHub <noreply@github.com>2024-01-15 10:23:18 +0100
commit29a807d35ac5d9e76ea1b8d653bb25b0e4e2dc73 (patch)
treed55fddad443566300bd4a7fdd3ef1118a8460700 /streamingvisitors
parent48b1bae2a6cdf58a237aa7be59632a06aba86861 (diff)
parent252fbeed13b8622fbc813620dc3b4e45abc6bbe2 (diff)
Merge branch 'master' into balder/sliced-parallell-or
Diffstat (limited to 'streamingvisitors')
-rw-r--r--streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp2
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp135
-rw-r--r--streamingvisitors/src/tests/textutil/textutil_test.cpp3
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/querytermdata.h21
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp152
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.h14
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsmfields.def1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp206
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h92
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h8
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp8
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h9
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp21
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h54
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp9
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h13
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h12
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp14
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp166
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h24
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h7
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h9
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp16
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h12
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp167
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h46
38 files changed, 629 insertions, 654 deletions
diff --git a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
index 2d138d1d336..93e35e4c6d2 100644
--- a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
+++ b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
@@ -40,7 +40,7 @@ protected:
RankProcessorTest::RankProcessorTest()
: testing::Test(),
- _factory(),
+ _factory(nullptr),
_query(),
_query_wrapper()
{
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 4492dfac02b..7f89071868a 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -15,12 +15,15 @@
#include <vespa/vsm/searcher/utf8substringsearcher.h>
#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h>
#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
#include <vespa/vsm/vsm/snippetmodifier.h>
using namespace document;
using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
using search::streaming::QueryTerm;
+using search::streaming::Normalizing;
+using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
using search::streaming::QueryTermList;
using TermType = QueryTerm::Type;
using namespace vsm;
@@ -47,7 +50,7 @@ class String
private:
const std::string & _str;
public:
- String(const std::string & str) : _str(str) {}
+ explicit String(const std::string & str) : _str(str) {}
bool operator==(const String & rhs) const {
return _str == rhs._str;
}
@@ -56,14 +59,14 @@ public:
class Query
{
private:
- void setupQuery(const StringList & terms) {
- for (size_t i = 0; i < terms.size(); ++i) {
- ParsedQueryTerm pqt = parseQueryTerm(terms[i]);
+ void setupQuery(const StringList & terms, Normalizing normalizing) {
+ for (const auto & term : terms) {
+ ParsedQueryTerm pqt = parseQueryTerm(term);
ParsedTerm pt = parseTerm(pqt.second);
- qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second));
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing));
}
- for (size_t i = 0; i < qtv.size(); ++i) {
- qtl.push_back(qtv[i].get());
+ for (const auto & i : qtv) {
+ qtl.push_back(i.get());
}
}
public:
@@ -72,14 +75,16 @@ public:
QueryNodeResultFactory eqnr;
std::vector<QueryTerm::UP> qtv;
QueryTermList qtl;
- Query(const StringList & terms);
+
+ explicit Query(const StringList & terms) : Query(terms, Normalizing::LOWERCASE_AND_FOLD) {}
+ Query(const StringList & terms, Normalizing normalizing);
~Query();
static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) {
size_t i = queryTerm.find(':');
if (i != std::string::npos) {
- return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1));
+ return {queryTerm.substr(0, i), queryTerm.substr(i + 1)};
}
- return ParsedQueryTerm(std::string(), queryTerm);
+ return {std::string(), queryTerm};
}
static ParsedTerm parseTerm(const std::string & term) {
if (term[0] == '*' && term[term.size() - 1] == '*') {
@@ -94,8 +99,8 @@ public:
}
};
-Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() {
- setupQuery(terms);
+Query::Query(const StringList & terms, Normalizing normalizing) : eqnr(), qtv(), qtl() {
+ setupQuery(terms, normalizing);
}
Query::~Query() = default;
@@ -111,7 +116,7 @@ struct SnippetModifierSetup
SnippetModifierSetup::SnippetModifierSetup(const StringList & terms)
: query(terms),
- searcher(new UTF8SubstringSnippetModifier()),
+ searcher(new UTF8SubstringSnippetModifier(0)),
env(),
modifier(searcher)
{
@@ -254,8 +259,8 @@ getFieldValue(const StringList & fv)
static ArrayDataType type(*DataType::STRING);
ArrayFieldValue afv(type);
- for (size_t i = 0; i < fv.size(); ++i) {
- afv.add(StringFieldValue(fv[i]));
+ for (const auto & v : fv) {
+ afv.add(StringFieldValue(v));
}
return afv;
}
@@ -265,8 +270,8 @@ getFieldValue(const LongList & fv)
{
static ArrayDataType type(*DataType::LONG);
ArrayFieldValue afv(type);
- for (size_t i = 0; i < fv.size(); ++i) {
- afv.add(LongFieldValue(fv[i]));
+ for (long v : fv) {
+ afv.add(LongFieldValue(v));
}
return afv;
}
@@ -276,8 +281,8 @@ getFieldValue(const FloatList & fv)
{
static ArrayDataType type(*DataType::FLOAT);
ArrayFieldValue afv(type);
- for (size_t i = 0; i < fv.size(); ++i) {
- afv.add(FloatFieldValue(fv[i]));
+ for (float v : fv) {
+ afv.add(FloatFieldValue(v));
}
return afv;
}
@@ -286,8 +291,8 @@ bool
assertMatchTermSuffix(const std::string & term, const std::string & word)
{
QueryNodeResultFactory eqnr;
- QueryTerm qa(eqnr.create(), term, "index", TermType::WORD);
- QueryTerm qb(eqnr.create(), word, "index", TermType::WORD);
+ QueryTerm qa(eqnr.create(), term, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
+ QueryTerm qb(eqnr.create(), word, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
const ucs4_t * a;
size_t alen = qa.term(a);
const ucs4_t * b;
@@ -299,8 +304,8 @@ void
assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp)
{
HitsList hl;
- for (size_t i = 0; i < exp.size(); ++i) {
- hl.push_back(exp[i] ? Hits().add(0) : Hits());
+ for (bool v : exp) {
+ hl.push_back(v ? Hits().add(0) : Hits());
}
assertSearch(fs, query, fv, hl);
}
@@ -308,7 +313,7 @@ assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & f
std::vector<QueryTerm::UP>
performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv)
{
- Query q(query);
+ Query q(query, fs.normalize_mode());
// prepare field searcher
test::MockFieldSearcherEnv env;
@@ -316,7 +321,7 @@ performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & f
// setup document
SharedFieldPathMap sfim(new FieldPathMapT());
- sfim->push_back(FieldPath());
+ sfim->emplace_back();
StorageDocument doc(std::make_unique<document::Document>(), sfim, 1);
doc.setField(0, document::FieldValue::UP(fv.clone()));
@@ -342,7 +347,7 @@ assertSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv
bool
assertFieldInfo(FieldSearcher & fs, const StringList & query,
- const FieldValue & fv, const FieldInfoList & exp)
+ const FieldValue & fv, const FieldInfoList & exp)
{
auto qtv = performSearch(fs, query, fv);
if (!EXPECT_EQUAL(qtv.size(), exp.size())) return false;
@@ -358,7 +363,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query,
void
assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp)
{
- UTF8SubstringSnippetModifier mod;
+ UTF8SubstringSnippetModifier mod(0);
performSearch(mod, query, StringFieldValue(fv));
EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size());
std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos());
@@ -369,7 +374,7 @@ assertSnippetModifier(const StringList & query, const std::string & fv, const st
void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp)
{
FieldValue::UP mfv = setup.modifier.modify(fv);
- const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get());
+ const auto & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get());
const std::string & actual = lfv.getValue();
EXPECT_EQUAL(actual.size(), exp.size());
EXPECT_EQUAL(actual, exp);
@@ -377,11 +382,11 @@ void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv,
void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms)
{
- if (terms.size() == 0) {
- ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL);
+ if (terms.empty()) {
+ ASSERT_TRUE(man.getModifiers().getModifier(fId) == nullptr);
return;
}
- ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL);
+ ASSERT_TRUE(man.getModifiers().getModifier(fId) != nullptr);
UTF8SubstringSnippetModifier * searcher =
(static_cast<SnippetModifier *>(man.getModifiers().getModifier(fId)))->getSearcher().get();
EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size());
@@ -437,11 +442,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits()));
assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
- fs.setMatchType(FieldSearcher::PREFIX);
+ fs.match_type(FieldSearcher::PREFIX);
assertString(fs, "oper", field, Hits().add(0).add(2));
assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits()));
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false;
{ // test handling of several underscores
@@ -466,7 +471,7 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
TEST("verify correct term parsing") {
ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index");
ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term");
- ASSERT_TRUE(Query::parseQueryTerm("term").first == "");
+ ASSERT_TRUE(Query::parseQueryTerm("term").first.empty());
ASSERT_TRUE(Query::parseQueryTerm("term").second == "term");
ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr");
ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM);
@@ -550,12 +555,12 @@ TEST("utf8 substring search with empty term")
TEST("utf8 suffix search") {
UTF8SuffixStringFieldSearcher fs(0);
std::string field = "operators and operator overloading";
- assertString(fs, "rsand", field, Hits());
- assertString(fs, "tor", field, Hits().add(2));
- assertString(fs, "tors", field, Hits().add(0));
+ TEST_DO(assertString(fs, "rsand", field, Hits()));
+ TEST_DO(assertString(fs, "tor", field, Hits().add(2)));
+ TEST_DO(assertString(fs, "tors", field, Hits().add(0)));
- assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()));
- assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
+ TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())));
+ TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))));
EXPECT_TRUE(testStringFieldInfo(fs));
}
@@ -587,22 +592,22 @@ TEST("utf8 flexible searcher"){
// prefix
assertString(fs, "vesp*", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::PREFIX);
+ fs.match_type(FieldSearcher::PREFIX);
assertString(fs, "vesp", "vespa", Hits().add(0));
// substring
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
assertString(fs, "*esp*", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::SUBSTRING);
+ fs.match_type(FieldSearcher::SUBSTRING);
assertString(fs, "esp", "vespa", Hits().add(0));
// suffix
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
assertString(fs, "*espa", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::SUFFIX);
+ fs.match_type(FieldSearcher::SUFFIX);
assertString(fs, "espa", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
EXPECT_TRUE(testStringFieldInfo(fs));
}
@@ -656,7 +661,7 @@ TEST("integer search")
TEST("floating point search")
{
- FloatFieldSearcher fs;
+ FloatFieldSearcher fs(0);
TEST_DO(assertFloat(fs, "10", 10, true));
TEST_DO(assertFloat(fs, "10.5", 10.5, true));
TEST_DO(assertFloat(fs, "-10.5", -10.5, true));
@@ -723,7 +728,7 @@ TEST("Snippet modifier search") {
"\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8");
{ // check that resizing works
- UTF8SubstringSnippetModifier mod;
+ UTF8SubstringSnippetModifier mod(0);
EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u);
EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u);
performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa"));
@@ -760,28 +765,32 @@ TEST("snippet modifier") {
}
}
-TEST("FieldSearchSpec constrution") {
+TEST("FieldSearchSpec construction") {
{
FieldSearchSpec f;
EXPECT_FALSE(f.valid());
EXPECT_EQUAL(0u, f.id());
EXPECT_EQUAL("", f.name());
EXPECT_EQUAL(0x100000u, f.maxLength());
+ EXPECT_EQUAL("", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode());
}
{
- FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+ FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789);
EXPECT_TRUE(f.valid());
EXPECT_EQUAL(7u, f.id());
EXPECT_EQUAL("f0", f.name());
EXPECT_EQUAL(789u, f.maxLength());
EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+ EXPECT_EQUAL("substring", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode());
}
}
TEST("snippet modifier manager") {
FieldSearchSpecMapT specMap;
- specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
- specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+ specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000);
+ specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000);
IndexFieldMapT indexMap;
indexMap["i0"].push_back(0);
indexMap["i1"].push_back(1);
@@ -822,13 +831,13 @@ TEST("snippet modifier manager") {
Query query(StringList().add("i2:foo").add("i2:*bar*"));
man.setup(query.qtl, specMap, indexMap, *env.field_paths, env.query_env);
{
- SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0));
+ auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0));
UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
}
{
- SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1));
+ auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1));
UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
@@ -863,4 +872,24 @@ TEST("counting of words") {
assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits()));
}
+vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization";
+
+void
+verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) {
+ ucs4_t buf[256];
+ TokenizeReader reader(reinterpret_cast<const search::byte *>(NormalizationInput.c_str()), NormalizationInput.size(), buf);
+ while (reader.hasNext()) {
+ reader.normalize(reader.next(), normalizing);
+ }
+ size_t len = reader.complete();
+ EXPECT_EQUAL(expected_len, len);
+ EXPECT_EQUAL(0, Fast_UnicodeUtil::utf8cmp(expected, buf));
+}
+
+TEST("test normalizing") {
+ verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str());
+ verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization");
+ verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization");
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp
index b926444e4df..f7f340a2182 100644
--- a/streamingvisitors/src/tests/textutil/textutil_test.cpp
+++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp
@@ -2,7 +2,6 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/fastlib/text/normwordfolder.h>
-#include <vespa/searchlib/query/base.h>
#include <vespa/vsm/searcher/fold.h>
#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
@@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V &
const byte * srcbuf = reinterpret_cast<const byte *>(input);
auto dstbuf = std::make_unique<ucs4_t[]>(len + 1);
auto offsets = std::make_unique<size_t[]>(len + 1);
- UTF8StrChrFieldSearcher fs;
+ UTF8StrChrFieldSearcher fs(0);
BW bw(dstbuf.get(), offsets.get());
size_t dstlen = fs.skipSeparators(srcbuf, len, bw);
EXPECT_EQUAL(dstlen, expdstbuf.size());
diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
index 8c1c3771917..38d0e942fbc 100644
--- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
+++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
@@ -17,15 +17,32 @@ private:
search::fef::SimpleTermData _termData;
public:
QueryTermData * clone() const override { return new QueryTermData(); }
- search::fef::SimpleTermData &getTermData() { return _termData; }
+ search::fef::SimpleTermData &getTermData() noexcept { return _termData; }
+};
+
+class SearchMethodInfo {
+public:
+ using Normalizing = search::streaming::Normalizing;
+ virtual ~SearchMethodInfo() = default;
+ virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0;
+ virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0;
};
class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory {
public:
+ using Normalizing = search::streaming::Normalizing;
+ QueryTermDataFactory(const SearchMethodInfo * searchMethodInfo) noexcept : _searchMethodInfo(searchMethodInfo) {}
std::unique_ptr<search::streaming::QueryNodeResultBase> create() const override {
return std::make_unique<QueryTermData>();
}
- bool getRewriteFloatTerms() const override { return true; }
+ Normalizing normalizing_mode(vespalib::stringref index) const noexcept override {
+ return _searchMethodInfo ? _searchMethodInfo->normalizing_mode(index) : Normalizing::LOWERCASE_AND_FOLD;
+ }
+ bool allow_float_terms_rewrite(vespalib::stringref index ) const noexcept override {
+ return _searchMethodInfo && _searchMethodInfo->is_text_matching(index);
+ }
+private:
+ const SearchMethodInfo * _searchMethodInfo;
};
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
index 4d31c71c0a0..cdd1a018d84 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
@@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult;
using search::attribute::IAttributeVector;
using search::expression::ConfigureStaticParams;
using search::streaming::Query;
+using search::streaming::Normalizing;
using search::streaming::QueryTermList;
using storage::StorageComponent;
using storage::VisitorEnvironment;
@@ -91,7 +92,7 @@ ForceWordfolderInit::ForceWordfolderInit()
Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION);
}
-static ForceWordfolderInit _G_forceNormWordFolderInit;
+static ForceWordfolderInit G_forceNormWordFolderInit;
// Leftovers from FS4 protocol with limited use here.
enum queryflags {
@@ -238,14 +239,16 @@ SearchVisitor::SummaryGenerator::fillSummary(AttributeVector::DocId lid, const H
return {};
}
-void SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj)
+void
+SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj)
{
auto & hitsAggr(static_cast<HitsAggregationResult &>(obj));
hitsAggr.setSummaryGenerator(_summaryGenerator);
_numHitsAggregators++;
}
-bool SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const
+bool
+SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const
{
return obj.getClass().inherits(HitsAggregationResult::classId);
}
@@ -259,7 +262,8 @@ SearchVisitor::GroupingEntry::GroupingEntry(Grouping * grouping) :
SearchVisitor::GroupingEntry::~GroupingEntry() = default;
-void SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank)
+void
+SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank)
{
if (_count < _limit) {
_grouping->aggregate(doc, rank);
@@ -310,7 +314,58 @@ SearchVisitor::SearchVisitor(StorageComponent& component,
LOG(debug, "Created SearchVisitor");
}
-void SearchVisitor::init(const Parameters & params)
+bool
+SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept {
+ StringFieldIdTMap fieldIdMap;
+ _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = _fieldSearchSpecMap.specMap().find(fieldId.second);
+ if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.uses_string_search_method()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+
+uint32_t
+count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+ size_t count = 0;
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = specMap.find(fieldId.second);
+ if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) {
+ count++;
+ }
+ }
+ return count;
+}
+
+uint32_t
+count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+ size_t count = 0;
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = specMap.find(fieldId.second);
+ if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) {
+ count++;
+ }
+ }
+ return count;
+}
+
+}
+
+SearchMethodInfo::Normalizing
+SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept {
+ StringFieldIdTMap fieldIdMap;
+ _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
+ if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE;
+ if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE;
+ return Normalizing::LOWERCASE_AND_FOLD;
+}
+
+void
+SearchVisitor::init(const Parameters & params)
{
VISITOR_TRACE(6, "About to lazily init VSM adapter");
_attrMan.add(_documentIdAttributeBacking);
@@ -397,7 +452,14 @@ void SearchVisitor::init(const Parameters & params)
if ( params.lookup("query", queryBlob) ) {
LOG(spam, "Received query blob of %zu bytes", queryBlob.size());
VISITOR_TRACE(9, vespalib::make_string("Setting up for query blob of %zu bytes", queryBlob.size()));
- QueryTermDataFactory addOnFactory;
+ // Create mapping from field name to field id, from field id to search spec,
+ // and from index name to list of field ids
+ _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config());
+ auto additionalFields = registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs());
+ // Add extra elements to mapping from field name to field id
+ _fieldSearchSpecMap.buildFromConfig(additionalFields);
+
+ QueryTermDataFactory addOnFactory(this);
_query = Query(addOnFactory, vespalib::stringref(queryBlob.data(), queryBlob.size()));
_searchBuffer->reserve(0x10000);
@@ -408,19 +470,11 @@ void SearchVisitor::init(const Parameters & params)
LOG(warning, "Request without query stack count");
}
- std::vector<vespalib::string> additionalFields;
- registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs(), additionalFields);
-
- StringFieldIdTMap fieldsInQuery;
- setupFieldSearchers(additionalFields, fieldsInQuery);
-
-
+ StringFieldIdTMap fieldsInQuery = setupFieldSearchers();
setupScratchDocument(fieldsInQuery);
-
_syntheticFieldsController.setup(_fieldSearchSpecMap.nameIdMap(), fieldsInQuery);
setupAttributeVectors();
-
setupAttributeVectorsForSorting(_sortSpec);
_rankController.setRankManagerSnapshot(_env->get_rank_manager_snapshot());
@@ -436,7 +490,6 @@ void SearchVisitor::init(const Parameters & params)
// This depends on _fieldPathMap (from setupScratchDocument),
// and IQueryEnvironment (from setupRankProcessors).
prepare_field_searchers();
-
} else {
LOG(warning, "No query received");
}
@@ -529,10 +582,7 @@ SearchVisitor::PositionInserter::PositionInserter(AttributeVector & attribute, A
SearchVisitor::PositionInserter::~PositionInserter() = default;
void
-SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content & c)
-{
- (void) c;
-}
+SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content &) { }
void
SearchVisitor::PositionInserter::onStructStart(const Content & c)
@@ -605,7 +655,6 @@ SearchVisitor::RankController::setupRankProcessors(Query & query,
{
_rankSetup = &_rankManagerSnapshot->getRankSetup(_rankProfile);
_rankProcessor = std::make_unique<RankProcessor>(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, &attrMan);
- LOG(debug, "Initialize rank processor");
_rankProcessor->initForRanking(wantedHitCount);
// register attribute vectors needed for ranking
processAccessedAttributes(_rankProcessor->get_real_query_env(), true, attrMan, attributeFields);
@@ -637,8 +686,7 @@ SearchVisitor::RankController::rankMatchedDocument(uint32_t docId)
{
_rankProcessor->runRankProgram(docId);
LOG(debug, "Rank score for matched document %u: %f",
- docId,
- _rankProcessor->getRankScore());
+ docId, _rankProcessor->getRankScore());
if (_dumpFeatures) {
_dumpProcessor->runRankProgram(docId);
// we must transfer the score to this match data to make sure that the same hits
@@ -718,9 +766,8 @@ SearchVisitor::SyntheticFieldsController::setup(const StringFieldIdTMap & fieldR
}
void
-SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument & document)
+SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument &)
{
- (void) document;
}
void
@@ -730,10 +777,10 @@ SearchVisitor::SyntheticFieldsController::onDocumentMatch(StorageDocument & docu
document.setField(_documentIdFId, std::make_unique<document::StringFieldValue>(documentId));
}
-void
-SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec,
- std::vector<vespalib::string> & fieldList)
+std::vector<vespalib::string>
+SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec)
{
+ std::vector<vespalib::string> fieldList;
for (const vsm::DocsumTools::FieldSpec & spec : docsumSpec) {
fieldList.push_back(spec.getOutputName());
const std::vector<vespalib::string> & inputNames = spec.getInputNames();
@@ -748,25 +795,20 @@ SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::Fiel
fieldList.emplace_back("[docid]");
fieldList.emplace_back("[rank]");
fieldList.emplace_back("documentid");
+ return fieldList;
}
-void
-SearchVisitor::setupFieldSearchers(const std::vector<vespalib::string> & additionalFields,
- StringFieldIdTMap & fieldsInQuery)
+StringFieldIdTMap
+SearchVisitor::setupFieldSearchers()
{
- // Create mapping from field name to field id, from field id to search spec,
- // and from index name to list of field ids
- _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config());
- // Add extra elements to mapping from field name to field id
- _fieldSearchSpecMap.buildFromConfig(additionalFields);
-
// Reconfig field searchers based on the query
_fieldSearchSpecMap.reconfigFromQuery(_query);
// Map field name to field id for all fields in the query
- _fieldSearchSpecMap.buildFieldsInQuery(_query, fieldsInQuery);
+ StringFieldIdTMap fieldsInQuery = _fieldSearchSpecMap.buildFieldsInQuery(_query);
// Connect field names in the query to field searchers
_fieldSearchSpecMap.buildSearcherMap(fieldsInQuery.map(), _fieldSearcherMap);
+ return fieldsInQuery;
}
void
@@ -947,8 +989,7 @@ class SingleDocumentStore : public vsm::IDocSumCache
{
public:
explicit SingleDocumentStore(const StorageDocument & doc) : _doc(doc) { }
- const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override {
- (void) docId;
+ const vsm::Document & getDocSum(const search::DocumentIdT &) const override {
return _doc;
}
private:
@@ -959,19 +1000,12 @@ bool
SearchVisitor::compatibleDocumentTypes(const document::DocumentType& typeA,
const document::DocumentType& typeB)
{
- if (&typeA == &typeB) {
- return true;
- } else {
- return (typeA.getName() == typeB.getName());
- }
+ return (&typeA == &typeB) || (typeA.getName() == typeB.getName());
}
void
-SearchVisitor::handleDocuments(const document::BucketId&,
- DocEntryList & entries,
- HitCounter& hitCounter)
+SearchVisitor::handleDocuments(const document::BucketId&, DocEntryList & entries, HitCounter& )
{
- (void) hitCounter;
if (!_init_called) {
init(_params);
}
@@ -1016,37 +1050,25 @@ SearchVisitor::handleDocument(StorageDocument & document)
RankProcessor & rp = *_rankController.getRankProcessor();
vespalib::string documentId(document.docDoc().getId().getScheme().toString());
LOG(debug, "Matched document with id '%s'", documentId.c_str());
-
document.setDocId(rp.getDocId());
-
fillAttributeVectors(documentId, document);
-
_rankController.rankMatchedDocument(rp.getDocId());
-
if (_shouldFillRankAttribute) {
_rankAttribute.add(rp.getRankScore());
}
-
if (_rankController.keepMatchedDocument()) {
-
bool amongTheBest = _rankController.collectMatchedDocument(!_sortList.empty(), *this, _tmpSortBuffer, &document);
-
_syntheticFieldsController.onDocumentMatch(document, documentId);
-
SingleDocumentStore single(document);
_summaryGenerator.setDocsumCache(single);
group(document.docDoc(), rp.getRankScore(), false);
-
if (amongTheBest) {
needToKeepDocument = true;
}
-
} else {
_hitsRejectedCount++;
LOG(debug, "Do not keep document with id '%s' because rank score (%f) <= rank score drop limit (%f)",
- documentId.c_str(),
- rp.getRankScore(),
- _rankController.getRankSetup()->getRankScoreDropLimit());
+ documentId.c_str(), rp.getRankScore(), _rankController.getRankSetup()->getRankScoreDropLimit());
}
} else {
LOG(debug, "Did not match document with id '%s'", document.docDoc().getId().getScheme().toString().c_str());
@@ -1145,7 +1167,8 @@ SearchVisitor::fillSortBuffer()
return pos;
}
-void SearchVisitor::completedBucket(const document::BucketId&, HitCounter&)
+void
+SearchVisitor::completedBucket(const document::BucketId&, HitCounter&)
{
LOG(debug, "Completed bucket");
}
@@ -1157,7 +1180,8 @@ SearchVisitor::generate_query_result(HitCounter& counter)
return std::move(_queryResult);
}
-void SearchVisitor::completedVisitingInternal(HitCounter& hitCounter)
+void
+SearchVisitor::completedVisitingInternal(HitCounter& hitCounter)
{
if (!_init_called) {
init(_params);
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
index ef7a41f23a5..ce40b5ba742 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
@@ -8,6 +8,7 @@
#include "rankmanager.h"
#include "rankprocessor.h"
#include "searchenvironment.h"
+#include "querytermdata.h"
#include <vespa/vsm/common/docsum.h>
#include <vespa/vsm/common/documenttypemapping.h>
#include <vespa/vsm/common/storagedocument.h>
@@ -42,7 +43,8 @@ class SearchEnvironmentSnapshot;
* @brief Visitor that applies a search query to visitor data and
* converts them to a QueryResultCommand.
**/
-class SearchVisitor : public storage::Visitor {
+class SearchVisitor : public storage::Visitor,
+ public SearchMethodInfo {
public:
SearchVisitor(storage::StorageComponent&, storage::VisitorEnvironment& vEnv,
const vdslib::Parameters & params);
@@ -253,19 +255,15 @@ private:
* @param docsumSpec config with the field names used by the docsum setup.
* @param fieldList list of field names that are built.
**/
- static void registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec,
- std::vector<vespalib::string> & fieldList);
+ static std::vector<vespalib::string> registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec);
/**
* Setup the field searchers used when matching the query with the stream of documents.
* This includes setting up various mappings in FieldSearchSpecMap and building mapping
* for fields used by the query.
*
- * @param additionalFields list of additional field names used when setting up the mappings.
- * @param fieldsInQuery mapping from field name to field id that are built based on the query.
**/
- void setupFieldSearchers(const std::vector<vespalib::string> & additionalFields,
- vsm::StringFieldIdTMap & fieldsInQuery);
+ vsm::StringFieldIdTMap setupFieldSearchers();
/**
* Prepare the field searchers for the given query.
@@ -488,6 +486,8 @@ private:
vsm::StringFieldIdTMapT _fieldsUnion;
void setupAttributeVector(const vsm::FieldPath &fieldPath);
+ bool is_text_matching(vespalib::stringref index) const noexcept override;
+ Normalizing normalizing_mode(vespalib::stringref index) const noexcept override;
};
class SearchVisitorFactory : public storage::VisitorFactory {
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
index 442a044d38f..dac732013d2 100644
--- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -14,6 +14,7 @@ fieldspec[].name string
## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8
fieldspec[].arg1 string default=""
+fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD
## Maximum number of chars to search per field.
fieldspec[].maxlength int default=1048576
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 1a9238346b0..40aad418b22 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
intfieldsearcher.cpp
nearest_neighbor_field_searcher.cpp
strchrfieldsearcher.cpp
+ tokenizereader.cpp
utf8flexiblestringfieldsearcher.cpp
utf8strchrfieldsearcher.cpp
utf8stringfieldsearcherbase.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
index c7e7d2e74bd..3708cca85fb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
@@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- BoolFieldSearcher(FieldIdT fId);
- ~BoolFieldSearcher();
+ explicit BoolFieldSearcher(FieldIdT fId);
+ ~BoolFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index c797e6751ee..5e06ae41a03 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -23,79 +23,54 @@ class force
force() { FieldSearcher::init(); }
};
-static force __forceInit;
+static force ForceInit;
byte FieldSearcher::_foldLowCase[256];
byte FieldSearcher::_wordChar[256];
-FieldSearcherBase::FieldSearcherBase() :
- _qtl(),
- _qtlFastBuffer(),
- _qtlFastSize(0),
- _qtlFast(nullptr)
+FieldSearcherBase::FieldSearcherBase() noexcept
+ : _qtl()
{
}
-FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) :
- _qtl(),
- _qtlFastBuffer(),
- _qtlFastSize(0),
- _qtlFast(nullptr)
+FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org)
+ : _qtl()
{
prepare(org._qtl);
}
-FieldSearcherBase::~FieldSearcherBase()
-{
-}
-
-FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org)
-{
- if (this != &org) {
- prepare(org._qtl);
- }
- return *this;
-}
+FieldSearcherBase::~FieldSearcherBase() = default;
-void FieldSearcherBase::prepare(const QueryTermList & qtl)
+void
+FieldSearcherBase::prepare(const QueryTermList & qtl)
{
_qtl = qtl;
- _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13);
- _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf);
- _qtlFastSize = 0;
- for (auto qt : _qtl) {
- memcpy(&_qtlFast[_qtlFastSize++], qt->getTerm(), std::min(size_t(16), qt->termLen()));
- }
}
-FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) :
- FieldSearcherBase(),
- _field(fId),
- _matchType(defaultPrefix ? PREFIX : REGULAR),
- _maxFieldLength(0x100000),
- _currentElementId(0),
- _currentElementWeight(1),
- _pureUsAsciiCount(0),
- _pureUsAsciiFieldCount(0),
- _anyUtf8Count(0),
- _anyUtf8FieldCount(0),
- _words(0),
- _badUtf8Count(0),
- _zeroCount(0)
+FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
+ : FieldSearcherBase(),
+ _field(fId),
+ _matchType(defaultPrefix ? PREFIX : REGULAR),
+ _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
+ _maxFieldLength(0x100000),
+ _currentElementId(0),
+ _currentElementWeight(1),
+ _words(0),
+ _badUtf8Count(0)
{
- zeroStat();
}
FieldSearcher::~FieldSearcher() = default;
-bool FieldSearcher::search(const StorageDocument & doc)
+bool
+FieldSearcher::search(const StorageDocument & doc)
{
for (auto qt : _qtl) {
QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
fInfo.setHitOffset(qt->getHitList().size());
}
onSearch(doc);
- for(auto qt : _qtl) {
+ for (auto qt : _qtl) {
QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset());
fInfo.setFieldLength(_words);
@@ -104,16 +79,16 @@ bool FieldSearcher::search(const StorageDocument & doc)
return true;
}
-void FieldSearcher::prepare(QueryTermList& qtl,
- const SharedSearcherBuf&,
- const vsm::FieldPathMapT&,
- search::fef::IQueryEnvironment&)
+void
+FieldSearcher::prepare(QueryTermList& qtl, const SharedSearcherBuf&,
+ const vsm::FieldPathMapT&, search::fef::IQueryEnvironment&)
{
FieldSearcherBase::prepare(qtl);
prepareFieldId();
}
-size_t FieldSearcher::countWords(const FieldRef & f)
+size_t
+FieldSearcher::countWords(const FieldRef & f)
{
size_t words = 0;
const char * n = f.data();
@@ -129,36 +104,16 @@ size_t FieldSearcher::countWords(const FieldRef & f)
return words;
}
-void FieldSearcher::prepareFieldId()
+void
+FieldSearcher::prepareFieldId()
{
for(auto qt : _qtl) {
qt->resizeFieldId(field());
}
}
-void FieldSearcher::addStat(const FieldSearcher & toAdd)
-{
- _pureUsAsciiCount += toAdd._pureUsAsciiCount;
- _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount;
- _anyUtf8Count += toAdd._anyUtf8Count;
- _anyUtf8FieldCount += toAdd._anyUtf8FieldCount;
- _badUtf8Count += toAdd._badUtf8Count;
- _zeroCount += toAdd._zeroCount;
- for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; }
-}
-
-void FieldSearcher::zeroStat()
-{
- _pureUsAsciiCount = 0;
- _pureUsAsciiFieldCount = 0;
- _anyUtf8Count = 0;
- _anyUtf8FieldCount = 0;
- _badUtf8Count = 0;
- _zeroCount = 0;
- for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; }
-}
-
-void FieldSearcher::init()
+void
+FieldSearcher::init()
{
for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
_foldLowCase[i] = 0;
@@ -182,50 +137,59 @@ void FieldSearcher::init()
_wordChar[0xd7] = 0;
_wordChar[0xf7] = 0;
- if (1) /* _doAccentRemoval */ {
- _foldLowCase[0xc0] = 'a';
- _foldLowCase[0xc1] = 'a';
- _foldLowCase[0xc2] = 'a';
- _foldLowCase[0xc3] = 'a'; // A tilde
- _foldLowCase[0xc7] = 'c';
- _foldLowCase[0xc8] = 'e';
- _foldLowCase[0xc9] = 'e';
- _foldLowCase[0xca] = 'e';
- _foldLowCase[0xcb] = 'e';
- _foldLowCase[0xcc] = 'i'; // I grave
- _foldLowCase[0xcd] = 'i';
- _foldLowCase[0xce] = 'i';
- _foldLowCase[0xcf] = 'i';
- _foldLowCase[0xd3] = 'o';
- _foldLowCase[0xd4] = 'o';
- _foldLowCase[0xda] = 'u';
- _foldLowCase[0xdb] = 'u';
-
- _foldLowCase[0xe0] = 'a';
- _foldLowCase[0xe1] = 'a';
- _foldLowCase[0xe2] = 'a';
- _foldLowCase[0xe3] = 'a'; // a tilde
- _foldLowCase[0xe7] = 'c';
- _foldLowCase[0xe8] = 'e';
- _foldLowCase[0xe9] = 'e';
- _foldLowCase[0xea] = 'e';
- _foldLowCase[0xeb] = 'e';
- _foldLowCase[0xec] = 'i'; // i grave
- _foldLowCase[0xed] = 'i';
- _foldLowCase[0xee] = 'i';
- _foldLowCase[0xef] = 'i';
- _foldLowCase[0xf3] = 'o';
- _foldLowCase[0xf4] = 'o';
- _foldLowCase[0xfa] = 'u';
- _foldLowCase[0xfb] = 'u';
- }
+ _foldLowCase[0xc0] = 'a';
+ _foldLowCase[0xc1] = 'a';
+ _foldLowCase[0xc2] = 'a';
+ _foldLowCase[0xc3] = 'a';
+ _foldLowCase[0xc7] = 'c';
+ _foldLowCase[0xc8] = 'e';
+ _foldLowCase[0xc9] = 'e';
+ _foldLowCase[0xca] = 'e';
+ _foldLowCase[0xcb] = 'e';
+ _foldLowCase[0xcc] = 'i';
+ _foldLowCase[0xcd] = 'i';
+ _foldLowCase[0xce] = 'i';
+ _foldLowCase[0xcf] = 'i';
+ _foldLowCase[0xd1] = 'n';
+ _foldLowCase[0xd2] = 'o';
+ _foldLowCase[0xd3] = 'o';
+ _foldLowCase[0xd4] = 'o';
+ _foldLowCase[0xd5] = 'o';
+ _foldLowCase[0xd9] = 'u';
+ _foldLowCase[0xda] = 'u';
+ _foldLowCase[0xdb] = 'u';
+ _foldLowCase[0xdc] = 'u';
+ _foldLowCase[0xdd] = 'y';
+ _foldLowCase[0xe0] = 'a';
+ _foldLowCase[0xe1] = 'a';
+ _foldLowCase[0xe2] = 'a';
+ _foldLowCase[0xe3] = 'a';
+ _foldLowCase[0xe7] = 'c';
+ _foldLowCase[0xe8] = 'e';
+ _foldLowCase[0xe9] = 'e';
+ _foldLowCase[0xea] = 'e';
+ _foldLowCase[0xeb] = 'e';
+ _foldLowCase[0xec] = 'i';
+ _foldLowCase[0xed] = 'i';
+ _foldLowCase[0xee] = 'i';
+ _foldLowCase[0xef] = 'i';
+ _foldLowCase[0xf1] = 'n';
+ _foldLowCase[0xf2] = 'o';
+ _foldLowCase[0xf3] = 'o';
+ _foldLowCase[0xf4] = 'o';
+ _foldLowCase[0xf5] = 'o';
+ _foldLowCase[0xf9] = 'u';
+ _foldLowCase[0xfa] = 'u';
+ _foldLowCase[0xfb] = 'u';
+ _foldLowCase[0xfc] = 'u';
+ _foldLowCase[0xfd] = 'y';
+ _foldLowCase[0xff] = 'y';
}
-void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
- const SharedSearcherBuf& searcherBuf,
- Query& query,
- const vsm::FieldPathMapT& field_paths,
- search::fef::IQueryEnvironment& query_env)
+void
+FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf,
+ Query& query, const vsm::FieldPathMapT& field_paths,
+ search::fef::IQueryEnvironment& query_env)
{
QueryTermList qtl;
query.getLeaves(qtl);
@@ -269,7 +233,8 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
LOG(debug, "Will search in %s", tmp.c_str());
}
-bool FieldSearcher::onSearch(const StorageDocument & doc)
+bool
+FieldSearcher::onSearch(const StorageDocument & doc)
{
bool retval(true);
size_t fNo(field());
@@ -296,10 +261,10 @@ FieldSearcher::IteratorHandler::onCollectionStart(const Content & c)
const document::FieldValue & fv = c.getValue();
LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str());
if (fv.isA(document::FieldValue::Type::ARRAY)) {
- const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv);
+ const auto & afv = static_cast<const document::ArrayFieldValue &>(fv);
LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size());
} else if (fv.isA(document::FieldValue::Type::WSET)) {
- const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv);
+ const auto & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv);
LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size());
}
}
@@ -311,5 +276,4 @@ FieldSearcher::IteratorHandler::onStructStart(const Content & c)
_searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue()));
}
-
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index e79dacf827e..c5bca6f3899 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -14,77 +14,59 @@ namespace vsm {
using termcount_t = size_t;
using termsize_t = size_t;
-#if defined(COLLECT_CHAR_STAT)
- #define NEED_CHAR_STAT(a) { a; }
-#else
- #define NEED_CHAR_STAT(a)
-#endif
-
using ucs4_t = uint32_t;
using cmptype_t = ucs4_t;
using SearcherBuf = vespalib::Array<cmptype_t>;
using SharedSearcherBuf = std::shared_ptr<SearcherBuf>;
-using CharVector = std::vector<char>;
class FieldSearcherBase
{
protected:
- search::streaming::QueryTermList _qtl;
-private:
- CharVector _qtlFastBuffer;
-protected:
- FieldSearcherBase();
+ FieldSearcherBase() noexcept;
FieldSearcherBase(const FieldSearcherBase & org);
- virtual ~FieldSearcherBase(void);
- FieldSearcherBase & operator = (const FieldSearcherBase & org);
+ virtual ~FieldSearcherBase();
+ FieldSearcherBase & operator = (const FieldSearcherBase & org) = delete;
void prepare(const search::streaming::QueryTermList & qtl);
- size_t _qtlFastSize;
- search::v16qi *_qtlFast;
+protected:
+ search::streaming::QueryTermList _qtl;
};
class FieldSearcher : public FieldSearcherBase
{
public:
+ using Normalizing = search::streaming::Normalizing;
enum MatchType {
REGULAR,
PREFIX,
SUBSTRING,
SUFFIX,
- EXACT
+ EXACT,
};
- FieldSearcher(FieldIdT fId, bool defaultPrefix=false);
+ explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {}
+ FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept;
~FieldSearcher() override;
virtual std::unique_ptr<FieldSearcher> duplicate() const = 0;
bool search(const StorageDocument & doc);
- virtual void prepare(search::streaming::QueryTermList& qtl,
- const SharedSearcherBuf& buf,
- const vsm::FieldPathMapT& field_paths,
- search::fef::IQueryEnvironment& query_env);
-
- FieldIdT field() const { return _field; }
- void field(FieldIdT v) { _field = v; prepareFieldId(); }
- bool prefix() const { return _matchType == PREFIX; }
- bool substring() const { return _matchType == SUBSTRING; }
- bool suffix() const { return _matchType == SUFFIX; }
- bool exact() const { return _matchType == EXACT; }
- void setMatchType(MatchType mt) { _matchType = mt; }
+ virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf,
+ const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env);
+
+ FieldIdT field() const noexcept { return _field; }
+ bool prefix() const noexcept { return _matchType == PREFIX; }
+ bool substring() const noexcept { return _matchType == SUBSTRING; }
+ bool suffix() const noexcept { return _matchType == SUFFIX; }
+ bool exact() const noexcept { return _matchType == EXACT; }
+ Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+ MatchType match_type() const noexcept { return _matchType; }
+ void match_type(MatchType mt) noexcept { _matchType = mt; }
+ void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; }
+ void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); }
static void init();
static search::byte fold(search::byte c) { return _foldLowCase[c]; }
static search::byte iswordchar(search::byte c) { return _wordChar[c]; }
static search::byte isspace(search::byte c) { return ! iswordchar(c); }
static size_t countWords(const FieldRef & f);
- unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; }
- unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; }
- unsigned anyUtf8Count() const { return _anyUtf8Count; }
- unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; }
- unsigned badUtf8Count() const { return _badUtf8Count; }
- unsigned zeroCount() const { return _zeroCount; }
- unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; }
- const unsigned * utf8Count() const { return _utf8Count; }
- int32_t getCurrentWeight() const { return _currentElementWeight; }
- void addStat(const FieldSearcher & toAdd);
- void zeroStat();
+ int32_t currentWeight() const { return _currentElementWeight; }
FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
size_t maxFieldLength() const { return _maxFieldLength; }
@@ -98,7 +80,7 @@ private:
void onStructStart(const Content & c) override;
public:
- IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {}
+ explicit IteratorHandler(FieldSearcher & searcher) noexcept : _searcher(searcher) {}
};
friend class IteratorHandler; // to allow calls to onValue();
@@ -110,33 +92,21 @@ private:
virtual void onStructValue(const document::StructFieldValue &) { }
FieldIdT _field;
MatchType _matchType;
+ Normalizing _normalize_mode;
unsigned _maxFieldLength;
uint32_t _currentElementId;
int32_t _currentElementWeight; // Contains the weight of the current item being evaluated.
- /// Number of bytes in blocks containing pure us-ascii
- unsigned _pureUsAsciiCount;
- /// Number of blocks containing pure us-ascii
- unsigned _pureUsAsciiFieldCount;
- /// Number of bytes in blocks containing any non us-ascii
- unsigned _anyUtf8Count;
- /// Number of blocks containing any non us-ascii
- unsigned _anyUtf8FieldCount;
protected:
/// Number of terms searched.
- unsigned _words;
+ unsigned _words;
/// Number of utf8 bytes by utf8 size.
- unsigned _utf8Count[6];
- unsigned _badUtf8Count;
- unsigned _zeroCount;
-protected:
- void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; }
- void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; }
+ unsigned _badUtf8Count;
/**
* Adds a hit to the given query term.
* For each call to onValue() a batch of words are processed, and the position is local to this batch.
**/
void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const {
- qt.add(_words + pos, field(), _currentElementId, getCurrentWeight());
+ qt.add(_words + pos, field(), _currentElementId, _currentElementWeight);
}
public:
static search::byte _foldLowCase[256];
@@ -149,10 +119,8 @@ using FieldIdTSearcherMapT = std::vector<FieldSearcherContainer>;
class FieldIdTSearcherMap : public FieldIdTSearcherMapT
{
public:
- void prepare(const DocumentTypeIndexFieldMapT& difm,
- const SharedSearcherBuf& searcherBuf,
- search::streaming::Query& query,
- const vsm::FieldPathMapT& field_paths,
+ void prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf,
+ search::streaming::Query& query, const vsm::FieldPathMapT& field_paths,
search::fef::IQueryEnvironment& query_env);
};
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
index 7dd40348f47..8558522003f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
@@ -37,7 +37,7 @@ void FloatFieldSearcherT<T>::prepare(search::streaming::QueryTermList& qtl,
_floatTerm.clear();
FieldSearcher::prepare(qtl, buf, field_paths, query_env);
for (auto qt : qtl) {
- size_t sz(qt->termLen());
+ size_t sz(qt->termLen());
if (sz) {
auto range = qt->getRange<T>();
_floatTerm.emplace_back(range.low, range.high, range.valid);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
index 07b3f6e1c5f..85341472c26 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
@@ -9,8 +9,8 @@ template <typename T>
class FloatFieldSearcherT : public FieldSearcher
{
public:
- FloatFieldSearcherT(FieldIdT fId=0);
- ~FloatFieldSearcherT();
+ explicit FloatFieldSearcherT(FieldIdT fId);
+ ~FloatFieldSearcherT() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
@@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { }
+ explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { }
};
class DoubleFieldSearcher : public FloatFieldSearcherTD
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { }
+ DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { }
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
index a2122f08995..c0b5117d6bf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
@@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const
return std::make_unique<FUTF8StrChrFieldSearcher>(*this);
}
-FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher()
- : UTF8StrChrFieldSearcher(),
- _folded(4_Ki)
-{ }
FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId)
: UTF8StrChrFieldSearcher(fId),
_folded(4_Ki)
@@ -36,7 +32,7 @@ FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded
for(size_t i=0; i < sz; i++) {
byte c = toFold[i];
if (c>=128) { retval = false; break; }
- folded[i] = FieldSearcher::_foldLowCase[c];
+ folded[i] = fold(c);
}
return retval;
}
@@ -209,7 +205,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
folded[f.size()+1] = 0x01;
memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
return match(folded, f.size(), qt);
- NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
} else {
return UTF8StrChrFieldSearcher::matchTerm(f, qt);
}
@@ -227,7 +222,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t min
folded[f.size()+1] = 0x01;
memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size());
- NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
} else {
return UTF8StrChrFieldSearcher::matchTerms(f, mintsz);
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
index 5d5ca3d6c3c..b8aa287070a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
@@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- FUTF8StrChrFieldSearcher();
- FUTF8StrChrFieldSearcher(FieldIdT fId);
+ explicit FUTF8StrChrFieldSearcher(FieldIdT fId);
~FUTF8StrChrFieldSearcher() override;
static bool ansiFold(const char * toFold, size_t sz, char * folded);
static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart);
static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart);
private:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef&, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef&, size_t shortestTerm) override;
virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt);
size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize);
std::vector<char> _folded;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
index 741148fbca1..17c9f23fefb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
@@ -8,8 +8,8 @@ namespace vsm {
class GeoPosFieldSearcher : public FieldSearcher {
public:
- GeoPosFieldSearcher(FieldIdT fId=0);
- ~GeoPosFieldSearcher();
+ GeoPosFieldSearcher(FieldIdT fId);
+ ~GeoPosFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
@@ -21,7 +21,7 @@ protected:
using GeoLocation = search::common::GeoLocation;
class GeoPosInfo : public GeoLocation {
public:
- GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
+ explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
bool cmp(const document::StructFieldValue & fv) const;
};
using GeoPosInfoListT = std::vector<GeoPosInfo>;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
index 47b83c1538d..9c63d31e3c3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
@@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- IntFieldSearcher(FieldIdT fId=0);
- ~IntFieldSearcher();
+ explicit IntFieldSearcher(FieldIdT fId);
+ ~IntFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
index 76fedbd1166..816317bf86d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
}
DistanceMetric
-NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value)
{
// Valid string values must match the definition of DistanceMetric in
// config-model/src/main/java/com/yahoo/schema/document/Attribute.java
- auto v = value;
+ vespalib::string v = value;
std::transform(v.begin(), v.end(), v.begin(),
[](unsigned char c) { return std::tolower(c); });
try {
return DistanceMetricUtils::to_distance_metric(v);
} catch (vespalib::IllegalStateException&) {
- vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+ vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str());
return DistanceMetric::Euclidean;
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
index 5629b443c78..ecdc64d1336 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -11,10 +11,7 @@
#include <vespa/searchlib/tensor/tensor_ext_attribute.h>
namespace search::fef { class IQueryEnvironment; }
-
-namespace search::tensor {
-class TensorExtAttribute;
-}
+namespace search::tensor { class TensorExtAttribute; }
namespace vsm {
@@ -43,7 +40,7 @@ private:
public:
NearestNeighborFieldSearcher(FieldIdT fid,
search::attribute::DistanceMetric metric);
- ~NearestNeighborFieldSearcher();
+ ~NearestNeighborFieldSearcher() override;
std::unique_ptr<FieldSearcher> duplicate() const override;
void prepare(search::streaming::QueryTermList& qtl,
@@ -52,7 +49,7 @@ public:
search::fef::IQueryEnvironment& query_env) override;
void onValue(const document::FieldValue& fv) override;
- static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+ static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value);
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
index 9ad76712092..19c723d060d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
@@ -8,8 +8,7 @@ namespace vsm {
class StrChrFieldSearcher : public FieldSearcher
{
public:
- StrChrFieldSearcher() : FieldSearcher(0) { }
- StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
+ explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
void onValue(const document::FieldValue & fv) override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
@@ -19,7 +18,7 @@ private:
size_t shortestTerm() const;
bool matchDoc(const FieldRef & field);
virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0;
+ virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
new file mode 100644
index 00000000000..d8a6091fe11
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -0,0 +1,21 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokenizereader.h"
+
+namespace vsm {
+
+void
+TokenizeReader::fold(ucs4_t c) {
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+ }
+ } else {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
new file mode 100644
index 00000000000..f10c8910e82
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -0,0 +1,54 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+/**
+ * Handles tokenization of utf8 input with on the fly normalization.
+ * It handles Normalizing::NONE, Normalizing::LOWERCASE, and Normalizing::LOWERCASE_AND_FOLD
+ */
+class TokenizeReader {
+public:
+ using byte = search::byte;
+ using Normalizing = search::streaming::Normalizing;
+ TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+ : _p(p),
+ _p_end(p + len),
+ _q(q),
+ _q_start(q)
+ {}
+ ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+ void normalize(ucs4_t c, Normalizing normalize_mode) {
+ switch (normalize_mode) {
+ case Normalizing::LOWERCASE:
+ c = Fast_NormalizeWordFolder::lowercase(c);
+ [[fallthrough]];
+ case Normalizing::NONE:
+ *_q++ = c;
+ break;
+ case Normalizing::LOWERCASE_AND_FOLD:
+ fold(c);
+ break;
+ }
+ }
+ bool hasNext() const noexcept { return _p < _p_end; }
+ const byte * p() const noexcept { return _p; }
+ size_t complete() noexcept {
+ *_q = 0;
+ size_t token_len = _q - _q_start;
+ _q = _q_start;
+ return token_len;
+ }
+private:
+ void fold(ucs4_t c);
+ const byte *_p;
+ const byte *_p_end;
+ ucs4_t *_q;
+ ucs4_t *_q_start;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
index 724efb54331..70cef08428a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
@@ -7,6 +7,13 @@ using search::streaming::QueryTermList;
namespace vsm {
+UTF8ExactStringFieldSearcher::UTF8ExactStringFieldSearcher(FieldIdT fId)
+ : UTF8StringFieldSearcherBase(fId)
+{
+ match_type(EXACT);
+ normalize_mode(Normalizing::LOWERCASE);
+}
+
std::unique_ptr<FieldSearcher>
UTF8ExactStringFieldSearcher::duplicate() const
{
@@ -14,7 +21,7 @@ UTF8ExactStringFieldSearcher::duplicate() const
}
size_t
-UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
for (auto qt : _qtl) {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
index 997bed74787..9f590156a96 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
@@ -1,10 +1,9 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
-namespace vsm
-{
+namespace vsm {
/**
* This class does suffix utf8 searches.
@@ -12,14 +11,12 @@ namespace vsm
class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase
{
protected:
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ explicit UTF8ExactStringFieldSearcher(FieldIdT fId);
};
}
-
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 655b068e152..78f491198ad 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
}
}
-UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() :
- UTF8StringFieldSearcherBase()
-{ }
-
UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) :
UTF8StringFieldSearcherBase(fId)
{ }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index 5eee6a8862a..bb1b55dffe4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -1,10 +1,9 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
-namespace vsm
-{
+namespace vsm {
/**
* This class does utf8 searches based on the query term type.
@@ -17,18 +16,17 @@ private:
* Tries to match the given query term against the content of the given field reference.
* Search strategy is choosen based on the query term type.
**/
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
/**
* Tries to match each query term in the underlying query against the content of the given field reference.
* Search strategy is choosen based on the query term type.
**/
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8FlexibleStringFieldSearcher();
- UTF8FlexibleStringFieldSearcher(FieldIdT fId);
+ explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 2488d198b03..37dc4ffb99c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -1,5 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8strchrfieldsearcher.h"
+#include "tokenizereader.h"
using search::streaming::QueryTerm;
using search::streaming::QueryTermList;
@@ -14,21 +15,19 @@ UTF8StrChrFieldSearcher::duplicate() const
}
size_t
-UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- const byte * e = n + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
@@ -42,7 +41,6 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
}
words++;
}
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
return words;
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
index cfe546bc6f6..663ee3a1a62 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
@@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
-
+ explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
protected:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 4daea693e95..5036e9bedb1 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
+#include "tokenizereader.h"
#include <cassert>
using search::streaming::QueryTerm;
@@ -10,115 +10,36 @@ using search::byte;
namespace vsm {
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
- if (maxSz > 0) {
- maxSz--;
- }
- ucs4_t c(*p);
- ucs4_t *q(dstbuf);
- const byte * end(p+maxSz);
-
- // Skip non-word characters between words
- for (; p < end; ) {
- if (c < 128) {
- if (!c) { break; }
- p++;
- if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) {
- *q++ = Fast_NormalizeWordFolder::_foldCase[c];
- c = 0;
- } else {
- c = *p;
- }
- } else {
- const byte * oldP(p);
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- _utf8Count[p-oldP-1]++;
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::ToFold(c);
- *q++ = c;
- }
- break;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
- }
- c = *p;
- }
- }
- }
-
- c = *p; // Next char
- for (; p < end;) {
- if (c < 128) { // Common case, ASCII
- if (!c) { break; }
- p++;
- if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) {
- c = 0;
- } else {
- *q++ = Fast_NormalizeWordFolder::_foldCase[c];
- c = *p;
- }
- } else {
- const byte * oldP(p);
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
- _utf8Count[p-oldP-1]++;
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::ToFold(c);
- *q++ = c;
- }
-
- c = *p;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
- }
- break;
- }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+ ucs4_t c(0);
+ Normalizing norm_mode = normalize_mode();
+ while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+ if (Fast_UnicodeUtil::IsWordChar(c)) {
+ reader.normalize(c, norm_mode);
+ while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+ reader.normalize(c, norm_mode);
}
}
- *q = 0;
- tokenlen = q - dstbuf;
- return p;
}
size_t
UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
{
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- // __builtin_prefetch(n, 0, 0);
const cmptype_t * term;
termsize_t tsz = qt.term(term);
- const byte * e = n + f.size();
if ( f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
+ cmptype_t * fn = _buf->data();
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
const cmptype_t *tt=term, *et=term+tsz;
for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -128,33 +49,35 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt
}
words++;
}
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
return words;
}
size_t
UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
{
- const byte * n = reinterpret_cast<const byte *> (f.data());
const cmptype_t * term;
termsize_t tsz = qt.term(term);
const cmptype_t * eterm = term+tsz;
- const byte * e = n + f.size();
+ if ( f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * fn = _buf->data();
if (tsz <= f.size()) {
bool equal(true);
- for (; equal && (n < e) && (term < eterm); term++) {
- if (*term < 0x80) {
- equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]);
- } else {
- cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
- equal = (*term == c);
+ Normalizing norm_mode = normalize_mode();
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while (equal && reader.hasNext() && (term < eterm)) {
+ reader.normalize(reader.next(), norm_mode);
+ size_t len = reader.complete();
+ for (size_t i(0); i < len; i++) {
+ equal = (term[i] == fn[i]);
}
+ term += len;
}
- if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) {
+ if (equal && (term == eterm) && (qt.isPrefix() || ! reader.hasNext())) {
addHit(qt,0);
}
}
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
return 1;
}
@@ -188,7 +111,6 @@ UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm &
}
}
}
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
return words + 1; // we must also count the last word
}
@@ -196,22 +118,17 @@ size_t
UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
{
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
const cmptype_t * term;
termsize_t tsz = qt.term(term);
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
+ cmptype_t * dstbuf = _buf->data();
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
addHit(qt, words);
}
@@ -220,11 +137,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
return words;
}
-UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
- StrChrFieldSearcher()
-{
-}
-
UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
StrChrFieldSearcher(fId)
{
@@ -280,12 +192,12 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
if (c < 128) {
p++;
if (!isSeparatorCharacter(c)) {
- dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b));
+ dstbuf.onCharacter(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c), (oldP - b));
}
} else {
c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != NULL) {
+ if (repl != nullptr) {
size_t repllen = strlen(repl);
if (repllen > 0) {
ucs4_t * buf = dstbuf.getBuf();
@@ -300,13 +212,11 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
}
}
} else {
- c = Fast_NormalizeWordFolder::ToFold(c);
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
dstbuf.onCharacter(c, (oldP - b));
}
if (c == Fast_UnicodeUtil::_BadUTF8Char) {
_badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
}
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 38aac508f4f..b196f2795a4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -28,15 +28,15 @@ public:
ucs4_t * _cbuf;
public:
- BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { }
- BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { }
+ explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { }
+ BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { }
void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; }
void onOffset(size_t) { }
void incBuf(size_t inc) { _cbuf += inc; }
ucs4_t * getBuf() { return _cbuf; }
- bool valid() { return true; }
- size_t size() { return (_cbuf - _bbuf); }
- bool hasOffsets() { return false; }
+ bool valid() const noexcept { return true; }
+ size_t size() const noexcept { return (_cbuf - _bbuf); }
+ bool hasOffsets() const noexcept { return false; }
};
/**
@@ -50,17 +50,18 @@ public:
size_t * _coff;
public:
- OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
+ explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
void onOffset(size_t of) { *_coff++ = of; }
- bool valid() { return (size() == (size_t)(_coff - _boff)); }
- bool hasOffsets() { return true; }
+ bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
+ bool hasOffsets() const noexcept { return true; }
};
protected:
SharedSearcherBuf _buf;
- const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+ template<typename Reader>
+ void tokenize(Reader & reader);
/**
* Matches the given query term against the words in the given field reference
@@ -103,9 +104,8 @@ protected:
size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt);
public:
- UTF8StringFieldSearcherBase();
- UTF8StringFieldSearcherBase(FieldIdT fId);
- ~UTF8StringFieldSearcherBase();
+ explicit UTF8StringFieldSearcherBase(FieldIdT fId);
+ ~UTF8StringFieldSearcherBase() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
index 88091c6ab4e..fcc2893a71d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
@@ -1,6 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include "utf8substringsearcher.h"
#include <vespa/fastlib/text/unicodeutil.h>
using search::byte;
@@ -45,8 +45,6 @@ UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ );
}
}
-
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
return words + 1; // we must also count the last word
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
index b1455d5c5f6..cee35993ce7 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
@@ -1,7 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h>
+#include "utf8strchrfieldsearcher.h"
namespace vsm {
@@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
protected:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
index 8403e69658f..6d8a399cd33 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
@@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char *
_modified->put(_unitSep);
}
-UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() :
- UTF8StringFieldSearcherBase(),
- _modified(new CharBuffer(32)),
- _offsets(new std::vector<size_t>(32)),
- _readPtr(NULL),
- _unitSep(juniper::separators::unit_separator)
-{
-}
-
UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) :
UTF8StringFieldSearcherBase(fId),
_modified(new CharBuffer(32)),
_offsets(new std::vector<size_t>(32)),
- _readPtr(NULL),
+ _readPtr(nullptr),
_unitSep(juniper::separators::unit_separator)
{
}
@@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId,
UTF8StringFieldSearcherBase(fId),
_modified(modBuf),
_offsets(offBuf),
- _readPtr(NULL),
+ _readPtr(nullptr),
_unitSep(juniper::separators::unit_separator)
{
}
-UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {}
+UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default;
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
index ebb806de61c..99e6c29961f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
@@ -23,8 +23,8 @@ private:
const char * _readPtr; // buffer to read from (field reference)
char _unitSep; // the unit separator character to use
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
/**
* Copies n bytes from the field reference to the modified buffer and updates the read pointer.
@@ -51,9 +51,8 @@ public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SubstringSnippetModifier();
- UTF8SubstringSnippetModifier(FieldIdT fId);
- ~UTF8SubstringSnippetModifier();
+ explicit UTF8SubstringSnippetModifier(FieldIdT fId);
+ ~UTF8SubstringSnippetModifier() override;
/**
* Creates a new instance.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index e28ce114225..8bbacf168cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -1,5 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8suffixstringfieldsearcher.h"
+#include "tokenizereader.h"
using search::byte;
using search::streaming::QueryTerm;
@@ -14,24 +15,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const
}
size_t
-UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
index 556f61a714f..dc3bc214b49 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
@@ -1,10 +1,9 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
-namespace vsm
-{
+namespace vsm {
/**
* This class does suffix utf8 searches.
@@ -12,13 +11,12 @@ namespace vsm
class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase
{
protected:
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
};
}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index e33408a2e26..715c19a0bb7 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -28,30 +28,30 @@ namespace vsm {
namespace {
-void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
+void
+setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
if (arg1 == "prefix") {
- searcher->setMatchType(FieldSearcher::PREFIX);
+ searcher->match_type(FieldSearcher::PREFIX);
} else if (arg1 == "substring") {
- searcher->setMatchType(FieldSearcher::SUBSTRING);
+ searcher->match_type(FieldSearcher::SUBSTRING);
} else if (arg1 == "suffix") {
- searcher->setMatchType(FieldSearcher::SUFFIX);
- } else if (arg1 == "exact") {
- searcher->setMatchType(FieldSearcher::EXACT);
- } else if (arg1 == "word") {
- searcher->setMatchType(FieldSearcher::EXACT);
+ searcher->match_type(FieldSearcher::SUFFIX);
+ } else if ((arg1 == "exact") || (arg1 == "word")) {
+ searcher->match_type(FieldSearcher::EXACT);
}
}
}
-FieldSearchSpec::FieldSearchSpec() :
- _id(0),
- _name(),
- _maxLength(0x100000),
- _searcher(),
- _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
- _arg1(),
- _reconfigured(false)
+FieldSearchSpec::FieldSearchSpec()
+ : _id(0),
+ _name(),
+ _maxLength(0x100000),
+ _searcher(),
+ _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+ _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
+ _arg1(),
+ _reconfigured(false)
{
}
FieldSearchSpec::~FieldSearchSpec() = default;
@@ -59,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default;
FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
-FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
- VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
- const vespalib::string & arg1, size_t maxLength_) :
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef,
+ Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) :
_id(fid),
_name(fname),
- _maxLength(maxLength_),
+ _maxLength(maxLength_in),
_searcher(),
_searchMethod(searchDef),
- _arg1(arg1),
+ _normalize_mode(normalize_mode),
+ _arg1(arg1_in),
_reconfigured(false)
{
switch(searchDef) {
@@ -78,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
- if (arg1 == "substring") {
+ if (_arg1 == "substring") {
_searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
- } else if (arg1 == "suffix") {
+ } else if (_arg1 == "suffix") {
_searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
- } else if (arg1 == "exact") {
- _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
- } else if (arg1 == "word") {
+ } else if ((_arg1 == "exact") || (_arg1 == "word")) {
_searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
} else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
_searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
@@ -111,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
_searcher = std::make_unique<GeoPosFieldSearcher>(fid);
break;
case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR:
- auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1);
+ auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1);
_searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm);
break;
}
if (_searcher) {
- setMatchType(_searcher, arg1);
+ setMatchType(_searcher, _arg1);
_searcher->maxFieldLength(maxLength());
+ _searcher->normalize_mode(_normalize_mode);
}
}
@@ -150,7 +149,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term)
}
}
-vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f)
+vespalib::asciistream &
+operator <<(vespalib::asciistream & os, const FieldSearchSpec & f)
{
os << f._id << ' ' << f._name << ' ';
if ( ! f._searcher) {
@@ -164,62 +164,67 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default;
FieldSearchSpecMap::~FieldSearchSpecMap() = default;
namespace {
- const std::string _G_empty("");
- const std::string _G_value(".value");
- const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
- const std::regex _G_map2("\\{\".*\"\\}");
- const std::regex _G_array("\\[[0-9]+\\]");
+ const std::string G_empty;
+ const std::string G_value(".value");
+ const std::regex G_map1("\\{[a-zA-Z0-9]+\\}");
+ const std::regex G_map2("\\{\".*\"\\}");
+ const std::regex G_array("\\[[0-9]+\\]");
}
-vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex)
+vespalib::string
+FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex)
{
if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
- std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
- index = std::regex_replace(index, _G_map2, _G_value);
- index = std::regex_replace(index, _G_array, _G_empty);
+ std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value);
+ index = std::regex_replace(index, G_map2, G_value);
+ index = std::regex_replace(index, G_array, G_empty);
return index;
}
return rawIndex;
}
-bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const
+void
+FieldSearchSpecMap::addFieldsFromIndex(vespalib::stringref rawIndex, StringFieldIdTMap & fieldIdMap) const {
+ for (const auto & dtm : documentTypeMap()) {
+ const IndexFieldMapT & fim = dtm.second;
+ vespalib::string index(stripNonFields(rawIndex));
+ auto fIt = fim.find(index);
+ if (fIt != fim.end()) {
+ for(FieldIdT fid : fIt->second) {
+ const FieldSearchSpec & spec = specMap().find(fid)->second;
+ LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.data(), index.c_str());
+ if ((rawIndex != index) && (spec.name().find(index) == 0)) {
+ vespalib::string modIndex(rawIndex);
+ modIndex.append(spec.name().substr(index.size()));
+ fieldIdMap.add(modIndex, spec.id());
+ } else {
+ fieldIdMap.add(spec.name(),spec.id());
+ }
+ }
+ } else {
+ LOG(warning, "No valid indexes registered for index %s", rawIndex.data());
+ }
+ }
+}
+
+StringFieldIdTMap
+FieldSearchSpecMap::buildFieldsInQuery(const Query & query) const
{
- bool retval(true);
+ StringFieldIdTMap fieldsInQuery;
ConstQueryTermList qtl;
query.getLeaves(qtl);
for (const auto & term : qtl) {
- for (const auto & dtm : documentTypeMap()) {
- const IndexFieldMapT & fim = dtm.second;
- vespalib::string rawIndex(term->index());
- vespalib::string index(stripNonFields(rawIndex));
- auto fIt = fim.find(index);
- if (fIt != fim.end()) {
- for(FieldIdT fid : fIt->second) {
- const FieldSearchSpec & spec = specMap().find(fid)->second;
- LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str());
- if ((rawIndex != index) && (spec.name().find(index) == 0)) {
- vespalib::string modIndex(rawIndex);
- modIndex.append(spec.name().substr(index.size()));
- fieldsInQuery.add(modIndex, spec.id());
- } else {
- fieldsInQuery.add(spec.name(),spec.id());
- }
- }
- } else {
- LOG(warning, "No valid indexes registered for index %s", term->index().c_str());
- retval = false;
- }
- }
+ addFieldsFromIndex(term->index(), fieldsInQuery);
}
- return retval;
+ return fieldsInQuery;
}
-void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded)
+void
+FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded)
{
- for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) {
- LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str());
- _nameIdMap.add(otherFieldsNeeded[i]);
+ for (const auto & i : otherFieldsNeeded) {
+ _nameIdMap.add(i);
}
}
@@ -251,16 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
return ifm;
}
+search::streaming::Normalizing
+normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+ switch (normalize_mode) {
+ case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+ }
+ return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+}
+
}
-bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
+void
+FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
{
- bool retval(true);
LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
LOG(spam, "Parsing %s", cfs.name.c_str());
FieldIdT fieldId = specMap().size();
- FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+ FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
_specMap[fieldId] = std::move(fss);
_nameIdMap.add(cfs.name, fieldId);
LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
@@ -275,7 +290,6 @@ bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
}
_documentTypeMap[di.name] = indexMapp;
}
- return retval;
}
void
@@ -297,12 +311,14 @@ FieldSearchSpecMap::reconfigFromQuery(const Query & query)
}
}
-bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b)
+bool
+lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b)
{
return a->field() < b->field();
}
-void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap)
+void
+FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const
{
fieldSearcherMap.clear();
for (const auto & entry : fieldsInQuery) {
@@ -328,10 +344,11 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const
if (!itr->second.uses_nearest_neighbor_search_method()) {
return dm;
}
- return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1());
+ return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1());
}
-vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df)
+vespalib::asciistream &
+operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df)
{
os << "DocumentTypeMap = \n";
for (const auto & dtm : df.documentTypeMap()) {
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index b0154a82dae..7ba9799991e 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -10,20 +10,29 @@ namespace vsm {
class FieldSearchSpec
{
public:
+ using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
+ using Normalizing = search::streaming::Normalizing;
FieldSearchSpec();
- FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
- VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
- const vespalib::string & arg1, size_t maxLength);
+ FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod,
+ Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength);
~FieldSearchSpec();
FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
- const FieldSearcher & searcher() const { return *_searcher; }
- const vespalib::string & name() const { return _name; }
- FieldIdT id() const { return _id; }
- bool valid() const { return static_cast<bool>(_searcher); }
- size_t maxLength() const { return _maxLength; }
- bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; }
- const vespalib::string& get_arg1() const noexcept { return _arg1; }
+ const FieldSearcher & searcher() const noexcept { return *_searcher; }
+ const vespalib::string & name() const noexcept { return _name; }
+ FieldIdT id() const noexcept { return _id; }
+ bool valid() const noexcept { return static_cast<bool>(_searcher); }
+ size_t maxLength() const noexcept { return _maxLength; }
+ Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+ const vespalib::string& arg1() const noexcept { return _arg1; }
+ bool uses_nearest_neighbor_search_method() const noexcept {
+ return _searchMethod == Searchmethod::NEAREST_NEIGHBOR;
+ }
+ bool uses_string_search_method() const noexcept {
+ return (_searchMethod == Searchmethod::UTF8) ||
+ (_searchMethod == Searchmethod::AUTOUTF8) ||
+ (_searchMethod == Searchmethod::SSE2UTF8);
+ }
/**
* Reconfigures the field searcher based on information in the given query term.
@@ -37,7 +46,8 @@ private:
vespalib::string _name;
size_t _maxLength;
FieldSearcherContainer _searcher;
- VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+ Searchmethod _searchMethod;
+ Normalizing _normalize_mode;
vespalib::string _arg1;
bool _reconfigured;
};
@@ -55,7 +65,7 @@ public:
* and a mapping from field name to field id. It then iterates over all document types and index names
* and creates a mapping from index name to list of field ids for each document type.
**/
- bool buildFromConfig(const VsmfieldsHandle & conf);
+ void buildFromConfig(const VsmfieldsHandle & conf);
/**
* Iterates over the given field name vector adding extra elements to the mapping from field name to field id.
@@ -71,17 +81,13 @@ public:
* Adds a [field name, field id] entry to the given mapping for each field name used in the given query.
* This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs.
**/
- bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const;
-
- /**
- * Adds a [field name, field id] entry to the given mapping for each field name in the given vector.
- **/
- void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const;
+ StringFieldIdTMap buildFieldsInQuery(const search::streaming::Query & query) const;
+ void addFieldsFromIndex(vespalib::stringref index, StringFieldIdTMap & fieldIdMap) const;
/**
* Adds a FieldSearcher object to the given field searcher map for each field name in the other map.
**/
- void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap);
+ void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const;
const FieldSearchSpecMapT & specMap() const { return _specMap; }
//const IndexFieldMapT & indexMap() const { return _documentTypeMap.begin()->second; }
@@ -89,7 +95,7 @@ public:
const StringFieldIdTMap & nameIdMap() const { return _nameIdMap; }
friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f);
- static vespalib::string stripNonFields(const vespalib::string & rawIndex);
+ static vespalib::string stripNonFields(vespalib::stringref rawIndex);
search::attribute::DistanceMetric get_distance_metric(const vespalib::string& name) const;
private: