diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-15 10:23:18 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-15 10:23:18 +0100 |
commit | 29a807d35ac5d9e76ea1b8d653bb25b0e4e2dc73 (patch) | |
tree | d55fddad443566300bd4a7fdd3ef1118a8460700 /streamingvisitors | |
parent | 48b1bae2a6cdf58a237aa7be59632a06aba86861 (diff) | |
parent | 252fbeed13b8622fbc813620dc3b4e45abc6bbe2 (diff) |
Merge branch 'master' into balder/sliced-parallell-or
Diffstat (limited to 'streamingvisitors')
38 files changed, 629 insertions, 654 deletions
diff --git a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp index 2d138d1d336..93e35e4c6d2 100644 --- a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp +++ b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp @@ -40,7 +40,7 @@ protected: RankProcessorTest::RankProcessorTest() : testing::Test(), - _factory(), + _factory(nullptr), _query(), _query_wrapper() { diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 4492dfac02b..7f89071868a 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -15,12 +15,15 @@ #include <vespa/vsm/searcher/utf8substringsearcher.h> #include <vespa/vsm/searcher/utf8substringsnippetmodifier.h> #include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> +#include <vespa/vsm/searcher/tokenizereader.h> #include <vespa/vsm/vsm/snippetmodifier.h> using namespace document; using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; using search::streaming::QueryTerm; +using search::streaming::Normalizing; +using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; using search::streaming::QueryTermList; using TermType = QueryTerm::Type; using namespace vsm; @@ -47,7 +50,7 @@ class String private: const std::string & _str; public: - String(const std::string & str) : _str(str) {} + explicit String(const std::string & str) : _str(str) {} bool operator==(const String & rhs) const { return _str == rhs._str; } @@ -56,14 +59,14 @@ public: class Query { private: - void setupQuery(const StringList & terms) { - for (size_t i = 0; i < terms.size(); ++i) { - ParsedQueryTerm pqt = parseQueryTerm(terms[i]); + void setupQuery(const StringList & terms, Normalizing normalizing) { + for (const auto & term : terms) { + ParsedQueryTerm pqt = parseQueryTerm(term); ParsedTerm pt = parseTerm(pqt.second); - qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second)); + qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing)); } - for (size_t i = 0; i < qtv.size(); ++i) { - qtl.push_back(qtv[i].get()); + for (const auto & i : qtv) { + qtl.push_back(i.get()); } } public: @@ -72,14 +75,16 @@ public: QueryNodeResultFactory eqnr; std::vector<QueryTerm::UP> qtv; QueryTermList qtl; - Query(const StringList & terms); + + explicit Query(const StringList & terms) : Query(terms, Normalizing::LOWERCASE_AND_FOLD) {} + Query(const StringList & terms, Normalizing normalizing); ~Query(); static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) { size_t i = queryTerm.find(':'); if (i != std::string::npos) { - return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1)); + return {queryTerm.substr(0, i), queryTerm.substr(i + 1)}; } - return ParsedQueryTerm(std::string(), queryTerm); + return {std::string(), queryTerm}; } static ParsedTerm parseTerm(const std::string & term) { if (term[0] == '*' && term[term.size() - 1] == '*') { @@ -94,8 +99,8 @@ public: } }; -Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() { - setupQuery(terms); +Query::Query(const StringList & terms, Normalizing normalizing) : eqnr(), qtv(), qtl() { + setupQuery(terms, normalizing); } Query::~Query() = default; @@ -111,7 +116,7 @@ struct SnippetModifierSetup SnippetModifierSetup::SnippetModifierSetup(const StringList & terms) : query(terms), - searcher(new UTF8SubstringSnippetModifier()), + searcher(new UTF8SubstringSnippetModifier(0)), env(), modifier(searcher) { @@ -254,8 +259,8 @@ getFieldValue(const StringList & fv) static ArrayDataType type(*DataType::STRING); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(StringFieldValue(fv[i])); + for (const auto & v : fv) { + afv.add(StringFieldValue(v)); } return afv; } @@ -265,8 +270,8 @@ getFieldValue(const LongList & fv) { static ArrayDataType type(*DataType::LONG); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(LongFieldValue(fv[i])); + for (long v : fv) { + afv.add(LongFieldValue(v)); } return afv; } @@ -276,8 +281,8 @@ getFieldValue(const FloatList & fv) { static ArrayDataType type(*DataType::FLOAT); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(FloatFieldValue(fv[i])); + for (float v : fv) { + afv.add(FloatFieldValue(v)); } return afv; } @@ -286,8 +291,8 @@ bool assertMatchTermSuffix(const std::string & term, const std::string & word) { QueryNodeResultFactory eqnr; - QueryTerm qa(eqnr.create(), term, "index", TermType::WORD); - QueryTerm qb(eqnr.create(), word, "index", TermType::WORD); + QueryTerm qa(eqnr.create(), term, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD); + QueryTerm qb(eqnr.create(), word, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD); const ucs4_t * a; size_t alen = qa.term(a); const ucs4_t * b; @@ -299,8 +304,8 @@ void assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp) { HitsList hl; - for (size_t i = 0; i < exp.size(); ++i) { - hl.push_back(exp[i] ? Hits().add(0) : Hits()); + for (bool v : exp) { + hl.push_back(v ? Hits().add(0) : Hits()); } assertSearch(fs, query, fv, hl); } @@ -308,7 +313,7 @@ assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & f std::vector<QueryTerm::UP> performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv) { - Query q(query); + Query q(query, fs.normalize_mode()); // prepare field searcher test::MockFieldSearcherEnv env; @@ -316,7 +321,7 @@ performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & f // setup document SharedFieldPathMap sfim(new FieldPathMapT()); - sfim->push_back(FieldPath()); + sfim->emplace_back(); StorageDocument doc(std::make_unique<document::Document>(), sfim, 1); doc.setField(0, document::FieldValue::UP(fv.clone())); @@ -342,7 +347,7 @@ assertSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv bool assertFieldInfo(FieldSearcher & fs, const StringList & query, - const FieldValue & fv, const FieldInfoList & exp) + const FieldValue & fv, const FieldInfoList & exp) { auto qtv = performSearch(fs, query, fv); if (!EXPECT_EQUAL(qtv.size(), exp.size())) return false; @@ -358,7 +363,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query, void assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp) { - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); performSearch(mod, query, StringFieldValue(fv)); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size()); std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos()); @@ -369,7 +374,7 @@ assertSnippetModifier(const StringList & query, const std::string & fv, const st void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp) { FieldValue::UP mfv = setup.modifier.modify(fv); - const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get()); + const auto & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get()); const std::string & actual = lfv.getValue(); EXPECT_EQUAL(actual.size(), exp.size()); EXPECT_EQUAL(actual, exp); @@ -377,11 +382,11 @@ void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms) { - if (terms.size() == 0) { - ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL); + if (terms.empty()) { + ASSERT_TRUE(man.getModifiers().getModifier(fId) == nullptr); return; } - ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL); + ASSERT_TRUE(man.getModifiers().getModifier(fId) != nullptr); UTF8SubstringSnippetModifier * searcher = (static_cast<SnippetModifier *>(man.getModifiers().getModifier(fId)))->getSearcher().get(); EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size()); @@ -437,11 +442,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits())); assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "oper", field, Hits().add(0).add(2)); assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits())); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; { // test handling of several underscores @@ -466,7 +471,7 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) TEST("verify correct term parsing") { ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index"); ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term"); - ASSERT_TRUE(Query::parseQueryTerm("term").first == ""); + ASSERT_TRUE(Query::parseQueryTerm("term").first.empty()); ASSERT_TRUE(Query::parseQueryTerm("term").second == "term"); ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr"); ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM); @@ -550,12 +555,12 @@ TEST("utf8 substring search with empty term") TEST("utf8 suffix search") { UTF8SuffixStringFieldSearcher fs(0); std::string field = "operators and operator overloading"; - assertString(fs, "rsand", field, Hits()); - assertString(fs, "tor", field, Hits().add(2)); - assertString(fs, "tors", field, Hits().add(0)); + TEST_DO(assertString(fs, "rsand", field, Hits())); + TEST_DO(assertString(fs, "tor", field, Hits().add(2))); + TEST_DO(assertString(fs, "tors", field, Hits().add(0))); - assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())); - assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()))); + TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)))); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -587,22 +592,22 @@ TEST("utf8 flexible searcher"){ // prefix assertString(fs, "vesp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "vesp", "vespa", Hits().add(0)); // substring - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*esp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUBSTRING); + fs.match_type(FieldSearcher::SUBSTRING); assertString(fs, "esp", "vespa", Hits().add(0)); // suffix - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUFFIX); + fs.match_type(FieldSearcher::SUFFIX); assertString(fs, "espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -656,7 +661,7 @@ TEST("integer search") TEST("floating point search") { - FloatFieldSearcher fs; + FloatFieldSearcher fs(0); TEST_DO(assertFloat(fs, "10", 10, true)); TEST_DO(assertFloat(fs, "10.5", 10.5, true)); TEST_DO(assertFloat(fs, "-10.5", -10.5, true)); @@ -723,7 +728,7 @@ TEST("Snippet modifier search") { "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8"); { // check that resizing works - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u); performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa")); @@ -760,28 +765,32 @@ TEST("snippet modifier") { } } -TEST("FieldSearchSpec constrution") { +TEST("FieldSearchSpec construction") { { FieldSearchSpec f; EXPECT_FALSE(f.valid()); EXPECT_EQUAL(0u, f.id()); EXPECT_EQUAL("", f.name()); EXPECT_EQUAL(0x100000u, f.maxLength()); + EXPECT_EQUAL("", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode()); } { - FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789); + FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789); EXPECT_TRUE(f.valid()); EXPECT_EQUAL(7u, f.id()); EXPECT_EQUAL("f0", f.name()); EXPECT_EQUAL(789u, f.maxLength()); EXPECT_EQUAL(789u, f.searcher().maxFieldLength()); + EXPECT_EQUAL("substring", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode()); } } TEST("snippet modifier manager") { FieldSearchSpecMapT specMap; - specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000); - specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000); + specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000); + specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000); IndexFieldMapT indexMap; indexMap["i0"].push_back(0); indexMap["i1"].push_back(1); @@ -822,13 +831,13 @@ TEST("snippet modifier manager") { Query query(StringList().add("i2:foo").add("i2:*bar*")); man.setup(query.qtl, specMap, indexMap, *env.field_paths, env.query_env); { - SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0)); + auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0)); UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); } { - SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1)); + auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1)); UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); @@ -863,4 +872,24 @@ TEST("counting of words") { assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits())); } +vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization"; + +void +verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) { + ucs4_t buf[256]; + TokenizeReader reader(reinterpret_cast<const search::byte *>(NormalizationInput.c_str()), NormalizationInput.size(), buf); + while (reader.hasNext()) { + reader.normalize(reader.next(), normalizing); + } + size_t len = reader.complete(); + EXPECT_EQUAL(expected_len, len); + EXPECT_EQUAL(0, Fast_UnicodeUtil::utf8cmp(expected, buf)); +} + +TEST("test normalizing") { + verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str()); + verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization"); + verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization"); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp index b926444e4df..f7f340a2182 100644 --- a/streamingvisitors/src/tests/textutil/textutil_test.cpp +++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp @@ -2,7 +2,6 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/fastlib/text/normwordfolder.h> -#include <vespa/searchlib/query/base.h> #include <vespa/vsm/searcher/fold.h> #include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> #include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> @@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & const byte * srcbuf = reinterpret_cast<const byte *>(input); auto dstbuf = std::make_unique<ucs4_t[]>(len + 1); auto offsets = std::make_unique<size_t[]>(len + 1); - UTF8StrChrFieldSearcher fs; + UTF8StrChrFieldSearcher fs(0); BW bw(dstbuf.get(), offsets.get()); size_t dstlen = fs.skipSeparators(srcbuf, len, bw); EXPECT_EQUAL(dstlen, expdstbuf.size()); diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h index 8c1c3771917..38d0e942fbc 100644 --- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h +++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h @@ -17,15 +17,32 @@ private: search::fef::SimpleTermData _termData; public: QueryTermData * clone() const override { return new QueryTermData(); } - search::fef::SimpleTermData &getTermData() { return _termData; } + search::fef::SimpleTermData &getTermData() noexcept { return _termData; } +}; + +class SearchMethodInfo { +public: + using Normalizing = search::streaming::Normalizing; + virtual ~SearchMethodInfo() = default; + virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0; + virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0; }; class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory { public: + using Normalizing = search::streaming::Normalizing; + QueryTermDataFactory(const SearchMethodInfo * searchMethodInfo) noexcept : _searchMethodInfo(searchMethodInfo) {} std::unique_ptr<search::streaming::QueryNodeResultBase> create() const override { return std::make_unique<QueryTermData>(); } - bool getRewriteFloatTerms() const override { return true; } + Normalizing normalizing_mode(vespalib::stringref index) const noexcept override { + return _searchMethodInfo ? _searchMethodInfo->normalizing_mode(index) : Normalizing::LOWERCASE_AND_FOLD; + } + bool allow_float_terms_rewrite(vespalib::stringref index ) const noexcept override { + return _searchMethodInfo && _searchMethodInfo->is_text_matching(index); + } +private: + const SearchMethodInfo * _searchMethodInfo; }; diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 4d31c71c0a0..cdd1a018d84 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult; using search::attribute::IAttributeVector; using search::expression::ConfigureStaticParams; using search::streaming::Query; +using search::streaming::Normalizing; using search::streaming::QueryTermList; using storage::StorageComponent; using storage::VisitorEnvironment; @@ -91,7 +92,7 @@ ForceWordfolderInit::ForceWordfolderInit() Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION); } -static ForceWordfolderInit _G_forceNormWordFolderInit; +static ForceWordfolderInit G_forceNormWordFolderInit; // Leftovers from FS4 protocol with limited use here. enum queryflags { @@ -238,14 +239,16 @@ SearchVisitor::SummaryGenerator::fillSummary(AttributeVector::DocId lid, const H return {}; } -void SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj) +void +SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj) { auto & hitsAggr(static_cast<HitsAggregationResult &>(obj)); hitsAggr.setSummaryGenerator(_summaryGenerator); _numHitsAggregators++; } -bool SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const +bool +SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const { return obj.getClass().inherits(HitsAggregationResult::classId); } @@ -259,7 +262,8 @@ SearchVisitor::GroupingEntry::GroupingEntry(Grouping * grouping) : SearchVisitor::GroupingEntry::~GroupingEntry() = default; -void SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank) +void +SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank) { if (_count < _limit) { _grouping->aggregate(doc, rank); @@ -310,7 +314,58 @@ SearchVisitor::SearchVisitor(StorageComponent& component, LOG(debug, "Created SearchVisitor"); } -void SearchVisitor::init(const Parameters & params) +bool +SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept { + StringFieldIdTMap fieldIdMap; + _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); + for (const auto & fieldId : fieldIdMap.map()) { + auto found = _fieldSearchSpecMap.specMap().find(fieldId.second); + if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.uses_string_search_method()) { + return true; + } + } + return false; +} + +namespace { + +uint32_t +count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) { + count++; + } + } + return count; +} + +uint32_t +count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) { + count++; + } + } + return count; +} + +} + +SearchMethodInfo::Normalizing +SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept { + StringFieldIdTMap fieldIdMap; + _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); + if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE; + if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE; + return Normalizing::LOWERCASE_AND_FOLD; +} + +void +SearchVisitor::init(const Parameters & params) { VISITOR_TRACE(6, "About to lazily init VSM adapter"); _attrMan.add(_documentIdAttributeBacking); @@ -397,7 +452,14 @@ void SearchVisitor::init(const Parameters & params) if ( params.lookup("query", queryBlob) ) { LOG(spam, "Received query blob of %zu bytes", queryBlob.size()); VISITOR_TRACE(9, vespalib::make_string("Setting up for query blob of %zu bytes", queryBlob.size())); - QueryTermDataFactory addOnFactory; + // Create mapping from field name to field id, from field id to search spec, + // and from index name to list of field ids + _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config()); + auto additionalFields = registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs()); + // Add extra elements to mapping from field name to field id + _fieldSearchSpecMap.buildFromConfig(additionalFields); + + QueryTermDataFactory addOnFactory(this); _query = Query(addOnFactory, vespalib::stringref(queryBlob.data(), queryBlob.size())); _searchBuffer->reserve(0x10000); @@ -408,19 +470,11 @@ void SearchVisitor::init(const Parameters & params) LOG(warning, "Request without query stack count"); } - std::vector<vespalib::string> additionalFields; - registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs(), additionalFields); - - StringFieldIdTMap fieldsInQuery; - setupFieldSearchers(additionalFields, fieldsInQuery); - - + StringFieldIdTMap fieldsInQuery = setupFieldSearchers(); setupScratchDocument(fieldsInQuery); - _syntheticFieldsController.setup(_fieldSearchSpecMap.nameIdMap(), fieldsInQuery); setupAttributeVectors(); - setupAttributeVectorsForSorting(_sortSpec); _rankController.setRankManagerSnapshot(_env->get_rank_manager_snapshot()); @@ -436,7 +490,6 @@ void SearchVisitor::init(const Parameters & params) // This depends on _fieldPathMap (from setupScratchDocument), // and IQueryEnvironment (from setupRankProcessors). prepare_field_searchers(); - } else { LOG(warning, "No query received"); } @@ -529,10 +582,7 @@ SearchVisitor::PositionInserter::PositionInserter(AttributeVector & attribute, A SearchVisitor::PositionInserter::~PositionInserter() = default; void -SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content & c) -{ - (void) c; -} +SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content &) { } void SearchVisitor::PositionInserter::onStructStart(const Content & c) @@ -605,7 +655,6 @@ SearchVisitor::RankController::setupRankProcessors(Query & query, { _rankSetup = &_rankManagerSnapshot->getRankSetup(_rankProfile); _rankProcessor = std::make_unique<RankProcessor>(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, &attrMan); - LOG(debug, "Initialize rank processor"); _rankProcessor->initForRanking(wantedHitCount); // register attribute vectors needed for ranking processAccessedAttributes(_rankProcessor->get_real_query_env(), true, attrMan, attributeFields); @@ -637,8 +686,7 @@ SearchVisitor::RankController::rankMatchedDocument(uint32_t docId) { _rankProcessor->runRankProgram(docId); LOG(debug, "Rank score for matched document %u: %f", - docId, - _rankProcessor->getRankScore()); + docId, _rankProcessor->getRankScore()); if (_dumpFeatures) { _dumpProcessor->runRankProgram(docId); // we must transfer the score to this match data to make sure that the same hits @@ -718,9 +766,8 @@ SearchVisitor::SyntheticFieldsController::setup(const StringFieldIdTMap & fieldR } void -SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument & document) +SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument &) { - (void) document; } void @@ -730,10 +777,10 @@ SearchVisitor::SyntheticFieldsController::onDocumentMatch(StorageDocument & docu document.setField(_documentIdFId, std::make_unique<document::StringFieldValue>(documentId)); } -void -SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec, - std::vector<vespalib::string> & fieldList) +std::vector<vespalib::string> +SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec) { + std::vector<vespalib::string> fieldList; for (const vsm::DocsumTools::FieldSpec & spec : docsumSpec) { fieldList.push_back(spec.getOutputName()); const std::vector<vespalib::string> & inputNames = spec.getInputNames(); @@ -748,25 +795,20 @@ SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::Fiel fieldList.emplace_back("[docid]"); fieldList.emplace_back("[rank]"); fieldList.emplace_back("documentid"); + return fieldList; } -void -SearchVisitor::setupFieldSearchers(const std::vector<vespalib::string> & additionalFields, - StringFieldIdTMap & fieldsInQuery) +StringFieldIdTMap +SearchVisitor::setupFieldSearchers() { - // Create mapping from field name to field id, from field id to search spec, - // and from index name to list of field ids - _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config()); - // Add extra elements to mapping from field name to field id - _fieldSearchSpecMap.buildFromConfig(additionalFields); - // Reconfig field searchers based on the query _fieldSearchSpecMap.reconfigFromQuery(_query); // Map field name to field id for all fields in the query - _fieldSearchSpecMap.buildFieldsInQuery(_query, fieldsInQuery); + StringFieldIdTMap fieldsInQuery = _fieldSearchSpecMap.buildFieldsInQuery(_query); // Connect field names in the query to field searchers _fieldSearchSpecMap.buildSearcherMap(fieldsInQuery.map(), _fieldSearcherMap); + return fieldsInQuery; } void @@ -947,8 +989,7 @@ class SingleDocumentStore : public vsm::IDocSumCache { public: explicit SingleDocumentStore(const StorageDocument & doc) : _doc(doc) { } - const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override { - (void) docId; + const vsm::Document & getDocSum(const search::DocumentIdT &) const override { return _doc; } private: @@ -959,19 +1000,12 @@ bool SearchVisitor::compatibleDocumentTypes(const document::DocumentType& typeA, const document::DocumentType& typeB) { - if (&typeA == &typeB) { - return true; - } else { - return (typeA.getName() == typeB.getName()); - } + return (&typeA == &typeB) || (typeA.getName() == typeB.getName()); } void -SearchVisitor::handleDocuments(const document::BucketId&, - DocEntryList & entries, - HitCounter& hitCounter) +SearchVisitor::handleDocuments(const document::BucketId&, DocEntryList & entries, HitCounter& ) { - (void) hitCounter; if (!_init_called) { init(_params); } @@ -1016,37 +1050,25 @@ SearchVisitor::handleDocument(StorageDocument & document) RankProcessor & rp = *_rankController.getRankProcessor(); vespalib::string documentId(document.docDoc().getId().getScheme().toString()); LOG(debug, "Matched document with id '%s'", documentId.c_str()); - document.setDocId(rp.getDocId()); - fillAttributeVectors(documentId, document); - _rankController.rankMatchedDocument(rp.getDocId()); - if (_shouldFillRankAttribute) { _rankAttribute.add(rp.getRankScore()); } - if (_rankController.keepMatchedDocument()) { - bool amongTheBest = _rankController.collectMatchedDocument(!_sortList.empty(), *this, _tmpSortBuffer, &document); - _syntheticFieldsController.onDocumentMatch(document, documentId); - SingleDocumentStore single(document); _summaryGenerator.setDocsumCache(single); group(document.docDoc(), rp.getRankScore(), false); - if (amongTheBest) { needToKeepDocument = true; } - } else { _hitsRejectedCount++; LOG(debug, "Do not keep document with id '%s' because rank score (%f) <= rank score drop limit (%f)", - documentId.c_str(), - rp.getRankScore(), - _rankController.getRankSetup()->getRankScoreDropLimit()); + documentId.c_str(), rp.getRankScore(), _rankController.getRankSetup()->getRankScoreDropLimit()); } } else { LOG(debug, "Did not match document with id '%s'", document.docDoc().getId().getScheme().toString().c_str()); @@ -1145,7 +1167,8 @@ SearchVisitor::fillSortBuffer() return pos; } -void SearchVisitor::completedBucket(const document::BucketId&, HitCounter&) +void +SearchVisitor::completedBucket(const document::BucketId&, HitCounter&) { LOG(debug, "Completed bucket"); } @@ -1157,7 +1180,8 @@ SearchVisitor::generate_query_result(HitCounter& counter) return std::move(_queryResult); } -void SearchVisitor::completedVisitingInternal(HitCounter& hitCounter) +void +SearchVisitor::completedVisitingInternal(HitCounter& hitCounter) { if (!_init_called) { init(_params); diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h index ef7a41f23a5..ce40b5ba742 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h @@ -8,6 +8,7 @@ #include "rankmanager.h" #include "rankprocessor.h" #include "searchenvironment.h" +#include "querytermdata.h" #include <vespa/vsm/common/docsum.h> #include <vespa/vsm/common/documenttypemapping.h> #include <vespa/vsm/common/storagedocument.h> @@ -42,7 +43,8 @@ class SearchEnvironmentSnapshot; * @brief Visitor that applies a search query to visitor data and * converts them to a QueryResultCommand. **/ -class SearchVisitor : public storage::Visitor { +class SearchVisitor : public storage::Visitor, + public SearchMethodInfo { public: SearchVisitor(storage::StorageComponent&, storage::VisitorEnvironment& vEnv, const vdslib::Parameters & params); @@ -253,19 +255,15 @@ private: * @param docsumSpec config with the field names used by the docsum setup. * @param fieldList list of field names that are built. **/ - static void registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec, - std::vector<vespalib::string> & fieldList); + static std::vector<vespalib::string> registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec); /** * Setup the field searchers used when matching the query with the stream of documents. * This includes setting up various mappings in FieldSearchSpecMap and building mapping * for fields used by the query. * - * @param additionalFields list of additional field names used when setting up the mappings. - * @param fieldsInQuery mapping from field name to field id that are built based on the query. **/ - void setupFieldSearchers(const std::vector<vespalib::string> & additionalFields, - vsm::StringFieldIdTMap & fieldsInQuery); + vsm::StringFieldIdTMap setupFieldSearchers(); /** * Prepare the field searchers for the given query. @@ -488,6 +486,8 @@ private: vsm::StringFieldIdTMapT _fieldsUnion; void setupAttributeVector(const vsm::FieldPath &fieldPath); + bool is_text_matching(vespalib::stringref index) const noexcept override; + Normalizing normalizing_mode(vespalib::stringref index) const noexcept override; }; class SearchVisitorFactory : public storage::VisitorFactory { diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def index 442a044d38f..dac732013d2 100644 --- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def +++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def @@ -14,6 +14,7 @@ fieldspec[].name string ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8 fieldspec[].arg1 string default="" +fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD ## Maximum number of chars to search per field. fieldspec[].maxlength int default=1048576 diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt index 1a9238346b0..40aad418b22 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt @@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT intfieldsearcher.cpp nearest_neighbor_field_searcher.cpp strchrfieldsearcher.cpp + tokenizereader.cpp utf8flexiblestringfieldsearcher.cpp utf8strchrfieldsearcher.cpp utf8stringfieldsearcherbase.cpp diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h index c7e7d2e74bd..3708cca85fb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h @@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - BoolFieldSearcher(FieldIdT fId); - ~BoolFieldSearcher(); + explicit BoolFieldSearcher(FieldIdT fId); + ~BoolFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index c797e6751ee..5e06ae41a03 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -23,79 +23,54 @@ class force force() { FieldSearcher::init(); } }; -static force __forceInit; +static force ForceInit; byte FieldSearcher::_foldLowCase[256]; byte FieldSearcher::_wordChar[256]; -FieldSearcherBase::FieldSearcherBase() : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) +FieldSearcherBase::FieldSearcherBase() noexcept + : _qtl() { } -FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) +FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) + : _qtl() { prepare(org._qtl); } -FieldSearcherBase::~FieldSearcherBase() -{ -} - -FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org) -{ - if (this != &org) { - prepare(org._qtl); - } - return *this; -} +FieldSearcherBase::~FieldSearcherBase() = default; -void FieldSearcherBase::prepare(const QueryTermList & qtl) +void +FieldSearcherBase::prepare(const QueryTermList & qtl) { _qtl = qtl; - _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13); - _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf); - _qtlFastSize = 0; - for (auto qt : _qtl) { - memcpy(&_qtlFast[_qtlFastSize++], qt->getTerm(), std::min(size_t(16), qt->termLen())); - } } -FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) : - FieldSearcherBase(), - _field(fId), - _matchType(defaultPrefix ? PREFIX : REGULAR), - _maxFieldLength(0x100000), - _currentElementId(0), - _currentElementWeight(1), - _pureUsAsciiCount(0), - _pureUsAsciiFieldCount(0), - _anyUtf8Count(0), - _anyUtf8FieldCount(0), - _words(0), - _badUtf8Count(0), - _zeroCount(0) +FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept + : FieldSearcherBase(), + _field(fId), + _matchType(defaultPrefix ? PREFIX : REGULAR), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), + _maxFieldLength(0x100000), + _currentElementId(0), + _currentElementWeight(1), + _words(0), + _badUtf8Count(0) { - zeroStat(); } FieldSearcher::~FieldSearcher() = default; -bool FieldSearcher::search(const StorageDocument & doc) +bool +FieldSearcher::search(const StorageDocument & doc) { for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitOffset(qt->getHitList().size()); } onSearch(doc); - for(auto qt : _qtl) { + for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset()); fInfo.setFieldLength(_words); @@ -104,16 +79,16 @@ bool FieldSearcher::search(const StorageDocument & doc) return true; } -void FieldSearcher::prepare(QueryTermList& qtl, - const SharedSearcherBuf&, - const vsm::FieldPathMapT&, - search::fef::IQueryEnvironment&) +void +FieldSearcher::prepare(QueryTermList& qtl, const SharedSearcherBuf&, + const vsm::FieldPathMapT&, search::fef::IQueryEnvironment&) { FieldSearcherBase::prepare(qtl); prepareFieldId(); } -size_t FieldSearcher::countWords(const FieldRef & f) +size_t +FieldSearcher::countWords(const FieldRef & f) { size_t words = 0; const char * n = f.data(); @@ -129,36 +104,16 @@ size_t FieldSearcher::countWords(const FieldRef & f) return words; } -void FieldSearcher::prepareFieldId() +void +FieldSearcher::prepareFieldId() { for(auto qt : _qtl) { qt->resizeFieldId(field()); } } -void FieldSearcher::addStat(const FieldSearcher & toAdd) -{ - _pureUsAsciiCount += toAdd._pureUsAsciiCount; - _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount; - _anyUtf8Count += toAdd._anyUtf8Count; - _anyUtf8FieldCount += toAdd._anyUtf8FieldCount; - _badUtf8Count += toAdd._badUtf8Count; - _zeroCount += toAdd._zeroCount; - for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; } -} - -void FieldSearcher::zeroStat() -{ - _pureUsAsciiCount = 0; - _pureUsAsciiFieldCount = 0; - _anyUtf8Count = 0; - _anyUtf8FieldCount = 0; - _badUtf8Count = 0; - _zeroCount = 0; - for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; } -} - -void FieldSearcher::init() +void +FieldSearcher::init() { for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { _foldLowCase[i] = 0; @@ -182,50 +137,59 @@ void FieldSearcher::init() _wordChar[0xd7] = 0; _wordChar[0xf7] = 0; - if (1) /* _doAccentRemoval */ { - _foldLowCase[0xc0] = 'a'; - _foldLowCase[0xc1] = 'a'; - _foldLowCase[0xc2] = 'a'; - _foldLowCase[0xc3] = 'a'; // A tilde - _foldLowCase[0xc7] = 'c'; - _foldLowCase[0xc8] = 'e'; - _foldLowCase[0xc9] = 'e'; - _foldLowCase[0xca] = 'e'; - _foldLowCase[0xcb] = 'e'; - _foldLowCase[0xcc] = 'i'; // I grave - _foldLowCase[0xcd] = 'i'; - _foldLowCase[0xce] = 'i'; - _foldLowCase[0xcf] = 'i'; - _foldLowCase[0xd3] = 'o'; - _foldLowCase[0xd4] = 'o'; - _foldLowCase[0xda] = 'u'; - _foldLowCase[0xdb] = 'u'; - - _foldLowCase[0xe0] = 'a'; - _foldLowCase[0xe1] = 'a'; - _foldLowCase[0xe2] = 'a'; - _foldLowCase[0xe3] = 'a'; // a tilde - _foldLowCase[0xe7] = 'c'; - _foldLowCase[0xe8] = 'e'; - _foldLowCase[0xe9] = 'e'; - _foldLowCase[0xea] = 'e'; - _foldLowCase[0xeb] = 'e'; - _foldLowCase[0xec] = 'i'; // i grave - _foldLowCase[0xed] = 'i'; - _foldLowCase[0xee] = 'i'; - _foldLowCase[0xef] = 'i'; - _foldLowCase[0xf3] = 'o'; - _foldLowCase[0xf4] = 'o'; - _foldLowCase[0xfa] = 'u'; - _foldLowCase[0xfb] = 'u'; - } + _foldLowCase[0xc0] = 'a'; + _foldLowCase[0xc1] = 'a'; + _foldLowCase[0xc2] = 'a'; + _foldLowCase[0xc3] = 'a'; + _foldLowCase[0xc7] = 'c'; + _foldLowCase[0xc8] = 'e'; + _foldLowCase[0xc9] = 'e'; + _foldLowCase[0xca] = 'e'; + _foldLowCase[0xcb] = 'e'; + _foldLowCase[0xcc] = 'i'; + _foldLowCase[0xcd] = 'i'; + _foldLowCase[0xce] = 'i'; + _foldLowCase[0xcf] = 'i'; + _foldLowCase[0xd1] = 'n'; + _foldLowCase[0xd2] = 'o'; + _foldLowCase[0xd3] = 'o'; + _foldLowCase[0xd4] = 'o'; + _foldLowCase[0xd5] = 'o'; + _foldLowCase[0xd9] = 'u'; + _foldLowCase[0xda] = 'u'; + _foldLowCase[0xdb] = 'u'; + _foldLowCase[0xdc] = 'u'; + _foldLowCase[0xdd] = 'y'; + _foldLowCase[0xe0] = 'a'; + _foldLowCase[0xe1] = 'a'; + _foldLowCase[0xe2] = 'a'; + _foldLowCase[0xe3] = 'a'; + _foldLowCase[0xe7] = 'c'; + _foldLowCase[0xe8] = 'e'; + _foldLowCase[0xe9] = 'e'; + _foldLowCase[0xea] = 'e'; + _foldLowCase[0xeb] = 'e'; + _foldLowCase[0xec] = 'i'; + _foldLowCase[0xed] = 'i'; + _foldLowCase[0xee] = 'i'; + _foldLowCase[0xef] = 'i'; + _foldLowCase[0xf1] = 'n'; + _foldLowCase[0xf2] = 'o'; + _foldLowCase[0xf3] = 'o'; + _foldLowCase[0xf4] = 'o'; + _foldLowCase[0xf5] = 'o'; + _foldLowCase[0xf9] = 'u'; + _foldLowCase[0xfa] = 'u'; + _foldLowCase[0xfb] = 'u'; + _foldLowCase[0xfc] = 'u'; + _foldLowCase[0xfd] = 'y'; + _foldLowCase[0xff] = 'y'; } -void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, - const SharedSearcherBuf& searcherBuf, - Query& query, - const vsm::FieldPathMapT& field_paths, - search::fef::IQueryEnvironment& query_env) +void +FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf, + Query& query, const vsm::FieldPathMapT& field_paths, + search::fef::IQueryEnvironment& query_env) { QueryTermList qtl; query.getLeaves(qtl); @@ -269,7 +233,8 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, LOG(debug, "Will search in %s", tmp.c_str()); } -bool FieldSearcher::onSearch(const StorageDocument & doc) +bool +FieldSearcher::onSearch(const StorageDocument & doc) { bool retval(true); size_t fNo(field()); @@ -296,10 +261,10 @@ FieldSearcher::IteratorHandler::onCollectionStart(const Content & c) const document::FieldValue & fv = c.getValue(); LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str()); if (fv.isA(document::FieldValue::Type::ARRAY)) { - const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv); + const auto & afv = static_cast<const document::ArrayFieldValue &>(fv); LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size()); } else if (fv.isA(document::FieldValue::Type::WSET)) { - const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv); + const auto & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv); LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size()); } } @@ -311,5 +276,4 @@ FieldSearcher::IteratorHandler::onStructStart(const Content & c) _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue())); } - } diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index e79dacf827e..c5bca6f3899 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -14,77 +14,59 @@ namespace vsm { using termcount_t = size_t; using termsize_t = size_t; -#if defined(COLLECT_CHAR_STAT) - #define NEED_CHAR_STAT(a) { a; } -#else - #define NEED_CHAR_STAT(a) -#endif - using ucs4_t = uint32_t; using cmptype_t = ucs4_t; using SearcherBuf = vespalib::Array<cmptype_t>; using SharedSearcherBuf = std::shared_ptr<SearcherBuf>; -using CharVector = std::vector<char>; class FieldSearcherBase { protected: - search::streaming::QueryTermList _qtl; -private: - CharVector _qtlFastBuffer; -protected: - FieldSearcherBase(); + FieldSearcherBase() noexcept; FieldSearcherBase(const FieldSearcherBase & org); - virtual ~FieldSearcherBase(void); - FieldSearcherBase & operator = (const FieldSearcherBase & org); + virtual ~FieldSearcherBase(); + FieldSearcherBase & operator = (const FieldSearcherBase & org) = delete; void prepare(const search::streaming::QueryTermList & qtl); - size_t _qtlFastSize; - search::v16qi *_qtlFast; +protected: + search::streaming::QueryTermList _qtl; }; class FieldSearcher : public FieldSearcherBase { public: + using Normalizing = search::streaming::Normalizing; enum MatchType { REGULAR, PREFIX, SUBSTRING, SUFFIX, - EXACT + EXACT, }; - FieldSearcher(FieldIdT fId, bool defaultPrefix=false); + explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} + FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept; ~FieldSearcher() override; virtual std::unique_ptr<FieldSearcher> duplicate() const = 0; bool search(const StorageDocument & doc); - virtual void prepare(search::streaming::QueryTermList& qtl, - const SharedSearcherBuf& buf, - const vsm::FieldPathMapT& field_paths, - search::fef::IQueryEnvironment& query_env); - - FieldIdT field() const { return _field; } - void field(FieldIdT v) { _field = v; prepareFieldId(); } - bool prefix() const { return _matchType == PREFIX; } - bool substring() const { return _matchType == SUBSTRING; } - bool suffix() const { return _matchType == SUFFIX; } - bool exact() const { return _matchType == EXACT; } - void setMatchType(MatchType mt) { _matchType = mt; } + virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, + const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); + + FieldIdT field() const noexcept { return _field; } + bool prefix() const noexcept { return _matchType == PREFIX; } + bool substring() const noexcept { return _matchType == SUBSTRING; } + bool suffix() const noexcept { return _matchType == SUFFIX; } + bool exact() const noexcept { return _matchType == EXACT; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + MatchType match_type() const noexcept { return _matchType; } + void match_type(MatchType mt) noexcept { _matchType = mt; } + void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; } + void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); } static void init(); static search::byte fold(search::byte c) { return _foldLowCase[c]; } static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); - unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; } - unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; } - unsigned anyUtf8Count() const { return _anyUtf8Count; } - unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; } - unsigned badUtf8Count() const { return _badUtf8Count; } - unsigned zeroCount() const { return _zeroCount; } - unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; } - const unsigned * utf8Count() const { return _utf8Count; } - int32_t getCurrentWeight() const { return _currentElementWeight; } - void addStat(const FieldSearcher & toAdd); - void zeroStat(); + int32_t currentWeight() const { return _currentElementWeight; } FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -98,7 +80,7 @@ private: void onStructStart(const Content & c) override; public: - IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} + explicit IteratorHandler(FieldSearcher & searcher) noexcept : _searcher(searcher) {} }; friend class IteratorHandler; // to allow calls to onValue(); @@ -110,33 +92,21 @@ private: virtual void onStructValue(const document::StructFieldValue &) { } FieldIdT _field; MatchType _matchType; + Normalizing _normalize_mode; unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. - /// Number of bytes in blocks containing pure us-ascii - unsigned _pureUsAsciiCount; - /// Number of blocks containing pure us-ascii - unsigned _pureUsAsciiFieldCount; - /// Number of bytes in blocks containing any non us-ascii - unsigned _anyUtf8Count; - /// Number of blocks containing any non us-ascii - unsigned _anyUtf8FieldCount; protected: /// Number of terms searched. - unsigned _words; + unsigned _words; /// Number of utf8 bytes by utf8 size. - unsigned _utf8Count[6]; - unsigned _badUtf8Count; - unsigned _zeroCount; -protected: - void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; } - void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; } + unsigned _badUtf8Count; /** * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. **/ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); + qt.add(_words + pos, field(), _currentElementId, _currentElementWeight); } public: static search::byte _foldLowCase[256]; @@ -149,10 +119,8 @@ using FieldIdTSearcherMapT = std::vector<FieldSearcherContainer>; class FieldIdTSearcherMap : public FieldIdTSearcherMapT { public: - void prepare(const DocumentTypeIndexFieldMapT& difm, - const SharedSearcherBuf& searcherBuf, - search::streaming::Query& query, - const vsm::FieldPathMapT& field_paths, + void prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf, + search::streaming::Query& query, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); }; diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp index 7dd40348f47..8558522003f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp @@ -37,7 +37,7 @@ void FloatFieldSearcherT<T>::prepare(search::streaming::QueryTermList& qtl, _floatTerm.clear(); FieldSearcher::prepare(qtl, buf, field_paths, query_env); for (auto qt : qtl) { - size_t sz(qt->termLen()); + size_t sz(qt->termLen()); if (sz) { auto range = qt->getRange<T>(); _floatTerm.emplace_back(range.low, range.high, range.valid); diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h index 07b3f6e1c5f..85341472c26 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h @@ -9,8 +9,8 @@ template <typename T> class FloatFieldSearcherT : public FieldSearcher { public: - FloatFieldSearcherT(FieldIdT fId=0); - ~FloatFieldSearcherT(); + explicit FloatFieldSearcherT(FieldIdT fId); + ~FloatFieldSearcherT() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { } + explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { } }; class DoubleFieldSearcher : public FloatFieldSearcherTD { public: std::unique_ptr<FieldSearcher> duplicate() const override; - DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { } + DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp index a2122f08995..c0b5117d6bf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const return std::make_unique<FUTF8StrChrFieldSearcher>(*this); } -FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher() - : UTF8StrChrFieldSearcher(), - _folded(4_Ki) -{ } FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StrChrFieldSearcher(fId), _folded(4_Ki) @@ -36,7 +32,7 @@ FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded for(size_t i=0; i < sz; i++) { byte c = toFold[i]; if (c>=128) { retval = false; break; } - folded[i] = FieldSearcher::_foldLowCase[c]; + folded[i] = fold(c); } return retval; } @@ -209,7 +205,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) folded[f.size()+1] = 0x01; memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values return match(folded, f.size(), qt); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); } else { return UTF8StrChrFieldSearcher::matchTerm(f, qt); } @@ -227,7 +222,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t min folded[f.size()+1] = 0x01; memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size()); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); } else { return UTF8StrChrFieldSearcher::matchTerms(f, mintsz); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h index 5d5ca3d6c3c..b8aa287070a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h @@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FUTF8StrChrFieldSearcher(); - FUTF8StrChrFieldSearcher(FieldIdT fId); + explicit FUTF8StrChrFieldSearcher(FieldIdT fId); ~FUTF8StrChrFieldSearcher() override; static bool ansiFold(const char * toFold, size_t sz, char * folded); static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart); static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart); private: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef&, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef&, size_t shortestTerm) override; virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt); size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize); std::vector<char> _folded; diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h index 741148fbca1..17c9f23fefb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h @@ -8,8 +8,8 @@ namespace vsm { class GeoPosFieldSearcher : public FieldSearcher { public: - GeoPosFieldSearcher(FieldIdT fId=0); - ~GeoPosFieldSearcher(); + GeoPosFieldSearcher(FieldIdT fId); + ~GeoPosFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -21,7 +21,7 @@ protected: using GeoLocation = search::common::GeoLocation; class GeoPosInfo : public GeoLocation { public: - GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} + explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} bool cmp(const document::StructFieldValue & fv) const; }; using GeoPosInfoListT = std::vector<GeoPosInfo>; diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h index 47b83c1538d..9c63d31e3c3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h @@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - IntFieldSearcher(FieldIdT fId=0); - ~IntFieldSearcher(); + explicit IntFieldSearcher(FieldIdT fId); + ~IntFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp index 76fedbd1166..816317bf86d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp @@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv) } DistanceMetric -NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value) +NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value) { // Valid string values must match the definition of DistanceMetric in // config-model/src/main/java/com/yahoo/schema/document/Attribute.java - auto v = value; + vespalib::string v = value; std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { return std::tolower(c); }); try { return DistanceMetricUtils::to_distance_metric(v); } catch (vespalib::IllegalStateException&) { - vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str()); + vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str()); return DistanceMetric::Euclidean; } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h index 5629b443c78..ecdc64d1336 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h @@ -11,10 +11,7 @@ #include <vespa/searchlib/tensor/tensor_ext_attribute.h> namespace search::fef { class IQueryEnvironment; } - -namespace search::tensor { -class TensorExtAttribute; -} +namespace search::tensor { class TensorExtAttribute; } namespace vsm { @@ -43,7 +40,7 @@ private: public: NearestNeighborFieldSearcher(FieldIdT fid, search::attribute::DistanceMetric metric); - ~NearestNeighborFieldSearcher(); + ~NearestNeighborFieldSearcher() override; std::unique_ptr<FieldSearcher> duplicate() const override; void prepare(search::streaming::QueryTermList& qtl, @@ -52,7 +49,7 @@ public: search::fef::IQueryEnvironment& query_env) override; void onValue(const document::FieldValue& fv) override; - static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value); + static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h index 9ad76712092..19c723d060d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h @@ -8,8 +8,7 @@ namespace vsm { class StrChrFieldSearcher : public FieldSearcher { public: - StrChrFieldSearcher() : FieldSearcher(0) { } - StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } + explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } void onValue(const document::FieldValue & fv) override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, @@ -19,7 +18,7 @@ private: size_t shortestTerm() const; bool matchDoc(const FieldRef & field); virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0; + virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp new file mode 100644 index 00000000000..d8a6091fe11 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp @@ -0,0 +1,21 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokenizereader.h" + +namespace vsm { + +void +TokenizeReader::fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h new file mode 100644 index 00000000000..f10c8910e82 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h @@ -0,0 +1,54 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/searchlib/query/streaming/querynoderesultbase.h> +#include <vespa/searchlib/query/base.h> +#include <vespa/fastlib/text/normwordfolder.h> + +namespace vsm { + +/** + * Handles tokenization of utf8 input with on the fly normalization. + * It handles Normalizing::NONE, Normalizing::LOWERCASE, and Normalizing::LOWERCASE_AND_FOLD + */ +class TokenizeReader { +public: + using byte = search::byte; + using Normalizing = search::streaming::Normalizing; + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } +private: + void fold(ucs4_t c); + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp index 724efb54331..70cef08428a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp @@ -7,6 +7,13 @@ using search::streaming::QueryTermList; namespace vsm { +UTF8ExactStringFieldSearcher::UTF8ExactStringFieldSearcher(FieldIdT fId) + : UTF8StringFieldSearcherBase(fId) +{ + match_type(EXACT); + normalize_mode(Normalizing::LOWERCASE); +} + std::unique_ptr<FieldSearcher> UTF8ExactStringFieldSearcher::duplicate() const { @@ -14,7 +21,7 @@ UTF8ExactStringFieldSearcher::duplicate() const } size_t -UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; for (auto qt : _qtl) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index 997bed74787..9f590156a96 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -1,10 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> +#include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does suffix utf8 searches. @@ -12,14 +11,12 @@ namespace vsm class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase { protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8ExactStringFieldSearcher(FieldIdT fId); }; } - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index 655b068e152..78f491198ad 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) } } -UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() : - UTF8StringFieldSearcherBase() -{ } - UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h index 5eee6a8862a..bb1b55dffe4 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -1,10 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> +#include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does utf8 searches based on the query term type. @@ -17,18 +16,17 @@ private: * Tries to match the given query term against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; /** * Tries to match each query term in the underlying query against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8FlexibleStringFieldSearcher(); - UTF8FlexibleStringFieldSearcher(FieldIdT fId); + explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 2488d198b03..37dc4ffb99c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8strchrfieldsearcher.h" +#include "tokenizereader.h" using search::streaming::QueryTerm; using search::streaming::QueryTermList; @@ -14,21 +15,19 @@ UTF8StrChrFieldSearcher::duplicate() const } size_t -UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - const byte * e = n + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); @@ -42,7 +41,6 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) } words++; } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h index cfe546bc6f6..663ee3a1a62 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h @@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } - + explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index 4daea693e95..5036e9bedb1 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,7 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" -#include <vespa/fastlib/text/normwordfolder.h> +#include "tokenizereader.h" #include <cassert> using search::streaming::QueryTerm; @@ -10,115 +10,36 @@ using search::byte; namespace vsm { -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) { - *q++ = Fast_NormalizeWordFolder::_foldCase[c]; - c = 0; - } else { - c = *p; - } - } else { - const byte * oldP(p); - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - _utf8Count[p-oldP-1]++; - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::ToFold(c); - *q++ = c; - } - break; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) { - c = 0; - } else { - *q++ = Fast_NormalizeWordFolder::_foldCase[c]; - c = *p; - } - } else { - const byte * oldP(p); - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - _utf8Count[p-oldP-1]++; - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::ToFold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; - } - break; - } +template<typename Reader> +void +UTF8StringFieldSearcherBase::tokenize(Reader & reader) { + ucs4_t c(0); + Normalizing norm_mode = normalize_mode(); + while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); + + if (Fast_UnicodeUtil::IsWordChar(c)) { + reader.normalize(c, norm_mode); + while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { + reader.normalize(c, norm_mode); } } - *q = 0; - tokenlen = q - dstbuf; - return p; } size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - // __builtin_prefetch(n, 0, 0); const cmptype_t * term; termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); if ( f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); + cmptype_t * fn = _buf->data(); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -128,33 +49,35 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt } words++; } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words; } size_t UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) { - const byte * n = reinterpret_cast<const byte *> (f.data()); const cmptype_t * term; termsize_t tsz = qt.term(term); const cmptype_t * eterm = term+tsz; - const byte * e = n + f.size(); + if ( f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fn = _buf->data(); if (tsz <= f.size()) { bool equal(true); - for (; equal && (n < e) && (term < eterm); term++) { - if (*term < 0x80) { - equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]); - } else { - cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); - equal = (*term == c); + Normalizing norm_mode = normalize_mode(); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while (equal && reader.hasNext() && (term < eterm)) { + reader.normalize(reader.next(), norm_mode); + size_t len = reader.complete(); + for (size_t i(0); i < len; i++) { + equal = (term[i] == fn[i]); } + term += len; } - if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) { + if (equal && (term == eterm) && (qt.isPrefix() || ! reader.hasNext())) { addHit(qt,0); } } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return 1; } @@ -188,7 +111,6 @@ UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & } } } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words + 1; // we must also count the last word } @@ -196,22 +118,17 @@ size_t UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) { termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); const cmptype_t * term; termsize_t tsz = qt.term(term); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; + cmptype_t * dstbuf = _buf->data(); - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } @@ -220,11 +137,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) return words; } -UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() : - StrChrFieldSearcher() -{ -} - UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) : StrChrFieldSearcher(fId) { @@ -280,12 +192,12 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T if (c < 128) { p++; if (!isSeparatorCharacter(c)) { - dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b)); + dstbuf.onCharacter(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c), (oldP - b)); } } else { c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != NULL) { + if (repl != nullptr) { size_t repllen = strlen(repl); if (repllen > 0) { ucs4_t * buf = dstbuf.getBuf(); @@ -300,13 +212,11 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T } } } else { - c = Fast_NormalizeWordFolder::ToFold(c); + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); dstbuf.onCharacter(c, (oldP - b)); } if (c == Fast_UnicodeUtil::_BadUTF8Char) { _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; } } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 38aac508f4f..b196f2795a4 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -28,15 +28,15 @@ public: ucs4_t * _cbuf; public: - BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } - BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } + explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { } + BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { } void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } void onOffset(size_t) { } void incBuf(size_t inc) { _cbuf += inc; } ucs4_t * getBuf() { return _cbuf; } - bool valid() { return true; } - size_t size() { return (_cbuf - _bbuf); } - bool hasOffsets() { return false; } + bool valid() const noexcept { return true; } + size_t size() const noexcept { return (_cbuf - _bbuf); } + bool hasOffsets() const noexcept { return false; } }; /** @@ -50,17 +50,18 @@ public: size_t * _coff; public: - OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} + explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } void onOffset(size_t of) { *_coff++ = of; } - bool valid() { return (size() == (size_t)(_coff - _boff)); } - bool hasOffsets() { return true; } + bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() const noexcept { return true; } }; protected: SharedSearcherBuf _buf; - const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + template<typename Reader> + void tokenize(Reader & reader); /** * Matches the given query term against the words in the given field reference @@ -103,9 +104,8 @@ protected: size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); public: - UTF8StringFieldSearcherBase(); - UTF8StringFieldSearcherBase(FieldIdT fId); - ~UTF8StringFieldSearcherBase(); + explicit UTF8StringFieldSearcherBase(FieldIdT fId); + ~UTF8StringFieldSearcherBase() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp index 88091c6ab4e..fcc2893a71d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp @@ -1,6 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vsm/searcher/utf8substringsearcher.h> +#include "utf8substringsearcher.h" #include <vespa/fastlib/text/unicodeutil.h> using search::byte; @@ -45,8 +45,6 @@ UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ ); } } - - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words + 1; // we must also count the last word } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h index b1455d5c5f6..cee35993ce7 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h @@ -1,7 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h> +#include "utf8strchrfieldsearcher.h" namespace vsm { @@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp index 8403e69658f..6d8a399cd33 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp @@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * _modified->put(_unitSep); } -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() : - UTF8StringFieldSearcherBase(), - _modified(new CharBuffer(32)), - _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), - _unitSep(juniper::separators::unit_separator) -{ -} - UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) : UTF8StringFieldSearcherBase(fId), _modified(new CharBuffer(32)), _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } @@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId, UTF8StringFieldSearcherBase(fId), _modified(modBuf), _offsets(offBuf), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } -UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {} +UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h index ebb806de61c..99e6c29961f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h @@ -23,8 +23,8 @@ private: const char * _readPtr; // buffer to read from (field reference) char _unitSep; // the unit separator character to use - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; /** * Copies n bytes from the field reference to the modified buffer and updates the read pointer. @@ -51,9 +51,8 @@ public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubstringSnippetModifier(); - UTF8SubstringSnippetModifier(FieldIdT fId); - ~UTF8SubstringSnippetModifier(); + explicit UTF8SubstringSnippetModifier(FieldIdT fId); + ~UTF8SubstringSnippetModifier() override; /** * Creates a new instance. diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index e28ce114225..8bbacf168cf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8suffixstringfieldsearcher.h" +#include "tokenizereader.h" using search::byte; using search::streaming::QueryTerm; @@ -14,24 +15,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const } size_t -UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h index 556f61a714f..dc3bc214b49 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h @@ -1,10 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> +#include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does suffix utf8 searches. @@ -12,13 +11,12 @@ namespace vsm class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase { protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index e33408a2e26..715c19a0bb7 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -28,30 +28,30 @@ namespace vsm { namespace { -void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { +void +setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { if (arg1 == "prefix") { - searcher->setMatchType(FieldSearcher::PREFIX); + searcher->match_type(FieldSearcher::PREFIX); } else if (arg1 == "substring") { - searcher->setMatchType(FieldSearcher::SUBSTRING); + searcher->match_type(FieldSearcher::SUBSTRING); } else if (arg1 == "suffix") { - searcher->setMatchType(FieldSearcher::SUFFIX); - } else if (arg1 == "exact") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "word") { - searcher->setMatchType(FieldSearcher::EXACT); + searcher->match_type(FieldSearcher::SUFFIX); + } else if ((arg1 == "exact") || (arg1 == "word")) { + searcher->match_type(FieldSearcher::EXACT); } } } -FieldSearchSpec::FieldSearchSpec() : - _id(0), - _name(), - _maxLength(0x100000), - _searcher(), - _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), - _arg1(), - _reconfigured(false) +FieldSearchSpec::FieldSearchSpec() + : _id(0), + _name(), + _maxLength(0x100000), + _searcher(), + _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), + _arg1(), + _reconfigured(false) { } FieldSearchSpec::~FieldSearchSpec() = default; @@ -59,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default; FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; -FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, - VsmfieldsConfig::Fieldspec::Searchmethod searchDef, - const vespalib::string & arg1, size_t maxLength_) : +FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef, + Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) : _id(fid), _name(fname), - _maxLength(maxLength_), + _maxLength(maxLength_in), _searcher(), _searchMethod(searchDef), - _arg1(arg1), + _normalize_mode(normalize_mode), + _arg1(arg1_in), _reconfigured(false) { switch(searchDef) { @@ -78,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - if (arg1 == "substring") { + if (_arg1 == "substring") { _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); - } else if (arg1 == "suffix") { + } else if (_arg1 == "suffix") { _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (arg1 == "exact") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (arg1 == "word") { + } else if ((_arg1 == "exact") || (_arg1 == "word")) { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); @@ -111,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & _searcher = std::make_unique<GeoPosFieldSearcher>(fid); break; case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR: - auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1); + auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1); _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm); break; } if (_searcher) { - setMatchType(_searcher, arg1); + setMatchType(_searcher, _arg1); _searcher->maxFieldLength(maxLength()); + _searcher->normalize_mode(_normalize_mode); } } @@ -150,7 +149,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term) } } -vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f) +vespalib::asciistream & +operator <<(vespalib::asciistream & os, const FieldSearchSpec & f) { os << f._id << ' ' << f._name << ' '; if ( ! f._searcher) { @@ -164,62 +164,67 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default; FieldSearchSpecMap::~FieldSearchSpecMap() = default; namespace { - const std::string _G_empty(""); - const std::string _G_value(".value"); - const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); - const std::regex _G_map2("\\{\".*\"\\}"); - const std::regex _G_array("\\[[0-9]+\\]"); + const std::string G_empty; + const std::string G_value(".value"); + const std::regex G_map1("\\{[a-zA-Z0-9]+\\}"); + const std::regex G_map2("\\{\".*\"\\}"); + const std::regex G_array("\\[[0-9]+\\]"); } -vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex) +vespalib::string +FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex) { if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { - std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); - index = std::regex_replace(index, _G_map2, _G_value); - index = std::regex_replace(index, _G_array, _G_empty); + std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value); + index = std::regex_replace(index, G_map2, G_value); + index = std::regex_replace(index, G_array, G_empty); return index; } return rawIndex; } -bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const +void +FieldSearchSpecMap::addFieldsFromIndex(vespalib::stringref rawIndex, StringFieldIdTMap & fieldIdMap) const { + for (const auto & dtm : documentTypeMap()) { + const IndexFieldMapT & fim = dtm.second; + vespalib::string index(stripNonFields(rawIndex)); + auto fIt = fim.find(index); + if (fIt != fim.end()) { + for(FieldIdT fid : fIt->second) { + const FieldSearchSpec & spec = specMap().find(fid)->second; + LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.data(), index.c_str()); + if ((rawIndex != index) && (spec.name().find(index) == 0)) { + vespalib::string modIndex(rawIndex); + modIndex.append(spec.name().substr(index.size())); + fieldIdMap.add(modIndex, spec.id()); + } else { + fieldIdMap.add(spec.name(),spec.id()); + } + } + } else { + LOG(warning, "No valid indexes registered for index %s", rawIndex.data()); + } + } +} + +StringFieldIdTMap +FieldSearchSpecMap::buildFieldsInQuery(const Query & query) const { - bool retval(true); + StringFieldIdTMap fieldsInQuery; ConstQueryTermList qtl; query.getLeaves(qtl); for (const auto & term : qtl) { - for (const auto & dtm : documentTypeMap()) { - const IndexFieldMapT & fim = dtm.second; - vespalib::string rawIndex(term->index()); - vespalib::string index(stripNonFields(rawIndex)); - auto fIt = fim.find(index); - if (fIt != fim.end()) { - for(FieldIdT fid : fIt->second) { - const FieldSearchSpec & spec = specMap().find(fid)->second; - LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str()); - if ((rawIndex != index) && (spec.name().find(index) == 0)) { - vespalib::string modIndex(rawIndex); - modIndex.append(spec.name().substr(index.size())); - fieldsInQuery.add(modIndex, spec.id()); - } else { - fieldsInQuery.add(spec.name(),spec.id()); - } - } - } else { - LOG(warning, "No valid indexes registered for index %s", term->index().c_str()); - retval = false; - } - } + addFieldsFromIndex(term->index(), fieldsInQuery); } - return retval; + return fieldsInQuery; } -void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded) +void +FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded) { - for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) { - LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str()); - _nameIdMap.add(otherFieldsNeeded[i]); + for (const auto & i : otherFieldsNeeded) { + _nameIdMap.add(i); } } @@ -251,16 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch return ifm; } +search::streaming::Normalizing +normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { + switch (normalize_mode) { + case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD; + } + return search::streaming::Normalizing::LOWERCASE_AND_FOLD; +} + } -bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) +void +FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) { - bool retval(true); LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { LOG(spam, "Parsing %s", cfs.name.c_str()); FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); _specMap[fieldId] = std::move(fss); _nameIdMap.add(cfs.name, fieldId); LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); @@ -275,7 +290,6 @@ bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) } _documentTypeMap[di.name] = indexMapp; } - return retval; } void @@ -297,12 +311,14 @@ FieldSearchSpecMap::reconfigFromQuery(const Query & query) } } -bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b) +bool +lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b) { return a->field() < b->field(); } -void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) +void +FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const { fieldSearcherMap.clear(); for (const auto & entry : fieldsInQuery) { @@ -328,10 +344,11 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const if (!itr->second.uses_nearest_neighbor_search_method()) { return dm; } - return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1()); + return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1()); } -vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df) +vespalib::asciistream & +operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df) { os << "DocumentTypeMap = \n"; for (const auto & dtm : df.documentTypeMap()) { diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h index b0154a82dae..7ba9799991e 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -10,20 +10,29 @@ namespace vsm { class FieldSearchSpec { public: + using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; + using Normalizing = search::streaming::Normalizing; FieldSearchSpec(); - FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, - VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, - const vespalib::string & arg1, size_t maxLength); + FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod, + Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength); ~FieldSearchSpec(); FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; - const FieldSearcher & searcher() const { return *_searcher; } - const vespalib::string & name() const { return _name; } - FieldIdT id() const { return _id; } - bool valid() const { return static_cast<bool>(_searcher); } - size_t maxLength() const { return _maxLength; } - bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; } - const vespalib::string& get_arg1() const noexcept { return _arg1; } + const FieldSearcher & searcher() const noexcept { return *_searcher; } + const vespalib::string & name() const noexcept { return _name; } + FieldIdT id() const noexcept { return _id; } + bool valid() const noexcept { return static_cast<bool>(_searcher); } + size_t maxLength() const noexcept { return _maxLength; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + const vespalib::string& arg1() const noexcept { return _arg1; } + bool uses_nearest_neighbor_search_method() const noexcept { + return _searchMethod == Searchmethod::NEAREST_NEIGHBOR; + } + bool uses_string_search_method() const noexcept { + return (_searchMethod == Searchmethod::UTF8) || + (_searchMethod == Searchmethod::AUTOUTF8) || + (_searchMethod == Searchmethod::SSE2UTF8); + } /** * Reconfigures the field searcher based on information in the given query term. @@ -37,7 +46,8 @@ private: vespalib::string _name; size_t _maxLength; FieldSearcherContainer _searcher; - VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; + Searchmethod _searchMethod; + Normalizing _normalize_mode; vespalib::string _arg1; bool _reconfigured; }; @@ -55,7 +65,7 @@ public: * and a mapping from field name to field id. It then iterates over all document types and index names * and creates a mapping from index name to list of field ids for each document type. **/ - bool buildFromConfig(const VsmfieldsHandle & conf); + void buildFromConfig(const VsmfieldsHandle & conf); /** * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. @@ -71,17 +81,13 @@ public: * Adds a [field name, field id] entry to the given mapping for each field name used in the given query. * This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs. **/ - bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const; - - /** - * Adds a [field name, field id] entry to the given mapping for each field name in the given vector. - **/ - void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const; + StringFieldIdTMap buildFieldsInQuery(const search::streaming::Query & query) const; + void addFieldsFromIndex(vespalib::stringref index, StringFieldIdTMap & fieldIdMap) const; /** * Adds a FieldSearcher object to the given field searcher map for each field name in the other map. **/ - void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap); + void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const; const FieldSearchSpecMapT & specMap() const { return _specMap; } //const IndexFieldMapT & indexMap() const { return _documentTypeMap.begin()->second; } @@ -89,7 +95,7 @@ public: const StringFieldIdTMap & nameIdMap() const { return _nameIdMap; } friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f); - static vespalib::string stripNonFields(const vespalib::string & rawIndex); + static vespalib::string stripNonFields(vespalib::stringref rawIndex); search::attribute::DistanceMetric get_distance_metric(const vespalib::string& name) const; private: |