diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-10 10:59:27 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-01-10 10:59:27 +0000 |
commit | 5bdad953f6d91cb26139ef6506c3748531dc708a (patch) | |
tree | 3e268a4f3e98ee62a9ed15e3ab3ffe0b38c9579d /streamingvisitors | |
parent | 3f7017773ce147a2d65a9835acdfd682dfafd54a (diff) |
Use the normalize_mode config.
Diffstat (limited to 'streamingvisitors')
8 files changed, 45 insertions, 59 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index a691d7671f9..74d8fdc4bf3 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -441,11 +441,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits())); assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "oper", field, Hits().add(0).add(2)); assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits())); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; { // test handling of several underscores @@ -554,12 +554,12 @@ TEST("utf8 substring search with empty term") TEST("utf8 suffix search") { UTF8SuffixStringFieldSearcher fs(0); std::string field = "operators and operator overloading"; - assertString(fs, "rsand", field, Hits()); - assertString(fs, "tor", field, Hits().add(2)); - assertString(fs, "tors", field, Hits().add(0)); + TEST_DO(assertString(fs, "rsand", field, Hits())); + TEST_DO(assertString(fs, "tor", field, Hits().add(2))); + TEST_DO(assertString(fs, "tors", field, Hits().add(0))); - assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())); - assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()))); + TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)))); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -591,22 +591,22 @@ TEST("utf8 flexible searcher"){ // prefix assertString(fs, "vesp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "vesp", "vespa", Hits().add(0)); // substring - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*esp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUBSTRING); + fs.match_type(FieldSearcher::SUBSTRING); assertString(fs, "esp", "vespa", Hits().add(0)); // suffix - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUFFIX); + fs.match_type(FieldSearcher::SUFFIX); assertString(fs, "espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); EXPECT_TRUE(testStringFieldInfo(fs)); } diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 88556778481..cdd1a018d84 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult; using search::attribute::IAttributeVector; using search::expression::ConfigureStaticParams; using search::streaming::Query; +using search::streaming::Normalizing; using search::streaming::QueryTermList; using storage::StorageComponent; using storage::VisitorEnvironment; @@ -329,11 +330,11 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept { namespace { uint32_t -count_exact(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { +count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { size_t count = 0; for (const auto & fieldId : fieldIdMap.map()) { auto found = specMap.find(fieldId.second); - if ((found != specMap.end()) && found->second.searcher().exact()) { + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) { count++; } } @@ -341,11 +342,11 @@ count_exact(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & } uint32_t -count_cased(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { +count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { size_t count = 0; for (const auto & fieldId : fieldIdMap.map()) { auto found = specMap.find(fieldId.second); - if ((found != specMap.end()) && found->second.searcher().cased()) { + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) { count++; } } @@ -358,8 +359,8 @@ SearchMethodInfo::Normalizing SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept { StringFieldIdTMap fieldIdMap; _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); - if (count_cased(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE; - if (count_exact(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE; + if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE; + if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE; return Normalizing::LOWERCASE_AND_FOLD; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index b9e1fe8f83c..5e06ae41a03 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -51,6 +51,7 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept : FieldSearcherBase(), _field(fId), _matchType(defaultPrefix ? PREFIX : REGULAR), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _maxFieldLength(0x100000), _currentElementId(0), _currentElementWeight(1), @@ -69,7 +70,7 @@ FieldSearcher::search(const StorageDocument & doc) fInfo.setHitOffset(qt->getHitList().size()); } onSearch(doc); - for(auto qt : _qtl) { + for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset()); fInfo.setFieldLength(_words); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index 75ace16328b..c5bca6f3899 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -34,13 +34,13 @@ protected: class FieldSearcher : public FieldSearcherBase { public: + using Normalizing = search::streaming::Normalizing; enum MatchType { REGULAR, PREFIX, SUBSTRING, SUFFIX, EXACT, - CASED }; explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} @@ -51,21 +51,22 @@ public: virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); - FieldIdT field() const { return _field; } - void field(FieldIdT v) { _field = v; prepareFieldId(); } - bool prefix() const { return _matchType == PREFIX; } - bool substring() const { return _matchType == SUBSTRING; } - bool suffix() const { return _matchType == SUFFIX; } - bool exact() const { return _matchType == EXACT; } - bool cased() const { return _matchType == CASED; } - void setMatchType(MatchType mt) { _matchType = mt; } - MatchType match_type() const noexcept { return _matchType; } + FieldIdT field() const noexcept { return _field; } + bool prefix() const noexcept { return _matchType == PREFIX; } + bool substring() const noexcept { return _matchType == SUBSTRING; } + bool suffix() const noexcept { return _matchType == SUFFIX; } + bool exact() const noexcept { return _matchType == EXACT; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + MatchType match_type() const noexcept { return _matchType; } + void match_type(MatchType mt) noexcept { _matchType = mt; } + void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; } + void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); } static void init(); static search::byte fold(search::byte c) { return _foldLowCase[c]; } static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); - int32_t getCurrentWeight() const { return _currentElementWeight; } + int32_t currentWeight() const { return _currentElementWeight; } FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -91,6 +92,7 @@ private: virtual void onStructValue(const document::StructFieldValue &) { } FieldIdT _field; MatchType _matchType; + Normalizing _normalize_mode; unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. @@ -104,7 +106,7 @@ protected: * For each call to onValue() a batch of words are processed, and the position is local to this batch. **/ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); + qt.add(_words + pos, field(), _currentElementId, _currentElementWeight); } public: static search::byte _foldLowCase[256]; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index a01a9cd088d..aaf8b940dc8 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -20,7 +20,7 @@ public: explicit UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { - setMatchType(EXACT); + match_type(EXACT); } }; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index ed76fb79f4e..115cddce619 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -62,7 +62,6 @@ protected: SharedSearcherBuf _buf; using byte = search::byte; - using Normalizing = search::streaming::Normalizing; class TokenizeReader { public: @@ -121,15 +120,6 @@ protected: template<typename Reader> void tokenize(Reader & reader); - Normalizing normalize_mode() const noexcept { - switch (match_type()) { - case EXACT: return Normalizing::LOWERCASE; - case CASED: return Normalizing::NONE; - default: return Normalizing::LOWERCASE_AND_FOLD; - } - return Normalizing::LOWERCASE_AND_FOLD; - } - /** * Matches the given query term against the words in the given field reference * using exact or prefix match strategy. diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h index c20710e63ab..dc3bc214b49 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h @@ -3,8 +3,7 @@ #include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does suffix utf8 searches. diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 22934ba74d2..715c19a0bb7 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -31,17 +31,13 @@ namespace { void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { if (arg1 == "prefix") { - searcher->setMatchType(FieldSearcher::PREFIX); + searcher->match_type(FieldSearcher::PREFIX); } else if (arg1 == "substring") { - searcher->setMatchType(FieldSearcher::SUBSTRING); + searcher->match_type(FieldSearcher::SUBSTRING); } else if (arg1 == "suffix") { - searcher->setMatchType(FieldSearcher::SUFFIX); - } else if (arg1 == "exact") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "word") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "cased") { - searcher->setMatchType(FieldSearcher::CASED); + searcher->match_type(FieldSearcher::SUFFIX); + } else if ((arg1 == "exact") || (arg1 == "word")) { + searcher->match_type(FieldSearcher::EXACT); } } @@ -86,12 +82,8 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); } else if (_arg1 == "suffix") { _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (_arg1 == "exact") { + } else if ((_arg1 == "exact") || (_arg1 == "word")) { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (_arg1 == "word") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (_arg1 == "cased") { - _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); } else { @@ -124,6 +116,7 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & if (_searcher) { setMatchType(_searcher, _arg1); _searcher->maxFieldLength(maxLength()); + _searcher->normalize_mode(_normalize_mode); } } |