diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2021-04-22 11:20:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-22 11:20:10 +0200 |
commit | d78232f1ef8ba360e9694034e4bdda90a49a024d (patch) | |
tree | 9313e012e9885cec98188891abcba1700fb0623e /searchlib | |
parent | a71839400a12823ebd7abe90c19dbe03587118be (diff) | |
parent | d5add2f4c82db7088212e550f7c768317f009d53 (diff) |
Merge pull request #17540 from vespa-engine/balder/implment-case-sensitive-search-for-non-fast-search-attributes
Support both case sensitive, and case-insensitive search in non-fast-…
Diffstat (limited to 'searchlib')
6 files changed, 172 insertions, 40 deletions
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index aaae2772687..ec711b4a456 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -386,6 +386,7 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext)); + EXPECT_EQUAL(24u, sizeof(StringSearchHelper)); EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); @@ -408,4 +409,83 @@ TEST("testSingleValue") } } +TEST("test uncased match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::WORD); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("axyz")); + EXPECT_FALSE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test uncased prefix match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::PREFIXTERM); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_TRUE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("axyz")); + EXPECT_TRUE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xYza")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test cased match") { + QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::WORD); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("aXyZ")); + EXPECT_FALSE(helper.isMatch("XyZa")); + EXPECT_FALSE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("Xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test cased prefix match") { + QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::PREFIXTERM); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_TRUE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("aXyZ")); + EXPECT_TRUE(helper.isMatch("XyZa")); + EXPECT_FALSE(helper.isMatch("xyZa")); + EXPECT_FALSE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("Xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test uncased regex match") { + QueryTermUCS4 xyz("x[yY]+Z", QueryTermSimple::Type::REGEXP); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_TRUE(helper.isRegex()); + EXPECT_TRUE(helper.isMatch("axyZ")); + EXPECT_TRUE(helper.isMatch("xyZa")); + EXPECT_TRUE(helper.isMatch("xyZ")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("xyaZ")); +} + +TEST("test cased regex match") { + QueryTermUCS4 xyz("x[Y]+Z", QueryTermSimple::Type::REGEXP); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_TRUE(helper.isRegex()); + EXPECT_TRUE(helper.isMatch("axYZ")); + EXPECT_TRUE(helper.isMatch("xYZa")); + EXPECT_FALSE(helper.isMatch("xyZ")); + EXPECT_TRUE(helper.isMatch("xYZ")); + EXPECT_FALSE(helper.isMatch("xYz")); + EXPECT_FALSE(helper.isMatch("xaYZ")); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp index ced5ccfdc85..6387edc588b 100644 --- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp +++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp @@ -3,13 +3,11 @@ #include "configconverter.h" using namespace vespa::config::search; -using namespace search; +namespace search::attribute { namespace { -using search::attribute::CollectionType; -using search::attribute::BasicType; using vespalib::eval::ValueType; using vespalib::eval::CellType; @@ -81,9 +79,18 @@ convert_dictionary(const AttributesConfig::Attribute::Dictionary & dictionary) { return DictionaryConfig(convert(dictionary.type), convert(dictionary.match)); } +Config::Match +convertMatch(AttributesConfig::Attribute::Match match_cfg) { + switch (match_cfg) { + case AttributesConfig::Attribute::Match::CASED: + return Config::Match::CASED; + case AttributesConfig::Attribute::Match::UNCASED: + return Config::Match::UNCASED; + } + assert(false); } -namespace search::attribute { +} Config ConfigConverter::convert(const AttributesConfig::Attribute & cfg) @@ -106,6 +113,7 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg) predicateParams.setDensePostingListThreshold(cfg.densepostinglistthreshold); retval.setPredicateParams(predicateParams); retval.set_dictionary_config(convert_dictionary(cfg.dictionary)); + retval.set_match(convertMatch(cfg.match)); using CfgDm = AttributesConfig::Attribute::Distancemetric; DistanceMetric dm(DistanceMetric::Euclidean); switch (cfg.distancemetric) { diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp index a308fc06af0..4be86652541 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp @@ -15,6 +15,57 @@ LOG_SETUP(".searchlib.attribute.stringbase"); namespace search { +StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) + : _regex(), + _term(), + _termLen(), + _isPrefix(term.isPrefix()), + _isRegex(term.isRegex()), + _isCased(cased) +{ + if (isRegex()) { + if (isCased()) { + _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None); + } else { + _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); + } + } else if (isCased()) { + _term._char = term.getTerm(); + _termLen = term.getTermLen(); + } else { + term.term(_term._ucs4); + } +} + +StringSearchHelper::~StringSearchHelper() +{ + if (isRegex()) { + + } +} + +bool +StringSearchHelper::isMatch(const char *src) const { + if (__builtin_expect(isRegex(), false)) { + return getRegex().valid() ? getRegex().partial_match(std::string_view(src)) : false; + } + if (__builtin_expect(isCased(), false)) { + int res = strncmp(_term._char, src, _termLen); + return (res == 0) && (src[_termLen] == 0 || isPrefix()); + } + vespalib::Utf8ReaderForZTS u8reader(src); + uint32_t j = 0; + uint32_t val; + for (;; ++j) { + val = u8reader.getChar(); + val = vespalib::LowerCase::convert(val); + if (_term._ucs4[j] == 0 || _term._ucs4[j] != val) { + break; + } + } + return (_term._ucs4[j] == 0 && (val == 0 || isPrefix())); +} + IMPLEMENT_IDENTIFIABLE_ABSTRACT(StringAttribute, AttributeVector); using attribute::LoadedEnumAttribute; @@ -225,16 +276,8 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT const StringAttribute & toBeSearched) : SearchContext(toBeSearched), _queryTerm(static_cast<QueryTermUCS4 *>(qTerm.release())), - _termUCS4(nullptr), - _regex(), - _isPrefix(_queryTerm->isPrefix()), - _isRegex(_queryTerm->isRegex()) + _helper(*_queryTerm, toBeSearched.getConfig().get_match() == Config::Match::CASED) { - if (isRegex()) { - _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase); - } else { - _queryTerm->term(_termUCS4); - } } StringAttribute::StringSearchContext::~StringSearchContext() = default; diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h index b8fef783d58..f051fb07e71 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.h +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h @@ -16,6 +16,27 @@ namespace search { +class StringSearchHelper { +public: + StringSearchHelper(QueryTermUCS4 & qTerm, bool cased); + ~StringSearchHelper(); + bool isMatch(const char *src) const; + bool isPrefix() const { return _isPrefix; } + bool isRegex() const { return _isRegex; } + bool isCased() const { return _isCased; } + const vespalib::Regex & getRegex() const { return _regex; } +private: + vespalib::Regex _regex; + union { + const ucs4_t *_ucs4; + const char *_char; + } _term; + uint32_t _termLen; + bool _isPrefix; + bool _isRegex; + bool _isCased; +}; + class ReaderBase; class StringAttribute : public AttributeVector @@ -97,24 +118,13 @@ protected: ~StringSearchContext() override; protected: bool valid() const override; - const QueryTermUCS4 * queryTerm() const override; - bool isMatch(const char *src) const { - if (__builtin_expect(isRegex(), false)) { - return _regex.valid() ? _regex.partial_match(std::string_view(src)) : false; - } - vespalib::Utf8ReaderForZTS u8reader(src); - uint32_t j = 0; - uint32_t val; - for (;; ++j) { - val = u8reader.getChar(); - val = vespalib::LowerCase::convert(val); - if (_termUCS4[j] == 0 || _termUCS4[j] != val) { - break; - } - } - return (_termUCS4[j] == 0 && (val == 0 || isPrefix())); - } + bool isMatch(const char *src) const { return _helper.isMatch(src); } + bool isPrefix() const { return _helper.isPrefix(); } + bool isRegex() const { return _helper.isRegex(); } + bool isCased() const { return _helper.isCased(); } + const vespalib::Regex & getRegex() const { return _helper.getRegex(); } + class CollectHitCount { public: CollectHitCount() : _hitCount(0) { } @@ -151,16 +161,9 @@ protected: } return -1; } - - bool isPrefix() const { return _isPrefix; } - bool isRegex() const { return _isRegex; } - const vespalib::Regex & getRegex() const { return _regex; } private: std::unique_ptr<QueryTermUCS4> _queryTerm; - const ucs4_t *_termUCS4; - vespalib::Regex _regex; - bool _isPrefix; - bool _isRegex; + StringSearchHelper _helper; }; }; diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 93b19212926..d0dfbee010f 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -63,7 +63,6 @@ public: virtual void visitMembers(vespalib::ObjectVisitor &visitor) const; vespalib::string getClassName() const; bool isValid() const { return _valid; } -protected: const string & getTermString() const { return _term; } private: bool getRangeInternal(int64_t & low, int64_t & high) const; diff --git a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h index 00ac59d729e..b8735eb30f0 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h +++ b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h @@ -12,7 +12,6 @@ namespace search { */ class QueryTermUCS4 : public QueryTermSimple { public: - typedef std::unique_ptr<QueryTermUCS4> UP; QueryTermUCS4(const QueryTermUCS4 &) = delete; QueryTermUCS4 & operator = (const QueryTermUCS4 &) = delete; QueryTermUCS4(QueryTermUCS4 &&) = delete; |