diff options
8 files changed, 179 insertions, 40 deletions
diff --git a/searchcommon/src/vespa/searchcommon/attribute/config.cpp b/searchcommon/src/vespa/searchcommon/attribute/config.cpp index 065e9c14de9..c62d7ef0ea1 100644 --- a/searchcommon/src/vespa/searchcommon/attribute/config.cpp +++ b/searchcommon/src/vespa/searchcommon/attribute/config.cpp @@ -14,6 +14,7 @@ Config::Config() noexcept : _isFilter(false), _fastAccess(false), _mutable(false), + _match(Match::UNCASED), _dictionary(), _growStrategy(), _compactionStrategy(), @@ -34,6 +35,7 @@ Config::Config(BasicType bt, CollectionType ct, bool fastSearch_, bool huge_) no _isFilter(false), _fastAccess(false), _mutable(false), + _match(Match::UNCASED), _dictionary(), _growStrategy(), _compactionStrategy(), @@ -62,6 +64,7 @@ Config::operator==(const Config &b) const _isFilter == b._isFilter && _fastAccess == b._fastAccess && _mutable == b._mutable && + _match == b._match && _dictionary == b._dictionary && _growStrategy == b._growStrategy && _compactionStrategy == b._compactionStrategy && diff --git a/searchcommon/src/vespa/searchcommon/attribute/config.h b/searchcommon/src/vespa/searchcommon/attribute/config.h index c1b30303606..fdf3a00ac99 100644 --- a/searchcommon/src/vespa/searchcommon/attribute/config.h +++ b/searchcommon/src/vespa/searchcommon/attribute/config.h @@ -22,6 +22,7 @@ namespace search::attribute { */ class Config { public: + enum class Match { CASED, UNCASED }; Config() noexcept; Config(BasicType bt) noexcept : Config(bt, CollectionType::SINGLE) { } Config(BasicType bt, CollectionType ct) noexcept : Config(bt, ct, false) { } @@ -68,6 +69,7 @@ public: const GrowStrategy & getGrowStrategy() const { return _growStrategy; } const CompactionStrategy &getCompactionStrategy() const { return _compactionStrategy; } const DictionaryConfig & get_dictionary_config() const { return _dictionary; } + Match get_match() const { return _match; } Config & setHuge(bool v) { _huge = v; return *this;} Config & setFastSearch(bool v) { _fastSearch = v; return *this; } Config & setPredicateParams(const PredicateParams &v) { _predicateParams = v; return *this; } @@ -121,6 +123,7 @@ public: return *this; } Config & set_dictionary_config(const DictionaryConfig & cfg) { _dictionary = cfg; return *this; } + Config & set_match(Match match) { _match = match; return *this; } bool operator!=(const Config &b) const { return !(operator==(b)); } bool operator==(const Config &b) const; @@ -134,6 +137,7 @@ private: bool _isFilter; bool _fastAccess; bool _mutable; + Match _match; DictionaryConfig _dictionary; GrowStrategy _growStrategy; CompactionStrategy _compactionStrategy; diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp index aaae2772687..ec711b4a456 100644 --- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp +++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp @@ -386,6 +386,7 @@ testSingleValue(Attribute & svsa, Config &cfg) TEST("testSingleValue") { EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext)); + EXPECT_EQUAL(24u, sizeof(StringSearchHelper)); EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext)); { Config cfg(BasicType::STRING, CollectionType::SINGLE); @@ -408,4 +409,83 @@ TEST("testSingleValue") } } +TEST("test uncased match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::WORD); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("axyz")); + EXPECT_FALSE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test uncased prefix match") { + QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::PREFIXTERM); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_TRUE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("axyz")); + EXPECT_TRUE(helper.isMatch("xyza")); + EXPECT_TRUE(helper.isMatch("xYza")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test cased match") { + QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::WORD); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("aXyZ")); + EXPECT_FALSE(helper.isMatch("XyZa")); + EXPECT_FALSE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("Xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test cased prefix match") { + QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::PREFIXTERM); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_TRUE(helper.isPrefix()); + EXPECT_FALSE(helper.isRegex()); + EXPECT_FALSE(helper.isMatch("aXyZ")); + EXPECT_TRUE(helper.isMatch("XyZa")); + EXPECT_FALSE(helper.isMatch("xyZa")); + EXPECT_FALSE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("Xyz")); + EXPECT_TRUE(helper.isMatch("XyZ")); +} + +TEST("test uncased regex match") { + QueryTermUCS4 xyz("x[yY]+Z", QueryTermSimple::Type::REGEXP); + StringSearchHelper helper(xyz, false); + EXPECT_FALSE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_TRUE(helper.isRegex()); + EXPECT_TRUE(helper.isMatch("axyZ")); + EXPECT_TRUE(helper.isMatch("xyZa")); + EXPECT_TRUE(helper.isMatch("xyZ")); + EXPECT_TRUE(helper.isMatch("xyz")); + EXPECT_FALSE(helper.isMatch("xyaZ")); +} + +TEST("test cased regex match") { + QueryTermUCS4 xyz("x[Y]+Z", QueryTermSimple::Type::REGEXP); + StringSearchHelper helper(xyz, true); + EXPECT_TRUE(helper.isCased()); + EXPECT_FALSE(helper.isPrefix()); + EXPECT_TRUE(helper.isRegex()); + EXPECT_TRUE(helper.isMatch("axYZ")); + EXPECT_TRUE(helper.isMatch("xYZa")); + EXPECT_FALSE(helper.isMatch("xyZ")); + EXPECT_TRUE(helper.isMatch("xYZ")); + EXPECT_FALSE(helper.isMatch("xYz")); + EXPECT_FALSE(helper.isMatch("xaYZ")); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp index ced5ccfdc85..6387edc588b 100644 --- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp +++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp @@ -3,13 +3,11 @@ #include "configconverter.h" using namespace vespa::config::search; -using namespace search; +namespace search::attribute { namespace { -using search::attribute::CollectionType; -using search::attribute::BasicType; using vespalib::eval::ValueType; using vespalib::eval::CellType; @@ -81,9 +79,18 @@ convert_dictionary(const AttributesConfig::Attribute::Dictionary & dictionary) { return DictionaryConfig(convert(dictionary.type), convert(dictionary.match)); } +Config::Match +convertMatch(AttributesConfig::Attribute::Match match_cfg) { + switch (match_cfg) { + case AttributesConfig::Attribute::Match::CASED: + return Config::Match::CASED; + case AttributesConfig::Attribute::Match::UNCASED: + return Config::Match::UNCASED; + } + assert(false); } -namespace search::attribute { +} Config ConfigConverter::convert(const AttributesConfig::Attribute & cfg) @@ -106,6 +113,7 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg) predicateParams.setDensePostingListThreshold(cfg.densepostinglistthreshold); retval.setPredicateParams(predicateParams); retval.set_dictionary_config(convert_dictionary(cfg.dictionary)); + retval.set_match(convertMatch(cfg.match)); using CfgDm = AttributesConfig::Attribute::Distancemetric; DistanceMetric dm(DistanceMetric::Euclidean); switch (cfg.distancemetric) { diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp index a308fc06af0..4be86652541 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp @@ -15,6 +15,57 @@ LOG_SETUP(".searchlib.attribute.stringbase"); namespace search { +StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) + : _regex(), + _term(), + _termLen(), + _isPrefix(term.isPrefix()), + _isRegex(term.isRegex()), + _isCased(cased) +{ + if (isRegex()) { + if (isCased()) { + _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None); + } else { + _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); + } + } else if (isCased()) { + _term._char = term.getTerm(); + _termLen = term.getTermLen(); + } else { + term.term(_term._ucs4); + } +} + +StringSearchHelper::~StringSearchHelper() +{ + if (isRegex()) { + + } +} + +bool +StringSearchHelper::isMatch(const char *src) const { + if (__builtin_expect(isRegex(), false)) { + return getRegex().valid() ? getRegex().partial_match(std::string_view(src)) : false; + } + if (__builtin_expect(isCased(), false)) { + int res = strncmp(_term._char, src, _termLen); + return (res == 0) && (src[_termLen] == 0 || isPrefix()); + } + vespalib::Utf8ReaderForZTS u8reader(src); + uint32_t j = 0; + uint32_t val; + for (;; ++j) { + val = u8reader.getChar(); + val = vespalib::LowerCase::convert(val); + if (_term._ucs4[j] == 0 || _term._ucs4[j] != val) { + break; + } + } + return (_term._ucs4[j] == 0 && (val == 0 || isPrefix())); +} + IMPLEMENT_IDENTIFIABLE_ABSTRACT(StringAttribute, AttributeVector); using attribute::LoadedEnumAttribute; @@ -225,16 +276,8 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT const StringAttribute & toBeSearched) : SearchContext(toBeSearched), _queryTerm(static_cast<QueryTermUCS4 *>(qTerm.release())), - _termUCS4(nullptr), - _regex(), - _isPrefix(_queryTerm->isPrefix()), - _isRegex(_queryTerm->isRegex()) + _helper(*_queryTerm, toBeSearched.getConfig().get_match() == Config::Match::CASED) { - if (isRegex()) { - _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase); - } else { - _queryTerm->term(_termUCS4); - } } StringAttribute::StringSearchContext::~StringSearchContext() = default; diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h index b8fef783d58..f051fb07e71 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.h +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h @@ -16,6 +16,27 @@ namespace search { +class StringSearchHelper { +public: + StringSearchHelper(QueryTermUCS4 & qTerm, bool cased); + ~StringSearchHelper(); + bool isMatch(const char *src) const; + bool isPrefix() const { return _isPrefix; } + bool isRegex() const { return _isRegex; } + bool isCased() const { return _isCased; } + const vespalib::Regex & getRegex() const { return _regex; } +private: + vespalib::Regex _regex; + union { + const ucs4_t *_ucs4; + const char *_char; + } _term; + uint32_t _termLen; + bool _isPrefix; + bool _isRegex; + bool _isCased; +}; + class ReaderBase; class StringAttribute : public AttributeVector @@ -97,24 +118,13 @@ protected: ~StringSearchContext() override; protected: bool valid() const override; - const QueryTermUCS4 * queryTerm() const override; - bool isMatch(const char *src) const { - if (__builtin_expect(isRegex(), false)) { - return _regex.valid() ? _regex.partial_match(std::string_view(src)) : false; - } - vespalib::Utf8ReaderForZTS u8reader(src); - uint32_t j = 0; - uint32_t val; - for (;; ++j) { - val = u8reader.getChar(); - val = vespalib::LowerCase::convert(val); - if (_termUCS4[j] == 0 || _termUCS4[j] != val) { - break; - } - } - return (_termUCS4[j] == 0 && (val == 0 || isPrefix())); - } + bool isMatch(const char *src) const { return _helper.isMatch(src); } + bool isPrefix() const { return _helper.isPrefix(); } + bool isRegex() const { return _helper.isRegex(); } + bool isCased() const { return _helper.isCased(); } + const vespalib::Regex & getRegex() const { return _helper.getRegex(); } + class CollectHitCount { public: CollectHitCount() : _hitCount(0) { } @@ -151,16 +161,9 @@ protected: } return -1; } - - bool isPrefix() const { return _isPrefix; } - bool isRegex() const { return _isRegex; } - const vespalib::Regex & getRegex() const { return _regex; } private: std::unique_ptr<QueryTermUCS4> _queryTerm; - const ucs4_t *_termUCS4; - vespalib::Regex _regex; - bool _isPrefix; - bool _isRegex; + StringSearchHelper _helper; }; }; diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index 93b19212926..d0dfbee010f 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -63,7 +63,6 @@ public: virtual void visitMembers(vespalib::ObjectVisitor &visitor) const; vespalib::string getClassName() const; bool isValid() const { return _valid; } -protected: const string & getTermString() const { return _term; } private: bool getRangeInternal(int64_t & low, int64_t & high) const; diff --git a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h index 00ac59d729e..b8735eb30f0 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h +++ b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h @@ -12,7 +12,6 @@ namespace search { */ class QueryTermUCS4 : public QueryTermSimple { public: - typedef std::unique_ptr<QueryTermUCS4> UP; QueryTermUCS4(const QueryTermUCS4 &) = delete; QueryTermUCS4 & operator = (const QueryTermUCS4 &) = delete; QueryTermUCS4(QueryTermUCS4 &&) = delete; |