summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2021-04-22 11:20:10 +0200
committerGitHub <noreply@github.com>2021-04-22 11:20:10 +0200
commitd78232f1ef8ba360e9694034e4bdda90a49a024d (patch)
tree9313e012e9885cec98188891abcba1700fb0623e /searchlib
parenta71839400a12823ebd7abe90c19dbe03587118be (diff)
parentd5add2f4c82db7088212e550f7c768317f009d53 (diff)
Merge pull request #17540 from vespa-engine/balder/implment-case-sensitive-search-for-non-fast-search-attributes
Support both case sensitive, and case-insensitive search in non-fast-…
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp80
-rw-r--r--searchlib/src/vespa/searchlib/attribute/configconverter.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.h53
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_ucs4.h1
6 files changed, 172 insertions, 40 deletions
diff --git a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
index aaae2772687..ec711b4a456 100644
--- a/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
+++ b/searchlib/src/tests/attribute/stringattribute/stringattribute_test.cpp
@@ -386,6 +386,7 @@ testSingleValue(Attribute & svsa, Config &cfg)
TEST("testSingleValue")
{
EXPECT_EQUAL(24u, sizeof(AttributeVector::SearchContext));
+ EXPECT_EQUAL(24u, sizeof(StringSearchHelper));
EXPECT_EQUAL(56u, sizeof(SingleValueStringAttribute::StringSingleImplSearchContext));
{
Config cfg(BasicType::STRING, CollectionType::SINGLE);
@@ -408,4 +409,83 @@ TEST("testSingleValue")
}
}
+TEST("test uncased match") {
+ QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::WORD);
+ StringSearchHelper helper(xyz, false);
+ EXPECT_FALSE(helper.isCased());
+ EXPECT_FALSE(helper.isPrefix());
+ EXPECT_FALSE(helper.isRegex());
+ EXPECT_FALSE(helper.isMatch("axyz"));
+ EXPECT_FALSE(helper.isMatch("xyza"));
+ EXPECT_TRUE(helper.isMatch("xyz"));
+ EXPECT_TRUE(helper.isMatch("XyZ"));
+}
+
+TEST("test uncased prefix match") {
+ QueryTermUCS4 xyz("xyz", QueryTermSimple::Type::PREFIXTERM);
+ StringSearchHelper helper(xyz, false);
+ EXPECT_FALSE(helper.isCased());
+ EXPECT_TRUE(helper.isPrefix());
+ EXPECT_FALSE(helper.isRegex());
+ EXPECT_FALSE(helper.isMatch("axyz"));
+ EXPECT_TRUE(helper.isMatch("xyza"));
+ EXPECT_TRUE(helper.isMatch("xYza"));
+ EXPECT_TRUE(helper.isMatch("xyz"));
+ EXPECT_TRUE(helper.isMatch("XyZ"));
+}
+
+TEST("test cased match") {
+ QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::WORD);
+ StringSearchHelper helper(xyz, true);
+ EXPECT_TRUE(helper.isCased());
+ EXPECT_FALSE(helper.isPrefix());
+ EXPECT_FALSE(helper.isRegex());
+ EXPECT_FALSE(helper.isMatch("aXyZ"));
+ EXPECT_FALSE(helper.isMatch("XyZa"));
+ EXPECT_FALSE(helper.isMatch("xyz"));
+ EXPECT_FALSE(helper.isMatch("Xyz"));
+ EXPECT_TRUE(helper.isMatch("XyZ"));
+}
+
+TEST("test cased prefix match") {
+ QueryTermUCS4 xyz("XyZ", QueryTermSimple::Type::PREFIXTERM);
+ StringSearchHelper helper(xyz, true);
+ EXPECT_TRUE(helper.isCased());
+ EXPECT_TRUE(helper.isPrefix());
+ EXPECT_FALSE(helper.isRegex());
+ EXPECT_FALSE(helper.isMatch("aXyZ"));
+ EXPECT_TRUE(helper.isMatch("XyZa"));
+ EXPECT_FALSE(helper.isMatch("xyZa"));
+ EXPECT_FALSE(helper.isMatch("xyz"));
+ EXPECT_FALSE(helper.isMatch("Xyz"));
+ EXPECT_TRUE(helper.isMatch("XyZ"));
+}
+
+TEST("test uncased regex match") {
+ QueryTermUCS4 xyz("x[yY]+Z", QueryTermSimple::Type::REGEXP);
+ StringSearchHelper helper(xyz, false);
+ EXPECT_FALSE(helper.isCased());
+ EXPECT_FALSE(helper.isPrefix());
+ EXPECT_TRUE(helper.isRegex());
+ EXPECT_TRUE(helper.isMatch("axyZ"));
+ EXPECT_TRUE(helper.isMatch("xyZa"));
+ EXPECT_TRUE(helper.isMatch("xyZ"));
+ EXPECT_TRUE(helper.isMatch("xyz"));
+ EXPECT_FALSE(helper.isMatch("xyaZ"));
+}
+
+TEST("test cased regex match") {
+ QueryTermUCS4 xyz("x[Y]+Z", QueryTermSimple::Type::REGEXP);
+ StringSearchHelper helper(xyz, true);
+ EXPECT_TRUE(helper.isCased());
+ EXPECT_FALSE(helper.isPrefix());
+ EXPECT_TRUE(helper.isRegex());
+ EXPECT_TRUE(helper.isMatch("axYZ"));
+ EXPECT_TRUE(helper.isMatch("xYZa"));
+ EXPECT_FALSE(helper.isMatch("xyZ"));
+ EXPECT_TRUE(helper.isMatch("xYZ"));
+ EXPECT_FALSE(helper.isMatch("xYz"));
+ EXPECT_FALSE(helper.isMatch("xaYZ"));
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
index ced5ccfdc85..6387edc588b 100644
--- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
@@ -3,13 +3,11 @@
#include "configconverter.h"
using namespace vespa::config::search;
-using namespace search;
+namespace search::attribute {
namespace {
-using search::attribute::CollectionType;
-using search::attribute::BasicType;
using vespalib::eval::ValueType;
using vespalib::eval::CellType;
@@ -81,9 +79,18 @@ convert_dictionary(const AttributesConfig::Attribute::Dictionary & dictionary) {
return DictionaryConfig(convert(dictionary.type), convert(dictionary.match));
}
+Config::Match
+convertMatch(AttributesConfig::Attribute::Match match_cfg) {
+ switch (match_cfg) {
+ case AttributesConfig::Attribute::Match::CASED:
+ return Config::Match::CASED;
+ case AttributesConfig::Attribute::Match::UNCASED:
+ return Config::Match::UNCASED;
+ }
+ assert(false);
}
-namespace search::attribute {
+}
Config
ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
@@ -106,6 +113,7 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
predicateParams.setDensePostingListThreshold(cfg.densepostinglistthreshold);
retval.setPredicateParams(predicateParams);
retval.set_dictionary_config(convert_dictionary(cfg.dictionary));
+ retval.set_match(convertMatch(cfg.match));
using CfgDm = AttributesConfig::Attribute::Distancemetric;
DistanceMetric dm(DistanceMetric::Euclidean);
switch (cfg.distancemetric) {
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
index a308fc06af0..4be86652541 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
@@ -15,6 +15,57 @@ LOG_SETUP(".searchlib.attribute.stringbase");
namespace search {
+StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
+ : _regex(),
+ _term(),
+ _termLen(),
+ _isPrefix(term.isPrefix()),
+ _isRegex(term.isRegex()),
+ _isCased(cased)
+{
+ if (isRegex()) {
+ if (isCased()) {
+ _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None);
+ } else {
+ _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase);
+ }
+ } else if (isCased()) {
+ _term._char = term.getTerm();
+ _termLen = term.getTermLen();
+ } else {
+ term.term(_term._ucs4);
+ }
+}
+
+StringSearchHelper::~StringSearchHelper()
+{
+ if (isRegex()) {
+
+ }
+}
+
+bool
+StringSearchHelper::isMatch(const char *src) const {
+ if (__builtin_expect(isRegex(), false)) {
+ return getRegex().valid() ? getRegex().partial_match(std::string_view(src)) : false;
+ }
+ if (__builtin_expect(isCased(), false)) {
+ int res = strncmp(_term._char, src, _termLen);
+ return (res == 0) && (src[_termLen] == 0 || isPrefix());
+ }
+ vespalib::Utf8ReaderForZTS u8reader(src);
+ uint32_t j = 0;
+ uint32_t val;
+ for (;; ++j) {
+ val = u8reader.getChar();
+ val = vespalib::LowerCase::convert(val);
+ if (_term._ucs4[j] == 0 || _term._ucs4[j] != val) {
+ break;
+ }
+ }
+ return (_term._ucs4[j] == 0 && (val == 0 || isPrefix()));
+}
+
IMPLEMENT_IDENTIFIABLE_ABSTRACT(StringAttribute, AttributeVector);
using attribute::LoadedEnumAttribute;
@@ -225,16 +276,8 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT
const StringAttribute & toBeSearched) :
SearchContext(toBeSearched),
_queryTerm(static_cast<QueryTermUCS4 *>(qTerm.release())),
- _termUCS4(nullptr),
- _regex(),
- _isPrefix(_queryTerm->isPrefix()),
- _isRegex(_queryTerm->isRegex())
+ _helper(*_queryTerm, toBeSearched.getConfig().get_match() == Config::Match::CASED)
{
- if (isRegex()) {
- _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase);
- } else {
- _queryTerm->term(_termUCS4);
- }
}
StringAttribute::StringSearchContext::~StringSearchContext() = default;
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h
index b8fef783d58..f051fb07e71 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.h
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h
@@ -16,6 +16,27 @@
namespace search {
+class StringSearchHelper {
+public:
+ StringSearchHelper(QueryTermUCS4 & qTerm, bool cased);
+ ~StringSearchHelper();
+ bool isMatch(const char *src) const;
+ bool isPrefix() const { return _isPrefix; }
+ bool isRegex() const { return _isRegex; }
+ bool isCased() const { return _isCased; }
+ const vespalib::Regex & getRegex() const { return _regex; }
+private:
+ vespalib::Regex _regex;
+ union {
+ const ucs4_t *_ucs4;
+ const char *_char;
+ } _term;
+ uint32_t _termLen;
+ bool _isPrefix;
+ bool _isRegex;
+ bool _isCased;
+};
+
class ReaderBase;
class StringAttribute : public AttributeVector
@@ -97,24 +118,13 @@ protected:
~StringSearchContext() override;
protected:
bool valid() const override;
-
const QueryTermUCS4 * queryTerm() const override;
- bool isMatch(const char *src) const {
- if (__builtin_expect(isRegex(), false)) {
- return _regex.valid() ? _regex.partial_match(std::string_view(src)) : false;
- }
- vespalib::Utf8ReaderForZTS u8reader(src);
- uint32_t j = 0;
- uint32_t val;
- for (;; ++j) {
- val = u8reader.getChar();
- val = vespalib::LowerCase::convert(val);
- if (_termUCS4[j] == 0 || _termUCS4[j] != val) {
- break;
- }
- }
- return (_termUCS4[j] == 0 && (val == 0 || isPrefix()));
- }
+ bool isMatch(const char *src) const { return _helper.isMatch(src); }
+ bool isPrefix() const { return _helper.isPrefix(); }
+ bool isRegex() const { return _helper.isRegex(); }
+ bool isCased() const { return _helper.isCased(); }
+ const vespalib::Regex & getRegex() const { return _helper.getRegex(); }
+
class CollectHitCount {
public:
CollectHitCount() : _hitCount(0) { }
@@ -151,16 +161,9 @@ protected:
}
return -1;
}
-
- bool isPrefix() const { return _isPrefix; }
- bool isRegex() const { return _isRegex; }
- const vespalib::Regex & getRegex() const { return _regex; }
private:
std::unique_ptr<QueryTermUCS4> _queryTerm;
- const ucs4_t *_termUCS4;
- vespalib::Regex _regex;
- bool _isPrefix;
- bool _isRegex;
+ StringSearchHelper _helper;
};
};
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index 93b19212926..d0dfbee010f 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -63,7 +63,6 @@ public:
virtual void visitMembers(vespalib::ObjectVisitor &visitor) const;
vespalib::string getClassName() const;
bool isValid() const { return _valid; }
-protected:
const string & getTermString() const { return _term; }
private:
bool getRangeInternal(int64_t & low, int64_t & high) const;
diff --git a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
index 00ac59d729e..b8735eb30f0 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_ucs4.h
@@ -12,7 +12,6 @@ namespace search {
*/
class QueryTermUCS4 : public QueryTermSimple {
public:
- typedef std::unique_ptr<QueryTermUCS4> UP;
QueryTermUCS4(const QueryTermUCS4 &) = delete;
QueryTermUCS4 & operator = (const QueryTermUCS4 &) = delete;
QueryTermUCS4(QueryTermUCS4 &&) = delete;