diff options
12 files changed, 180 insertions, 186 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 64952dbe5b5..a691d7671f9 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -22,6 +22,7 @@ using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; using search::streaming::QueryTerm; using search::streaming::Normalizing; +using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; using search::streaming::QueryTermList; using TermType = QueryTerm::Type; using namespace vsm; @@ -763,28 +764,32 @@ TEST("snippet modifier") { } } -TEST("FieldSearchSpec constrution") { +TEST("FieldSearchSpec construction") { { FieldSearchSpec f; EXPECT_FALSE(f.valid()); EXPECT_EQUAL(0u, f.id()); EXPECT_EQUAL("", f.name()); EXPECT_EQUAL(0x100000u, f.maxLength()); + EXPECT_EQUAL("", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode()); } { - FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789); + FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789); EXPECT_TRUE(f.valid()); EXPECT_EQUAL(7u, f.id()); EXPECT_EQUAL("f0", f.name()); EXPECT_EQUAL(789u, f.maxLength()); EXPECT_EQUAL(789u, f.searcher().maxFieldLength()); + EXPECT_EQUAL("substring", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode()); } } TEST("snippet modifier manager") { FieldSearchSpecMapT specMap; - specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000); - specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000); + specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000); + specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000); IndexFieldMapT indexMap; indexMap["i0"].push_back(0); indexMap["i1"].push_back(1); diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def index 442a044d38f..dac732013d2 100644 --- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def +++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def @@ -14,6 +14,7 @@ fieldspec[].name string ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8 fieldspec[].arg1 string default="" +fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD ## Maximum number of chars to search per field. fieldspec[].maxlength int default=1048576 diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index 55d80413b8c..b9e1fe8f83c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -55,10 +55,8 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept _currentElementId(0), _currentElementWeight(1), _words(0), - _badUtf8Count(0), - _zeroCount(0) + _badUtf8Count(0) { - zeroStat(); } FieldSearcher::~FieldSearcher() = default; @@ -114,13 +112,6 @@ FieldSearcher::prepareFieldId() } void -FieldSearcher::zeroStat() -{ - _badUtf8Count = 0; - _zeroCount = 0; -} - -void FieldSearcher::init() { for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index 663592ed6d3..75ace16328b 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -59,13 +59,13 @@ public: bool exact() const { return _matchType == EXACT; } bool cased() const { return _matchType == CASED; } void setMatchType(MatchType mt) { _matchType = mt; } + MatchType match_type() const noexcept { return _matchType; } static void init(); static search::byte fold(search::byte c) { return _foldLowCase[c]; } static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); int32_t getCurrentWeight() const { return _currentElementWeight; } - void zeroStat(); FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -96,11 +96,9 @@ private: int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. protected: /// Number of terms searched. - unsigned _words; + unsigned _words; /// Number of utf8 bytes by utf8 size. - unsigned _badUtf8Count; - unsigned _zeroCount; -protected: + unsigned _badUtf8Count; /** * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp index 76fedbd1166..816317bf86d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp @@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv) } DistanceMetric -NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value) +NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value) { // Valid string values must match the definition of DistanceMetric in // config-model/src/main/java/com/yahoo/schema/document/Attribute.java - auto v = value; + vespalib::string v = value; std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { return std::tolower(c); }); try { return DistanceMetricUtils::to_distance_metric(v); } catch (vespalib::IllegalStateException&) { - vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str()); + vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str()); return DistanceMetric::Euclidean; } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h index d08c2fbbc83..ecdc64d1336 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h @@ -11,10 +11,7 @@ #include <vespa/searchlib/tensor/tensor_ext_attribute.h> namespace search::fef { class IQueryEnvironment; } - -namespace search::tensor { -class TensorExtAttribute; -} +namespace search::tensor { class TensorExtAttribute; } namespace vsm { @@ -52,7 +49,7 @@ public: search::fef::IQueryEnvironment& query_env) override; void onValue(const document::FieldValue& fv) override; - static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value); + static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 00828bcc7b1..fa1fc83728c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -18,17 +18,15 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - const byte * e = n + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index 1148083b042..ce63f55ea63 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,7 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" -#include <vespa/fastlib/text/normwordfolder.h> #include <cassert> using search::streaming::QueryTerm; @@ -10,107 +9,36 @@ using search::byte; namespace vsm { -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = 0; - } else { - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - break; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - c = 0; - } else { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - break; - } +template<typename Reader> +void +UTF8StringFieldSearcherBase::tokenize(Reader & reader) { + ucs4_t c(0); + Normalizing norm_mode = normalize_mode(); + while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); + + if (Fast_UnicodeUtil::IsWordChar(c)) { + reader.normalize(c, norm_mode); + while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { + reader.normalize(c, norm_mode); } } - *q = 0; - tokenlen = q - dstbuf; - return p; } size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - // __builtin_prefetch(n, 0, 0); const cmptype_t * term; termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); if ( f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); + cmptype_t * fn = _buf->data(); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -185,22 +113,17 @@ size_t UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) { termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); const cmptype_t * term; termsize_t tsz = qt.term(term); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; + cmptype_t * dstbuf = _buf->data(); - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 1362b3c4f1d..ed76fb79f4e 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -2,6 +2,7 @@ #pragma once #include "strchrfieldsearcher.h" +#include <vespa/fastlib/text/normwordfolder.h> namespace vsm { @@ -34,9 +35,9 @@ public: void onOffset(size_t) { } void incBuf(size_t inc) { _cbuf += inc; } ucs4_t * getBuf() { return _cbuf; } - bool valid() { return true; } - size_t size() { return (_cbuf - _bbuf); } - bool hasOffsets() { return false; } + bool valid() const noexcept { return true; } + size_t size() const noexcept { return (_cbuf - _bbuf); } + bool hasOffsets() const noexcept { return false; } }; /** @@ -53,14 +54,81 @@ public: explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } void onOffset(size_t of) { *_coff++ = of; } - bool valid() { return (size() == (size_t)(_coff - _boff)); } - bool hasOffsets() { return true; } + bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() const noexcept { return true; } }; protected: SharedSearcherBuf _buf; - const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + using byte = search::byte; + using Normalizing = search::streaming::Normalizing; + + class TokenizeReader { + public: + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } + private: + void fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + } + void lowercase(ucs4_t c) { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; + }; + + + template<typename Reader> + void tokenize(Reader & reader); + + Normalizing normalize_mode() const noexcept { + switch (match_type()) { + case EXACT: return Normalizing::LOWERCASE; + case CASED: return Normalizing::NONE; + default: return Normalizing::LOWERCASE_AND_FOLD; + } + return Normalizing::LOWERCASE_AND_FOLD; + } /** * Matches the given query term against the words in the given field reference diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index e28ce114225..4318d5fe1a3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const } size_t -UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 4b0efd58a56..22934ba74d2 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -40,6 +40,8 @@ setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { searcher->setMatchType(FieldSearcher::EXACT); } else if (arg1 == "word") { searcher->setMatchType(FieldSearcher::EXACT); + } else if (arg1 == "cased") { + searcher->setMatchType(FieldSearcher::CASED); } } @@ -51,6 +53,7 @@ FieldSearchSpec::FieldSearchSpec() _maxLength(0x100000), _searcher(), _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _arg1(), _reconfigured(false) { @@ -60,15 +63,15 @@ FieldSearchSpec::~FieldSearchSpec() = default; FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; -FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, - VsmfieldsConfig::Fieldspec::Searchmethod searchDef, - const vespalib::string & arg1, size_t maxLength_) : +FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef, + Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) : _id(fid), _name(fname), - _maxLength(maxLength_), + _maxLength(maxLength_in), _searcher(), _searchMethod(searchDef), - _arg1(arg1), + _normalize_mode(normalize_mode), + _arg1(arg1_in), _reconfigured(false) { switch(searchDef) { @@ -79,14 +82,16 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - if (arg1 == "substring") { + if (_arg1 == "substring") { _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); - } else if (arg1 == "suffix") { + } else if (_arg1 == "suffix") { _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (arg1 == "exact") { + } else if (_arg1 == "exact") { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (arg1 == "word") { + } else if (_arg1 == "word") { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); + } else if (_arg1 == "cased") { + _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); } else { @@ -112,12 +117,12 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & _searcher = std::make_unique<GeoPosFieldSearcher>(fid); break; case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR: - auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1); + auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1); _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm); break; } if (_searcher) { - setMatchType(_searcher, arg1); + setMatchType(_searcher, _arg1); _searcher->maxFieldLength(maxLength()); } } @@ -166,20 +171,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default; FieldSearchSpecMap::~FieldSearchSpecMap() = default; namespace { - const std::string _G_empty(""); - const std::string _G_value(".value"); - const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); - const std::regex _G_map2("\\{\".*\"\\}"); - const std::regex _G_array("\\[[0-9]+\\]"); + const std::string G_empty; + const std::string G_value(".value"); + const std::regex G_map1("\\{[a-zA-Z0-9]+\\}"); + const std::regex G_map2("\\{\".*\"\\}"); + const std::regex G_array("\\[[0-9]+\\]"); } vespalib::string FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex) { if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { - std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); - index = std::regex_replace(index, _G_map2, _G_value); - index = std::regex_replace(index, _G_array, _G_empty); + std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value); + index = std::regex_replace(index, G_map2, G_value); + index = std::regex_replace(index, G_array, G_empty); return index; } return rawIndex; @@ -258,17 +263,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch return ifm; } +search::streaming::Normalizing +normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { + switch (normalize_mode) { + case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD; + } + return search::streaming::Normalizing::LOWERCASE_AND_FOLD; } -bool +} + +void FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) { - bool retval(true); LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { LOG(spam, "Parsing %s", cfs.name.c_str()); FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); _specMap[fieldId] = std::move(fss); _nameIdMap.add(cfs.name, fieldId); LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); @@ -283,7 +297,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) } _documentTypeMap[di.name] = indexMapp; } - return retval; } void @@ -338,7 +351,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const if (!itr->second.uses_nearest_neighbor_search_method()) { return dm; } - return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1()); + return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1()); } vespalib::asciistream & diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h index 43bb5b04481..7ba9799991e 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -10,25 +10,29 @@ namespace vsm { class FieldSearchSpec { public: + using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; + using Normalizing = search::streaming::Normalizing; FieldSearchSpec(); - FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, - VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, - const vespalib::string & arg1, size_t maxLength); + FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod, + Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength); ~FieldSearchSpec(); FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; - const FieldSearcher & searcher() const { return *_searcher; } - const vespalib::string & name() const { return _name; } - FieldIdT id() const { return _id; } - bool valid() const { return static_cast<bool>(_searcher); } - size_t maxLength() const { return _maxLength; } - bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; } + const FieldSearcher & searcher() const noexcept { return *_searcher; } + const vespalib::string & name() const noexcept { return _name; } + FieldIdT id() const noexcept { return _id; } + bool valid() const noexcept { return static_cast<bool>(_searcher); } + size_t maxLength() const noexcept { return _maxLength; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + const vespalib::string& arg1() const noexcept { return _arg1; } + bool uses_nearest_neighbor_search_method() const noexcept { + return _searchMethod == Searchmethod::NEAREST_NEIGHBOR; + } bool uses_string_search_method() const noexcept { - return (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8); + return (_searchMethod == Searchmethod::UTF8) || + (_searchMethod == Searchmethod::AUTOUTF8) || + (_searchMethod == Searchmethod::SSE2UTF8); } - const vespalib::string& get_arg1() const noexcept { return _arg1; } /** * Reconfigures the field searcher based on information in the given query term. @@ -42,7 +46,8 @@ private: vespalib::string _name; size_t _maxLength; FieldSearcherContainer _searcher; - VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; + Searchmethod _searchMethod; + Normalizing _normalize_mode; vespalib::string _arg1; bool _reconfigured; }; @@ -60,7 +65,7 @@ public: * and a mapping from field name to field id. It then iterates over all document types and index names * and creates a mapping from index name to list of field ids for each document type. **/ - bool buildFromConfig(const VsmfieldsHandle & conf); + void buildFromConfig(const VsmfieldsHandle & conf); /** * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. |