diff options
Diffstat (limited to 'streamingvisitors')
30 files changed, 281 insertions, 310 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 83b84fffa11..74d8fdc4bf3 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -22,6 +22,7 @@ using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; using search::streaming::QueryTerm; using search::streaming::Normalizing; +using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; using search::streaming::QueryTermList; using TermType = QueryTerm::Type; using namespace vsm; @@ -114,7 +115,7 @@ struct SnippetModifierSetup SnippetModifierSetup::SnippetModifierSetup(const StringList & terms) : query(terms), - searcher(new UTF8SubstringSnippetModifier()), + searcher(new UTF8SubstringSnippetModifier(0)), env(), modifier(searcher) { @@ -361,7 +362,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query, void assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp) { - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); performSearch(mod, query, StringFieldValue(fv)); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size()); std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos()); @@ -440,11 +441,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits())); assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "oper", field, Hits().add(0).add(2)); assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits())); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; { // test handling of several underscores @@ -553,12 +554,12 @@ TEST("utf8 substring search with empty term") TEST("utf8 suffix search") { UTF8SuffixStringFieldSearcher fs(0); std::string field = "operators and operator overloading"; - assertString(fs, "rsand", field, Hits()); - assertString(fs, "tor", field, Hits().add(2)); - assertString(fs, "tors", field, Hits().add(0)); + TEST_DO(assertString(fs, "rsand", field, Hits())); + TEST_DO(assertString(fs, "tor", field, Hits().add(2))); + TEST_DO(assertString(fs, "tors", field, Hits().add(0))); - assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())); - assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()))); + TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)))); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -590,22 +591,22 @@ TEST("utf8 flexible searcher"){ // prefix assertString(fs, "vesp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "vesp", "vespa", Hits().add(0)); // substring - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*esp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUBSTRING); + fs.match_type(FieldSearcher::SUBSTRING); assertString(fs, "esp", "vespa", Hits().add(0)); // suffix - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUFFIX); + fs.match_type(FieldSearcher::SUFFIX); assertString(fs, "espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -659,7 +660,7 @@ TEST("integer search") TEST("floating point search") { - FloatFieldSearcher fs; + FloatFieldSearcher fs(0); TEST_DO(assertFloat(fs, "10", 10, true)); TEST_DO(assertFloat(fs, "10.5", 10.5, true)); TEST_DO(assertFloat(fs, "-10.5", -10.5, true)); @@ -726,7 +727,7 @@ TEST("Snippet modifier search") { "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8"); { // check that resizing works - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u); performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa")); @@ -763,28 +764,32 @@ TEST("snippet modifier") { } } -TEST("FieldSearchSpec constrution") { +TEST("FieldSearchSpec construction") { { FieldSearchSpec f; EXPECT_FALSE(f.valid()); EXPECT_EQUAL(0u, f.id()); EXPECT_EQUAL("", f.name()); EXPECT_EQUAL(0x100000u, f.maxLength()); + EXPECT_EQUAL("", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode()); } { - FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789); + FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789); EXPECT_TRUE(f.valid()); EXPECT_EQUAL(7u, f.id()); EXPECT_EQUAL("f0", f.name()); EXPECT_EQUAL(789u, f.maxLength()); EXPECT_EQUAL(789u, f.searcher().maxFieldLength()); + EXPECT_EQUAL("substring", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode()); } } TEST("snippet modifier manager") { FieldSearchSpecMapT specMap; - specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000); - specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000); + specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000); + specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000); IndexFieldMapT indexMap; indexMap["i0"].push_back(0); indexMap["i1"].push_back(1); diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp index b926444e4df..f7f340a2182 100644 --- a/streamingvisitors/src/tests/textutil/textutil_test.cpp +++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp @@ -2,7 +2,6 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/fastlib/text/normwordfolder.h> -#include <vespa/searchlib/query/base.h> #include <vespa/vsm/searcher/fold.h> #include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> #include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> @@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & const byte * srcbuf = reinterpret_cast<const byte *>(input); auto dstbuf = std::make_unique<ucs4_t[]>(len + 1); auto offsets = std::make_unique<size_t[]>(len + 1); - UTF8StrChrFieldSearcher fs; + UTF8StrChrFieldSearcher fs(0); BW bw(dstbuf.get(), offsets.get()); size_t dstlen = fs.skipSeparators(srcbuf, len, bw); EXPECT_EQUAL(dstlen, expdstbuf.size()); diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 4161adaf21f..cdd1a018d84 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult; using search::attribute::IAttributeVector; using search::expression::ConfigureStaticParams; using search::streaming::Query; +using search::streaming::Normalizing; using search::streaming::QueryTermList; using storage::StorageComponent; using storage::VisitorEnvironment; @@ -326,20 +327,41 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept { return false; } +namespace { + +uint32_t +count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) { + count++; + } + } + return count; +} + +uint32_t +count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) { + count++; + } + } + return count; +} + +} + SearchMethodInfo::Normalizing SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept { StringFieldIdTMap fieldIdMap; _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); - size_t num_exact = 0; - for (const auto & fieldId : fieldIdMap.map()) { - auto found = _fieldSearchSpecMap.specMap().find(fieldId.second); - if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.searcher().exact()) { - num_exact++; - } - } - return ((num_exact == 0) || (num_exact != fieldIdMap.map().size())) - ? Normalizing::LOWERCASE_AND_FOLD - : Normalizing::LOWERCASE; + if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE; + if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE; + return Normalizing::LOWERCASE_AND_FOLD; } void diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def index 442a044d38f..dac732013d2 100644 --- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def +++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def @@ -14,6 +14,7 @@ fieldspec[].name string ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8 fieldspec[].arg1 string default="" +fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD ## Maximum number of chars to search per field. fieldspec[].maxlength int default=1048576 diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h index c7e7d2e74bd..3708cca85fb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h @@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - BoolFieldSearcher(FieldIdT fId); - ~BoolFieldSearcher(); + explicit BoolFieldSearcher(FieldIdT fId); + ~BoolFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index 55d80413b8c..5e06ae41a03 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -51,14 +51,13 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept : FieldSearcherBase(), _field(fId), _matchType(defaultPrefix ? PREFIX : REGULAR), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _maxFieldLength(0x100000), _currentElementId(0), _currentElementWeight(1), _words(0), - _badUtf8Count(0), - _zeroCount(0) + _badUtf8Count(0) { - zeroStat(); } FieldSearcher::~FieldSearcher() = default; @@ -71,7 +70,7 @@ FieldSearcher::search(const StorageDocument & doc) fInfo.setHitOffset(qt->getHitList().size()); } onSearch(doc); - for(auto qt : _qtl) { + for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset()); fInfo.setFieldLength(_words); @@ -114,13 +113,6 @@ FieldSearcher::prepareFieldId() } void -FieldSearcher::zeroStat() -{ - _badUtf8Count = 0; - _zeroCount = 0; -} - -void FieldSearcher::init() { for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index e64c41f814f..c5bca6f3899 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -34,12 +34,13 @@ protected: class FieldSearcher : public FieldSearcherBase { public: + using Normalizing = search::streaming::Normalizing; enum MatchType { REGULAR, PREFIX, SUBSTRING, SUFFIX, - EXACT + EXACT, }; explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} @@ -50,20 +51,22 @@ public: virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); - FieldIdT field() const { return _field; } - void field(FieldIdT v) { _field = v; prepareFieldId(); } - bool prefix() const { return _matchType == PREFIX; } - bool substring() const { return _matchType == SUBSTRING; } - bool suffix() const { return _matchType == SUFFIX; } - bool exact() const { return _matchType == EXACT; } - void setMatchType(MatchType mt) { _matchType = mt; } + FieldIdT field() const noexcept { return _field; } + bool prefix() const noexcept { return _matchType == PREFIX; } + bool substring() const noexcept { return _matchType == SUBSTRING; } + bool suffix() const noexcept { return _matchType == SUFFIX; } + bool exact() const noexcept { return _matchType == EXACT; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + MatchType match_type() const noexcept { return _matchType; } + void match_type(MatchType mt) noexcept { _matchType = mt; } + void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; } + void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); } static void init(); static search::byte fold(search::byte c) { return _foldLowCase[c]; } static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); - int32_t getCurrentWeight() const { return _currentElementWeight; } - void zeroStat(); + int32_t currentWeight() const { return _currentElementWeight; } FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -89,22 +92,21 @@ private: virtual void onStructValue(const document::StructFieldValue &) { } FieldIdT _field; MatchType _matchType; + Normalizing _normalize_mode; unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. protected: /// Number of terms searched. - unsigned _words; + unsigned _words; /// Number of utf8 bytes by utf8 size. - unsigned _badUtf8Count; - unsigned _zeroCount; -protected: + unsigned _badUtf8Count; /** * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. **/ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); + qt.add(_words + pos, field(), _currentElementId, _currentElementWeight); } public: static search::byte _foldLowCase[256]; diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h index 07b3f6e1c5f..85341472c26 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h @@ -9,8 +9,8 @@ template <typename T> class FloatFieldSearcherT : public FieldSearcher { public: - FloatFieldSearcherT(FieldIdT fId=0); - ~FloatFieldSearcherT(); + explicit FloatFieldSearcherT(FieldIdT fId); + ~FloatFieldSearcherT() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { } + explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { } }; class DoubleFieldSearcher : public FloatFieldSearcherTD { public: std::unique_ptr<FieldSearcher> duplicate() const override; - DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { } + DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp index d7d73899e53..c0b5117d6bf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const return std::make_unique<FUTF8StrChrFieldSearcher>(*this); } -FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher() - : UTF8StrChrFieldSearcher(), - _folded(4_Ki) -{ } FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StrChrFieldSearcher(fId), _folded(4_Ki) diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h index 5d5ca3d6c3c..b8aa287070a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h @@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FUTF8StrChrFieldSearcher(); - FUTF8StrChrFieldSearcher(FieldIdT fId); + explicit FUTF8StrChrFieldSearcher(FieldIdT fId); ~FUTF8StrChrFieldSearcher() override; static bool ansiFold(const char * toFold, size_t sz, char * folded); static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart); static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart); private: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef&, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef&, size_t shortestTerm) override; virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt); size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize); std::vector<char> _folded; diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h index 741148fbca1..17c9f23fefb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h @@ -8,8 +8,8 @@ namespace vsm { class GeoPosFieldSearcher : public FieldSearcher { public: - GeoPosFieldSearcher(FieldIdT fId=0); - ~GeoPosFieldSearcher(); + GeoPosFieldSearcher(FieldIdT fId); + ~GeoPosFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -21,7 +21,7 @@ protected: using GeoLocation = search::common::GeoLocation; class GeoPosInfo : public GeoLocation { public: - GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} + explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} bool cmp(const document::StructFieldValue & fv) const; }; using GeoPosInfoListT = std::vector<GeoPosInfo>; diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h index 47b83c1538d..9c63d31e3c3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h @@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - IntFieldSearcher(FieldIdT fId=0); - ~IntFieldSearcher(); + explicit IntFieldSearcher(FieldIdT fId); + ~IntFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp index 76fedbd1166..816317bf86d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp @@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv) } DistanceMetric -NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value) +NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value) { // Valid string values must match the definition of DistanceMetric in // config-model/src/main/java/com/yahoo/schema/document/Attribute.java - auto v = value; + vespalib::string v = value; std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { return std::tolower(c); }); try { return DistanceMetricUtils::to_distance_metric(v); } catch (vespalib::IllegalStateException&) { - vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str()); + vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str()); return DistanceMetric::Euclidean; } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h index 5629b443c78..ecdc64d1336 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h @@ -11,10 +11,7 @@ #include <vespa/searchlib/tensor/tensor_ext_attribute.h> namespace search::fef { class IQueryEnvironment; } - -namespace search::tensor { -class TensorExtAttribute; -} +namespace search::tensor { class TensorExtAttribute; } namespace vsm { @@ -43,7 +40,7 @@ private: public: NearestNeighborFieldSearcher(FieldIdT fid, search::attribute::DistanceMetric metric); - ~NearestNeighborFieldSearcher(); + ~NearestNeighborFieldSearcher() override; std::unique_ptr<FieldSearcher> duplicate() const override; void prepare(search::streaming::QueryTermList& qtl, @@ -52,7 +49,7 @@ public: search::fef::IQueryEnvironment& query_env) override; void onValue(const document::FieldValue& fv) override; - static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value); + static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h index 9ad76712092..19c723d060d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h @@ -8,8 +8,7 @@ namespace vsm { class StrChrFieldSearcher : public FieldSearcher { public: - StrChrFieldSearcher() : FieldSearcher(0) { } - StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } + explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } void onValue(const document::FieldValue & fv) override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, @@ -19,7 +18,7 @@ private: size_t shortestTerm() const; bool matchDoc(const FieldRef & field); virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0; + virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index dd6f31581a0..aaf8b940dc8 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -17,10 +17,10 @@ protected: public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8ExactStringFieldSearcher(FieldIdT fId) + explicit UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { - setMatchType(EXACT); + match_type(EXACT); } }; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index 655b068e152..78f491198ad 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) } } -UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() : - UTF8StringFieldSearcherBase() -{ } - UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h index 5eee6a8862a..04fbee96d36 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -17,18 +17,17 @@ private: * Tries to match the given query term against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; /** * Tries to match each query term in the underlying query against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8FlexibleStringFieldSearcher(); - UTF8FlexibleStringFieldSearcher(FieldIdT fId); + explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 651d1dcad9f..fa1fc83728c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -14,21 +14,19 @@ UTF8StrChrFieldSearcher::duplicate() const } size_t -UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - const byte * e = n + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h index cfe546bc6f6..663ee3a1a62 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h @@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } - + explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index ebdf69d0b30..ce63f55ea63 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,7 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" -#include <vespa/fastlib/text/normwordfolder.h> #include <cassert> using search::streaming::QueryTerm; @@ -10,107 +9,36 @@ using search::byte; namespace vsm { -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = 0; - } else { - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - break; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - c = 0; - } else { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - break; - } +template<typename Reader> +void +UTF8StringFieldSearcherBase::tokenize(Reader & reader) { + ucs4_t c(0); + Normalizing norm_mode = normalize_mode(); + while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); + + if (Fast_UnicodeUtil::IsWordChar(c)) { + reader.normalize(c, norm_mode); + while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { + reader.normalize(c, norm_mode); } } - *q = 0; - tokenlen = q - dstbuf; - return p; } size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - // __builtin_prefetch(n, 0, 0); const cmptype_t * term; termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); if ( f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); + cmptype_t * fn = _buf->data(); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -185,22 +113,17 @@ size_t UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) { termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); const cmptype_t * term; termsize_t tsz = qt.term(term); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; + cmptype_t * dstbuf = _buf->data(); - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } @@ -209,11 +132,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) return words; } -UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() : - StrChrFieldSearcher() -{ -} - UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) : StrChrFieldSearcher(fId) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 38aac508f4f..115cddce619 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -2,6 +2,7 @@ #pragma once #include "strchrfieldsearcher.h" +#include <vespa/fastlib/text/normwordfolder.h> namespace vsm { @@ -28,15 +29,15 @@ public: ucs4_t * _cbuf; public: - BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } - BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } + explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { } + BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { } void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } void onOffset(size_t) { } void incBuf(size_t inc) { _cbuf += inc; } ucs4_t * getBuf() { return _cbuf; } - bool valid() { return true; } - size_t size() { return (_cbuf - _bbuf); } - bool hasOffsets() { return false; } + bool valid() const noexcept { return true; } + size_t size() const noexcept { return (_cbuf - _bbuf); } + bool hasOffsets() const noexcept { return false; } }; /** @@ -50,17 +51,74 @@ public: size_t * _coff; public: - OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} + explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } void onOffset(size_t of) { *_coff++ = of; } - bool valid() { return (size() == (size_t)(_coff - _boff)); } - bool hasOffsets() { return true; } + bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() const noexcept { return true; } }; protected: SharedSearcherBuf _buf; - const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + using byte = search::byte; + + class TokenizeReader { + public: + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } + private: + void fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + } + void lowercase(ucs4_t c) { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; + }; + + + template<typename Reader> + void tokenize(Reader & reader); /** * Matches the given query term against the words in the given field reference @@ -103,9 +161,8 @@ protected: size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); public: - UTF8StringFieldSearcherBase(); - UTF8StringFieldSearcherBase(FieldIdT fId); - ~UTF8StringFieldSearcherBase(); + explicit UTF8StringFieldSearcherBase(FieldIdT fId); + ~UTF8StringFieldSearcherBase() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp index 25ef9ae7618..fcc2893a71d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp @@ -1,6 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vsm/searcher/utf8substringsearcher.h> +#include "utf8substringsearcher.h" #include <vespa/fastlib/text/unicodeutil.h> using search::byte; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h index b1455d5c5f6..22ecf9c41fa 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h @@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp index 8403e69658f..6d8a399cd33 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp @@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * _modified->put(_unitSep); } -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() : - UTF8StringFieldSearcherBase(), - _modified(new CharBuffer(32)), - _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), - _unitSep(juniper::separators::unit_separator) -{ -} - UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) : UTF8StringFieldSearcherBase(fId), _modified(new CharBuffer(32)), _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } @@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId, UTF8StringFieldSearcherBase(fId), _modified(modBuf), _offsets(offBuf), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } -UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {} +UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h index ebb806de61c..99e6c29961f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h @@ -23,8 +23,8 @@ private: const char * _readPtr; // buffer to read from (field reference) char _unitSep; // the unit separator character to use - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; /** * Copies n bytes from the field reference to the modified buffer and updates the read pointer. @@ -51,9 +51,8 @@ public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubstringSnippetModifier(); - UTF8SubstringSnippetModifier(FieldIdT fId); - ~UTF8SubstringSnippetModifier(); + explicit UTF8SubstringSnippetModifier(FieldIdT fId); + ~UTF8SubstringSnippetModifier() override; /** * Creates a new instance. diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index e28ce114225..4318d5fe1a3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const } size_t -UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h index 556f61a714f..dc3bc214b49 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h @@ -1,10 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> +#include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does suffix utf8 searches. @@ -12,13 +11,12 @@ namespace vsm class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase { protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 4b0efd58a56..715c19a0bb7 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -31,15 +31,13 @@ namespace { void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { if (arg1 == "prefix") { - searcher->setMatchType(FieldSearcher::PREFIX); + searcher->match_type(FieldSearcher::PREFIX); } else if (arg1 == "substring") { - searcher->setMatchType(FieldSearcher::SUBSTRING); + searcher->match_type(FieldSearcher::SUBSTRING); } else if (arg1 == "suffix") { - searcher->setMatchType(FieldSearcher::SUFFIX); - } else if (arg1 == "exact") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "word") { - searcher->setMatchType(FieldSearcher::EXACT); + searcher->match_type(FieldSearcher::SUFFIX); + } else if ((arg1 == "exact") || (arg1 == "word")) { + searcher->match_type(FieldSearcher::EXACT); } } @@ -51,6 +49,7 @@ FieldSearchSpec::FieldSearchSpec() _maxLength(0x100000), _searcher(), _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _arg1(), _reconfigured(false) { @@ -60,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default; FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; -FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, - VsmfieldsConfig::Fieldspec::Searchmethod searchDef, - const vespalib::string & arg1, size_t maxLength_) : +FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef, + Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) : _id(fid), _name(fname), - _maxLength(maxLength_), + _maxLength(maxLength_in), _searcher(), _searchMethod(searchDef), - _arg1(arg1), + _normalize_mode(normalize_mode), + _arg1(arg1_in), _reconfigured(false) { switch(searchDef) { @@ -79,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - if (arg1 == "substring") { + if (_arg1 == "substring") { _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); - } else if (arg1 == "suffix") { + } else if (_arg1 == "suffix") { _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (arg1 == "exact") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (arg1 == "word") { + } else if ((_arg1 == "exact") || (_arg1 == "word")) { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); @@ -112,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & _searcher = std::make_unique<GeoPosFieldSearcher>(fid); break; case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR: - auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1); + auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1); _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm); break; } if (_searcher) { - setMatchType(_searcher, arg1); + setMatchType(_searcher, _arg1); _searcher->maxFieldLength(maxLength()); + _searcher->normalize_mode(_normalize_mode); } } @@ -166,20 +164,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default; FieldSearchSpecMap::~FieldSearchSpecMap() = default; namespace { - const std::string _G_empty(""); - const std::string _G_value(".value"); - const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); - const std::regex _G_map2("\\{\".*\"\\}"); - const std::regex _G_array("\\[[0-9]+\\]"); + const std::string G_empty; + const std::string G_value(".value"); + const std::regex G_map1("\\{[a-zA-Z0-9]+\\}"); + const std::regex G_map2("\\{\".*\"\\}"); + const std::regex G_array("\\[[0-9]+\\]"); } vespalib::string FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex) { if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { - std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); - index = std::regex_replace(index, _G_map2, _G_value); - index = std::regex_replace(index, _G_array, _G_empty); + std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value); + index = std::regex_replace(index, G_map2, G_value); + index = std::regex_replace(index, G_array, G_empty); return index; } return rawIndex; @@ -258,17 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch return ifm; } +search::streaming::Normalizing +normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { + switch (normalize_mode) { + case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD; + } + return search::streaming::Normalizing::LOWERCASE_AND_FOLD; } -bool +} + +void FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) { - bool retval(true); LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { LOG(spam, "Parsing %s", cfs.name.c_str()); FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); _specMap[fieldId] = std::move(fss); _nameIdMap.add(cfs.name, fieldId); LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); @@ -283,7 +290,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) } _documentTypeMap[di.name] = indexMapp; } - return retval; } void @@ -338,7 +344,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const if (!itr->second.uses_nearest_neighbor_search_method()) { return dm; } - return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1()); + return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1()); } vespalib::asciistream & diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h index 43bb5b04481..7ba9799991e 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -10,25 +10,29 @@ namespace vsm { class FieldSearchSpec { public: + using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; + using Normalizing = search::streaming::Normalizing; FieldSearchSpec(); - FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, - VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, - const vespalib::string & arg1, size_t maxLength); + FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod, + Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength); ~FieldSearchSpec(); FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; - const FieldSearcher & searcher() const { return *_searcher; } - const vespalib::string & name() const { return _name; } - FieldIdT id() const { return _id; } - bool valid() const { return static_cast<bool>(_searcher); } - size_t maxLength() const { return _maxLength; } - bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; } + const FieldSearcher & searcher() const noexcept { return *_searcher; } + const vespalib::string & name() const noexcept { return _name; } + FieldIdT id() const noexcept { return _id; } + bool valid() const noexcept { return static_cast<bool>(_searcher); } + size_t maxLength() const noexcept { return _maxLength; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + const vespalib::string& arg1() const noexcept { return _arg1; } + bool uses_nearest_neighbor_search_method() const noexcept { + return _searchMethod == Searchmethod::NEAREST_NEIGHBOR; + } bool uses_string_search_method() const noexcept { - return (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8); + return (_searchMethod == Searchmethod::UTF8) || + (_searchMethod == Searchmethod::AUTOUTF8) || + (_searchMethod == Searchmethod::SSE2UTF8); } - const vespalib::string& get_arg1() const noexcept { return _arg1; } /** * Reconfigures the field searcher based on information in the given query term. @@ -42,7 +46,8 @@ private: vespalib::string _name; size_t _maxLength; FieldSearcherContainer _searcher; - VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; + Searchmethod _searchMethod; + Normalizing _normalize_mode; vespalib::string _arg1; bool _reconfigured; }; @@ -60,7 +65,7 @@ public: * and a mapping from field name to field id. It then iterates over all document types and index names * and creates a mapping from index name to list of field ids for each document type. **/ - bool buildFromConfig(const VsmfieldsHandle & conf); + void buildFromConfig(const VsmfieldsHandle & conf); /** * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. |