From 0f9915ebdfb2931ebb3c06ac55b537f42477256b Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 19 Dec 2023 17:02:19 +0000 Subject: - Modernize code - Unify some conversion tables. --- .../vespa/searchlib/query/streaming/queryterm.cpp | 2 +- .../src/tests/searcher/searcher_test.cpp | 48 +++---- .../src/vespa/vsm/searcher/fieldsearcher.cpp | 152 ++++++++++----------- .../src/vespa/vsm/searcher/fieldsearcher.h | 35 +---- .../vsm/searcher/futf8strchrfieldsearcher.cpp | 4 +- .../vespa/vsm/searcher/utf8strchrfieldsearcher.cpp | 1 - .../vsm/searcher/utf8stringfieldsearcherbase.cpp | 13 -- .../vespa/vsm/searcher/utf8substringsearcher.cpp | 2 - vespalib/src/vespa/fastlib/text/normwordfolder.cpp | 1 - vespalib/src/vespa/fastlib/text/normwordfolder.h | 12 +- 10 files changed, 103 insertions(+), 167 deletions(-) diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index a658ff5f3d6..fe6f73367d7 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -9,7 +9,7 @@ namespace { class CharInfo { public: CharInfo(); - uint8_t get(uint8_t c) const { return _charInfo[c]; } + uint8_t get(uint8_t c) const noexcept { return _charInfo[c]; } private: uint8_t _charInfo[256]; }; diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 4492dfac02b..1ce285c2103 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -47,7 +47,7 @@ class String private: const std::string & _str; public: - String(const std::string & str) : _str(str) {} + explicit String(const std::string & str) : _str(str) {} bool operator==(const String & rhs) const { return _str == rhs._str; } @@ -57,13 +57,13 @@ class Query { private: void setupQuery(const StringList & terms) { - for (size_t i = 0; i < terms.size(); ++i) { - ParsedQueryTerm pqt = parseQueryTerm(terms[i]); + for (const auto & term : terms) { + ParsedQueryTerm pqt = parseQueryTerm(term); ParsedTerm pt = parseTerm(pqt.second); qtv.push_back(std::make_unique(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second)); } - for (size_t i = 0; i < qtv.size(); ++i) { - qtl.push_back(qtv[i].get()); + for (const auto & i : qtv) { + qtl.push_back(i.get()); } } public: @@ -72,14 +72,14 @@ public: QueryNodeResultFactory eqnr; std::vector qtv; QueryTermList qtl; - Query(const StringList & terms); + explicit Query(const StringList & terms); ~Query(); static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) { size_t i = queryTerm.find(':'); if (i != std::string::npos) { - return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1)); + return {queryTerm.substr(0, i), queryTerm.substr(i + 1)}; } - return ParsedQueryTerm(std::string(), queryTerm); + return {std::string(), queryTerm}; } static ParsedTerm parseTerm(const std::string & term) { if (term[0] == '*' && term[term.size() - 1] == '*') { @@ -254,8 +254,8 @@ getFieldValue(const StringList & fv) static ArrayDataType type(*DataType::STRING); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(StringFieldValue(fv[i])); + for (const auto & v : fv) { + afv.add(StringFieldValue(v)); } return afv; } @@ -265,8 +265,8 @@ getFieldValue(const LongList & fv) { static ArrayDataType type(*DataType::LONG); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(LongFieldValue(fv[i])); + for (long v : fv) { + afv.add(LongFieldValue(v)); } return afv; } @@ -276,8 +276,8 @@ getFieldValue(const FloatList & fv) { static ArrayDataType type(*DataType::FLOAT); ArrayFieldValue afv(type); - for (size_t i = 0; i < fv.size(); ++i) { - afv.add(FloatFieldValue(fv[i])); + for (float v : fv) { + afv.add(FloatFieldValue(v)); } return afv; } @@ -299,8 +299,8 @@ void assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp) { HitsList hl; - for (size_t i = 0; i < exp.size(); ++i) { - hl.push_back(exp[i] ? Hits().add(0) : Hits()); + for (bool v : exp) { + hl.push_back(v ? Hits().add(0) : Hits()); } assertSearch(fs, query, fv, hl); } @@ -316,7 +316,7 @@ performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & f // setup document SharedFieldPathMap sfim(new FieldPathMapT()); - sfim->push_back(FieldPath()); + sfim->emplace_back(); StorageDocument doc(std::make_unique(), sfim, 1); doc.setField(0, document::FieldValue::UP(fv.clone())); @@ -369,7 +369,7 @@ assertSnippetModifier(const StringList & query, const std::string & fv, const st void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp) { FieldValue::UP mfv = setup.modifier.modify(fv); - const document::LiteralFieldValueB & lfv = static_cast(*mfv.get()); + const auto & lfv = static_cast(*mfv.get()); const std::string & actual = lfv.getValue(); EXPECT_EQUAL(actual.size(), exp.size()); EXPECT_EQUAL(actual, exp); @@ -377,11 +377,11 @@ void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms) { - if (terms.size() == 0) { - ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL); + if (terms.empty()) { + ASSERT_TRUE(man.getModifiers().getModifier(fId) == nullptr); return; } - ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL); + ASSERT_TRUE(man.getModifiers().getModifier(fId) != nullptr); UTF8SubstringSnippetModifier * searcher = (static_cast(man.getModifiers().getModifier(fId)))->getSearcher().get(); EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size()); @@ -466,7 +466,7 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) TEST("verify correct term parsing") { ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index"); ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term"); - ASSERT_TRUE(Query::parseQueryTerm("term").first == ""); + ASSERT_TRUE(Query::parseQueryTerm("term").first.empty()); ASSERT_TRUE(Query::parseQueryTerm("term").second == "term"); ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr"); ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM); @@ -822,13 +822,13 @@ TEST("snippet modifier manager") { Query query(StringList().add("i2:foo").add("i2:*bar*")); man.setup(query.qtl, specMap, indexMap, *env.field_paths, env.query_env); { - SnippetModifier * sm = static_cast(man.getModifiers().getModifier(0)); + auto * sm = static_cast(man.getModifiers().getModifier(0)); UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); } { - SnippetModifier * sm = static_cast(man.getModifiers().getModifier(1)); + auto * sm = static_cast(man.getModifiers().getModifier(1)); UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index c797e6751ee..851606634cc 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -28,26 +28,24 @@ static force __forceInit; byte FieldSearcher::_foldLowCase[256]; byte FieldSearcher::_wordChar[256]; -FieldSearcherBase::FieldSearcherBase() : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) +FieldSearcherBase::FieldSearcherBase() noexcept + : _qtl(), + _qtlFastBuffer(), + _qtlFastSize(0), + _qtlFast(nullptr) { } -FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) +FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) + : _qtl(), + _qtlFastBuffer(), + _qtlFastSize(0), + _qtlFast(nullptr) { prepare(org._qtl); } -FieldSearcherBase::~FieldSearcherBase() -{ -} +FieldSearcherBase::~FieldSearcherBase() = default; FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org) { @@ -68,20 +66,16 @@ void FieldSearcherBase::prepare(const QueryTermList & qtl) } } -FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) : - FieldSearcherBase(), - _field(fId), - _matchType(defaultPrefix ? PREFIX : REGULAR), - _maxFieldLength(0x100000), - _currentElementId(0), - _currentElementWeight(1), - _pureUsAsciiCount(0), - _pureUsAsciiFieldCount(0), - _anyUtf8Count(0), - _anyUtf8FieldCount(0), - _words(0), - _badUtf8Count(0), - _zeroCount(0) +FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept + : FieldSearcherBase(), + _field(fId), + _matchType(defaultPrefix ? PREFIX : REGULAR), + _maxFieldLength(0x100000), + _currentElementId(0), + _currentElementWeight(1), + _words(0), + _badUtf8Count(0), + _zeroCount(0) { zeroStat(); } @@ -136,26 +130,10 @@ void FieldSearcher::prepareFieldId() } } -void FieldSearcher::addStat(const FieldSearcher & toAdd) -{ - _pureUsAsciiCount += toAdd._pureUsAsciiCount; - _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount; - _anyUtf8Count += toAdd._anyUtf8Count; - _anyUtf8FieldCount += toAdd._anyUtf8FieldCount; - _badUtf8Count += toAdd._badUtf8Count; - _zeroCount += toAdd._zeroCount; - for (size_t i=0; i(fv); + const auto & afv = static_cast(fv); LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size()); } else if (fv.isA(document::FieldValue::Type::WSET)) { - const document::WeightedSetFieldValue & wsfv = static_cast(fv); + const auto & wsfv = static_cast(fv); LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size()); } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index e79dacf827e..c231a96711c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -14,12 +14,6 @@ namespace vsm { using termcount_t = size_t; using termsize_t = size_t; -#if defined(COLLECT_CHAR_STAT) - #define NEED_CHAR_STAT(a) { a; } -#else - #define NEED_CHAR_STAT(a) -#endif - using ucs4_t = uint32_t; using cmptype_t = ucs4_t; using SearcherBuf = vespalib::Array; @@ -33,9 +27,9 @@ protected: private: CharVector _qtlFastBuffer; protected: - FieldSearcherBase(); + FieldSearcherBase() noexcept; FieldSearcherBase(const FieldSearcherBase & org); - virtual ~FieldSearcherBase(void); + virtual ~FieldSearcherBase(); FieldSearcherBase & operator = (const FieldSearcherBase & org); void prepare(const search::streaming::QueryTermList & qtl); size_t _qtlFastSize; @@ -53,7 +47,8 @@ public: EXACT }; - FieldSearcher(FieldIdT fId, bool defaultPrefix=false); + explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} + FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept; ~FieldSearcher() override; virtual std::unique_ptr duplicate() const = 0; bool search(const StorageDocument & doc); @@ -74,16 +69,7 @@ public: static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); - unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; } - unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; } - unsigned anyUtf8Count() const { return _anyUtf8Count; } - unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; } - unsigned badUtf8Count() const { return _badUtf8Count; } - unsigned zeroCount() const { return _zeroCount; } - unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; } - const unsigned * utf8Count() const { return _utf8Count; } int32_t getCurrentWeight() const { return _currentElementWeight; } - void addStat(const FieldSearcher & toAdd); void zeroStat(); FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -98,7 +84,7 @@ private: void onStructStart(const Content & c) override; public: - IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} + explicit IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} }; friend class IteratorHandler; // to allow calls to onValue(); @@ -113,24 +99,13 @@ private: unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. - /// Number of bytes in blocks containing pure us-ascii - unsigned _pureUsAsciiCount; - /// Number of blocks containing pure us-ascii - unsigned _pureUsAsciiFieldCount; - /// Number of bytes in blocks containing any non us-ascii - unsigned _anyUtf8Count; - /// Number of blocks containing any non us-ascii - unsigned _anyUtf8FieldCount; protected: /// Number of terms searched. unsigned _words; /// Number of utf8 bytes by utf8 size. - unsigned _utf8Count[6]; unsigned _badUtf8Count; unsigned _zeroCount; protected: - void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; } - void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; } /** * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp index a2122f08995..d7d73899e53 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -36,7 +36,7 @@ FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded for(size_t i=0; i < sz; i++) { byte c = toFold[i]; if (c>=128) { retval = false; break; } - folded[i] = FieldSearcher::_foldLowCase[c]; + folded[i] = fold(c); } return retval; } @@ -209,7 +209,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) folded[f.size()+1] = 0x01; memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values return match(folded, f.size(), qt); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); } else { return UTF8StrChrFieldSearcher::matchTerm(f, qt); } @@ -227,7 +226,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t min folded[f.size()+1] = 0x01; memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size()); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); } else { return UTF8StrChrFieldSearcher::matchTerms(f, mintsz); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 2488d198b03..651d1dcad9f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -42,7 +42,6 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) } words++; } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index c31102ec0ab..ebdf69d0b30 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -32,10 +32,8 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * c = *p; } } else { - const byte * oldP(p); c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); if (Fast_UnicodeUtil::IsWordChar(c)) { - _utf8Count[p-oldP-1]++; const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); if (repl != nullptr) { size_t repllen = strlen(repl); @@ -50,8 +48,6 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * } else { if (c == Fast_UnicodeUtil::_BadUTF8Char) { _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; } c = *p; } @@ -70,10 +66,8 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * c = *p; } } else { - const byte * oldP(p); c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - _utf8Count[p-oldP-1]++; const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); if (repl != nullptr) { size_t repllen = strlen(repl); @@ -89,8 +83,6 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * } else { if (c == Fast_UnicodeUtil::_BadUTF8Char) { _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; } break; } @@ -128,7 +120,6 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt } words++; } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words; } @@ -154,7 +145,6 @@ UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) addHit(qt,0); } } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return 1; } @@ -188,7 +178,6 @@ UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & } } } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words + 1; // we must also count the last word } @@ -305,8 +294,6 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T } if (c == Fast_UnicodeUtil::_BadUTF8Char) { _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; } } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp index 88091c6ab4e..25ef9ae7618 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp @@ -45,8 +45,6 @@ UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ ); } } - - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); return words + 1; // we must also count the last word } diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp index f9dbf202fcb..8d3ccad9900 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.cpp +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.cpp @@ -74,7 +74,6 @@ Fast_NormalizeWordFolder::Initialize() _foldCase[0xda] = 'u'; _foldCase[0xdb] = 'u'; _foldCase[0xdd] = 'y'; - _foldCase[0xe0] = 'a'; _foldCase[0xe1] = 'a'; _foldCase[0xe2] = 'a'; diff --git a/vespalib/src/vespa/fastlib/text/normwordfolder.h b/vespalib/src/vespa/fastlib/text/normwordfolder.h index c596b0fd2b4..121a83e260d 100644 --- a/vespalib/src/vespa/fastlib/text/normwordfolder.h +++ b/vespalib/src/vespa/fastlib/text/normwordfolder.h @@ -104,20 +104,16 @@ public: switch(testchar) { case 0xc4: case 0xe4: // A/a with diaeresis + case 0xc6: + case 0xe6: // Letter/ligature AE/ae return "ae"; case 0xc5: case 0xe5: // A/a with ring return "aa"; - case 0xc6: - case 0xe6: // Letter/ligature AE/ae - return "ae"; - case 0xd6: case 0xf6: // O/o with diaeresis - return "oe"; - case 0xd8: case 0xf8: // O/o with stroke return "oe"; @@ -133,10 +129,6 @@ public: case 0xde: case 0xfe: // norse "thorn" return "th"; - - default: - return nullptr; - } } -- cgit v1.2.3