diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-01-25 11:39:36 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2024-01-25 11:39:36 +0100 |
commit | d198b7b73e376bcb349b159a77e57dbb2a54f19e (patch) | |
tree | 7ba1e5430f562e1515856a6e6c82daeba905443d /streamingvisitors | |
parent | 45900206e7b773c804e803497dd5a7058f33f9c4 (diff) |
Track element length in streaming mode.
Diffstat (limited to 'streamingvisitors')
8 files changed, 49 insertions, 10 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index eb233db9632..d1778c2ce8d 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -1110,6 +1110,21 @@ TEST("counting of words") { assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits())); } +TEST("element lengths") +{ + UTF8StrChrFieldSearcher fs(0); + auto field = StringList().add("a").add("b a c").add("d a"); + auto query = StringList().add("a"); + auto qtv = performSearch(fs, query, getFieldValue(field)); + EXPECT_EQUAL(1u, qtv.size()); + auto& qt = *qtv[0]; + auto& hl = qt.getHitList(); + EXPECT_EQUAL(3u, hl.size()); + EXPECT_EQUAL(1u, hl[0].element_length()); + EXPECT_EQUAL(3u, hl[1].element_length()); + EXPECT_EQUAL(2u, hl[2].element_length()); +} + vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization"; void diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp index d0cfa4d9956..aa25b0e75d3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp @@ -53,7 +53,7 @@ void BoolFieldSearcher::onValue(const document::FieldValue & fv) addHit(*_qtl[j], 0); } } - ++_words; + set_element_length(1); } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index 5e06ae41a03..c75ab7fccd3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -5,6 +5,7 @@ #include <vespa/document/fieldvalue/weightedsetfieldvalue.h> #include <vespa/searchlib/query/streaming/multi_term.h> #include <vespa/vespalib/stllike/hash_set.h> +#include <cassert> #include <vespa/log/log.h> LOG_SETUP(".vsm.searcher.fieldsearcher"); @@ -55,6 +56,7 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept _maxFieldLength(0x100000), _currentElementId(0), _currentElementWeight(1), + _element_length_fixups(), _words(0), _badUtf8Count(0) { @@ -70,6 +72,7 @@ FieldSearcher::search(const StorageDocument & doc) fInfo.setHitOffset(qt->getHitList().size()); } onSearch(doc); + assert(_element_length_fixups.empty()); for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset()); @@ -276,4 +279,16 @@ FieldSearcher::IteratorHandler::onStructStart(const Content & c) _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue())); } +void +FieldSearcher::set_element_length(uint32_t element_length) +{ + _words += element_length; + if (!_element_length_fixups.empty()) { + for (auto& fixup : _element_length_fixups) { + fixup.first->set_element_length(fixup.second, element_length); + } + _element_length_fixups.clear(); + } +} + } diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index e339e4bdf5a..4a9844d8af6 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -6,6 +6,7 @@ #include <vespa/vsm/common/document.h> #include <vespa/vsm/common/storagedocument.h> #include <vespa/vespalib/util/array.h> +#include <utility> namespace search::fef { class IQueryEnvironment; } @@ -96,6 +97,7 @@ private: unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. + std::vector<std::pair<search::streaming::QueryTerm*, uint32_t>> _element_length_fixups; protected: /// Number of terms searched. unsigned _words; @@ -105,9 +107,10 @@ protected: * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. **/ - void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(field(), _currentElementId, _currentElementWeight, _words + pos); + void addHit(search::streaming::QueryTerm & qt, uint32_t pos) { + _element_length_fixups.emplace_back(&qt, qt.add(field(), _currentElementId, _currentElementWeight, _words + pos)); } + void set_element_length(uint32_t element_length); public: static search::byte _foldLowCase[256]; static search::byte _wordChar[256]; diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp index 8558522003f..70e5bb4b82c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp @@ -55,7 +55,7 @@ void FloatFieldSearcherT<T>::onValue(const document::FieldValue & fv) addHit(*_qtl[j], 0); } } - ++_words; + set_element_length(1); } template<typename T> diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp index 5ecc9a5a06e..bbeb3be986f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp @@ -58,7 +58,7 @@ void GeoPosFieldSearcher::onStructValue(const document::StructFieldValue & fv) { addHit(*_qtl[j], 0); } } - ++_words; + set_element_length(1); } bool GeoPosFieldSearcher::GeoPosInfo::cmp(const document::StructFieldValue & sfv) const { diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp index e73c7f5c1a7..3984254274f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp @@ -43,7 +43,7 @@ void IntFieldSearcher::onValue(const document::FieldValue & fv) addHit(*_qtl[j], 0); } } - ++_words; + set_element_length(1); } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp index ba52444101d..673cf11b2cf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp @@ -25,22 +25,28 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv) bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) { + size_t element_length = 0; + bool need_count_words = false; if (_qtl.size() > 1) { size_t mintsz = shortestTerm(); if (fieldRef.size() >= mintsz) { - _words += matchTerms(fieldRef, mintsz); + element_length = matchTerms(fieldRef, mintsz); } else { - _words += countWords(fieldRef); + need_count_words = true; } } else { for (auto qt : _qtl) { if (fieldRef.size() >= qt->termLen() || qt->isRegex() || qt->isFuzzy()) { - _words += matchTerm(fieldRef, *qt); + element_length = std::max(element_length, matchTerm(fieldRef, *qt)); } else { - _words += countWords(fieldRef); + need_count_words = true; } } } + if (need_count_words) { + element_length = std::max(element_length, countWords(fieldRef)); + } + set_element_length(element_length); return true; } |