diff options
author | Tor Egge <tegge@vespa.ai> | 2024-02-05 14:50:16 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-05 14:50:16 +0100 |
commit | c57119796a4df33d9a7554508f5e85de5e934be7 (patch) | |
tree | 3dbe60dbc5e8ec1892dcbfe977120d5dc85ee2e9 /searchlib | |
parent | 9f5950a9d126db1f36abdbec4247100116abbca2 (diff) | |
parent | c490ded9e1d40b68b2b167896d754459e5d9d7de (diff) |
Merge pull request #30179 from vespa-engine/toregge/add-unpack-match-data-member-function-to-search-streaming-queryterm
Add unpack_match_data member function to search::streaming::QueryTerm.
Diffstat (limited to 'searchlib')
7 files changed, 112 insertions, 43 deletions
diff --git a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp index ff4734a3846..2459fe2f01c 100644 --- a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp +++ b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp @@ -74,22 +74,13 @@ TEST(PhraseQueryNodeTest, test_phrase_evaluate) ASSERT_EQ(3u, hits.size()); EXPECT_EQ(0u, hits[0].field_id()); EXPECT_EQ(0u, hits[0].element_id()); - EXPECT_EQ(2u, hits[0].position()); + EXPECT_EQ(0u, hits[0].position()); EXPECT_EQ(1u, hits[1].field_id()); EXPECT_EQ(0u, hits[1].element_id()); - EXPECT_EQ(6u, hits[1].position()); + EXPECT_EQ(4u, hits[1].position()); EXPECT_EQ(3u, hits[2].field_id()); EXPECT_EQ(0u, hits[2].element_id()); - EXPECT_EQ(2u, hits[2].position()); - ASSERT_EQ(4u, p->getFieldInfoSize()); - EXPECT_EQ(0u, p->getFieldInfo(0).getHitOffset()); - EXPECT_EQ(1u, p->getFieldInfo(0).getHitCount()); - EXPECT_EQ(1u, p->getFieldInfo(1).getHitOffset()); - EXPECT_EQ(1u, p->getFieldInfo(1).getHitCount()); - EXPECT_EQ(0u, p->getFieldInfo(2).getHitOffset()); // invalid, but will never be used - EXPECT_EQ(0u, p->getFieldInfo(2).getHitCount()); - EXPECT_EQ(2u, p->getFieldInfo(3).getHitOffset()); - EXPECT_EQ(1u, p->getFieldInfo(3).getHitCount()); + EXPECT_EQ(0u, hits[2].position()); EXPECT_TRUE(p->evaluate()); } diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h index ad9d15f719a..b2cdb422cc3 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h +++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h @@ -23,7 +23,6 @@ public: ~HitIteratorPack(); FieldElement& get_field_element_ref() noexcept { return _field_element; } HitIterator& front() noexcept { return _iterators.front(); } - HitIterator& back() noexcept { return _iterators.back(); } iterator begin() noexcept { return _iterators.begin(); } iterator end() noexcept { return _iterators.end(); } bool all_valid() const noexcept; diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h index 6f795c31356..9bf5f8de6b2 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h @@ -32,7 +32,6 @@ public: MultiTerm* as_multi_term() noexcept override { return this; } void reset() override; bool evaluate() const override; - virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0; const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; } }; diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp index 0020089ef62..95781b58019 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp @@ -7,8 +7,7 @@ namespace search::streaming { PhraseQueryNode::PhraseQueryNode(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, uint32_t num_terms) - : MultiTerm(std::move(result_base), index, num_terms), - _fieldInfo(32) + : MultiTerm(std::move(result_base), index, num_terms) { } @@ -65,13 +64,11 @@ const HitList & PhraseQueryNode::evaluateHits(HitList & hl) const { hl.clear(); - _fieldInfo.clear(); auto& terms = get_terms(); HitIteratorPack itr_pack(terms); if (!itr_pack.all_valid()) { return hl; } - auto& last_child = dynamic_cast<const QueryTerm&>(*terms.back()); while (itr_pack.seek_to_matching_field_element()) { uint32_t first_position = itr_pack.front()->position(); bool retry_element = true; @@ -92,10 +89,8 @@ PhraseQueryNode::evaluateHits(HitList & hl) const ++position_offset; } if (match) { - auto h = *itr_pack.back(); + auto h = *itr_pack.front(); hl.push_back(h); - auto& fi = last_child.getFieldInfo(h.field_id()); - updateFieldInfo(h.field_id(), hl.size() - 1, fi.getFieldLength()); if (!itr_pack.front().step_in_field_element(itr_pack.get_field_element_ref())) { retry_element = false; } @@ -106,25 +101,9 @@ PhraseQueryNode::evaluateHits(HitList & hl) const } void -PhraseQueryNode::updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const -{ - if (fid >= _fieldInfo.size()) { - _fieldInfo.resize(fid + 1); - // only set hit offset and field length the first time - QueryTerm::FieldInfo & fi = _fieldInfo[fid]; - fi.setHitOffset(offset); - fi.setFieldLength(fieldLength); - } - QueryTerm::FieldInfo & fi = _fieldInfo[fid]; - fi.setHitCount(fi.getHitCount() + 1); -} - -void PhraseQueryNode::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) { - (void) docid; - (void) td; - (void) match_data; + unpack_match_data_helper(docid, td, match_data, *get_terms().front()); } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h index 594eab3deba..763cb1d6b8f 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h +++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h @@ -19,16 +19,11 @@ public: const HitList & evaluateHits(HitList & hl) const override; void getPhrases(QueryNodeRefList & tl) override; void getPhrases(ConstQueryNodeRefList & tl) const override; - const QueryTerm::FieldInfo & getFieldInfo(size_t fid) const { return _fieldInfo[fid]; } - size_t getFieldInfoSize() const { return _fieldInfo.size(); } void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override; void getLeaves(QueryTermList & tl) override; void getLeaves(ConstQueryTermList & tl) const override; size_t width() const override; MultiTerm* as_multi_term() noexcept override; -private: - mutable std::vector<QueryTerm::FieldInfo> _fieldInfo; - void updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const; }; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index b7e619cfe4c..e5e1473dd3c 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -2,8 +2,21 @@ #include "queryterm.h" #include <vespa/fastlib/text/normwordfolder.h> +#include <vespa/searchlib/fef/itermdata.h> +#include <vespa/searchlib/fef/matchdata.h> #include <vespa/vespalib/objects/visit.h> +#include <algorithm> #include <cmath> +#include <limits> + +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.query.streaming.queryterm"); + +using search::fef::ITermData; +using search::fef::ITermFieldData; +using search::fef::MatchData; +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataPosition; namespace { @@ -176,6 +189,91 @@ QueryTerm::set_element_length(uint32_t hitlist_idx, uint32_t element_length) _hitList[hitlist_idx].set_element_length(element_length); } +namespace { + +uint16_t +cap_16_bits(uint32_t value) +{ + return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); +} + +uint32_t +extract_field_length(const QueryTerm& term, uint32_t field_id) +{ + return (field_id < term.getFieldInfoSize()) ? term.getFieldInfo(field_id).getFieldLength() : search::fef::FieldPositionsIterator::UNKNOWN_LENGTH; +} + +void +set_interleaved_features(TermFieldMatchData& tmd, uint32_t field_length, uint32_t num_occs) +{ + tmd.setFieldLength(cap_16_bits(field_length)); + tmd.setNumOccs(cap_16_bits(num_occs)); +} + +} + +void +QueryTerm::unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const +{ + HitList list; + const HitList & hitList = evaluateHits(list); + + if (!hitList.empty()) { // only unpack if we have a hit + LOG(debug, "Unpack match data for query term '%s:%s'", + index().c_str(), getTerm()); + + uint32_t lastFieldId = -1; + TermFieldMatchData *tmd = nullptr; + uint32_t num_occs = 0; + + // optimize for hitlist giving all hits for a single field in one chunk + for (const Hit & hit : hitList) { + uint32_t fieldId = hit.field_id(); + if (fieldId != lastFieldId) { + if (tmd != nullptr) { + if (tmd->needs_interleaved_features()) { + set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs); + } + // reset to notfound/unknown values + tmd = nullptr; + } + num_occs = 0; + + // setup for new field that had a hit + const ITermFieldData *tfd = td.lookupField(fieldId); + if (tfd != nullptr) { + tmd = match_data.resolveTermField(tfd->getHandle()); + tmd->setFieldId(fieldId); + // reset field match data, but only once per docId + if (tmd->getDocId() != docid) { + tmd->reset(docid); + } + } + lastFieldId = fieldId; + } + ++num_occs; + if (tmd != nullptr) { + TermFieldMatchDataPosition pos(hit.element_id(), hit.position(), + hit.element_weight(), hit.element_length()); + tmd->appendPosition(pos); + LOG(debug, "Append elemId(%u),position(%u), weight(%d), tfmd.weight(%d)", + pos.getElementId(), pos.getPosition(), pos.getElementWeight(), tmd->getWeight()); + } + } + if (tmd != nullptr) { + if (tmd->needs_interleaved_features()) { + set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs); + } + } + } +} + +void +QueryTerm::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) +{ + unpack_match_data_helper(docid, td, match_data, *this); +} + NearestNeighborQueryNode* QueryTerm::as_nearest_neighbor_query_node() noexcept { diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 108cc1c148d..de043237cff 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -9,6 +9,12 @@ #include <vespa/vespalib/objects/objectvisitor.h> #include <vespa/vespalib/stllike/string.h> +namespace search::fef { + +class ITermData; +class MatchData; + +} namespace search::streaming { class FuzzyTerm; @@ -98,7 +104,9 @@ public: virtual MultiTerm* as_multi_term() noexcept; virtual RegexpTerm* as_regexp_term() noexcept; virtual FuzzyTerm* as_fuzzy_term() noexcept; + virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data); protected: + void unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const; using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; EncodingBitMap _encoding; |