aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <tegge@vespa.ai>2024-02-05 14:50:16 +0100
committerGitHub <noreply@github.com>2024-02-05 14:50:16 +0100
commitc57119796a4df33d9a7554508f5e85de5e934be7 (patch)
tree3dbe60dbc5e8ec1892dcbfe977120d5dc85ee2e9 /searchlib
parent9f5950a9d126db1f36abdbec4247100116abbca2 (diff)
parentc490ded9e1d40b68b2b167896d754459e5d9d7de (diff)
Merge pull request #30179 from vespa-engine/toregge/add-unpack-match-data-member-function-to-search-streaming-queryterm
Add unpack_match_data member function to search::streaming::QueryTerm.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/query/streaming/phrase_query_node_test.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp98
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h8
7 files changed, 112 insertions, 43 deletions
diff --git a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
index ff4734a3846..2459fe2f01c 100644
--- a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
+++ b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
@@ -74,22 +74,13 @@ TEST(PhraseQueryNodeTest, test_phrase_evaluate)
ASSERT_EQ(3u, hits.size());
EXPECT_EQ(0u, hits[0].field_id());
EXPECT_EQ(0u, hits[0].element_id());
- EXPECT_EQ(2u, hits[0].position());
+ EXPECT_EQ(0u, hits[0].position());
EXPECT_EQ(1u, hits[1].field_id());
EXPECT_EQ(0u, hits[1].element_id());
- EXPECT_EQ(6u, hits[1].position());
+ EXPECT_EQ(4u, hits[1].position());
EXPECT_EQ(3u, hits[2].field_id());
EXPECT_EQ(0u, hits[2].element_id());
- EXPECT_EQ(2u, hits[2].position());
- ASSERT_EQ(4u, p->getFieldInfoSize());
- EXPECT_EQ(0u, p->getFieldInfo(0).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(0).getHitCount());
- EXPECT_EQ(1u, p->getFieldInfo(1).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(1).getHitCount());
- EXPECT_EQ(0u, p->getFieldInfo(2).getHitOffset()); // invalid, but will never be used
- EXPECT_EQ(0u, p->getFieldInfo(2).getHitCount());
- EXPECT_EQ(2u, p->getFieldInfo(3).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(3).getHitCount());
+ EXPECT_EQ(0u, hits[2].position());
EXPECT_TRUE(p->evaluate());
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
index ad9d15f719a..b2cdb422cc3 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
@@ -23,7 +23,6 @@ public:
~HitIteratorPack();
FieldElement& get_field_element_ref() noexcept { return _field_element; }
HitIterator& front() noexcept { return _iterators.front(); }
- HitIterator& back() noexcept { return _iterators.back(); }
iterator begin() noexcept { return _iterators.begin(); }
iterator end() noexcept { return _iterators.end(); }
bool all_valid() const noexcept;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
index 6f795c31356..9bf5f8de6b2 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
@@ -32,7 +32,6 @@ public:
MultiTerm* as_multi_term() noexcept override { return this; }
void reset() override;
bool evaluate() const override;
- virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0;
const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; }
};
diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
index 0020089ef62..95781b58019 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
@@ -7,8 +7,7 @@
namespace search::streaming {
PhraseQueryNode::PhraseQueryNode(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, uint32_t num_terms)
- : MultiTerm(std::move(result_base), index, num_terms),
- _fieldInfo(32)
+ : MultiTerm(std::move(result_base), index, num_terms)
{
}
@@ -65,13 +64,11 @@ const HitList &
PhraseQueryNode::evaluateHits(HitList & hl) const
{
hl.clear();
- _fieldInfo.clear();
auto& terms = get_terms();
HitIteratorPack itr_pack(terms);
if (!itr_pack.all_valid()) {
return hl;
}
- auto& last_child = dynamic_cast<const QueryTerm&>(*terms.back());
while (itr_pack.seek_to_matching_field_element()) {
uint32_t first_position = itr_pack.front()->position();
bool retry_element = true;
@@ -92,10 +89,8 @@ PhraseQueryNode::evaluateHits(HitList & hl) const
++position_offset;
}
if (match) {
- auto h = *itr_pack.back();
+ auto h = *itr_pack.front();
hl.push_back(h);
- auto& fi = last_child.getFieldInfo(h.field_id());
- updateFieldInfo(h.field_id(), hl.size() - 1, fi.getFieldLength());
if (!itr_pack.front().step_in_field_element(itr_pack.get_field_element_ref())) {
retry_element = false;
}
@@ -106,25 +101,9 @@ PhraseQueryNode::evaluateHits(HitList & hl) const
}
void
-PhraseQueryNode::updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const
-{
- if (fid >= _fieldInfo.size()) {
- _fieldInfo.resize(fid + 1);
- // only set hit offset and field length the first time
- QueryTerm::FieldInfo & fi = _fieldInfo[fid];
- fi.setHitOffset(offset);
- fi.setFieldLength(fieldLength);
- }
- QueryTerm::FieldInfo & fi = _fieldInfo[fid];
- fi.setHitCount(fi.getHitCount() + 1);
-}
-
-void
PhraseQueryNode::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data)
{
- (void) docid;
- (void) td;
- (void) match_data;
+ unpack_match_data_helper(docid, td, match_data, *get_terms().front());
}
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
index 594eab3deba..763cb1d6b8f 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
@@ -19,16 +19,11 @@ public:
const HitList & evaluateHits(HitList & hl) const override;
void getPhrases(QueryNodeRefList & tl) override;
void getPhrases(ConstQueryNodeRefList & tl) const override;
- const QueryTerm::FieldInfo & getFieldInfo(size_t fid) const { return _fieldInfo[fid]; }
- size_t getFieldInfoSize() const { return _fieldInfo.size(); }
void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override;
void getLeaves(QueryTermList & tl) override;
void getLeaves(ConstQueryTermList & tl) const override;
size_t width() const override;
MultiTerm* as_multi_term() noexcept override;
-private:
- mutable std::vector<QueryTerm::FieldInfo> _fieldInfo;
- void updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index b7e619cfe4c..e5e1473dd3c 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -2,8 +2,21 @@
#include "queryterm.h"
#include <vespa/fastlib/text/normwordfolder.h>
+#include <vespa/searchlib/fef/itermdata.h>
+#include <vespa/searchlib/fef/matchdata.h>
#include <vespa/vespalib/objects/visit.h>
+#include <algorithm>
#include <cmath>
+#include <limits>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".searchlib.query.streaming.queryterm");
+
+using search::fef::ITermData;
+using search::fef::ITermFieldData;
+using search::fef::MatchData;
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataPosition;
namespace {
@@ -176,6 +189,91 @@ QueryTerm::set_element_length(uint32_t hitlist_idx, uint32_t element_length)
_hitList[hitlist_idx].set_element_length(element_length);
}
+namespace {
+
+uint16_t
+cap_16_bits(uint32_t value)
+{
+ return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+}
+
+uint32_t
+extract_field_length(const QueryTerm& term, uint32_t field_id)
+{
+ return (field_id < term.getFieldInfoSize()) ? term.getFieldInfo(field_id).getFieldLength() : search::fef::FieldPositionsIterator::UNKNOWN_LENGTH;
+}
+
+void
+set_interleaved_features(TermFieldMatchData& tmd, uint32_t field_length, uint32_t num_occs)
+{
+ tmd.setFieldLength(cap_16_bits(field_length));
+ tmd.setNumOccs(cap_16_bits(num_occs));
+}
+
+}
+
+void
+QueryTerm::unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const
+{
+ HitList list;
+ const HitList & hitList = evaluateHits(list);
+
+ if (!hitList.empty()) { // only unpack if we have a hit
+ LOG(debug, "Unpack match data for query term '%s:%s'",
+ index().c_str(), getTerm());
+
+ uint32_t lastFieldId = -1;
+ TermFieldMatchData *tmd = nullptr;
+ uint32_t num_occs = 0;
+
+ // optimize for hitlist giving all hits for a single field in one chunk
+ for (const Hit & hit : hitList) {
+ uint32_t fieldId = hit.field_id();
+ if (fieldId != lastFieldId) {
+ if (tmd != nullptr) {
+ if (tmd->needs_interleaved_features()) {
+ set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs);
+ }
+ // reset to notfound/unknown values
+ tmd = nullptr;
+ }
+ num_occs = 0;
+
+ // setup for new field that had a hit
+ const ITermFieldData *tfd = td.lookupField(fieldId);
+ if (tfd != nullptr) {
+ tmd = match_data.resolveTermField(tfd->getHandle());
+ tmd->setFieldId(fieldId);
+ // reset field match data, but only once per docId
+ if (tmd->getDocId() != docid) {
+ tmd->reset(docid);
+ }
+ }
+ lastFieldId = fieldId;
+ }
+ ++num_occs;
+ if (tmd != nullptr) {
+ TermFieldMatchDataPosition pos(hit.element_id(), hit.position(),
+ hit.element_weight(), hit.element_length());
+ tmd->appendPosition(pos);
+ LOG(debug, "Append elemId(%u),position(%u), weight(%d), tfmd.weight(%d)",
+ pos.getElementId(), pos.getPosition(), pos.getElementWeight(), tmd->getWeight());
+ }
+ }
+ if (tmd != nullptr) {
+ if (tmd->needs_interleaved_features()) {
+ set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs);
+ }
+ }
+ }
+}
+
+void
+QueryTerm::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data)
+{
+ unpack_match_data_helper(docid, td, match_data, *this);
+}
+
NearestNeighborQueryNode*
QueryTerm::as_nearest_neighbor_query_node() noexcept
{
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 108cc1c148d..de043237cff 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -9,6 +9,12 @@
#include <vespa/vespalib/objects/objectvisitor.h>
#include <vespa/vespalib/stllike/string.h>
+namespace search::fef {
+
+class ITermData;
+class MatchData;
+
+}
namespace search::streaming {
class FuzzyTerm;
@@ -98,7 +104,9 @@ public:
virtual MultiTerm* as_multi_term() noexcept;
virtual RegexpTerm* as_regexp_term() noexcept;
virtual FuzzyTerm* as_fuzzy_term() noexcept;
+ virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data);
protected:
+ void unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const;
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
EncodingBitMap _encoding;