aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2024-02-05 13:42:12 +0100
committerTor Egge <Tor.Egge@online.no>2024-02-05 13:42:12 +0100
commitc490ded9e1d40b68b2b167896d754459e5d9d7de (patch)
treef5e888483539d4a4298177c8d2e06f261878a6cb
parentd45399e3ab1d07781f71473e4a8fe2b67b197941 (diff)
Add unpack_match_data member function to search::streaming::QueryTerm.
-rw-r--r--searchlib/src/tests/query/streaming/phrase_query_node_test.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.h1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp98
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h8
-rw-r--r--streamingvisitors/src/tests/querywrapper/querywrapper_test.cpp6
-rw-r--r--streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp4
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/querywrapper.h1
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp71
11 files changed, 118 insertions, 119 deletions
diff --git a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
index ff4734a3846..2459fe2f01c 100644
--- a/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
+++ b/searchlib/src/tests/query/streaming/phrase_query_node_test.cpp
@@ -74,22 +74,13 @@ TEST(PhraseQueryNodeTest, test_phrase_evaluate)
ASSERT_EQ(3u, hits.size());
EXPECT_EQ(0u, hits[0].field_id());
EXPECT_EQ(0u, hits[0].element_id());
- EXPECT_EQ(2u, hits[0].position());
+ EXPECT_EQ(0u, hits[0].position());
EXPECT_EQ(1u, hits[1].field_id());
EXPECT_EQ(0u, hits[1].element_id());
- EXPECT_EQ(6u, hits[1].position());
+ EXPECT_EQ(4u, hits[1].position());
EXPECT_EQ(3u, hits[2].field_id());
EXPECT_EQ(0u, hits[2].element_id());
- EXPECT_EQ(2u, hits[2].position());
- ASSERT_EQ(4u, p->getFieldInfoSize());
- EXPECT_EQ(0u, p->getFieldInfo(0).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(0).getHitCount());
- EXPECT_EQ(1u, p->getFieldInfo(1).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(1).getHitCount());
- EXPECT_EQ(0u, p->getFieldInfo(2).getHitOffset()); // invalid, but will never be used
- EXPECT_EQ(0u, p->getFieldInfo(2).getHitCount());
- EXPECT_EQ(2u, p->getFieldInfo(3).getHitOffset());
- EXPECT_EQ(1u, p->getFieldInfo(3).getHitCount());
+ EXPECT_EQ(0u, hits[2].position());
EXPECT_TRUE(p->evaluate());
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
index ad9d15f719a..b2cdb422cc3 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h
@@ -23,7 +23,6 @@ public:
~HitIteratorPack();
FieldElement& get_field_element_ref() noexcept { return _field_element; }
HitIterator& front() noexcept { return _iterators.front(); }
- HitIterator& back() noexcept { return _iterators.back(); }
iterator begin() noexcept { return _iterators.begin(); }
iterator end() noexcept { return _iterators.end(); }
bool all_valid() const noexcept;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
index 6f795c31356..9bf5f8de6b2 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
@@ -32,7 +32,6 @@ public:
MultiTerm* as_multi_term() noexcept override { return this; }
void reset() override;
bool evaluate() const override;
- virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0;
const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; }
};
diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
index 0020089ef62..95781b58019 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp
@@ -7,8 +7,7 @@
namespace search::streaming {
PhraseQueryNode::PhraseQueryNode(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, uint32_t num_terms)
- : MultiTerm(std::move(result_base), index, num_terms),
- _fieldInfo(32)
+ : MultiTerm(std::move(result_base), index, num_terms)
{
}
@@ -65,13 +64,11 @@ const HitList &
PhraseQueryNode::evaluateHits(HitList & hl) const
{
hl.clear();
- _fieldInfo.clear();
auto& terms = get_terms();
HitIteratorPack itr_pack(terms);
if (!itr_pack.all_valid()) {
return hl;
}
- auto& last_child = dynamic_cast<const QueryTerm&>(*terms.back());
while (itr_pack.seek_to_matching_field_element()) {
uint32_t first_position = itr_pack.front()->position();
bool retry_element = true;
@@ -92,10 +89,8 @@ PhraseQueryNode::evaluateHits(HitList & hl) const
++position_offset;
}
if (match) {
- auto h = *itr_pack.back();
+ auto h = *itr_pack.front();
hl.push_back(h);
- auto& fi = last_child.getFieldInfo(h.field_id());
- updateFieldInfo(h.field_id(), hl.size() - 1, fi.getFieldLength());
if (!itr_pack.front().step_in_field_element(itr_pack.get_field_element_ref())) {
retry_element = false;
}
@@ -106,25 +101,9 @@ PhraseQueryNode::evaluateHits(HitList & hl) const
}
void
-PhraseQueryNode::updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const
-{
- if (fid >= _fieldInfo.size()) {
- _fieldInfo.resize(fid + 1);
- // only set hit offset and field length the first time
- QueryTerm::FieldInfo & fi = _fieldInfo[fid];
- fi.setHitOffset(offset);
- fi.setFieldLength(fieldLength);
- }
- QueryTerm::FieldInfo & fi = _fieldInfo[fid];
- fi.setHitCount(fi.getHitCount() + 1);
-}
-
-void
PhraseQueryNode::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data)
{
- (void) docid;
- (void) td;
- (void) match_data;
+ unpack_match_data_helper(docid, td, match_data, *get_terms().front());
}
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
index 594eab3deba..763cb1d6b8f 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.h
@@ -19,16 +19,11 @@ public:
const HitList & evaluateHits(HitList & hl) const override;
void getPhrases(QueryNodeRefList & tl) override;
void getPhrases(ConstQueryNodeRefList & tl) const override;
- const QueryTerm::FieldInfo & getFieldInfo(size_t fid) const { return _fieldInfo[fid]; }
- size_t getFieldInfoSize() const { return _fieldInfo.size(); }
void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override;
void getLeaves(QueryTermList & tl) override;
void getLeaves(ConstQueryTermList & tl) const override;
size_t width() const override;
MultiTerm* as_multi_term() noexcept override;
-private:
- mutable std::vector<QueryTerm::FieldInfo> _fieldInfo;
- void updateFieldInfo(size_t fid, size_t offset, size_t fieldLength) const;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index b7e619cfe4c..e5e1473dd3c 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -2,8 +2,21 @@
#include "queryterm.h"
#include <vespa/fastlib/text/normwordfolder.h>
+#include <vespa/searchlib/fef/itermdata.h>
+#include <vespa/searchlib/fef/matchdata.h>
#include <vespa/vespalib/objects/visit.h>
+#include <algorithm>
#include <cmath>
+#include <limits>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".searchlib.query.streaming.queryterm");
+
+using search::fef::ITermData;
+using search::fef::ITermFieldData;
+using search::fef::MatchData;
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataPosition;
namespace {
@@ -176,6 +189,91 @@ QueryTerm::set_element_length(uint32_t hitlist_idx, uint32_t element_length)
_hitList[hitlist_idx].set_element_length(element_length);
}
+namespace {
+
+uint16_t
+cap_16_bits(uint32_t value)
+{
+ return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+}
+
+uint32_t
+extract_field_length(const QueryTerm& term, uint32_t field_id)
+{
+ return (field_id < term.getFieldInfoSize()) ? term.getFieldInfo(field_id).getFieldLength() : search::fef::FieldPositionsIterator::UNKNOWN_LENGTH;
+}
+
+void
+set_interleaved_features(TermFieldMatchData& tmd, uint32_t field_length, uint32_t num_occs)
+{
+ tmd.setFieldLength(cap_16_bits(field_length));
+ tmd.setNumOccs(cap_16_bits(num_occs));
+}
+
+}
+
+void
+QueryTerm::unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const
+{
+ HitList list;
+ const HitList & hitList = evaluateHits(list);
+
+ if (!hitList.empty()) { // only unpack if we have a hit
+ LOG(debug, "Unpack match data for query term '%s:%s'",
+ index().c_str(), getTerm());
+
+ uint32_t lastFieldId = -1;
+ TermFieldMatchData *tmd = nullptr;
+ uint32_t num_occs = 0;
+
+ // optimize for hitlist giving all hits for a single field in one chunk
+ for (const Hit & hit : hitList) {
+ uint32_t fieldId = hit.field_id();
+ if (fieldId != lastFieldId) {
+ if (tmd != nullptr) {
+ if (tmd->needs_interleaved_features()) {
+ set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs);
+ }
+ // reset to notfound/unknown values
+ tmd = nullptr;
+ }
+ num_occs = 0;
+
+ // setup for new field that had a hit
+ const ITermFieldData *tfd = td.lookupField(fieldId);
+ if (tfd != nullptr) {
+ tmd = match_data.resolveTermField(tfd->getHandle());
+ tmd->setFieldId(fieldId);
+ // reset field match data, but only once per docId
+ if (tmd->getDocId() != docid) {
+ tmd->reset(docid);
+ }
+ }
+ lastFieldId = fieldId;
+ }
+ ++num_occs;
+ if (tmd != nullptr) {
+ TermFieldMatchDataPosition pos(hit.element_id(), hit.position(),
+ hit.element_weight(), hit.element_length());
+ tmd->appendPosition(pos);
+ LOG(debug, "Append elemId(%u),position(%u), weight(%d), tfmd.weight(%d)",
+ pos.getElementId(), pos.getPosition(), pos.getElementWeight(), tmd->getWeight());
+ }
+ }
+ if (tmd != nullptr) {
+ if (tmd->needs_interleaved_features()) {
+ set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs);
+ }
+ }
+ }
+}
+
+void
+QueryTerm::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data)
+{
+ unpack_match_data_helper(docid, td, match_data, *this);
+}
+
NearestNeighborQueryNode*
QueryTerm::as_nearest_neighbor_query_node() noexcept
{
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 108cc1c148d..de043237cff 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -9,6 +9,12 @@
#include <vespa/vespalib/objects/objectvisitor.h>
#include <vespa/vespalib/stllike/string.h>
+namespace search::fef {
+
+class ITermData;
+class MatchData;
+
+}
namespace search::streaming {
class FuzzyTerm;
@@ -98,7 +104,9 @@ public:
virtual MultiTerm* as_multi_term() noexcept;
virtual RegexpTerm* as_regexp_term() noexcept;
virtual FuzzyTerm* as_fuzzy_term() noexcept;
+ virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data);
protected:
+ void unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const;
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
EncodingBitMap _encoding;
diff --git a/streamingvisitors/src/tests/querywrapper/querywrapper_test.cpp b/streamingvisitors/src/tests/querywrapper/querywrapper_test.cpp
index dc3dbfca7ca..70b863e540b 100644
--- a/streamingvisitors/src/tests/querywrapper/querywrapper_test.cpp
+++ b/streamingvisitors/src/tests/querywrapper/querywrapper_test.cpp
@@ -84,12 +84,6 @@ QueryWrapperTest::testQueryWrapper()
EXPECT_TRUE( tl[2].isPhraseTerm());
EXPECT_TRUE( tl[3].isPhraseTerm());
EXPECT_TRUE(!tl[4].isPhraseTerm());
-
- EXPECT_EQUAL(tl[0].getPosAdjust(), 0u);
- EXPECT_EQUAL(tl[1].getPosAdjust(), 2u);
- EXPECT_EQUAL(tl[2].getPosAdjust(), 2u);
- EXPECT_EQUAL(tl[3].getPosAdjust(), 2u);
- EXPECT_EQUAL(tl[4].getPosAdjust(), 0u);
}
}
diff --git a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
index c9518b29884..0abff37d622 100644
--- a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
+++ b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
@@ -85,8 +85,8 @@ RankProcessorTest::test_unpack_match_data_for_term_node(bool interleaved_feature
RankProcessor::unpack_match_data(1, *md, *_query_wrapper);
EXPECT_EQ(invalid_id, tfmd->getDocId());
node->add(field_id, 0, 1, 0);
+ node->add(field_id, 0, 1, 1);
auto& field_info = node->getFieldInfo(field_id);
- field_info.setHitCount(mock_num_occs);
field_info.setFieldLength(mock_field_length);
RankProcessor::unpack_match_data(2, *md, *_query_wrapper);
EXPECT_EQ(2, tfmd->getDocId());
@@ -97,7 +97,7 @@ RankProcessorTest::test_unpack_match_data_for_term_node(bool interleaved_feature
EXPECT_EQ(0, tfmd->getNumOccs());
EXPECT_EQ(0, tfmd->getFieldLength());
}
- EXPECT_EQ(1, tfmd->size());
+ EXPECT_EQ(2, tfmd->size());
node->reset();
RankProcessor::unpack_match_data(3, *md, *_query_wrapper);
EXPECT_EQ(2, tfmd->getDocId());
diff --git a/streamingvisitors/src/vespa/searchvisitor/querywrapper.h b/streamingvisitors/src/vespa/searchvisitor/querywrapper.h
index b24f695196e..420ff215833 100644
--- a/streamingvisitors/src/vespa/searchvisitor/querywrapper.h
+++ b/streamingvisitors/src/vespa/searchvisitor/querywrapper.h
@@ -47,7 +47,6 @@ public:
size_t getIndex() const { return _index; }
bool isPhraseTerm() const { return _parent != nullptr; }
bool isFirstPhraseTerm() const { return isPhraseTerm() && getIndex() == 0; }
- size_t getPosAdjust() const { return _parent != nullptr ? _parent->width() - 1 : 0; }
bool isGeoPosTerm() const { return (_term != nullptr) && _term->isGeoLoc(); }
};
diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
index 96e8ca89a04..09699f79427 100644
--- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
@@ -53,11 +53,6 @@ getFeature(const RankProgram &rankProgram) {
return resolver.resolve(0);
}
-uint16_t
-cap_16_bits(uint32_t value) {
- return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
-}
-
}
void
@@ -285,68 +280,10 @@ RankProcessor::unpack_match_data(uint32_t docid, MatchData &matchData, QueryWrap
QueryTermData & qtd = static_cast<QueryTermData &>(term.getTerm()->getQueryItem());
const ITermData &td = qtd.getTermData();
- HitList list;
- const HitList & hitList = isPhrase
- ? term.getParent()->evaluateHits(list)
- : term.getTerm()->evaluateHits(list);
-
- if (hitList.size() > 0) { // only unpack if we have a hit
- LOG(debug, "Unpack match data for query term '%s:%s' (%s)",
- term.getTerm()->index().c_str(), term.getTerm()->getTerm(), isPhrase ? "phrase" : "term");
-
- uint32_t lastFieldId = -1;
- TermFieldMatchData *tmd = nullptr;
- uint32_t fieldLen = search::fef::FieldPositionsIterator::UNKNOWN_LENGTH;
- uint32_t num_occs = 0;
-
- // optimize for hitlist giving all hits for a single field in one chunk
- for (const Hit & hit : hitList) {
- uint32_t fieldId = hit.field_id();
- if (fieldId != lastFieldId) {
- // reset to notfound/unknown values
- tmd = nullptr;
- fieldLen = search::fef::FieldPositionsIterator::UNKNOWN_LENGTH;
- num_occs = 0;
-
- // setup for new field that had a hit
- const ITermFieldData *tfd = td.lookupField(fieldId);
- if (tfd != nullptr) {
- tmd = matchData.resolveTermField(tfd->getHandle());
- tmd->setFieldId(fieldId);
- // reset field match data, but only once per docId
- if (tmd->getDocId() != docid) {
- tmd->reset(docid);
- }
- }
- // find fieldLen for new field
- if (isPhrase) {
- if (fieldId < term.getParent()->getFieldInfoSize()) {
- auto& field_info = term.getParent()->getFieldInfo(fieldId);
- fieldLen = field_info.getFieldLength();
- num_occs = field_info.getHitCount();
- }
- } else {
- if (fieldId < term.getTerm()->getFieldInfoSize()) {
- auto& field_info = term.getTerm()->getFieldInfo(fieldId);
- fieldLen = field_info.getFieldLength();
- num_occs = field_info.getHitCount();
- }
- }
- lastFieldId = fieldId;
- }
- if (tmd != nullptr) {
- // adjust so that the position for phrase terms equals the match for the first term
- TermFieldMatchDataPosition pos(hit.element_id(), hit.position() - term.getPosAdjust(),
- hit.element_weight(), hit.element_length());
- tmd->appendPosition(pos);
- LOG(debug, "Append elemId(%u),position(%u), weight(%d), tfmd.weight(%d)",
- pos.getElementId(), pos.getPosition(), pos.getElementWeight(), tmd->getWeight());
- if (tmd->needs_interleaved_features()) {
- tmd->setFieldLength(cap_16_bits(fieldLen));
- tmd->setNumOccs(cap_16_bits(num_occs));
- }
- }
- }
+ if (isPhrase) {
+ term.getParent()->unpack_match_data(docid, td, matchData);
+ } else {
+ term.getTerm()->unpack_match_data(docid, td, matchData);
}
}
}