diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-02-09 23:31:57 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-09 23:31:57 +0100 |
commit | a59d72339a96ead18d336b290c8b4e89c768bfa7 (patch) | |
tree | b8b7df32d41024b422c24044191f2996e6f32ff4 | |
parent | fc1e1b3def4a26b15c27892fb1d696e453adf0fb (diff) | |
parent | 1f24d13afaeb37fac4cd810b76c1b1b9b5a4dc51 (diff) |
Merge pull request #30229 from vespa-engine/toregge/handle-equiv-query-node-as-a-leaf-in-streaming-query-tree
Handle search::streaming::EquivQueryNode as a leaf in the query tree.
21 files changed, 654 insertions, 151 deletions
diff --git a/searchlib/src/tests/query/streaming/CMakeLists.txt b/searchlib/src/tests/query/streaming/CMakeLists.txt index 7568e45d00a..5ed450ecbc8 100644 --- a/searchlib/src/tests/query/streaming/CMakeLists.txt +++ b/searchlib/src/tests/query/streaming/CMakeLists.txt @@ -1,5 +1,14 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_query_streaming_equiv_query_node_test_app TEST + SOURCES + equiv_query_node_test.cpp + DEPENDS + searchlib + GTest::gtest +) +vespa_add_test(NAME searchlib_query_streaming_equiv_query_node_test_app COMMAND searchlib_query_streaming_equiv_query_node_test_app) + vespa_add_executable(searchlib_query_streaming_hit_iterator_test_app TEST SOURCES hit_iterator_test.cpp diff --git a/searchlib/src/tests/query/streaming/equiv_query_node_test.cpp b/searchlib/src/tests/query/streaming/equiv_query_node_test.cpp new file mode 100644 index 00000000000..72378385c78 --- /dev/null +++ b/searchlib/src/tests/query/streaming/equiv_query_node_test.cpp @@ -0,0 +1,209 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/query/streaming/equiv_query_node.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchlib/fef/simpletermdata.h> +#include <vespa/searchlib/query/streaming/phrase_query_node.h> +#include <vespa/searchlib/query/streaming/query.h> +#include <vespa/searchlib/query/streaming/queryterm.h> +#include <vespa/searchlib/query/tree/querybuilder.h> +#include <vespa/searchlib/query/tree/simplequery.h> +#include <vespa/searchlib/query/tree/stackdumpcreator.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::fef::MatchData; +using search::fef::SimpleTermData; +using search::fef::TermFieldHandle; +using search::fef::TermFieldMatchDataPosition; +using search::query::QueryBuilder; +using search::query::Node; +using search::query::SimpleQueryNodeTypes; +using search::query::StackDumpCreator; +using search::query::Weight; +using search::streaming::EquivQueryNode; +using search::streaming::HitList; +using search::streaming::PhraseQueryNode; +using search::streaming::Query; +using search::streaming::QueryNodeResultFactory; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +class AllowRewrite : public QueryNodeResultFactory +{ +public: + bool allow_float_terms_rewrite(vespalib::stringref) const noexcept override { return true; } +}; + +class EquivQueryNodeTest : public ::testing::Test +{ +public: + EquivQueryNodeTest(); + ~EquivQueryNodeTest(); + + void assert_tfmd_pos(const vespalib::string label, + const TermFieldMatchDataPosition &tfmd_pos, + uint32_t exp_element_id, + uint32_t exp_position, + int32_t exp_element_weight, + uint32_t exp_element_length); + vespalib::string make_simple_equiv_stack_dump(); +}; + +EquivQueryNodeTest::EquivQueryNodeTest() + : ::testing::Test() +{ +} + +EquivQueryNodeTest::~EquivQueryNodeTest() = default; + +void +EquivQueryNodeTest::assert_tfmd_pos(const vespalib::string label, + const TermFieldMatchDataPosition &tfmd_pos, + uint32_t exp_element_id, + uint32_t exp_position, + int32_t exp_element_weight, + uint32_t exp_element_length) +{ + SCOPED_TRACE(label); + EXPECT_EQ(exp_element_id, tfmd_pos.getElementId()); + EXPECT_EQ(exp_position, tfmd_pos.getPosition()); + EXPECT_EQ(exp_element_weight, tfmd_pos.getElementWeight()); + EXPECT_EQ(exp_element_length, tfmd_pos.getElementLen()); +} + +vespalib::string +EquivQueryNodeTest::make_simple_equiv_stack_dump() +{ + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addEquiv(3, 0, Weight(0)); + { + builder.addStringTerm("2", "", 0, Weight(0)); + builder.addStringTerm("2.5", "", 0, Weight(0)); + builder.addStringTerm("3", "", 0, Weight(0)); + } + Node::UP node = builder.build(); + return StackDumpCreator::create(*node); +} + +TEST_F(EquivQueryNodeTest, test_equiv_evaluate_and_unpack) +{ + auto stack_dump = make_simple_equiv_stack_dump(); + QueryNodeResultFactory empty; + Query q(empty, stack_dump); + auto& eqn = dynamic_cast<EquivQueryNode&>(q.getRoot()); + auto& terms = eqn.get_terms(); + EXPECT_EQ(3, terms.size()); + for (auto& qt : terms) { + qt->resizeFieldId(1); + } + + /* + * Populate hit lists in query terms, emulating the result of + * having performed a streaming search. + */ + constexpr uint32_t field0 = 0; + constexpr uint32_t field1 = 1; + constexpr uint32_t elem0 = 0; + constexpr uint32_t elem1 = 1; + constexpr int32_t weight1 = 1; + constexpr int32_t weight2 = 2; + constexpr uint32_t pos5 = 5; + constexpr uint32_t pos6 = 6; + constexpr uint32_t pos3 = 3; + constexpr uint32_t pos4 = 4; + constexpr uint32_t field0_len = 100; + constexpr uint32_t field1_len = 200; + constexpr uint32_t field0_element0_len = 10; + constexpr uint32_t field0_element1_len = 30; + constexpr uint32_t field1_element0_len = 31; + // field 0 + terms[0]->add(field0, elem0, weight1, pos5); + terms[1]->add(field0, elem0, weight1, pos6); + terms[2]->add(field0, elem1, weight1, pos3); + // field 1 + terms[1]->add(field1, elem0, weight1, pos4); + terms[2]->add(field1, elem0, weight2, pos4); + + terms[0]->set_element_length(0, field0_element0_len); + terms[1]->set_element_length(0, field0_element0_len); + terms[1]->set_element_length(1, field1_element0_len); + terms[2]->set_element_length(0, field0_element1_len); + terms[2]->set_element_length(1, field1_element0_len); + + /* + * evaluateHits() should get the union of the hits for each query term + * but without duplicates. + */ + HitList hits; + eqn.evaluateHits(hits); + auto exp_hits = HitList{{field0,elem0,weight1,pos5},{field0,elem0,weight1,pos6},{field0,elem1,weight1,pos3},{field1,elem0,weight2,pos4}}; + exp_hits[0].set_element_length(field0_element0_len); + exp_hits[1].set_element_length(field0_element0_len); + exp_hits[2].set_element_length(field0_element1_len); + exp_hits[3].set_element_length(field1_element0_len); + ASSERT_EQ(exp_hits, hits); + EXPECT_TRUE(eqn.evaluate()); + + /* + * Verify that unpack_match_data() gives the expected term field + * match data information. + */ + SimpleTermData td; + constexpr TermFieldHandle handle0 = 27; + constexpr TermFieldHandle handle1 = 29; + constexpr TermFieldHandle handle_max = std::max(handle0, handle1); + td.addField(0).setHandle(handle0); + td.addField(1).setHandle(handle1); + terms[0]->resizeFieldId(field0); + terms[0]->getFieldInfo(field0).setFieldLength(field0_len); + terms[1]->resizeFieldId(field1); + terms[1]->getFieldInfo(field0).setFieldLength(field0_len); + terms[1]->getFieldInfo(field1).setFieldLength(field1_len); + terms[2]->resizeFieldId(field1); + terms[2]->getFieldInfo(field0).setFieldLength(field0_len); + terms[2]->getFieldInfo(field1).setFieldLength(field1_len); + auto md = MatchData::makeTestInstance(handle_max + 1, handle_max + 1); + auto tfmd0 = md->resolveTermField(handle0); + auto tfmd1 = md->resolveTermField(handle1); + tfmd0->setNeedInterleavedFeatures(true); + tfmd1->setNeedInterleavedFeatures(true); + eqn.unpack_match_data(2, td, *md); + EXPECT_EQ(2, tfmd0->getDocId()); + EXPECT_EQ(3, tfmd0->getNumOccs()); + EXPECT_EQ(3, tfmd0->end() - tfmd0->begin()); + auto itr = tfmd0->begin(); + assert_tfmd_pos("tmfd0[0]", *itr, elem0, pos5, weight1, field0_element0_len); + ++itr; + assert_tfmd_pos("tmfd0[1]", *itr, elem0, pos6, weight1, field0_element0_len); + ++itr; + assert_tfmd_pos("tmfd0[2]", *itr, elem1, pos3, weight1, field0_element1_len); + EXPECT_EQ(field0_len, tfmd0->getFieldLength()); + EXPECT_EQ(2, tfmd1->getDocId()); + EXPECT_EQ(1, tfmd1->getNumOccs()); + EXPECT_EQ(1, tfmd1->end() - tfmd1->begin()); + itr = tfmd1->begin(); + assert_tfmd_pos("tmfd1[0]", *itr, elem0, pos4, weight2, field1_element0_len); + EXPECT_EQ(field1_len, tfmd1->getFieldLength()); +} + +TEST_F(EquivQueryNodeTest, test_equiv_flattening) +{ + auto stack_dump = make_simple_equiv_stack_dump(); + AllowRewrite allow_rewrite; + Query q(allow_rewrite, stack_dump); + auto& eqn = dynamic_cast<EquivQueryNode&>(q.getRoot()); + auto& terms = eqn.get_terms(); + // Query is flattened to equiv("2", "2.5", phrase("2","5"), "3") + EXPECT_EQ(4, terms.size()); + EXPECT_EQ("2", terms[0]->getTermString()); + EXPECT_EQ("2.5", terms[1]->getTermString()); + auto phrase = dynamic_cast<PhraseQueryNode*>(terms[2].get()); + EXPECT_NE(phrase, nullptr); + EXPECT_EQ(2, phrase->get_terms().size()); + EXPECT_EQ("2", phrase->get_terms()[0]->getTermString()); + EXPECT_EQ("5", phrase->get_terms()[1]->getTermString()); + EXPECT_EQ("3", terms[3]->getTermString()); +} + + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp index 19a2a0876c6..5559e194c5e 100644 --- a/searchlib/src/tests/query/streaming_query_test.cpp +++ b/searchlib/src/tests/query/streaming_query_test.cpp @@ -3,6 +3,7 @@ #include <vespa/searchlib/fef/simpletermdata.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/searchlib/query/streaming/dot_product_term.h> +#include <vespa/searchlib/query/streaming/equiv_query_node.h> #include <vespa/searchlib/query/streaming/in_term.h> #include <vespa/searchlib/query/streaming/phrase_query_node.h> #include <vespa/searchlib/query/streaming/query.h> @@ -352,17 +353,17 @@ TEST(StreamingQueryTest, onedot0e_is_rewritten_if_allowed_too) const QueryNode & root = q.getRoot(); EXPECT_TRUE(dynamic_cast<const EquivQueryNode *>(&root) != nullptr); const auto & equiv = static_cast<const EquivQueryNode &>(root); - EXPECT_EQ(2u, equiv.size()); - EXPECT_TRUE(dynamic_cast<const QueryTerm *>(equiv[0].get()) != nullptr); + EXPECT_EQ(2u, equiv.get_terms().size()); + EXPECT_TRUE(dynamic_cast<const QueryTerm *>(equiv.get_terms()[0].get()) != nullptr); { - const auto & qt = static_cast<const QueryTerm &>(*equiv[0]); + const auto & qt = static_cast<const QueryTerm &>(*equiv.get_terms()[0]); EXPECT_EQ("c", qt.index()); EXPECT_EQ(vespalib::stringref("1.0e"), qt.getTerm()); EXPECT_EQ(3u, qt.uniqueId()); } - EXPECT_TRUE(dynamic_cast<const PhraseQueryNode *>(equiv[1].get()) != nullptr); + EXPECT_TRUE(dynamic_cast<const PhraseQueryNode *>(equiv.get_terms()[1].get()) != nullptr); { - const auto & phrase = static_cast<const PhraseQueryNode &>(*equiv[1]); + const auto & phrase = static_cast<const PhraseQueryNode &>(*equiv.get_terms()[1]); EXPECT_EQ(2u, phrase.get_terms().size()); { const auto & qt = *phrase.get_terms()[0]; diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt index 63d52cbdf9f..a2f0c8fd136 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt @@ -2,7 +2,9 @@ vespa_add_library(searchlib_query_streaming OBJECT SOURCES dot_product_term.cpp + equiv_query_node.cpp fuzzy_term.cpp + hit.cpp hit_iterator_pack.cpp in_term.cpp multi_term.cpp diff --git a/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.cpp new file mode 100644 index 00000000000..939afec0463 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.cpp @@ -0,0 +1,102 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "equiv_query_node.h" +#include "phrase_query_node.h" +#include "queryterm.hpp" + +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataPosition; +using search::fef::ITermFieldData; + +namespace search::streaming { + +namespace { + +class HitWithFieldLength : public Hit +{ + uint32_t _field_length; +public: + HitWithFieldLength(const Hit& hit, uint32_t field_length) noexcept + : Hit(hit), + _field_length(field_length) + { + } + uint32_t get_field_length() const noexcept { return _field_length; } +}; + +template <typename HitType> +void merge_hits_from_children(std::vector<HitType>& hl, const MultiTerm& mt) +{ + HitList sub_hl_store; + for (auto& subterm : mt.get_terms()) { + auto *phrase = dynamic_cast<PhraseQueryNode*>(subterm.get()); + QueryTerm& fl_term = (phrase == nullptr) ? *subterm : *phrase->get_terms().front(); + auto& sub_hl = subterm->evaluateHits(sub_hl_store); + for (auto& h : sub_hl) { + if constexpr (std::is_same_v<Hit,HitType>) { + hl.emplace_back(h); + } else { + hl.emplace_back(h, extract_field_length(fl_term, h.field_id())); + } + } + } + std::sort(hl.begin(), hl.end()); + auto last = std::unique(hl.begin(), hl.end(), [](auto& lhs, auto &rhs) noexcept { return lhs.at_same_pos(rhs); }); + hl.erase(last, hl.end()); +} + +} + +EquivQueryNode::EquivQueryNode(std::unique_ptr<QueryNodeResultBase> result_base, uint32_t num_terms) + : MultiTerm(std::move(result_base), "", num_terms) +{ +} + +EquivQueryNode::~EquivQueryNode() = default; + +bool +EquivQueryNode::evaluate() const +{ + for (auto& subterm : get_terms()) { + if (subterm->evaluate()) { + return true; + } + } + return false; +} + +const HitList & +EquivQueryNode::evaluateHits(HitList & hl) const +{ + hl.clear(); + merge_hits_from_children(hl, *this); + return hl; +} + +void +EquivQueryNode::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) +{ + std::vector<HitWithFieldLength> hit_list; + merge_hits_from_children(hit_list, *this); + unpack_match_data_helper(docid, td, match_data, hit_list, *this); +} + +EquivQueryNode* +EquivQueryNode::as_equiv_query_node() noexcept +{ + return this; +} + +const EquivQueryNode* +EquivQueryNode::as_equiv_query_node() const noexcept +{ + return this; +} + +std::vector<std::unique_ptr<QueryTerm>> +EquivQueryNode::steal_terms() +{ + return std::move(_terms); +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.h b/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.h new file mode 100644 index 00000000000..b5cdb31274f --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/equiv_query_node.h @@ -0,0 +1,25 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "multi_term.h" + +namespace search::streaming { + +/** + N-ary "EQUIV" operator that merges terms from nodes below. +*/ +class EquivQueryNode : public MultiTerm +{ +public: + EquivQueryNode(std::unique_ptr<QueryNodeResultBase> result_base, uint32_t num_terms); + ~EquivQueryNode() override; + bool evaluate() const override; + const HitList & evaluateHits(HitList & hl) const override; + void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override; + EquivQueryNode* as_equiv_query_node() noexcept override; + const EquivQueryNode* as_equiv_query_node() const noexcept override; + std::vector<std::unique_ptr<QueryTerm>> steal_terms(); +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit.cpp b/searchlib/src/vespa/searchlib/query/streaming/hit.cpp new file mode 100644 index 00000000000..c05fda77476 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/hit.cpp @@ -0,0 +1,17 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hit.h" +#include <ostream> + +namespace search::streaming { + +std::ostream& +operator<<(std::ostream& os, const Hit& hit) +{ + os << "{" << hit.field_id() << "," << hit.element_id() << "," << + hit.element_weight() << "," << hit.element_length() << "," << + hit.position() << "}"; + return os; +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit.h b/searchlib/src/vespa/searchlib/query/streaming/hit.h index 168c09a91ec..fc24c21f722 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/hit.h +++ b/searchlib/src/vespa/searchlib/query/streaming/hit.h @@ -2,6 +2,7 @@ #pragma once #include <cstdint> +#include <iosfwd> #include <vector> namespace search::streaming { @@ -27,8 +28,37 @@ public: uint32_t element_length() const { return _element_length; } uint32_t position() const { return _position; } void set_element_length(uint32_t value) { _element_length = value; } + bool operator<(const Hit& rhs) const noexcept { + if (_field_id != rhs._field_id) { + return _field_id < rhs._field_id; + } + if (_element_id != rhs._element_id) { + return _element_id < rhs._element_id; + } + if (_position != rhs._position) { + return _position < rhs._position; + } + if (_element_weight != rhs._element_weight) { + return _element_weight > rhs._element_weight; + } + return _element_length < rhs._element_length; + } + bool at_same_pos(const Hit& rhs) const noexcept { + return (_field_id == rhs._field_id) && + (_element_id == rhs._element_id) && + (_position == rhs._position); + } + bool operator==(const Hit& rhs) const noexcept { + return (_field_id == rhs._field_id) && + (_element_id == rhs._element_id) && + (_position == rhs._position) && + (_element_weight == rhs._element_weight) && + (_element_length == rhs._element_length); + } }; +std::ostream& operator<<(std::ostream& os, const Hit& hit); + using HitList = std::vector<Hit>; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp index 9cd8d41d33d..b090ca13225 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/phrase_query_node.cpp @@ -69,7 +69,9 @@ PhraseQueryNode::evaluateHits(HitList & hl) const void PhraseQueryNode::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) { - unpack_match_data_helper(docid, td, match_data, *get_terms().front()); + HitList list; + const HitList & hit_list = evaluateHits(list); + unpack_match_data_helper(docid, td, match_data, hit_list, *get_terms().front()); } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/query.cpp b/searchlib/src/vespa/searchlib/query/streaming/query.cpp index 77424fb2d62..94d9acd02cd 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/query.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/query.cpp @@ -94,7 +94,6 @@ QueryConnector::create(ParseItem::ItemType type) case search::ParseItem::ITEM_AND: return std::make_unique<AndQueryNode>(); case search::ParseItem::ITEM_OR: case search::ParseItem::ITEM_WEAK_AND: return std::make_unique<OrQueryNode>(); - case search::ParseItem::ITEM_EQUIV: return std::make_unique<EquivQueryNode>(); case search::ParseItem::ITEM_NOT: return std::make_unique<AndNotQueryNode>(); case search::ParseItem::ITEM_SAME_ELEMENT: return std::make_unique<SameElementQueryNode>(); case search::ParseItem::ITEM_NEAR: return std::make_unique<NearQueryNode>(); @@ -158,12 +157,6 @@ RankWithQueryNode::evaluate() const { return firstOk; } -bool -EquivQueryNode::evaluate() const -{ - return OrQueryNode::evaluate(); -} - Query::Query() = default; Query::Query(const QueryNodeResultFactory & factory, vespalib::stringref queryRep) diff --git a/searchlib/src/vespa/searchlib/query/streaming/query.h b/searchlib/src/vespa/searchlib/query/streaming/query.h index e91a2f91dc5..a993a9a8a8a 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/query.h +++ b/searchlib/src/vespa/searchlib/query/streaming/query.h @@ -103,20 +103,6 @@ public: bool evaluate() const override; }; - -/** - N-ary "EQUIV" operator that merges terms from nodes below. -*/ -class EquivQueryNode : public OrQueryNode -{ -public: - EquivQueryNode() noexcept : OrQueryNode("EQUIV") { } - bool evaluate() const override; - bool isFlattenable(ParseItem::ItemType type) const override { - return (type == ParseItem::ITEM_EQUIV); - } -}; - /** Query packages the query tree. The usage pattern is like this. Construct the tree with the correct tree description. diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 0b277dbe221..dd3b1f84ad9 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -9,6 +9,7 @@ #include "same_element_query_node.h" #include <vespa/searchlib/parsequery/stackdumpiterator.h> #include <vespa/searchlib/query/streaming/dot_product_term.h> +#include <vespa/searchlib/query/streaming/equiv_query_node.h> #include <vespa/searchlib/query/streaming/in_term.h> #include <vespa/searchlib/query/streaming/wand_term.h> #include <vespa/searchlib/query/streaming/weighted_set_term.h> @@ -44,7 +45,6 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_AND: case ParseItem::ITEM_OR: case ParseItem::ITEM_WEAK_AND: - case ParseItem::ITEM_EQUIV: case ParseItem::ITEM_NOT: case ParseItem::ITEM_SAME_ELEMENT: case ParseItem::ITEM_NEAR: @@ -142,10 +142,10 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor auto dotPos = ssTerm.find('.'); phrase->add_term(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(0, dotPos), ssIndex, TermType::WORD, normalize_mode)); phrase->add_term(std::make_unique<QueryTerm>(factory.create(), ssTerm.substr(dotPos + 1), ssIndex, TermType::WORD, normalize_mode)); - auto orqn = std::make_unique<EquivQueryNode>(); - orqn->addChild(std::move(qt)); - orqn->addChild(std::move(phrase)); - qn = std::move(orqn); + auto eqn = std::make_unique<EquivQueryNode>(factory.create(), 2); + eqn->add_term(std::move(qt)); + eqn->add_term(std::move(phrase)); + qn = std::move(eqn); } else { qn = std::move(qt); } @@ -171,6 +171,9 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor case ParseItem::ITEM_PHRASE: qn = build_phrase_term(factory, queryRep); break; + case ParseItem::ITEM_EQUIV: + qn = build_equiv_term(factory, queryRep, allowRewrite); + break; default: skip_unknown(queryRep); break; @@ -282,6 +285,33 @@ QueryNode::build_phrase_term(const QueryNodeResultFactory& factory, SimpleQueryS return phrase; } +std::unique_ptr<QueryNode> +QueryNode::build_equiv_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep, bool allow_rewrite) +{ + auto eqn = std::make_unique<EquivQueryNode>(factory.create(), queryRep.getArity()); + auto arity = queryRep.getArity(); + eqn->setWeight(queryRep.GetWeight()); + eqn->setUniqueId(queryRep.getUniqueId()); + for (size_t i = 0; i < arity; ++i) { + queryRep.next(); + auto qn = Build(eqn.get(), factory, queryRep, allow_rewrite); + auto nested_eqn = dynamic_cast<EquivQueryNode*>(qn.get()); + if (nested_eqn != nullptr) { + auto stolen_terms = nested_eqn->steal_terms(); + for (auto& term : stolen_terms) { + eqn->add_term(std::move(term)); + } + continue; + } + auto qtp = dynamic_cast<QueryTerm*>(qn.get()); + assert(qtp != nullptr); + qn.release(); + std::unique_ptr<QueryTerm> qt(qtp); + eqn->add_term(std::move(qt)); + } + return eqn; +} + void QueryNode::skip_unknown(SimpleQueryStackDumpIterator& queryRep) { diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.h b/searchlib/src/vespa/searchlib/query/streaming/querynode.h index 4c7d9e88930..fff3bb15d10 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.h +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.h @@ -34,6 +34,7 @@ class QueryNode static std::unique_ptr<QueryNode> build_wand_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep); static std::unique_ptr<QueryNode> build_weighted_set_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep); static std::unique_ptr<QueryNode> build_phrase_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep); + static std::unique_ptr<QueryNode> build_equiv_term(const QueryNodeResultFactory& factory, SimpleQueryStackDumpIterator& queryRep, bool allow_rewrite); static void skip_unknown(SimpleQueryStackDumpIterator& queryRep); public: using UP = std::unique_ptr<QueryNode>; diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index f01f815e673..0d0f5a7c4ad 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -1,6 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "queryterm.h" +#include "queryterm.hpp" #include <vespa/searchlib/fef/itermdata.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/vespalib/objects/visit.h> @@ -113,89 +113,12 @@ QueryTerm::set_element_length(uint32_t hitlist_idx, uint32_t element_length) _hitList[hitlist_idx].set_element_length(element_length); } -namespace { - -uint16_t -cap_16_bits(uint32_t value) -{ - return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); -} - -uint32_t -extract_field_length(const QueryTerm& term, uint32_t field_id) -{ - return (field_id < term.getFieldInfoSize()) ? term.getFieldInfo(field_id).getFieldLength() : search::fef::FieldPositionsIterator::UNKNOWN_LENGTH; -} - -void -set_interleaved_features(TermFieldMatchData& tmd, uint32_t field_length, uint32_t num_occs) -{ - tmd.setFieldLength(cap_16_bits(field_length)); - tmd.setNumOccs(cap_16_bits(num_occs)); -} - -} - -void -QueryTerm::unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const -{ - HitList list; - const HitList & hitList = evaluateHits(list); - - if (!hitList.empty()) { // only unpack if we have a hit - LOG(debug, "Unpack match data for query term '%s:%s'", - index().c_str(), getTerm()); - - uint32_t lastFieldId = -1; - TermFieldMatchData *tmd = nullptr; - uint32_t num_occs = 0; - - // optimize for hitlist giving all hits for a single field in one chunk - for (const Hit & hit : hitList) { - uint32_t fieldId = hit.field_id(); - if (fieldId != lastFieldId) { - if (tmd != nullptr) { - if (tmd->needs_interleaved_features()) { - set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs); - } - // reset to notfound/unknown values - tmd = nullptr; - } - num_occs = 0; - - // setup for new field that had a hit - const ITermFieldData *tfd = td.lookupField(fieldId); - if (tfd != nullptr) { - tmd = match_data.resolveTermField(tfd->getHandle()); - tmd->setFieldId(fieldId); - // reset field match data, but only once per docId - if (tmd->getDocId() != docid) { - tmd->reset(docid); - } - } - lastFieldId = fieldId; - } - ++num_occs; - if (tmd != nullptr) { - TermFieldMatchDataPosition pos(hit.element_id(), hit.position(), - hit.element_weight(), hit.element_length()); - tmd->appendPosition(pos); - LOG(debug, "Append elemId(%u),position(%u), weight(%d), tfmd.weight(%d)", - pos.getElementId(), pos.getPosition(), pos.getElementWeight(), tmd->getWeight()); - } - } - if (tmd != nullptr) { - if (tmd->needs_interleaved_features()) { - set_interleaved_features(*tmd, extract_field_length(fl_term, lastFieldId), num_occs); - } - } - } -} - void QueryTerm::unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) { - unpack_match_data_helper(docid, td, match_data, *this); + HitList list; + const HitList & hit_list = evaluateHits(list); + unpack_match_data_helper(docid, td, match_data, hit_list, *this); } NearestNeighborQueryNode* @@ -222,4 +145,16 @@ QueryTerm::as_fuzzy_term() noexcept return nullptr; } +EquivQueryNode* +QueryTerm::as_equiv_query_node() noexcept +{ + return nullptr; +} + +const EquivQueryNode* +QueryTerm::as_equiv_query_node() const noexcept +{ + return nullptr; +} + } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 2eaecb86854..2cb4f2d2ebb 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -17,6 +17,7 @@ class MatchData; } namespace search::streaming { +class EquivQueryNode; class FuzzyTerm; class NearestNeighborQueryNode; class MultiTerm; @@ -100,9 +101,12 @@ public: virtual MultiTerm* as_multi_term() noexcept; virtual RegexpTerm* as_regexp_term() noexcept; virtual FuzzyTerm* as_fuzzy_term() noexcept; + virtual EquivQueryNode* as_equiv_query_node() noexcept; + virtual const EquivQueryNode* as_equiv_query_node() const noexcept; virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data); protected: - void unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const QueryTerm& fl_term) const; + template <typename HitListType> + static void unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const HitListType& hit_list, const QueryTerm& fl_term); using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; EncodingBitMap _encoding; diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.hpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.hpp new file mode 100644 index 00000000000..dd6eff1f22b --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.hpp @@ -0,0 +1,94 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "queryterm.h" +#include <vespa/searchlib/fef/itermdata.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <algorithm> +#include <limits> + + +namespace search::streaming { + +namespace { + +uint16_t +cap_16_bits(uint32_t value) +{ + return std::min(value, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); +} + +uint32_t +extract_field_length(const QueryTerm& term, uint32_t field_id) +{ + return (field_id < term.getFieldInfoSize()) ? term.getFieldInfo(field_id).getFieldLength() : search::fef::FieldPositionsIterator::UNKNOWN_LENGTH; +} + +void +set_interleaved_features(search::fef::TermFieldMatchData& tmd, uint32_t field_length, uint32_t num_occs) +{ + tmd.setFieldLength(cap_16_bits(field_length)); + tmd.setNumOccs(cap_16_bits(num_occs)); +} + +} + +template <typename HitListType> +void +QueryTerm::unpack_match_data_helper(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data, const HitListType& hit_list, const QueryTerm& fl_term) +{ + (void) fl_term; + if (!hit_list.empty()) { // only unpack if we have a hit + + uint32_t last_field_id = -1; + uint32_t last_field_length = 0; + search::fef::TermFieldMatchData *tmd = nullptr; + uint32_t num_occs = 0; + + // optimize for hitlist giving all hits for a single field in one chunk + for (const auto& hit : hit_list) { + uint32_t field_id = hit.field_id(); + if (field_id != last_field_id) { + if (tmd != nullptr) { + if (tmd->needs_interleaved_features()) { + set_interleaved_features(*tmd, last_field_length, num_occs); + } + // reset to notfound/unknown values + tmd = nullptr; + } + num_occs = 0; + + // setup for new field that had a hit + const search::fef::ITermFieldData *tfd = td.lookupField(field_id); + if (tfd != nullptr) { + tmd = match_data.resolveTermField(tfd->getHandle()); + tmd->setFieldId(field_id); + // reset field match data, but only once per docId + if (tmd->getDocId() != docid) { + tmd->reset(docid); + } + } + last_field_id = field_id; + if constexpr (std::is_same_v<HitList, HitListType>) { + last_field_length = extract_field_length(fl_term, field_id); + } else { + last_field_length = hit.get_field_length(); + } + } + ++num_occs; + if (tmd != nullptr) { + search::fef::TermFieldMatchDataPosition pos(hit.element_id(), hit.position(), + hit.element_weight(), hit.element_length()); + tmd->appendPosition(pos); + } + } + if (tmd != nullptr) { + if (tmd->needs_interleaved_features()) { + set_interleaved_features(*tmd, last_field_length, num_occs); + } + } + } +} + +} diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp index 3449df57513..a54d2adee78 100644 --- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp @@ -4,7 +4,7 @@ #include "rankprocessor.h" #include <vespa/searchlib/fef/handle.h> #include <vespa/searchlib/fef/simpletermfielddata.h> -#include <vespa/searchlib/query/streaming/multi_term.h> +#include <vespa/searchlib/query/streaming/equiv_query_node.h> #include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h> #include <vespa/vsm/vsm/fieldsearchspec.h> #include <algorithm> @@ -56,6 +56,51 @@ getFeature(const RankProgram &rankProgram) { } void +RankProcessor::resolve_fields_from_children(QueryTermData& qtd, MultiTerm& mt) +{ + vespalib::hash_set<uint32_t> field_ids; + for (auto& subterm : mt.get_terms()) { + vespalib::string expandedIndexName = vsm::FieldSearchSpecMap::stripNonFields(subterm->index()); + const RankManager::View *view = _rankManagerSnapshot->getView(expandedIndexName); + if (view != nullptr) { + for (auto field_id : *view) { + field_ids.insert(field_id); + } + } else { + LOG(warning, "Could not find a view for index '%s'. Ranking no fields.", + getIndexName(subterm->index(), expandedIndexName).c_str()); + } + } + std::vector<uint32_t> sorted_field_ids; + sorted_field_ids.reserve(field_ids.size()); + for (auto field_id : field_ids) { + sorted_field_ids.emplace_back(field_id); + } + std::sort(sorted_field_ids.begin(), sorted_field_ids.end()); + for (auto field_id : sorted_field_ids) { + qtd.getTermData().addField(field_id).setHandle(_mdLayout.allocTermField(field_id)); + } +} + +void +RankProcessor::resolve_fields_from_term(QueryTermData& qtd, search::streaming::QueryTerm& term) +{ + vespalib::string expandedIndexName = vsm::FieldSearchSpecMap::stripNonFields(term.index()); + const RankManager::View *view = _rankManagerSnapshot->getView(expandedIndexName); + if (view != nullptr) { + for (auto field_id : *view) { + qtd.getTermData().addField(field_id).setHandle(_mdLayout.allocTermField(field_id)); + } + } else { + LOG(warning, "Could not find a view for index '%s'. Ranking no fields.", + getIndexName(term.index(), expandedIndexName).c_str()); + } + LOG(debug, "Setup query term '%s:%s'", + getIndexName(term.index(), expandedIndexName).c_str(), + term.getTerm()); +} + +void RankProcessor::initQueryEnvironment() { QueryWrapper::TermList & terms = _query.getTermList(); @@ -75,21 +120,12 @@ RankProcessor::initQueryEnvironment() if (nn_term != nullptr) { qtd.getTermData().set_query_tensor_name(nn_term->get_query_tensor_name()); } - - vespalib::string expandedIndexName = vsm::FieldSearchSpecMap::stripNonFields(term->index()); - const RankManager::View *view = _rankManagerSnapshot->getView(expandedIndexName); - if (view != nullptr) { - for (auto field_id : *view) { - qtd.getTermData().addField(field_id).setHandle(_mdLayout.allocTermField(field_id)); - } + auto* eqn = term->as_equiv_query_node(); + if (eqn != nullptr) { + resolve_fields_from_children(qtd, *eqn); } else { - LOG(warning, "Could not find a view for index '%s'. Ranking no fields.", - getIndexName(term->index(), expandedIndexName).c_str()); + resolve_fields_from_term(qtd, *term); } - - LOG(debug, "Setup query term '%s:%s'", - getIndexName(term->index(), expandedIndexName).c_str(), - term->getTerm()); _queryEnv.addTerm(&qtd.getTermData()); } _rankSetup.prepareSharedState(_queryEnv, _queryEnv.getObjectStore()); diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h index 5651917ce7a..bec70beca77 100644 --- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h @@ -16,6 +16,8 @@ namespace streaming { +class QueryTermData; + /** * This class is associated with a query and a rank profile and * is used to calculate rank and feature set for matched documents. @@ -43,6 +45,8 @@ private: HitCollector::UP _hitCollector; std::unique_ptr<RankProgram> _match_features_program; + void resolve_fields_from_children(QueryTermData& qtd, search::streaming::MultiTerm& mt); + void resolve_fields_from_term(QueryTermData& qtd, search::streaming::QueryTerm& term); void initQueryEnvironment(); void initHitCollector(size_t wantedHitCount); void setupRankProgram(search::fef::RankProgram &program); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index c75ab7fccd3..72807bc6c34 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -3,7 +3,7 @@ #include <vespa/vsm/vsm/fieldsearchspec.h> #include <vespa/document/fieldvalue/arrayfieldvalue.h> #include <vespa/document/fieldvalue/weightedsetfieldvalue.h> -#include <vespa/searchlib/query/streaming/multi_term.h> +#include <vespa/searchlib/query/streaming/equiv_query_node.h> #include <vespa/vespalib/stllike/hash_set.h> #include <cassert> @@ -190,6 +190,39 @@ FieldSearcher::init() } void +FieldIdTSearcherMap::prepare_term(const DocumentTypeIndexFieldMapT& difm, QueryTerm* qt, FieldIdT fid, vespalib::hash_set<const void*>& seen, QueryTermList& onlyInIndex) +{ + auto equiv = qt->as_equiv_query_node(); + if (equiv != nullptr) { + for (auto& subterm : equiv->get_terms()) { + prepare_term(difm, subterm.get(), fid, seen, onlyInIndex); + } + return; + } + for (const auto& doc_type_elem : difm) { + const IndexFieldMapT & fim = doc_type_elem.second; + auto found = fim.find(FieldSearchSpecMap::stripNonFields(qt->index())); + if (found != fim.end()) { + const FieldIdTList & index = found->second; + if ((find(index.begin(), index.end(), fid) != index.end()) && !seen.contains(qt)) { + seen.insert(qt); + auto multi_term = qt->as_multi_term(); + if (multi_term != nullptr) { + for (auto& subterm : multi_term->get_terms()) { + onlyInIndex.emplace_back(subterm.get()); + } + } else { + onlyInIndex.emplace_back(qt); + } + } + } else { + LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", + qt->index().c_str()); + } + } +} + +void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf, Query& query, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env) @@ -202,27 +235,7 @@ FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, const Share vespalib::hash_set<const void*> seen; FieldIdT fid = searcher->field(); for (auto qt : qtl) { - for (const auto& doc_type_elem : difm) { - const IndexFieldMapT & fim = doc_type_elem.second; - auto found = fim.find(FieldSearchSpecMap::stripNonFields(qt->index())); - if (found != fim.end()) { - const FieldIdTList & index = found->second; - if ((find(index.begin(), index.end(), fid) != index.end()) && !seen.contains(qt)) { - seen.insert(qt); - auto multi_term = qt->as_multi_term(); - if (multi_term != nullptr) { - for (auto& subterm : multi_term->get_terms()) { - onlyInIndex.emplace_back(subterm.get()); - } - } else { - onlyInIndex.emplace_back(qt); - } - } - } else { - LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", - qt->index().c_str()); - } - } + prepare_term(difm, qt, fid, seen, onlyInIndex); } /// Should perhaps do a unique on onlyInIndex searcher->prepare(onlyInIndex, searcherBuf, field_paths, query_env); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index 6f3ec3e1e73..042e47ef164 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -5,6 +5,7 @@ #include <vespa/searchlib/query/streaming/query.h> #include <vespa/vsm/common/document.h> #include <vespa/vsm/common/storagedocument.h> +#include <vespa/vespalib/stllike/hash_set.h> #include <vespa/vespalib/util/array.h> #include <utility> @@ -122,6 +123,7 @@ using FieldIdTSearcherMapT = std::vector<FieldSearcherContainer>; class FieldIdTSearcherMap : public FieldIdTSearcherMapT { + void prepare_term(const DocumentTypeIndexFieldMapT& difm, search::streaming::QueryTerm* qt, FieldIdT fid, vespalib::hash_set<const void*>& seen, search::streaming::QueryTermList& onlyInIndex); public: void prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf, search::streaming::Query& query, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 3ae4794e33f..c596b46a774 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "fieldsearchspec.h" +#include <vespa/searchlib/query/streaming/equiv_query_node.h> #include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vsm/searcher/boolfieldsearcher.h> #include <vespa/vsm/searcher/floatfieldsearcher.h> @@ -222,7 +223,14 @@ FieldSearchSpecMap::buildFieldsInQuery(const Query & query) const query.getLeaves(qtl); for (const auto & term : qtl) { - addFieldsFromIndex(term->index(), fieldsInQuery); + auto equiv = term->as_equiv_query_node(); + if (equiv != nullptr) { + for (const auto& subterm : equiv->get_terms()) { + addFieldsFromIndex(subterm->index(), fieldsInQuery); + } + } else { + addFieldsFromIndex(term->index(), fieldsInQuery); + } } return fieldsInQuery; } |