diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-01-22 16:39:49 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2024-01-22 16:39:49 +0100 |
commit | b3a2230c45de8f0698dcbb93bd7a46422ed16731 (patch) | |
tree | a6bedb35e878ea210abfaeac9ad0f7f66b01e17f | |
parent | ccda952db487445f3522eecbcbfee4a6f6a90c32 (diff) |
Add hit iterator pack and use it for phrase search in streaming mode.
9 files changed, 366 insertions, 51 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 5628db99171..7bec70c00cb 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -190,6 +190,7 @@ vespa_define_module( src/tests/postinglistbm src/tests/predicate src/tests/query + src/tests/query/streaming src/tests/queryeval src/tests/queryeval/blueprint src/tests/queryeval/dot_product diff --git a/searchlib/src/tests/query/streaming/CMakeLists.txt b/searchlib/src/tests/query/streaming/CMakeLists.txt new file mode 100644 index 00000000000..d40250fbc3f --- /dev/null +++ b/searchlib/src/tests/query/streaming/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(searchlib_query_streaming_hit_iterator_test_app TEST + SOURCES + hit_iterator_test.cpp + DEPENDS + searchlib + GTest::gtest +) +vespa_add_test(NAME searchlib_query_streaming_hit_iterator_test_app COMMAND searchlib_query_streaming_hit_iterator_test_app) + +vespa_add_executable(searchlib_query_streaming_hit_iterator_pack_test_app TEST + SOURCES + hit_iterator_pack_test.cpp + DEPENDS + searchlib + GTest::gtest +) +vespa_add_test(NAME searchlib_query_streaming_hit_iterator_pack_test_app COMMAND searchlib_query_streaming_hit_iterator_pack_test_app) diff --git a/searchlib/src/tests/query/streaming/hit_iterator_pack_test.cpp b/searchlib/src/tests/query/streaming/hit_iterator_pack_test.cpp new file mode 100644 index 00000000000..7d7d8307920 --- /dev/null +++ b/searchlib/src/tests/query/streaming/hit_iterator_pack_test.cpp @@ -0,0 +1,44 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/query/streaming/hit_iterator_pack.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::streaming::HitIterator; +using search::streaming::HitIteratorPack; +using search::streaming::QueryNodeList; +using search::streaming::QueryTerm; +using search::streaming::QueryNodeResultBase; + +using FieldElement = HitIterator::FieldElement; + +TEST(HitIteratorPackTest, seek_to_matching_field_element) +{ + QueryNodeList qnl; + auto qt = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), "7", "", QueryTerm::Type::WORD); + qt->add(11, 0, 10, 0); + qt->add(11, 0, 10, 5); + qt->add(11, 1, 12, 0); + qt->add(11, 1, 12, 0); + qt->add(12, 1, 13, 0); + qt->add(12, 1, 13, 0); + qnl.emplace_back(std::move(qt)); + qt = std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), "8", "", QueryTerm::Type::WORD); + qt->add(2, 0, 4, 0); + qt->add(11, 0, 10, 0); + qt->add(12, 1, 13, 0); + qt->add(12, 2, 14, 0); + qnl.emplace_back(std::move(qt)); + HitIteratorPack itr_pack(qnl); + EXPECT_TRUE(itr_pack.all_valid()); + EXPECT_TRUE(itr_pack.seek_to_matching_field_element()); + EXPECT_EQ(FieldElement(11, 0), itr_pack.get_field_element_ref()); + EXPECT_TRUE(itr_pack.seek_to_matching_field_element()); + EXPECT_EQ(FieldElement(11, 0), itr_pack.get_field_element_ref()); + ++itr_pack.get_field_element_ref().second; + EXPECT_TRUE(itr_pack.seek_to_matching_field_element()); + EXPECT_EQ(FieldElement(12, 1), itr_pack.get_field_element_ref()); + ++itr_pack.get_field_element_ref().second; + EXPECT_FALSE(itr_pack.seek_to_matching_field_element()); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/query/streaming/hit_iterator_test.cpp b/searchlib/src/tests/query/streaming/hit_iterator_test.cpp new file mode 100644 index 00000000000..a9588ea3d6c --- /dev/null +++ b/searchlib/src/tests/query/streaming/hit_iterator_test.cpp @@ -0,0 +1,122 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/query/streaming/hit_iterator.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::streaming::Hit; +using search::streaming::HitList; +using search::streaming::HitIterator; + +using FieldElement = HitIterator::FieldElement; + +namespace { + +HitList +make_hit_list() +{ + HitList hl; + hl.emplace_back(11, 0, 10, 0); + hl.emplace_back(11, 0, 10, 5); + hl.emplace_back(11, 1, 12, 0); + hl.emplace_back(11, 1, 12, 7); + hl.emplace_back(12, 1, 13, 0); + hl.emplace_back(12, 1, 13, 9); + return hl; +} + +void +check_seek_to_field_elem(HitIterator& it, const FieldElement& field_element, const Hit* exp_ptr, const vespalib::string& label) +{ + SCOPED_TRACE(label); + EXPECT_TRUE(it.seek_to_field_element(field_element)); + EXPECT_TRUE(it.valid()); + EXPECT_EQ(exp_ptr, &*it); +} + +void +check_seek_to_field_elem_failure(HitIterator& it, const FieldElement& field_element, const vespalib::string& label) +{ + SCOPED_TRACE(label); + EXPECT_FALSE(it.seek_to_field_element(field_element)); + EXPECT_FALSE(it.valid()); +} + +void +check_step_in_field_element(HitIterator& it, FieldElement& field_element, bool exp_success, const Hit* exp_ptr, const vespalib::string& label) +{ + SCOPED_TRACE(label); + EXPECT_EQ(exp_success, it.step_in_field_element(field_element)); + if (exp_ptr) { + EXPECT_TRUE(it.valid()); + EXPECT_EQ(it.get_field_element(), field_element); + EXPECT_EQ(exp_ptr, &*it); + } else { + EXPECT_FALSE(it.valid()); + } +} + +void +check_seek_in_field_element(HitIterator& it, uint32_t position, FieldElement& field_element, bool exp_success, const Hit* exp_ptr, const vespalib::string& label) +{ + SCOPED_TRACE(label); + EXPECT_EQ(exp_success, it.seek_in_field_element(position, field_element)); + if (exp_ptr) { + EXPECT_TRUE(it.valid()); + EXPECT_EQ(it.get_field_element(), field_element); + EXPECT_EQ(exp_ptr, &*it); + } else { + EXPECT_FALSE(it.valid()); + } +} + +} + +TEST(HitITeratorTest, seek_to_field_element) +{ + auto hl = make_hit_list(); + HitIterator it(hl); + EXPECT_TRUE(it.valid()); + EXPECT_EQ(&hl[0], &*it); + check_seek_to_field_elem(it, FieldElement(0, 0), &hl[0], "(0, 0)"); + check_seek_to_field_elem(it, FieldElement(11, 0), &hl[0], "(11, 0)"); + check_seek_to_field_elem(it, FieldElement(11, 1), &hl[2], "(11, 1)"); + check_seek_to_field_elem(it, FieldElement(11, 2), &hl[4], "(11, 2)"); + check_seek_to_field_elem(it, FieldElement(12, 0), &hl[4], "(12, 0)"); + check_seek_to_field_elem(it, FieldElement(12, 1), &hl[4], "(12, 1)"); + check_seek_to_field_elem_failure(it, FieldElement(12, 2), "(12, 2)"); + check_seek_to_field_elem_failure(it, FieldElement(13, 0), "(13, 0)"); +} + +TEST(HitIteratorTest, step_in_field_element) +{ + auto hl = make_hit_list(); + HitIterator it(hl); + auto field_element = it.get_field_element(); + check_step_in_field_element(it, field_element, true, &hl[1], "1"); + check_step_in_field_element(it, field_element, false, &hl[2], "2"); + check_step_in_field_element(it, field_element, true, &hl[3], "3"); + check_step_in_field_element(it, field_element, false, &hl[4], "4"); + check_step_in_field_element(it, field_element, true, &hl[5], "5"); + check_step_in_field_element(it, field_element, false, nullptr, "end"); +} + +TEST(hitIteratorTest, seek_in_field_elem) +{ + auto hl = make_hit_list(); + HitIterator it(hl); + auto field_element = it.get_field_element(); + check_seek_in_field_element(it, 0, field_element, true, &hl[0], "0a"); + check_seek_in_field_element(it, 2, field_element, true, &hl[1], "2"); + check_seek_in_field_element(it, 5, field_element, true, &hl[1], "5"); + check_seek_in_field_element(it, 6, field_element, false, &hl[2], "6"); + check_seek_in_field_element(it, 0, field_element, true, &hl[2], "0b"); + check_seek_in_field_element(it, 1, field_element, true, &hl[3], "1"); + check_seek_in_field_element(it, 7, field_element, true, &hl[3], "7"); + check_seek_in_field_element(it, 8, field_element, false, &hl[4], "8"); + check_seek_in_field_element(it, 0, field_element, true, &hl[4], "0c"); + check_seek_in_field_element(it, 3, field_element, true, &hl[5], "3"); + check_seek_in_field_element(it, 9, field_element, true, &hl[5], "9"); + check_seek_in_field_element(it, 10, field_element, false, nullptr, "end"); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt index 76119a6d58f..91b8c429800 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt @@ -3,6 +3,7 @@ vespa_add_library(searchlib_query_streaming OBJECT SOURCES dot_product_term.cpp fuzzy_term.cpp + hit_iterator_pack.cpp in_term.cpp multi_term.cpp nearest_neighbor_query_node.cpp diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator.h b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator.h new file mode 100644 index 00000000000..14fd19bba2a --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator.h @@ -0,0 +1,59 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "hit.h" + +namespace search::streaming { + +/* + * Iterator used over hit list for a term to support near, onear, phrase and + * same element query nodes. + */ +class HitIterator +{ + HitList::const_iterator _cur; + HitList::const_iterator _end; +public: + using FieldElement = std::pair<uint32_t, uint32_t>; + HitIterator(const HitList& hl) noexcept + : _cur(hl.begin()), + _end(hl.end()) + { } + bool valid() const noexcept { return _cur != _end; } + const Hit* operator->() const noexcept { return _cur.operator->(); } + const Hit& operator*() const noexcept { return _cur.operator*(); } + FieldElement get_field_element() const noexcept { return std::make_pair(_cur->field_id(), _cur->element_id()); } + bool seek_to_field_element(const FieldElement& field_element) noexcept { + while (valid()) { + if (!(get_field_element() < field_element)) { + return true; + } + ++_cur; + } + return false; + } + bool step_in_field_element(FieldElement& field_element) noexcept { + ++_cur; + if (!valid()) { + return false; + } + auto ife = get_field_element(); + if (field_element < ife) { + field_element = ife; + return false; + } + return true; + } + bool seek_in_field_element(uint32_t word_pos, FieldElement& field_element) { + while (_cur->position() < word_pos) { + if (!step_in_field_element(field_element)) { + return false; + } + } + return true; + } + HitIterator& operator++() { ++_cur; return *this; } +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.cpp b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.cpp new file mode 100644 index 00000000000..fabd992c379 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.cpp @@ -0,0 +1,57 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hit_iterator_pack.h" + +namespace search::streaming { + + +HitIteratorPack::HitIteratorPack(const QueryNodeList& children) + : _iterators(), + _field_element(std::make_pair(0, 0)) +{ + auto num_children = children.size(); + _iterators.reserve(num_children); + for (auto& child : children) { + auto& curr = dynamic_cast<const QueryTerm&>(*child); + _iterators.emplace_back(curr.getHitList()); + } +} + +HitIteratorPack::~HitIteratorPack() = default; + +bool +HitIteratorPack::all_valid() const noexcept +{ + if (_iterators.empty()) { + return false; + } + for (auto& it : _iterators) { + if (!it.valid()) { + return false; + } + } + return true; +} + +bool +HitIteratorPack::seek_to_matching_field_element() noexcept +{ + bool retry = true; + while (retry) { + retry = false; + for (auto& it : _iterators) { + if (!it.seek_to_field_element(_field_element)) { + return false; + } + auto ife = it.get_field_element(); + if (_field_element < ife) { + _field_element = ife; + retry = true; + break; + } + } + } + return true; +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h new file mode 100644 index 00000000000..200d579930a --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/hit_iterator_pack.h @@ -0,0 +1,32 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "hit_iterator.h" +#include "queryterm.h" + +namespace search::streaming { + +/* + * Iterator pack used over hit list for a term to support near, onear, + * phrase and same element query nodes. + */ +class HitIteratorPack +{ + using iterator = typename std::vector<HitIterator>::iterator; + using FieldElement = HitIterator::FieldElement; + std::vector<HitIterator> _iterators; + FieldElement _field_element; +public: + explicit HitIteratorPack(const QueryNodeList& children); + ~HitIteratorPack(); + FieldElement& get_field_element_ref() noexcept { return _field_element; } + HitIterator& front() noexcept { return _iterators.front(); } + HitIterator& back() noexcept { return _iterators.back(); } + iterator begin() noexcept { return _iterators.begin(); } + iterator end() noexcept { return _iterators.end(); } + bool all_valid() const noexcept; + bool seek_to_matching_field_element() noexcept; +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/query.cpp b/searchlib/src/vespa/searchlib/query/streaming/query.cpp index 196de23c236..c58e1e62e57 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/query.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/query.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "query.h" +#include "hit_iterator_pack.h" #include <vespa/searchlib/parsequery/stackdumpiterator.h> #include <vespa/vespalib/objects/visit.hpp> #include <cassert> @@ -238,66 +239,45 @@ PhraseQueryNode::addChild(QueryNode::UP child) { AndQueryNode::addChild(std::move(child)); } -namespace { - -// TODO: Remove when rewriting PhraseQueryNode::evaluateHits -uint32_t legacy_pos(const Hit& hit) { - return ((hit.position() & 0xffffff) | ((hit.field_id() & 0xff) << 24)); -} - -} - const HitList & PhraseQueryNode::evaluateHits(HitList & hl) const { hl.clear(); _fieldInfo.clear(); - if ( ! AndQueryNode::evaluate()) return hl; - - HitList tmpHL; - const auto & children = getChildren(); - unsigned int fullPhraseLen = children.size(); - unsigned int currPhraseLen = 0; - std::vector<unsigned int> indexVector(fullPhraseLen, 0); - auto curr = static_cast<const QueryTerm *> (children[currPhraseLen].get()); - bool exhausted( curr->evaluateHits(tmpHL).empty()); - for (; !exhausted; ) { - auto next = static_cast<const QueryTerm *>(children[currPhraseLen+1].get()); - unsigned int & currIndex = indexVector[currPhraseLen]; - unsigned int & nextIndex = indexVector[currPhraseLen+1]; - - const auto & currHit = curr->evaluateHits(tmpHL)[currIndex]; - size_t firstPosition = legacy_pos(currHit); - uint32_t currElemId = currHit.element_id(); - uint32_t curr_field_id = currHit.field_id(); - - const HitList & nextHL = next->evaluateHits(tmpHL); - - int diff(0); - size_t nextIndexMax = nextHL.size(); - while ((nextIndex < nextIndexMax) && - ((nextHL[nextIndex].field_id() < curr_field_id) || - ((nextHL[nextIndex].field_id() == curr_field_id) && (nextHL[nextIndex].element_id() <= currElemId))) && - ((diff = legacy_pos(nextHL[nextIndex])-firstPosition) < 1)) - { - nextIndex++; - } - if ((diff == 1) && (nextHL[nextIndex].field_id() == curr_field_id) && (nextHL[nextIndex].element_id() == currElemId)) { - currPhraseLen++; - if ((currPhraseLen+1) == fullPhraseLen) { - Hit h = nextHL[indexVector[currPhraseLen]]; + HitIteratorPack itr_pack(getChildren()); + if (!itr_pack.all_valid()) { + return hl; + } + auto& last_child = dynamic_cast<const QueryTerm&>(*(*this)[size() - 1]); + while (itr_pack.seek_to_matching_field_element()) { + uint32_t first_position = itr_pack.front()->position(); + bool retry_element = true; + while (retry_element) { + uint32_t position_offset = 0; + bool match = true; + for (auto& it : itr_pack) { + if (!it.seek_in_field_element(first_position + position_offset, itr_pack.get_field_element_ref())) { + retry_element = false; + match = false; + break; + } + if (it->position() > first_position + position_offset) { + first_position = it->position() - position_offset; + match = false; + break; + } + ++position_offset; + } + if (match) { + auto h = *itr_pack.back(); hl.push_back(h); - const QueryTerm::FieldInfo & fi = next->getFieldInfo(h.field_id()); + auto& fi = last_child.getFieldInfo(h.field_id()); updateFieldInfo(h.field_id(), hl.size() - 1, fi.getFieldLength()); - currPhraseLen = 0; - indexVector[0]++; + if (!itr_pack.front().step_in_field_element(itr_pack.get_field_element_ref())) { + retry_element = false; + } } - } else { - currPhraseLen = 0; - indexVector[currPhraseLen]++; } - curr = static_cast<const QueryTerm *>(children[currPhraseLen].get()); - exhausted = (nextIndex >= nextIndexMax) || (indexVector[currPhraseLen] >= curr->evaluateHits(tmpHL).size()); } return hl; } |