diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-12-07 14:36:53 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-12-07 14:36:53 +0100 |
commit | 6fdfffbd6c8e205c67906cc5f7a5df9002be4a27 (patch) | |
tree | 69e9b2cee095783c86c7766674529b37057f4576 | |
parent | 10a580357998b4c750729f27d3ef2e224dd69af7 (diff) |
Add MultiTerm and InTerm for streaming search.
11 files changed, 239 insertions, 2 deletions
diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp index 020afc484e0..3c399f40f0a 100644 --- a/searchlib/src/tests/query/streaming_query_test.cpp +++ b/searchlib/src/tests/query/streaming_query_test.cpp @@ -1,10 +1,14 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/searchlib/fef/simpletermdata.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchlib/query/streaming/in_term.h> #include <vespa/searchlib/query/streaming/query.h> #include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h> #include <vespa/searchlib/query/tree/querybuilder.h> #include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/query/tree/stackdumpcreator.h> +#include <vespa/searchlib/query/tree/string_term_vector.h> #include <vespa/vespalib/gtest/gtest.h> #include <limits> #include <cmath> @@ -13,6 +17,8 @@ using namespace search; using namespace search::query; using namespace search::streaming; using TermType = QueryTerm::Type; +using search::fef::SimpleTermData; +using search::fef::MatchData; void assertHit(const Hit & h, size_t expWordpos, size_t expContext, int32_t weight) { EXPECT_EQ(h.wordpos(), expWordpos); @@ -867,6 +873,28 @@ TEST(StreamingQueryTest, test_nearest_neighbor_query_node) EXPECT_FALSE(node->evaluate()); } +TEST(StreamingQueryTest, test_in_term) +{ + auto term_vector = std::make_unique<StringTermVector>(1); + term_vector->addTerm("7"); + search::streaming::InTerm term({}, "index", TermType::WORD, std::move(term_vector)); + SimpleTermData td; + td.addField(10); + td.addField(11); + td.addField(12); + td.lookupField(10)->setHandle(0); + td.lookupField(12)->setHandle(1); + auto& q = *term.get_terms().front(); + q.add(0, 11, 0, 1); + q.add(0, 12, 0, 1); + MatchData md(MatchData::params().numTermFields(2)); + term.unpack_match_data(23, td, md); + auto tmd0 = md.resolveTermField(0); + EXPECT_NE(23, tmd0->getDocId()); + auto tmd2 = md.resolveTermField(1); + EXPECT_EQ(23, tmd2->getDocId()); +} + TEST(StreamingQueryTest, control_the_size_of_query_terms) { EXPECT_EQ(112u, sizeof(QueryTermSimple)); diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt index 4c51a0a1e0a..7f57f4d15d1 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt @@ -1,6 +1,8 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_query_streaming OBJECT SOURCES + in_term.cpp + multi_term.cpp nearest_neighbor_query_node.cpp query.cpp querynode.cpp diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp new file mode 100644 index 00000000000..f3c96fb9502 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp @@ -0,0 +1,52 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "in_term.h" +#include <vespa/searchlib/fef/itermdata.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchlib/query/tree/term_vector.h> +#include <vespa/vespalib/stllike/hash_set.h> + +using search::fef::ITermData; +using search::fef::MatchData; +using search::query::TermVector; + +namespace search::streaming { + +InTerm::InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<TermVector> terms) + : MultiTerm(std::move(result_base), index, type, std::move(terms)) +{ +} + +InTerm::~InTerm() = default; + +void +InTerm::unpack_match_data(uint32_t docid, const ITermData& td, MatchData& match_data) +{ + vespalib::hash_set<uint32_t> matching_field_ids; + HitList hl_store; + std::optional<uint32_t> prev_field_id; + for (const auto& term : _terms) { + auto& hl = term->evaluateHits(hl_store); + for (auto& hit : hl) { + if (!prev_field_id.has_value() || prev_field_id.value() != hit.context()) { + prev_field_id = hit.context(); + matching_field_ids.insert(hit.context()); + } + } + } + auto num_fields = td.numFields(); + for (uint32_t field_idx = 0; field_idx < num_fields; ++field_idx) { + auto& tfd = td.field(field_idx); + auto field_id = tfd.getFieldId(); + if (matching_field_ids.contains(field_id)) { + auto handle = tfd.getHandle(); + if (handle != fef::IllegalHandle) { + auto tmd = match_data.resolveTermField(tfd.getHandle()); + tmd->setFieldId(field_id); + tmd->reset(docid); + } + } + } +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.h b/searchlib/src/vespa/searchlib/query/streaming/in_term.h new file mode 100644 index 00000000000..00a57e61d30 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.h @@ -0,0 +1,19 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "multi_term.h" + +namespace search::streaming { + +/* + * Representation of YQL in operator for streaming search. + */ +class InTerm : public MultiTerm { +public: + InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, Type type, std::unique_ptr<query::TermVector> terms); + ~InTerm() override; + void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override; +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp new file mode 100644 index 00000000000..ad5857b8c41 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp @@ -0,0 +1,66 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "multi_term.h" +#include <vespa/searchlib/query/tree/term_vector.h> + +using search::fef::ITermData; +using search::fef::MatchData; +using search::query::TermVector; + +namespace search::streaming { + +MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms) + : QueryTerm(std::move(result_base), "", index, type), + _terms() +{ + _terms.reserve(num_terms); +} + +MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<TermVector> terms) + : MultiTerm(std::move(result_base), index, type, terms->size()) +{ + auto num_terms = terms->size(); + for (uint32_t i = 0; i < num_terms; ++i) { + add_term(std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), terms->getAsString(i).first, "", QueryTermSimple::Type::WORD)); + } +} + +MultiTerm::~MultiTerm() = default; + +void +MultiTerm::add_term(std::unique_ptr<QueryTerm> term) +{ + _terms.emplace_back(std::move(term)); +} + +MultiTerm* +MultiTerm::as_multi_term() noexcept +{ + return this; +} + +void +MultiTerm::reset() +{ + for (auto& term : _terms) { + term->reset(); + } +} + +bool +MultiTerm::evaluate() const +{ + for (const auto& term : _terms) { + if (term->evaluate()) return true; + } + return false; +} + +const HitList& +MultiTerm::evaluateHits(HitList& hl) const +{ + hl.clear(); + return hl; +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h new file mode 100644 index 00000000000..4c3f1ea5b5a --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h @@ -0,0 +1,39 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "queryterm.h" + +namespace search::fef { + +class ITermData; +class MatchData; + +} + +namespace search::query { class TermVector; } + +namespace search::streaming { + +/* + * Base class for query term nodes that are considered leaf nodes by + * the ranking framework but still have multiple terms used for + * search. + */ +class MultiTerm : public QueryTerm { +protected: + std::vector<std::unique_ptr<QueryTerm>> _terms; +public: + MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms); + MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<query::TermVector> terms); + ~MultiTerm() override; + void add_term(std::unique_ptr<QueryTerm> term); + MultiTerm* as_multi_term() noexcept override; + void reset() override; + bool evaluate() const override; + const HitList& evaluateHits(HitList& hl) const override; + virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0; + const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; } +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 8e67ed5f0d3..e38dce4cc78 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -3,6 +3,8 @@ #include "query.h" #include "nearest_neighbor_query_node.h" #include <vespa/searchlib/parsequery/stackdumpiterator.h> +#include <vespa/searchlib/query/streaming/in_term.h> +#include <vespa/searchlib/query/tree/term_vector.h> #include <charconv> #include <vespa/log/log.h> LOG_SETUP(".vsm.querynode"); @@ -178,6 +180,10 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor } } break; + case ParseItem::ITEM_STRING_IN: + case ParseItem::ITEM_NUMERIC_IN: + qn = std::make_unique<InTerm>(factory.create(), queryRep.getIndexName(), QueryTermSimple::Type::WORD, queryRep.get_terms()); + break; default: { for (uint32_t skipCount = arity; (skipCount > 0) && queryRep.next(); skipCount--) { diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index 82f2c2906e0..9c45427d07d 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -98,4 +98,10 @@ QueryTerm::as_nearest_neighbor_query_node() noexcept return nullptr; } +MultiTerm* +QueryTerm::as_multi_term() noexcept +{ + return nullptr; +} + } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 96d83982cbb..6e91437b1f9 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -12,6 +12,7 @@ namespace search::streaming { class NearestNeighborQueryNode; +class MultiTerm; /** This is a leaf in the Query tree. All terms are leafs. @@ -89,6 +90,7 @@ public: void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; } void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; } virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept; + virtual MultiTerm* as_multi_term() noexcept; protected: using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp index 8805cc5b3ec..6b15b7cb88e 100644 --- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp @@ -4,6 +4,7 @@ #include "rankprocessor.h" #include <vespa/searchlib/fef/handle.h> #include <vespa/searchlib/fef/simpletermfielddata.h> +#include <vespa/searchlib/query/streaming/multi_term.h> #include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h> #include <vespa/vsm/vsm/fieldsearchspec.h> #include <algorithm> @@ -26,6 +27,7 @@ using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataPosition; using search::streaming::Hit; using search::streaming::HitList; +using search::streaming::MultiTerm; using search::streaming::Query; using search::streaming::QueryTerm; using search::streaming::QueryTermList; @@ -273,6 +275,10 @@ RankProcessor::unpack_match_data(uint32_t docid, MatchData &matchData, QueryWrap tmd->setRawScore(docid, raw_score.value()); } } + } else if (auto multi_term = term.getTerm()->as_multi_term()) { + auto& qtd = static_cast<QueryTermData &>(term.getTerm()->getQueryItem()); + auto& td = qtd.getTermData(); + multi_term->unpack_match_data(docid, td, matchData); } else if (!term.isPhraseTerm() || term.isFirstPhraseTerm()) { // consider 1 term data per phrase bool isPhrase = term.isFirstPhraseTerm(); QueryTermData & qtd = static_cast<QueryTermData &>(term.getTerm()->getQueryItem()); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index c11f9e839cf..c797e6751ee 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -3,6 +3,8 @@ #include <vespa/vsm/vsm/fieldsearchspec.h> #include <vespa/document/fieldvalue/arrayfieldvalue.h> #include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/searchlib/query/streaming/multi_term.h> +#include <vespa/vespalib/stllike/hash_set.h> #include <vespa/log/log.h> LOG_SETUP(".vsm.searcher.fieldsearcher"); @@ -230,6 +232,7 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, vespalib::string tmp; for (auto& searcher : *this) { QueryTermList onlyInIndex; + vespalib::hash_set<const void*> seen; FieldIdT fid = searcher->field(); for (auto qt : qtl) { for (const auto& doc_type_elem : difm) { @@ -237,8 +240,16 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, auto found = fim.find(FieldSearchSpecMap::stripNonFields(qt->index())); if (found != fim.end()) { const FieldIdTList & index = found->second; - if ((find(index.begin(), index.end(), fid) != index.end()) && (find(onlyInIndex.begin(), onlyInIndex.end(), qt) == onlyInIndex.end())) { - onlyInIndex.push_back(qt); + if ((find(index.begin(), index.end(), fid) != index.end()) && !seen.contains(qt)) { + seen.insert(qt); + auto multi_term = qt->as_multi_term(); + if (multi_term != nullptr) { + for (auto& subterm : multi_term->get_terms()) { + onlyInIndex.emplace_back(subterm.get()); + } + } else { + onlyInIndex.emplace_back(qt); + } } } else { LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", |