summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <tegge@vespa.ai>2023-12-07 16:17:31 +0100
committerGitHub <noreply@github.com>2023-12-07 16:17:31 +0100
commit1eee8a86f9b8dc4f65c3931c4fba7f6fbb26954e (patch)
treebe468c0178f209aca4adc5fd54ffd80e357d5707
parent6b7fccbf8133a77bac22ea2b7a38fd349724a8c4 (diff)
parent3d8eaf0f2256e5bb559bad3182cf77f190357ad7 (diff)
Merge pull request #29582 from vespa-engine/toregge/add-multi-term-and-in-term-for-streaming-search
Add MultiTerm and InTerm for streaming search.
-rw-r--r--searchlib/src/tests/query/streaming_query_test.cpp30
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/in_term.cpp52
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/in_term.h19
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp66
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/multi_term.h39
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp15
11 files changed, 241 insertions, 2 deletions
diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp
index 020afc484e0..02416d0fa56 100644
--- a/searchlib/src/tests/query/streaming_query_test.cpp
+++ b/searchlib/src/tests/query/streaming_query_test.cpp
@@ -1,10 +1,14 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/searchlib/fef/simpletermdata.h>
+#include <vespa/searchlib/fef/matchdata.h>
+#include <vespa/searchlib/query/streaming/in_term.h>
#include <vespa/searchlib/query/streaming/query.h>
#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h>
#include <vespa/searchlib/query/tree/querybuilder.h>
#include <vespa/searchlib/query/tree/simplequery.h>
#include <vespa/searchlib/query/tree/stackdumpcreator.h>
+#include <vespa/searchlib/query/tree/string_term_vector.h>
#include <vespa/vespalib/gtest/gtest.h>
#include <limits>
#include <cmath>
@@ -13,6 +17,8 @@ using namespace search;
using namespace search::query;
using namespace search::streaming;
using TermType = QueryTerm::Type;
+using search::fef::SimpleTermData;
+using search::fef::MatchData;
void assertHit(const Hit & h, size_t expWordpos, size_t expContext, int32_t weight) {
EXPECT_EQ(h.wordpos(), expWordpos);
@@ -867,6 +873,30 @@ TEST(StreamingQueryTest, test_nearest_neighbor_query_node)
EXPECT_FALSE(node->evaluate());
}
+TEST(StreamingQueryTest, test_in_term)
+{
+ auto term_vector = std::make_unique<StringTermVector>(1);
+ term_vector->addTerm("7");
+ search::streaming::InTerm term({}, "index", std::move(term_vector));
+ SimpleTermData td;
+ td.addField(10);
+ td.addField(11);
+ td.addField(12);
+ td.lookupField(10)->setHandle(0);
+ td.lookupField(12)->setHandle(1);
+ EXPECT_FALSE(term.evaluate());
+ auto& q = *term.get_terms().front();
+ q.add(0, 11, 0, 1);
+ q.add(0, 12, 0, 1);
+ EXPECT_TRUE(term.evaluate());
+ MatchData md(MatchData::params().numTermFields(2));
+ term.unpack_match_data(23, td, md);
+ auto tmd0 = md.resolveTermField(0);
+ EXPECT_NE(23, tmd0->getDocId());
+ auto tmd2 = md.resolveTermField(1);
+ EXPECT_EQ(23, tmd2->getDocId());
+}
+
TEST(StreamingQueryTest, control_the_size_of_query_terms)
{
EXPECT_EQ(112u, sizeof(QueryTermSimple));
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
index 4c51a0a1e0a..7f57f4d15d1 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
@@ -1,6 +1,8 @@
# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(searchlib_query_streaming OBJECT
SOURCES
+ in_term.cpp
+ multi_term.cpp
nearest_neighbor_query_node.cpp
query.cpp
querynode.cpp
diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp
new file mode 100644
index 00000000000..36303d4e991
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp
@@ -0,0 +1,52 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "in_term.h"
+#include <vespa/searchlib/fef/itermdata.h>
+#include <vespa/searchlib/fef/matchdata.h>
+#include <vespa/searchlib/query/tree/term_vector.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+
+using search::fef::ITermData;
+using search::fef::MatchData;
+using search::query::TermVector;
+
+namespace search::streaming {
+
+InTerm::InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, std::unique_ptr<TermVector> terms)
+ : MultiTerm(std::move(result_base), index, Type::WORD, std::move(terms))
+{
+}
+
+InTerm::~InTerm() = default;
+
+void
+InTerm::unpack_match_data(uint32_t docid, const ITermData& td, MatchData& match_data)
+{
+ vespalib::hash_set<uint32_t> matching_field_ids;
+ HitList hl_store;
+ std::optional<uint32_t> prev_field_id;
+ for (const auto& term : _terms) {
+ auto& hl = term->evaluateHits(hl_store);
+ for (auto& hit : hl) {
+ if (!prev_field_id.has_value() || prev_field_id.value() != hit.context()) {
+ prev_field_id = hit.context();
+ matching_field_ids.insert(hit.context());
+ }
+ }
+ }
+ auto num_fields = td.numFields();
+ for (uint32_t field_idx = 0; field_idx < num_fields; ++field_idx) {
+ auto& tfd = td.field(field_idx);
+ auto field_id = tfd.getFieldId();
+ if (matching_field_ids.contains(field_id)) {
+ auto handle = tfd.getHandle();
+ if (handle != fef::IllegalHandle) {
+ auto tmd = match_data.resolveTermField(tfd.getHandle());
+ tmd->setFieldId(field_id);
+ tmd->reset(docid);
+ }
+ }
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.h b/searchlib/src/vespa/searchlib/query/streaming/in_term.h
new file mode 100644
index 00000000000..7d03ed989c7
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.h
@@ -0,0 +1,19 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "multi_term.h"
+
+namespace search::streaming {
+
+/*
+ * Representation of YQL in operator for streaming search.
+ */
+class InTerm : public MultiTerm {
+public:
+ InTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string& index, std::unique_ptr<query::TermVector> terms);
+ ~InTerm() override;
+ void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) override;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp
new file mode 100644
index 00000000000..ad5857b8c41
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp
@@ -0,0 +1,66 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "multi_term.h"
+#include <vespa/searchlib/query/tree/term_vector.h>
+
+using search::fef::ITermData;
+using search::fef::MatchData;
+using search::query::TermVector;
+
+namespace search::streaming {
+
+MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms)
+ : QueryTerm(std::move(result_base), "", index, type),
+ _terms()
+{
+ _terms.reserve(num_terms);
+}
+
+MultiTerm::MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<TermVector> terms)
+ : MultiTerm(std::move(result_base), index, type, terms->size())
+{
+ auto num_terms = terms->size();
+ for (uint32_t i = 0; i < num_terms; ++i) {
+ add_term(std::make_unique<QueryTerm>(std::unique_ptr<QueryNodeResultBase>(), terms->getAsString(i).first, "", QueryTermSimple::Type::WORD));
+ }
+}
+
+MultiTerm::~MultiTerm() = default;
+
+void
+MultiTerm::add_term(std::unique_ptr<QueryTerm> term)
+{
+ _terms.emplace_back(std::move(term));
+}
+
+MultiTerm*
+MultiTerm::as_multi_term() noexcept
+{
+ return this;
+}
+
+void
+MultiTerm::reset()
+{
+ for (auto& term : _terms) {
+ term->reset();
+ }
+}
+
+bool
+MultiTerm::evaluate() const
+{
+ for (const auto& term : _terms) {
+ if (term->evaluate()) return true;
+ }
+ return false;
+}
+
+const HitList&
+MultiTerm::evaluateHits(HitList& hl) const
+{
+ hl.clear();
+ return hl;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
new file mode 100644
index 00000000000..4c3f1ea5b5a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h
@@ -0,0 +1,39 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "queryterm.h"
+
+namespace search::fef {
+
+class ITermData;
+class MatchData;
+
+}
+
+namespace search::query { class TermVector; }
+
+namespace search::streaming {
+
+/*
+ * Base class for query term nodes that are considered leaf nodes by
+ * the ranking framework but still have multiple terms used for
+ * search.
+ */
+class MultiTerm : public QueryTerm {
+protected:
+ std::vector<std::unique_ptr<QueryTerm>> _terms;
+public:
+ MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, uint32_t num_terms);
+ MultiTerm(std::unique_ptr<QueryNodeResultBase> result_base, const string & index, Type type, std::unique_ptr<query::TermVector> terms);
+ ~MultiTerm() override;
+ void add_term(std::unique_ptr<QueryTerm> term);
+ MultiTerm* as_multi_term() noexcept override;
+ void reset() override;
+ bool evaluate() const override;
+ const HitList& evaluateHits(HitList& hl) const override;
+ virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0;
+ const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 8e67ed5f0d3..7e7d1f1f260 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -3,6 +3,8 @@
#include "query.h"
#include "nearest_neighbor_query_node.h"
#include <vespa/searchlib/parsequery/stackdumpiterator.h>
+#include <vespa/searchlib/query/streaming/in_term.h>
+#include <vespa/searchlib/query/tree/term_vector.h>
#include <charconv>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.querynode");
@@ -178,6 +180,10 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
}
}
break;
+ case ParseItem::ITEM_STRING_IN:
+ case ParseItem::ITEM_NUMERIC_IN:
+ qn = std::make_unique<InTerm>(factory.create(), queryRep.getIndexName(), queryRep.get_terms());
+ break;
default:
{
for (uint32_t skipCount = arity; (skipCount > 0) && queryRep.next(); skipCount--) {
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index 82f2c2906e0..9c45427d07d 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -98,4 +98,10 @@ QueryTerm::as_nearest_neighbor_query_node() noexcept
return nullptr;
}
+MultiTerm*
+QueryTerm::as_multi_term() noexcept
+{
+ return nullptr;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 96d83982cbb..6e91437b1f9 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -12,6 +12,7 @@
namespace search::streaming {
class NearestNeighborQueryNode;
+class MultiTerm;
/**
This is a leaf in the Query tree. All terms are leafs.
@@ -89,6 +90,7 @@ public:
void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; }
void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
+ virtual MultiTerm* as_multi_term() noexcept;
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
index 8805cc5b3ec..6b15b7cb88e 100644
--- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp
@@ -4,6 +4,7 @@
#include "rankprocessor.h"
#include <vespa/searchlib/fef/handle.h>
#include <vespa/searchlib/fef/simpletermfielddata.h>
+#include <vespa/searchlib/query/streaming/multi_term.h>
#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h>
#include <vespa/vsm/vsm/fieldsearchspec.h>
#include <algorithm>
@@ -26,6 +27,7 @@ using search::fef::TermFieldMatchData;
using search::fef::TermFieldMatchDataPosition;
using search::streaming::Hit;
using search::streaming::HitList;
+using search::streaming::MultiTerm;
using search::streaming::Query;
using search::streaming::QueryTerm;
using search::streaming::QueryTermList;
@@ -273,6 +275,10 @@ RankProcessor::unpack_match_data(uint32_t docid, MatchData &matchData, QueryWrap
tmd->setRawScore(docid, raw_score.value());
}
}
+ } else if (auto multi_term = term.getTerm()->as_multi_term()) {
+ auto& qtd = static_cast<QueryTermData &>(term.getTerm()->getQueryItem());
+ auto& td = qtd.getTermData();
+ multi_term->unpack_match_data(docid, td, matchData);
} else if (!term.isPhraseTerm() || term.isFirstPhraseTerm()) { // consider 1 term data per phrase
bool isPhrase = term.isFirstPhraseTerm();
QueryTermData & qtd = static_cast<QueryTermData &>(term.getTerm()->getQueryItem());
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index c11f9e839cf..c797e6751ee 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -3,6 +3,8 @@
#include <vespa/vsm/vsm/fieldsearchspec.h>
#include <vespa/document/fieldvalue/arrayfieldvalue.h>
#include <vespa/document/fieldvalue/weightedsetfieldvalue.h>
+#include <vespa/searchlib/query/streaming/multi_term.h>
+#include <vespa/vespalib/stllike/hash_set.h>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.searcher.fieldsearcher");
@@ -230,6 +232,7 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
vespalib::string tmp;
for (auto& searcher : *this) {
QueryTermList onlyInIndex;
+ vespalib::hash_set<const void*> seen;
FieldIdT fid = searcher->field();
for (auto qt : qtl) {
for (const auto& doc_type_elem : difm) {
@@ -237,8 +240,16 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
auto found = fim.find(FieldSearchSpecMap::stripNonFields(qt->index()));
if (found != fim.end()) {
const FieldIdTList & index = found->second;
- if ((find(index.begin(), index.end(), fid) != index.end()) && (find(onlyInIndex.begin(), onlyInIndex.end(), qt) == onlyInIndex.end())) {
- onlyInIndex.push_back(qt);
+ if ((find(index.begin(), index.end(), fid) != index.end()) && !seen.contains(qt)) {
+ seen.insert(qt);
+ auto multi_term = qt->as_multi_term();
+ if (multi_term != nullptr) {
+ for (auto& subterm : multi_term->get_terms()) {
+ onlyInIndex.emplace_back(subterm.get());
+ }
+ } else {
+ onlyInIndex.emplace_back(qt);
+ }
}
} else {
LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.",