summaryrefslogtreecommitdiffstats
path: root/searchcore
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2021-11-03 16:00:03 +0000
committerHåvard Pettersen <havardpe@oath.com>2021-11-04 15:11:32 +0000
commit2b2a16ac12b6fd40008bac37d59ec6fc89f66539 (patch)
tree3559cd7f934940a4fb2dc7cb1299133b0acf6462 /searchcore
parent6ebe77e2ceebd37aa26aa762f4c608fee22c1b40 (diff)
calculate match features
+ factor out feature extraction + improve summary feature testing + extract returned docids with ordering
Diffstat (limited to 'searchcore')
-rw-r--r--searchcore/src/tests/proton/matching/matching_test.cpp159
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt1
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp41
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp177
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/extract_features.h39
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/match_master.cpp20
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp12
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/match_tools.h2
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp13
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/result_processor.h1
10 files changed, 387 insertions, 78 deletions
diff --git a/searchcore/src/tests/proton/matching/matching_test.cpp b/searchcore/src/tests/proton/matching/matching_test.cpp
index d690fb29795..5d084a2448f 100644
--- a/searchcore/src/tests/proton/matching/matching_test.cpp
+++ b/searchcore/src/tests/proton/matching/matching_test.cpp
@@ -143,10 +143,11 @@ struct MyWorld {
config.add(indexproperties::rank::FirstPhase::NAME, "attribute(a1)");
config.add(indexproperties::hitcollector::HeapSize::NAME, (vespalib::asciistream() << heapSize).str());
config.add(indexproperties::hitcollector::ArraySize::NAME, (vespalib::asciistream() << arraySize).str());
- config.add(indexproperties::summary::Feature::NAME, "attribute(a1)");
+ config.add(indexproperties::summary::Feature::NAME, "matches(f1)");
config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"reduce(tensor(x[3])(x),sum)\")");
config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")");
config.add(indexproperties::summary::Feature::NAME, "value(100)");
+ config.add(indexproperties::summary::Feature::NAME, " attribute ( a1 ) "); // will be sorted and normalized
config.add(indexproperties::dump::IgnoreDefaultFeatures::NAME, "true");
config.add(indexproperties::dump::Feature::NAME, "attribute(a2)");
@@ -211,6 +212,44 @@ struct MyWorld {
config.import(cfg);
}
+ void setup_match_features() {
+ config.add(indexproperties::match::Feature::NAME, "attribute(a1)");
+ config.add(indexproperties::match::Feature::NAME, "attribute(a2)");
+ config.add(indexproperties::match::Feature::NAME, "matches(a1)");
+ config.add(indexproperties::match::Feature::NAME, "matches(f1)");
+ config.add(indexproperties::match::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")");
+ }
+
+ static void verify_match_features(SearchReply &reply, const vespalib::string &matched_field) {
+ if (reply.hits.empty()) {
+ EXPECT_EQUAL(reply.match_features.names.size(), 0u);
+ EXPECT_EQUAL(reply.match_features.values.size(), 0u);
+ } else {
+ ASSERT_EQUAL(reply.match_features.names.size(), 5u);
+ EXPECT_EQUAL(reply.match_features.names[0], "attribute(a1)");
+ EXPECT_EQUAL(reply.match_features.names[1], "attribute(a2)");
+ EXPECT_EQUAL(reply.match_features.names[2], "matches(a1)");
+ EXPECT_EQUAL(reply.match_features.names[3], "matches(f1)");
+ EXPECT_EQUAL(reply.match_features.names[4], "rankingExpression(\"tensor(x[3])(x)\")");
+ ASSERT_EQUAL(reply.match_features.values.size(), 5 * reply.hits.size());
+ for (size_t i = 0; i < reply.hits.size(); ++i) {
+ const auto *f = &reply.match_features.values[i * 5];
+ EXPECT_GREATER(f[0].as_double(), 0.0);
+ EXPECT_GREATER(f[1].as_double(), 0.0);
+ EXPECT_EQUAL(f[0].as_double() * 2, f[1].as_double());
+ EXPECT_EQUAL(f[2].as_double(), double(matched_field == "a1"));
+ EXPECT_EQUAL(f[3].as_double(), double(matched_field == "f1"));
+ EXPECT_TRUE(f[4].is_data());
+ {
+ nbostream buf(f[4].as_data().data, f[4].as_data().size);
+ auto actual = spec_from_value(*SimpleValue::from_stream(buf));
+ auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2);
+ EXPECT_EQUAL(actual, expect);
+ }
+ }
+ }
+ }
+
void setup_match_phase_limiting(const vespalib::string &attribute, size_t max_hits, bool descending)
{
inject_match_phase_limiting(config, attribute, max_hits, descending);
@@ -442,6 +481,30 @@ TEST("require that matching is performed (multi-threaded)") {
}
}
+TEST("require that match features are calculated (multi-threaded)") {
+ for (size_t threads = 1; threads <= 16; ++threads) {
+ MyWorld world;
+ world.basicSetup();
+ world.basicResults();
+ world.setup_match_features();
+ SearchRequest::SP request = world.createSimpleRequest("f1", "spread");
+ SearchReply::UP reply = world.performSearch(request, threads);
+ EXPECT_GREATER(reply->hits.size(), 0u);
+ world.verify_match_features(*reply, "f1");
+ }
+}
+
+TEST("require that no hits gives no match feature names") {
+ MyWorld world;
+ world.basicSetup();
+ world.basicResults();
+ world.setup_match_features();
+ SearchRequest::SP request = world.createSimpleRequest("f1", "not_found");
+ SearchReply::UP reply = world.performSearch(request, 1);
+ EXPECT_EQUAL(reply->hits.size(), 0u);
+ world.verify_match_features(*reply, "f1");
+}
+
TEST("require that matching also returns hits when only bitvector is used (multi-threaded)") {
for (size_t threads = 1; threads <= 16; ++threads) {
MyWorld world;
@@ -645,30 +708,36 @@ TEST("require that summary features are filled") {
world.basicResults();
DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
FeatureSet::SP fs = world.getSummaryFeatures(req);
- const FeatureSet::Value * f = NULL;
- EXPECT_EQUAL(4u, fs->numFeatures());
+ const FeatureSet::Value * f = nullptr;
+ EXPECT_EQUAL(5u, fs->numFeatures());
EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
- EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
- EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
- EXPECT_EQUAL("value(100)", fs->getNames()[3]);
- EXPECT_EQUAL(2u, fs->numDocs());
+ EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+ EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+ EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+ EXPECT_EQUAL("value(100)", fs->getNames()[4]);
+ EXPECT_EQUAL(3u, fs->numDocs());
f = fs->getFeaturesByDocId(10);
- EXPECT_TRUE(f != NULL);
+ EXPECT_TRUE(f != nullptr);
EXPECT_EQUAL(10, f[0].as_double());
- EXPECT_EQUAL(100, f[3].as_double());
+ EXPECT_EQUAL(1, f[1].as_double());
+ EXPECT_EQUAL(100, f[4].as_double());
f = fs->getFeaturesByDocId(15);
- EXPECT_TRUE(f == NULL);
+ EXPECT_TRUE(f != nullptr);
+ EXPECT_EQUAL(15, f[0].as_double());
+ EXPECT_EQUAL(0, f[1].as_double());
+ EXPECT_EQUAL(100, f[4].as_double());
f = fs->getFeaturesByDocId(30);
- EXPECT_TRUE(f != NULL);
+ EXPECT_TRUE(f != nullptr);
EXPECT_EQUAL(30, f[0].as_double());
- EXPECT_EQUAL(100, f[3].as_double());
- EXPECT_TRUE(f[1].is_double());
- EXPECT_TRUE(!f[1].is_data());
- EXPECT_EQUAL(f[1].as_double(), 3.0); // 0 + 1 + 2
- EXPECT_TRUE(!f[2].is_double());
- EXPECT_TRUE(f[2].is_data());
+ EXPECT_EQUAL(1, f[1].as_double());
+ EXPECT_TRUE(f[2].is_double());
+ EXPECT_TRUE(!f[2].is_data());
+ EXPECT_EQUAL(f[2].as_double(), 3.0); // 0 + 1 + 2
+ EXPECT_TRUE(!f[3].is_double());
+ EXPECT_TRUE(f[3].is_data());
+ EXPECT_EQUAL(100, f[4].as_double());
{
- nbostream buf(f[2].as_data().data, f[2].as_data().size);
+ nbostream buf(f[3].as_data().data, f[3].as_data().size);
auto actual = spec_from_value(*SimpleValue::from_stream(buf));
auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2);
EXPECT_EQUAL(actual, expect);
@@ -681,17 +750,18 @@ TEST("require that rank features are filled") {
world.basicResults();
DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
FeatureSet::SP fs = world.getRankFeatures(req);
- const FeatureSet::Value * f = NULL;
+ const FeatureSet::Value * f = nullptr;
EXPECT_EQUAL(1u, fs->numFeatures());
EXPECT_EQUAL("attribute(a2)", fs->getNames()[0]);
- EXPECT_EQUAL(2u, fs->numDocs());
+ EXPECT_EQUAL(3u, fs->numDocs());
f = fs->getFeaturesByDocId(10);
- EXPECT_TRUE(f != NULL);
+ EXPECT_TRUE(f != nullptr);
EXPECT_EQUAL(20, f[0].as_double());
f = fs->getFeaturesByDocId(15);
- EXPECT_TRUE(f == NULL);
+ EXPECT_TRUE(f != nullptr);
+ EXPECT_EQUAL(30, f[0].as_double());
f = fs->getFeaturesByDocId(30);
- EXPECT_TRUE(f != NULL);
+ EXPECT_TRUE(f != nullptr);
EXPECT_EQUAL(60, f[0].as_double());
}
@@ -727,29 +797,42 @@ TEST("require that getSummaryFeatures can use cached query setup") {
docsum_request->hits.back().docid = 30;
FeatureSet::SP fs = world.getSummaryFeatures(docsum_request);
- ASSERT_EQUAL(4u, fs->numFeatures());
+ ASSERT_EQUAL(5u, fs->numFeatures());
EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
- EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
- EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
- EXPECT_EQUAL("value(100)", fs->getNames()[3]);
+ EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+ EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+ EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+ EXPECT_EQUAL("value(100)", fs->getNames()[4]);
ASSERT_EQUAL(1u, fs->numDocs());
const auto *f = fs->getFeaturesByDocId(30);
ASSERT_TRUE(f);
EXPECT_EQUAL(30, f[0].as_double());
- EXPECT_EQUAL(100, f[3].as_double());
+ EXPECT_EQUAL(100, f[4].as_double());
// getSummaryFeatures can be called multiple times.
fs = world.getSummaryFeatures(docsum_request);
- ASSERT_EQUAL(4u, fs->numFeatures());
+ ASSERT_EQUAL(5u, fs->numFeatures());
EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
- EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
- EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
- EXPECT_EQUAL("value(100)", fs->getNames()[3]);
+ EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+ EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+ EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+ EXPECT_EQUAL("value(100)", fs->getNames()[4]);
ASSERT_EQUAL(1u, fs->numDocs());
f = fs->getFeaturesByDocId(30);
ASSERT_TRUE(f);
EXPECT_EQUAL(30, f[0].as_double());
- EXPECT_EQUAL(100, f[3].as_double());
+ EXPECT_EQUAL(100, f[4].as_double());
+}
+
+double count_f1_matches(FeatureSet &fs) {
+ ASSERT_TRUE(fs.getNames().size() > 1);
+ ASSERT_EQUAL(fs.getNames()[1], "matches(f1)");
+ double sum = 0.0;
+ for (size_t i = 0; i < fs.numDocs(); ++i) {
+ auto *f = fs.getFeaturesByIndex(i);
+ sum += f[1].as_double();
+ }
+ return sum;
}
TEST("require that getSummaryFeatures prefers cached query setup") {
@@ -765,16 +848,18 @@ TEST("require that getSummaryFeatures prefers cached query setup") {
req->sessionId = request->sessionId;
req->propertiesMap.lookupCreate(search::MapNames::CACHES).add("query", "true");
FeatureSet::SP fs = world.getSummaryFeatures(req);
- EXPECT_EQUAL(4u, fs->numFeatures());
- ASSERT_EQUAL(0u, fs->numDocs()); // "spread" has no hits
+ EXPECT_EQUAL(5u, fs->numFeatures());
+ EXPECT_EQUAL(3u, fs->numDocs());
+ EXPECT_EQUAL(0.0, count_f1_matches(*fs)); // "spread" has no hits
// Empty cache
auto pruneTime = vespalib::steady_clock::now() + 600s;
world.sessionManager->pruneTimedOutSessions(pruneTime);
fs = world.getSummaryFeatures(req);
- EXPECT_EQUAL(4u, fs->numFeatures());
- ASSERT_EQUAL(2u, fs->numDocs()); // "foo" has two hits
+ EXPECT_EQUAL(5u, fs->numFeatures());
+ EXPECT_EQUAL(3u, fs->numDocs());
+ EXPECT_EQUAL(2.0, count_f1_matches(*fs)); // "foo" has two hits
}
TEST("require that match params are set up straight with ranking on") {
diff --git a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
index 5f0017293e7..41e2fe2105f 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
+++ b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(searchcore_matching STATIC
docid_range_scheduler.cpp
docsum_matcher.cpp
document_scorer.cpp
+ extract_features.cpp
fakesearchcontext.cpp
handlerecorder.cpp
i_match_loop_communicator.cpp
diff --git a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
index 91b44b277f0..864e0a6b337 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
@@ -3,6 +3,7 @@
#include "docsum_matcher.h"
#include "match_tools.h"
#include "search_session.h"
+#include "extract_features.h"
#include <vespa/eval/eval/value_codec.h>
#include <vespa/vespalib/objects/nbostream.h>
#include <vespa/searchcommon/attribute/i_search_context.h>
@@ -46,45 +47,7 @@ get_feature_set(const MatchToolsFactory &mtf,
} else {
matchTools->setup_dump();
}
- RankProgram &rankProgram = matchTools->rank_program();
-
- std::vector<vespalib::string> featureNames;
- FeatureResolver resolver(rankProgram.get_seeds(false));
- featureNames.reserve(resolver.num_features());
- for (size_t i = 0; i < resolver.num_features(); ++i) {
- featureNames.emplace_back(resolver.name_of(i));
- }
- auto retval = std::make_unique<FeatureSet>(featureNames, docs.size());
- if (docs.empty()) {
- return retval;
- }
- FeatureSet &fs = *retval;
-
- SearchIterator &search = matchTools->search();
- search.initRange(docs.front(), docs.back()+1);
- for (uint32_t i = 0; i < docs.size(); ++i) {
- if (search.seek(docs[i])) {
- uint32_t docId = search.getDocId();
- search.unpack(docId);
- auto * f = fs.getFeaturesByIndex(fs.addDocId(docId));
- for (uint32_t j = 0; j < featureNames.size(); ++j) {
- if (resolver.is_object(j)) {
- auto obj = resolver.resolve(j).as_object(docId);
- if (! obj.get().type().is_double()) {
- vespalib::nbostream buf;
- encode_value(obj.get(), buf);
- f[j].set_data(vespalib::Memory(buf.peek(), buf.size()));
- } else {
- f[j].set_double(obj.get().as_double());
- }
- } else {
- f[j].set_double(resolver.resolve(j).as_number(docId));
- }
- }
- } else {
- LOG(debug, "getFeatureSet: Did not find hit for docid '%u'. Skipping hit", docs[i]);
- }
- }
+ auto retval = ExtractFeatures::get_feature_set(matchTools->search(), matchTools->rank_program(), docs);
if (auto onSummaryTask = mtf.createOnSummaryTask()) {
onSummaryTask->run(docs);
}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
new file mode 100644
index 00000000000..ef03fac2f6a
--- /dev/null
+++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
@@ -0,0 +1,177 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "extract_features.h"
+#include "match_tools.h"
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/vespalib/objects/nbostream.h>
+#include <vespa/vespalib/util/runnable.h>
+#include <vespa/vespalib/util/thread_bundle.h>
+#include <vespa/searchlib/fef/feature_resolver.h>
+#include <vespa/searchlib/fef/rank_program.h>
+#include <vespa/searchlib/queryeval/searchiterator.h>
+
+using vespalib::Runnable;
+using vespalib::ThreadBundle;
+using search::FeatureSet;
+using search::FeatureValues;
+using search::fef::FeatureResolver;
+using search::fef::RankProgram;
+using search::queryeval::SearchIterator;
+
+namespace proton::matching {
+
+using OrderedDocs = ExtractFeatures::OrderedDocs;
+
+namespace {
+
+struct MyChunk : Runnable {
+ const std::pair<uint32_t,uint32_t> *begin;
+ const std::pair<uint32_t,uint32_t> *end;
+ FeatureValues &result;
+ MyChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+ const std::pair<uint32_t,uint32_t> *end_in,
+ FeatureValues &result_in)
+ : begin(begin_in), end(end_in), result(result_in) {}
+ void calculate_features(SearchIterator &search, FeatureResolver &resolver) {
+ size_t num_features = result.names.size();
+ assert(end > begin);
+ assert(num_features == resolver.num_features());
+ search.initRange(begin[0].first, end[-1].first + 1);
+ for (auto pos = begin; pos != end; ++pos) {
+ uint32_t docid = pos->first;
+ search.unpack(docid);
+ auto * f = &result.values[pos->second * num_features];
+ for (uint32_t i = 0; i < num_features; ++i) {
+ if (resolver.is_object(i)) {
+ auto obj = resolver.resolve(i).as_object(docid);
+ if (!obj.get().type().is_double()) {
+ vespalib::nbostream buf;
+ encode_value(obj.get(), buf);
+ f[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
+ } else {
+ f[i].set_double(obj.get().as_double());
+ }
+ } else {
+ f[i].set_double(resolver.resolve(i).as_number(docid));
+ }
+ }
+ }
+ }
+};
+
+struct FirstChunk : MyChunk {
+ SearchIterator &search;
+ FeatureResolver &resolver;
+ FirstChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+ const std::pair<uint32_t,uint32_t> *end_in,
+ FeatureValues &result_in,
+ SearchIterator &search_in,
+ FeatureResolver &resolver_in)
+ : MyChunk(begin_in, end_in, result_in),
+ search(search_in),
+ resolver(resolver_in) {}
+ void run() override { calculate_features(search, resolver); }
+};
+
+struct LaterChunk : MyChunk {
+ const MatchToolsFactory &mtf;
+ LaterChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+ const std::pair<uint32_t,uint32_t> *end_in,
+ FeatureValues &result_in,
+ const MatchToolsFactory &mtf_in)
+ : MyChunk(begin_in, end_in, result_in),
+ mtf(mtf_in) {}
+ void run() override {
+ auto tools = mtf.createMatchTools();
+ tools->setup_match_features();
+ FeatureResolver resolver(tools->rank_program().get_seeds(false));
+ calculate_features(tools->search(), resolver);
+ }
+};
+
+struct MyWork {
+ size_t num_threads;
+ std::vector<Runnable::UP> chunks;
+ MyWork(ThreadBundle &thread_bundle) : num_threads(thread_bundle.size()), chunks() {
+ chunks.reserve(num_threads);
+ }
+ void run(ThreadBundle &thread_bundle) {
+ std::vector<Runnable*> refs;
+ refs.reserve(chunks.size());
+ for (const auto &task: chunks) {
+ refs.push_back(task.get());
+ }
+ thread_bundle.run(refs);
+ }
+};
+
+} // unnamed
+
+FeatureSet::UP
+ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs)
+{
+ std::vector<vespalib::string> featureNames;
+ FeatureResolver resolver(rank_program.get_seeds(false));
+ featureNames.reserve(resolver.num_features());
+ for (size_t i = 0; i < resolver.num_features(); ++i) {
+ featureNames.emplace_back(resolver.name_of(i));
+ }
+ auto result = std::make_unique<FeatureSet>(featureNames, docs.size());
+ if (!docs.empty()) {
+ search.initRange(docs.front(), docs.back()+1);
+ for (uint32_t docid: docs) {
+ search.unpack(docid);
+ auto * f = result->getFeaturesByIndex(result->addDocId(docid));
+ for (uint32_t i = 0; i < featureNames.size(); ++i) {
+ if (resolver.is_object(i)) {
+ auto obj = resolver.resolve(i).as_object(docid);
+ if (!obj.get().type().is_double()) {
+ vespalib::nbostream buf;
+ encode_value(obj.get(), buf);
+ f[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
+ } else {
+ f[i].set_double(obj.get().as_double());
+ }
+ } else {
+ f[i].set_double(resolver.resolve(i).as_number(docid));
+ }
+ }
+ }
+ }
+ return result;
+}
+
+FeatureValues
+ExtractFeatures::get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle)
+{
+ FeatureValues result;
+ auto tools = mtf.createMatchTools();
+ tools->setup_match_features();
+ FeatureResolver resolver(tools->rank_program().get_seeds(false));
+ result.names.reserve(resolver.num_features());
+ for (size_t i = 0; i < resolver.num_features(); ++i) {
+ result.names.emplace_back(resolver.name_of(i));
+ }
+ result.values.resize(result.names.size() * docs.size());
+ MyWork work(thread_bundle);
+ size_t per_thread = docs.size() / work.num_threads;
+ size_t rest_docs = docs.size() % work.num_threads;
+ size_t idx = 0;
+ for (size_t i = 0; i < work.num_threads; ++i) {
+ size_t chunk_size = per_thread + (i < rest_docs);
+ if (chunk_size == 0) {
+ break;
+ }
+ if (i == 0) {
+ work.chunks.push_back(std::make_unique<FirstChunk>(&docs[idx], &docs[idx + chunk_size], result, tools->search(), resolver));
+ } else {
+ work.chunks.push_back(std::make_unique<LaterChunk>(&docs[idx], &docs[idx + chunk_size], result, mtf));
+ }
+ idx += chunk_size;
+ }
+ assert(idx == docs.size());
+ work.run(thread_bundle);
+ return result;
+}
+
+}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.h b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h
new file mode 100644
index 00000000000..66e98d9db2d
--- /dev/null
+++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchlib/common/featureset.h>
+#include <vector>
+
+namespace vespalib { class ThreadBundle; };
+namespace search::queryeval { class SearchIterator; }
+namespace search::fef { class RankProgram; }
+
+namespace proton::matching {
+
+class MatchToolsFactory;
+
+struct ExtractFeatures {
+ using FeatureSet = search::FeatureSet;
+ using FeatureValues = search::FeatureValues;
+ using ThreadBundle = vespalib::ThreadBundle;
+ using SearchIterator = search::queryeval::SearchIterator;
+ using RankProgram = search::fef::RankProgram;
+
+ /**
+ * Extract all seed features from a rank program for a list of
+ * documents (must be in ascending order) using unpack information
+ * from a search.
+ **/
+ static FeatureSet::UP get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs);
+
+ // first: docid, second: result index (must be sorted on docid)
+ using OrderedDocs = std::vector<std::pair<uint32_t,uint32_t>>;
+
+ /**
+ * Extract match features using multiple threads.
+ **/
+ static FeatureValues get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle);
+};
+
+}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
index 26ed94f1d73..827ff4b5aca 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
@@ -5,7 +5,9 @@
#include "match_loop_communicator.h"
#include "match_thread.h"
#include "match_tools.h"
+#include "extract_features.h"
#include <vespa/searchlib/engine/trace.h>
+#include <vespa/searchlib/engine/searchreply.h>
#include <vespa/vespalib/util/thread_bundle.h>
#include <vespa/vespalib/util/issue.h>
#include <vespa/vespalib/data/slime/inserter.h>
@@ -21,6 +23,7 @@ namespace proton::matching {
using namespace search::fef;
using search::queryeval::SearchIterator;
using search::FeatureSet;
+using vespalib::ThreadBundle;
using vespalib::Issue;
namespace {
@@ -57,12 +60,25 @@ createScheduler(uint32_t numThreads, uint32_t numSearchPartitions, uint32_t numD
return std::make_unique<TaskDocidRangeScheduler>(numThreads, numSearchPartitions, numDocs);
}
+auto make_reply(const MatchToolsFactory &mtf, ResultProcessor &processor, ThreadBundle &bundle, auto full_result) {
+ if (mtf.has_match_features()) {
+ auto docs = processor.extract_docid_ordering(*full_result);
+ auto reply = processor.makeReply(std::move(std::move(full_result)));
+ if ((docs.size() > 0) && reply->_reply) {
+ reply->_reply->match_features = ExtractFeatures::get_match_features(mtf, docs, bundle);
+ }
+ return reply;
+ } else {
+ return processor.makeReply(std::move(full_result));
+ }
+}
+
} // namespace proton::matching::<unnamed>
ResultProcessor::Result::UP
MatchMaster::match(search::engine::Trace & trace,
const MatchParams &params,
- vespalib::ThreadBundle &threadBundle,
+ ThreadBundle &threadBundle,
const MatchToolsFactory &mtf,
ResultProcessor &resultProcessor,
uint32_t distributionKey,
@@ -87,7 +103,7 @@ MatchMaster::match(search::engine::Trace & trace,
}
resultProcessor.prepareThreadContextCreation(threadBundle.size());
threadBundle.run(targets);
- ResultProcessor::Result::UP reply = resultProcessor.makeReply(threadState[0]->extract_result());
+ auto reply = make_reply(mtf, resultProcessor, threadBundle, threadState[0]->extract_result());
double query_time_s = vespalib::to_s(query_latency_time.elapsed());
double rerank_time_s = vespalib::to_s(timedCommunicator.elapsed);
double match_time_s = 0.0;
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
index d652435cbca..accbb19669a 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
@@ -141,6 +141,12 @@ MatchTools::setup_second_phase()
}
void
+MatchTools::setup_match_features()
+{
+ setup(_rankSetup.create_match_program());
+}
+
+void
MatchTools::setup_summary()
{
setup(_rankSetup.create_summary_program());
@@ -281,6 +287,12 @@ MatchToolsFactory::has_first_phase_rank() const {
return !_rankSetup.getFirstPhaseRank().empty();
}
+bool
+MatchToolsFactory::has_match_features() const
+{
+ return _rankSetup.has_match_features();
+}
+
AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext,
vespalib::stringref attribute, vespalib::stringref operation)
: _requestContext(requestContext),
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
index a403b4b1a78..d63c67ec1d0 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
@@ -67,6 +67,7 @@ public:
void tag_search_as_changed() { _search_has_changed = true; }
void setup_first_phase();
void setup_second_phase();
+ void setup_match_features();
void setup_summary();
void setup_dump();
};
@@ -129,6 +130,7 @@ public:
std::unique_ptr<search::queryeval::IDiversifier> createDiversifier(uint32_t heapSize) const;
search::queryeval::Blueprint::HitEstimate estimate() const { return _query.estimate(); }
bool has_first_phase_rank() const;
+ bool has_match_features() const;
std::unique_ptr<AttributeOperationTask> createOnMatchTask() const;
std::unique_ptr<AttributeOperationTask> createOnFirstPhaseTask() const;
std::unique_ptr<AttributeOperationTask> createOnSecondPhaseTask() const;
diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
index 3da0d17895a..da1e6a2d567 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
@@ -102,6 +102,19 @@ ResultProcessor::createThreadContext(const vespalib::Doom & hardDoom, size_t thr
return std::make_unique<Context>(std::move(sort), std::move(result), std::move(groupingContext));
}
+std::vector<std::pair<uint32_t,uint32_t>>
+ResultProcessor::extract_docid_ordering(const PartialResult &result) const
+{
+ size_t est_size = result.size() - std::min(result.size(), _offset);
+ std::vector<std::pair<uint32_t,uint32_t>> list;
+ list.reserve(est_size);
+ for (size_t i = _offset; i < result.size(); ++i) {
+ list.emplace_back(result.hit(i)._docId, list.size());
+ }
+ std::sort(list.begin(), list.end(), [](const auto &a, const auto &b){ return (a.first < b.first); });
+ return list;
+};
+
ResultProcessor::Result::UP
ResultProcessor::makeReply(PartialResultUP full_result)
{
diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
index e0220d53d1a..5ec11cd7acb 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
@@ -103,6 +103,7 @@ public:
size_t countFS4Hits();
void prepareThreadContextCreation(size_t num_threads);
Context::UP createThreadContext(const vespalib::Doom & hardDoom, size_t thread_id, uint32_t distributionKey);
+ std::vector<std::pair<uint32_t,uint32_t>> extract_docid_ordering(const PartialResult &result) const;
std::unique_ptr<Result> makeReply(PartialResultUP full_result);
};