diff options
author | Håvard Pettersen <havardpe@oath.com> | 2021-11-03 16:00:03 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2021-11-04 15:11:32 +0000 |
commit | 2b2a16ac12b6fd40008bac37d59ec6fc89f66539 (patch) | |
tree | 3559cd7f934940a4fb2dc7cb1299133b0acf6462 /searchcore | |
parent | 6ebe77e2ceebd37aa26aa762f4c608fee22c1b40 (diff) |
calculate match features
+ factor out feature extraction
+ improve summary feature testing
+ extract returned docids with ordering
Diffstat (limited to 'searchcore')
10 files changed, 387 insertions, 78 deletions
diff --git a/searchcore/src/tests/proton/matching/matching_test.cpp b/searchcore/src/tests/proton/matching/matching_test.cpp index d690fb29795..5d084a2448f 100644 --- a/searchcore/src/tests/proton/matching/matching_test.cpp +++ b/searchcore/src/tests/proton/matching/matching_test.cpp @@ -143,10 +143,11 @@ struct MyWorld { config.add(indexproperties::rank::FirstPhase::NAME, "attribute(a1)"); config.add(indexproperties::hitcollector::HeapSize::NAME, (vespalib::asciistream() << heapSize).str()); config.add(indexproperties::hitcollector::ArraySize::NAME, (vespalib::asciistream() << arraySize).str()); - config.add(indexproperties::summary::Feature::NAME, "attribute(a1)"); + config.add(indexproperties::summary::Feature::NAME, "matches(f1)"); config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"reduce(tensor(x[3])(x),sum)\")"); config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")"); config.add(indexproperties::summary::Feature::NAME, "value(100)"); + config.add(indexproperties::summary::Feature::NAME, " attribute ( a1 ) "); // will be sorted and normalized config.add(indexproperties::dump::IgnoreDefaultFeatures::NAME, "true"); config.add(indexproperties::dump::Feature::NAME, "attribute(a2)"); @@ -211,6 +212,44 @@ struct MyWorld { config.import(cfg); } + void setup_match_features() { + config.add(indexproperties::match::Feature::NAME, "attribute(a1)"); + config.add(indexproperties::match::Feature::NAME, "attribute(a2)"); + config.add(indexproperties::match::Feature::NAME, "matches(a1)"); + config.add(indexproperties::match::Feature::NAME, "matches(f1)"); + config.add(indexproperties::match::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")"); + } + + static void verify_match_features(SearchReply &reply, const vespalib::string &matched_field) { + if (reply.hits.empty()) { + EXPECT_EQUAL(reply.match_features.names.size(), 0u); + EXPECT_EQUAL(reply.match_features.values.size(), 0u); + } else { + ASSERT_EQUAL(reply.match_features.names.size(), 5u); + EXPECT_EQUAL(reply.match_features.names[0], "attribute(a1)"); + EXPECT_EQUAL(reply.match_features.names[1], "attribute(a2)"); + EXPECT_EQUAL(reply.match_features.names[2], "matches(a1)"); + EXPECT_EQUAL(reply.match_features.names[3], "matches(f1)"); + EXPECT_EQUAL(reply.match_features.names[4], "rankingExpression(\"tensor(x[3])(x)\")"); + ASSERT_EQUAL(reply.match_features.values.size(), 5 * reply.hits.size()); + for (size_t i = 0; i < reply.hits.size(); ++i) { + const auto *f = &reply.match_features.values[i * 5]; + EXPECT_GREATER(f[0].as_double(), 0.0); + EXPECT_GREATER(f[1].as_double(), 0.0); + EXPECT_EQUAL(f[0].as_double() * 2, f[1].as_double()); + EXPECT_EQUAL(f[2].as_double(), double(matched_field == "a1")); + EXPECT_EQUAL(f[3].as_double(), double(matched_field == "f1")); + EXPECT_TRUE(f[4].is_data()); + { + nbostream buf(f[4].as_data().data, f[4].as_data().size); + auto actual = spec_from_value(*SimpleValue::from_stream(buf)); + auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2); + EXPECT_EQUAL(actual, expect); + } + } + } + } + void setup_match_phase_limiting(const vespalib::string &attribute, size_t max_hits, bool descending) { inject_match_phase_limiting(config, attribute, max_hits, descending); @@ -442,6 +481,30 @@ TEST("require that matching is performed (multi-threaded)") { } } +TEST("require that match features are calculated (multi-threaded)") { + for (size_t threads = 1; threads <= 16; ++threads) { + MyWorld world; + world.basicSetup(); + world.basicResults(); + world.setup_match_features(); + SearchRequest::SP request = world.createSimpleRequest("f1", "spread"); + SearchReply::UP reply = world.performSearch(request, threads); + EXPECT_GREATER(reply->hits.size(), 0u); + world.verify_match_features(*reply, "f1"); + } +} + +TEST("require that no hits gives no match feature names") { + MyWorld world; + world.basicSetup(); + world.basicResults(); + world.setup_match_features(); + SearchRequest::SP request = world.createSimpleRequest("f1", "not_found"); + SearchReply::UP reply = world.performSearch(request, 1); + EXPECT_EQUAL(reply->hits.size(), 0u); + world.verify_match_features(*reply, "f1"); +} + TEST("require that matching also returns hits when only bitvector is used (multi-threaded)") { for (size_t threads = 1; threads <= 16; ++threads) { MyWorld world; @@ -645,30 +708,36 @@ TEST("require that summary features are filled") { world.basicResults(); DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo"); FeatureSet::SP fs = world.getSummaryFeatures(req); - const FeatureSet::Value * f = NULL; - EXPECT_EQUAL(4u, fs->numFeatures()); + const FeatureSet::Value * f = nullptr; + EXPECT_EQUAL(5u, fs->numFeatures()); EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]); - EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]); - EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]); - EXPECT_EQUAL("value(100)", fs->getNames()[3]); - EXPECT_EQUAL(2u, fs->numDocs()); + EXPECT_EQUAL("matches(f1)", fs->getNames()[1]); + EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]); + EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]); + EXPECT_EQUAL("value(100)", fs->getNames()[4]); + EXPECT_EQUAL(3u, fs->numDocs()); f = fs->getFeaturesByDocId(10); - EXPECT_TRUE(f != NULL); + EXPECT_TRUE(f != nullptr); EXPECT_EQUAL(10, f[0].as_double()); - EXPECT_EQUAL(100, f[3].as_double()); + EXPECT_EQUAL(1, f[1].as_double()); + EXPECT_EQUAL(100, f[4].as_double()); f = fs->getFeaturesByDocId(15); - EXPECT_TRUE(f == NULL); + EXPECT_TRUE(f != nullptr); + EXPECT_EQUAL(15, f[0].as_double()); + EXPECT_EQUAL(0, f[1].as_double()); + EXPECT_EQUAL(100, f[4].as_double()); f = fs->getFeaturesByDocId(30); - EXPECT_TRUE(f != NULL); + EXPECT_TRUE(f != nullptr); EXPECT_EQUAL(30, f[0].as_double()); - EXPECT_EQUAL(100, f[3].as_double()); - EXPECT_TRUE(f[1].is_double()); - EXPECT_TRUE(!f[1].is_data()); - EXPECT_EQUAL(f[1].as_double(), 3.0); // 0 + 1 + 2 - EXPECT_TRUE(!f[2].is_double()); - EXPECT_TRUE(f[2].is_data()); + EXPECT_EQUAL(1, f[1].as_double()); + EXPECT_TRUE(f[2].is_double()); + EXPECT_TRUE(!f[2].is_data()); + EXPECT_EQUAL(f[2].as_double(), 3.0); // 0 + 1 + 2 + EXPECT_TRUE(!f[3].is_double()); + EXPECT_TRUE(f[3].is_data()); + EXPECT_EQUAL(100, f[4].as_double()); { - nbostream buf(f[2].as_data().data, f[2].as_data().size); + nbostream buf(f[3].as_data().data, f[3].as_data().size); auto actual = spec_from_value(*SimpleValue::from_stream(buf)); auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2); EXPECT_EQUAL(actual, expect); @@ -681,17 +750,18 @@ TEST("require that rank features are filled") { world.basicResults(); DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo"); FeatureSet::SP fs = world.getRankFeatures(req); - const FeatureSet::Value * f = NULL; + const FeatureSet::Value * f = nullptr; EXPECT_EQUAL(1u, fs->numFeatures()); EXPECT_EQUAL("attribute(a2)", fs->getNames()[0]); - EXPECT_EQUAL(2u, fs->numDocs()); + EXPECT_EQUAL(3u, fs->numDocs()); f = fs->getFeaturesByDocId(10); - EXPECT_TRUE(f != NULL); + EXPECT_TRUE(f != nullptr); EXPECT_EQUAL(20, f[0].as_double()); f = fs->getFeaturesByDocId(15); - EXPECT_TRUE(f == NULL); + EXPECT_TRUE(f != nullptr); + EXPECT_EQUAL(30, f[0].as_double()); f = fs->getFeaturesByDocId(30); - EXPECT_TRUE(f != NULL); + EXPECT_TRUE(f != nullptr); EXPECT_EQUAL(60, f[0].as_double()); } @@ -727,29 +797,42 @@ TEST("require that getSummaryFeatures can use cached query setup") { docsum_request->hits.back().docid = 30; FeatureSet::SP fs = world.getSummaryFeatures(docsum_request); - ASSERT_EQUAL(4u, fs->numFeatures()); + ASSERT_EQUAL(5u, fs->numFeatures()); EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]); - EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]); - EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]); - EXPECT_EQUAL("value(100)", fs->getNames()[3]); + EXPECT_EQUAL("matches(f1)", fs->getNames()[1]); + EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]); + EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]); + EXPECT_EQUAL("value(100)", fs->getNames()[4]); ASSERT_EQUAL(1u, fs->numDocs()); const auto *f = fs->getFeaturesByDocId(30); ASSERT_TRUE(f); EXPECT_EQUAL(30, f[0].as_double()); - EXPECT_EQUAL(100, f[3].as_double()); + EXPECT_EQUAL(100, f[4].as_double()); // getSummaryFeatures can be called multiple times. fs = world.getSummaryFeatures(docsum_request); - ASSERT_EQUAL(4u, fs->numFeatures()); + ASSERT_EQUAL(5u, fs->numFeatures()); EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]); - EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]); - EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]); - EXPECT_EQUAL("value(100)", fs->getNames()[3]); + EXPECT_EQUAL("matches(f1)", fs->getNames()[1]); + EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]); + EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]); + EXPECT_EQUAL("value(100)", fs->getNames()[4]); ASSERT_EQUAL(1u, fs->numDocs()); f = fs->getFeaturesByDocId(30); ASSERT_TRUE(f); EXPECT_EQUAL(30, f[0].as_double()); - EXPECT_EQUAL(100, f[3].as_double()); + EXPECT_EQUAL(100, f[4].as_double()); +} + +double count_f1_matches(FeatureSet &fs) { + ASSERT_TRUE(fs.getNames().size() > 1); + ASSERT_EQUAL(fs.getNames()[1], "matches(f1)"); + double sum = 0.0; + for (size_t i = 0; i < fs.numDocs(); ++i) { + auto *f = fs.getFeaturesByIndex(i); + sum += f[1].as_double(); + } + return sum; } TEST("require that getSummaryFeatures prefers cached query setup") { @@ -765,16 +848,18 @@ TEST("require that getSummaryFeatures prefers cached query setup") { req->sessionId = request->sessionId; req->propertiesMap.lookupCreate(search::MapNames::CACHES).add("query", "true"); FeatureSet::SP fs = world.getSummaryFeatures(req); - EXPECT_EQUAL(4u, fs->numFeatures()); - ASSERT_EQUAL(0u, fs->numDocs()); // "spread" has no hits + EXPECT_EQUAL(5u, fs->numFeatures()); + EXPECT_EQUAL(3u, fs->numDocs()); + EXPECT_EQUAL(0.0, count_f1_matches(*fs)); // "spread" has no hits // Empty cache auto pruneTime = vespalib::steady_clock::now() + 600s; world.sessionManager->pruneTimedOutSessions(pruneTime); fs = world.getSummaryFeatures(req); - EXPECT_EQUAL(4u, fs->numFeatures()); - ASSERT_EQUAL(2u, fs->numDocs()); // "foo" has two hits + EXPECT_EQUAL(5u, fs->numFeatures()); + EXPECT_EQUAL(3u, fs->numDocs()); + EXPECT_EQUAL(2.0, count_f1_matches(*fs)); // "foo" has two hits } TEST("require that match params are set up straight with ranking on") { diff --git a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt index 5f0017293e7..41e2fe2105f 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt +++ b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(searchcore_matching STATIC docid_range_scheduler.cpp docsum_matcher.cpp document_scorer.cpp + extract_features.cpp fakesearchcontext.cpp handlerecorder.cpp i_match_loop_communicator.cpp diff --git a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp index 91b44b277f0..864e0a6b337 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp @@ -3,6 +3,7 @@ #include "docsum_matcher.h" #include "match_tools.h" #include "search_session.h" +#include "extract_features.h" #include <vespa/eval/eval/value_codec.h> #include <vespa/vespalib/objects/nbostream.h> #include <vespa/searchcommon/attribute/i_search_context.h> @@ -46,45 +47,7 @@ get_feature_set(const MatchToolsFactory &mtf, } else { matchTools->setup_dump(); } - RankProgram &rankProgram = matchTools->rank_program(); - - std::vector<vespalib::string> featureNames; - FeatureResolver resolver(rankProgram.get_seeds(false)); - featureNames.reserve(resolver.num_features()); - for (size_t i = 0; i < resolver.num_features(); ++i) { - featureNames.emplace_back(resolver.name_of(i)); - } - auto retval = std::make_unique<FeatureSet>(featureNames, docs.size()); - if (docs.empty()) { - return retval; - } - FeatureSet &fs = *retval; - - SearchIterator &search = matchTools->search(); - search.initRange(docs.front(), docs.back()+1); - for (uint32_t i = 0; i < docs.size(); ++i) { - if (search.seek(docs[i])) { - uint32_t docId = search.getDocId(); - search.unpack(docId); - auto * f = fs.getFeaturesByIndex(fs.addDocId(docId)); - for (uint32_t j = 0; j < featureNames.size(); ++j) { - if (resolver.is_object(j)) { - auto obj = resolver.resolve(j).as_object(docId); - if (! obj.get().type().is_double()) { - vespalib::nbostream buf; - encode_value(obj.get(), buf); - f[j].set_data(vespalib::Memory(buf.peek(), buf.size())); - } else { - f[j].set_double(obj.get().as_double()); - } - } else { - f[j].set_double(resolver.resolve(j).as_number(docId)); - } - } - } else { - LOG(debug, "getFeatureSet: Did not find hit for docid '%u'. Skipping hit", docs[i]); - } - } + auto retval = ExtractFeatures::get_feature_set(matchTools->search(), matchTools->rank_program(), docs); if (auto onSummaryTask = mtf.createOnSummaryTask()) { onSummaryTask->run(docs); } diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp new file mode 100644 index 00000000000..ef03fac2f6a --- /dev/null +++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp @@ -0,0 +1,177 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "extract_features.h" +#include "match_tools.h" +#include <vespa/eval/eval/value_codec.h> +#include <vespa/vespalib/objects/nbostream.h> +#include <vespa/vespalib/util/runnable.h> +#include <vespa/vespalib/util/thread_bundle.h> +#include <vespa/searchlib/fef/feature_resolver.h> +#include <vespa/searchlib/fef/rank_program.h> +#include <vespa/searchlib/queryeval/searchiterator.h> + +using vespalib::Runnable; +using vespalib::ThreadBundle; +using search::FeatureSet; +using search::FeatureValues; +using search::fef::FeatureResolver; +using search::fef::RankProgram; +using search::queryeval::SearchIterator; + +namespace proton::matching { + +using OrderedDocs = ExtractFeatures::OrderedDocs; + +namespace { + +struct MyChunk : Runnable { + const std::pair<uint32_t,uint32_t> *begin; + const std::pair<uint32_t,uint32_t> *end; + FeatureValues &result; + MyChunk(const std::pair<uint32_t,uint32_t> *begin_in, + const std::pair<uint32_t,uint32_t> *end_in, + FeatureValues &result_in) + : begin(begin_in), end(end_in), result(result_in) {} + void calculate_features(SearchIterator &search, FeatureResolver &resolver) { + size_t num_features = result.names.size(); + assert(end > begin); + assert(num_features == resolver.num_features()); + search.initRange(begin[0].first, end[-1].first + 1); + for (auto pos = begin; pos != end; ++pos) { + uint32_t docid = pos->first; + search.unpack(docid); + auto * f = &result.values[pos->second * num_features]; + for (uint32_t i = 0; i < num_features; ++i) { + if (resolver.is_object(i)) { + auto obj = resolver.resolve(i).as_object(docid); + if (!obj.get().type().is_double()) { + vespalib::nbostream buf; + encode_value(obj.get(), buf); + f[i].set_data(vespalib::Memory(buf.peek(), buf.size())); + } else { + f[i].set_double(obj.get().as_double()); + } + } else { + f[i].set_double(resolver.resolve(i).as_number(docid)); + } + } + } + } +}; + +struct FirstChunk : MyChunk { + SearchIterator &search; + FeatureResolver &resolver; + FirstChunk(const std::pair<uint32_t,uint32_t> *begin_in, + const std::pair<uint32_t,uint32_t> *end_in, + FeatureValues &result_in, + SearchIterator &search_in, + FeatureResolver &resolver_in) + : MyChunk(begin_in, end_in, result_in), + search(search_in), + resolver(resolver_in) {} + void run() override { calculate_features(search, resolver); } +}; + +struct LaterChunk : MyChunk { + const MatchToolsFactory &mtf; + LaterChunk(const std::pair<uint32_t,uint32_t> *begin_in, + const std::pair<uint32_t,uint32_t> *end_in, + FeatureValues &result_in, + const MatchToolsFactory &mtf_in) + : MyChunk(begin_in, end_in, result_in), + mtf(mtf_in) {} + void run() override { + auto tools = mtf.createMatchTools(); + tools->setup_match_features(); + FeatureResolver resolver(tools->rank_program().get_seeds(false)); + calculate_features(tools->search(), resolver); + } +}; + +struct MyWork { + size_t num_threads; + std::vector<Runnable::UP> chunks; + MyWork(ThreadBundle &thread_bundle) : num_threads(thread_bundle.size()), chunks() { + chunks.reserve(num_threads); + } + void run(ThreadBundle &thread_bundle) { + std::vector<Runnable*> refs; + refs.reserve(chunks.size()); + for (const auto &task: chunks) { + refs.push_back(task.get()); + } + thread_bundle.run(refs); + } +}; + +} // unnamed + +FeatureSet::UP +ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs) +{ + std::vector<vespalib::string> featureNames; + FeatureResolver resolver(rank_program.get_seeds(false)); + featureNames.reserve(resolver.num_features()); + for (size_t i = 0; i < resolver.num_features(); ++i) { + featureNames.emplace_back(resolver.name_of(i)); + } + auto result = std::make_unique<FeatureSet>(featureNames, docs.size()); + if (!docs.empty()) { + search.initRange(docs.front(), docs.back()+1); + for (uint32_t docid: docs) { + search.unpack(docid); + auto * f = result->getFeaturesByIndex(result->addDocId(docid)); + for (uint32_t i = 0; i < featureNames.size(); ++i) { + if (resolver.is_object(i)) { + auto obj = resolver.resolve(i).as_object(docid); + if (!obj.get().type().is_double()) { + vespalib::nbostream buf; + encode_value(obj.get(), buf); + f[i].set_data(vespalib::Memory(buf.peek(), buf.size())); + } else { + f[i].set_double(obj.get().as_double()); + } + } else { + f[i].set_double(resolver.resolve(i).as_number(docid)); + } + } + } + } + return result; +} + +FeatureValues +ExtractFeatures::get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle) +{ + FeatureValues result; + auto tools = mtf.createMatchTools(); + tools->setup_match_features(); + FeatureResolver resolver(tools->rank_program().get_seeds(false)); + result.names.reserve(resolver.num_features()); + for (size_t i = 0; i < resolver.num_features(); ++i) { + result.names.emplace_back(resolver.name_of(i)); + } + result.values.resize(result.names.size() * docs.size()); + MyWork work(thread_bundle); + size_t per_thread = docs.size() / work.num_threads; + size_t rest_docs = docs.size() % work.num_threads; + size_t idx = 0; + for (size_t i = 0; i < work.num_threads; ++i) { + size_t chunk_size = per_thread + (i < rest_docs); + if (chunk_size == 0) { + break; + } + if (i == 0) { + work.chunks.push_back(std::make_unique<FirstChunk>(&docs[idx], &docs[idx + chunk_size], result, tools->search(), resolver)); + } else { + work.chunks.push_back(std::make_unique<LaterChunk>(&docs[idx], &docs[idx + chunk_size], result, mtf)); + } + idx += chunk_size; + } + assert(idx == docs.size()); + work.run(thread_bundle); + return result; +} + +} diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.h b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h new file mode 100644 index 00000000000..66e98d9db2d --- /dev/null +++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/common/featureset.h> +#include <vector> + +namespace vespalib { class ThreadBundle; }; +namespace search::queryeval { class SearchIterator; } +namespace search::fef { class RankProgram; } + +namespace proton::matching { + +class MatchToolsFactory; + +struct ExtractFeatures { + using FeatureSet = search::FeatureSet; + using FeatureValues = search::FeatureValues; + using ThreadBundle = vespalib::ThreadBundle; + using SearchIterator = search::queryeval::SearchIterator; + using RankProgram = search::fef::RankProgram; + + /** + * Extract all seed features from a rank program for a list of + * documents (must be in ascending order) using unpack information + * from a search. + **/ + static FeatureSet::UP get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs); + + // first: docid, second: result index (must be sorted on docid) + using OrderedDocs = std::vector<std::pair<uint32_t,uint32_t>>; + + /** + * Extract match features using multiple threads. + **/ + static FeatureValues get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle); +}; + +} diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp index 26ed94f1d73..827ff4b5aca 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp @@ -5,7 +5,9 @@ #include "match_loop_communicator.h" #include "match_thread.h" #include "match_tools.h" +#include "extract_features.h" #include <vespa/searchlib/engine/trace.h> +#include <vespa/searchlib/engine/searchreply.h> #include <vespa/vespalib/util/thread_bundle.h> #include <vespa/vespalib/util/issue.h> #include <vespa/vespalib/data/slime/inserter.h> @@ -21,6 +23,7 @@ namespace proton::matching { using namespace search::fef; using search::queryeval::SearchIterator; using search::FeatureSet; +using vespalib::ThreadBundle; using vespalib::Issue; namespace { @@ -57,12 +60,25 @@ createScheduler(uint32_t numThreads, uint32_t numSearchPartitions, uint32_t numD return std::make_unique<TaskDocidRangeScheduler>(numThreads, numSearchPartitions, numDocs); } +auto make_reply(const MatchToolsFactory &mtf, ResultProcessor &processor, ThreadBundle &bundle, auto full_result) { + if (mtf.has_match_features()) { + auto docs = processor.extract_docid_ordering(*full_result); + auto reply = processor.makeReply(std::move(std::move(full_result))); + if ((docs.size() > 0) && reply->_reply) { + reply->_reply->match_features = ExtractFeatures::get_match_features(mtf, docs, bundle); + } + return reply; + } else { + return processor.makeReply(std::move(full_result)); + } +} + } // namespace proton::matching::<unnamed> ResultProcessor::Result::UP MatchMaster::match(search::engine::Trace & trace, const MatchParams ¶ms, - vespalib::ThreadBundle &threadBundle, + ThreadBundle &threadBundle, const MatchToolsFactory &mtf, ResultProcessor &resultProcessor, uint32_t distributionKey, @@ -87,7 +103,7 @@ MatchMaster::match(search::engine::Trace & trace, } resultProcessor.prepareThreadContextCreation(threadBundle.size()); threadBundle.run(targets); - ResultProcessor::Result::UP reply = resultProcessor.makeReply(threadState[0]->extract_result()); + auto reply = make_reply(mtf, resultProcessor, threadBundle, threadState[0]->extract_result()); double query_time_s = vespalib::to_s(query_latency_time.elapsed()); double rerank_time_s = vespalib::to_s(timedCommunicator.elapsed); double match_time_s = 0.0; diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index d652435cbca..accbb19669a 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -141,6 +141,12 @@ MatchTools::setup_second_phase() } void +MatchTools::setup_match_features() +{ + setup(_rankSetup.create_match_program()); +} + +void MatchTools::setup_summary() { setup(_rankSetup.create_summary_program()); @@ -281,6 +287,12 @@ MatchToolsFactory::has_first_phase_rank() const { return !_rankSetup.getFirstPhaseRank().empty(); } +bool +MatchToolsFactory::has_match_features() const +{ + return _rankSetup.has_match_features(); +} + AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext, vespalib::stringref attribute, vespalib::stringref operation) : _requestContext(requestContext), diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h index a403b4b1a78..d63c67ec1d0 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h @@ -67,6 +67,7 @@ public: void tag_search_as_changed() { _search_has_changed = true; } void setup_first_phase(); void setup_second_phase(); + void setup_match_features(); void setup_summary(); void setup_dump(); }; @@ -129,6 +130,7 @@ public: std::unique_ptr<search::queryeval::IDiversifier> createDiversifier(uint32_t heapSize) const; search::queryeval::Blueprint::HitEstimate estimate() const { return _query.estimate(); } bool has_first_phase_rank() const; + bool has_match_features() const; std::unique_ptr<AttributeOperationTask> createOnMatchTask() const; std::unique_ptr<AttributeOperationTask> createOnFirstPhaseTask() const; std::unique_ptr<AttributeOperationTask> createOnSecondPhaseTask() const; diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp index 3da0d17895a..da1e6a2d567 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp @@ -102,6 +102,19 @@ ResultProcessor::createThreadContext(const vespalib::Doom & hardDoom, size_t thr return std::make_unique<Context>(std::move(sort), std::move(result), std::move(groupingContext)); } +std::vector<std::pair<uint32_t,uint32_t>> +ResultProcessor::extract_docid_ordering(const PartialResult &result) const +{ + size_t est_size = result.size() - std::min(result.size(), _offset); + std::vector<std::pair<uint32_t,uint32_t>> list; + list.reserve(est_size); + for (size_t i = _offset; i < result.size(); ++i) { + list.emplace_back(result.hit(i)._docId, list.size()); + } + std::sort(list.begin(), list.end(), [](const auto &a, const auto &b){ return (a.first < b.first); }); + return list; +}; + ResultProcessor::Result::UP ResultProcessor::makeReply(PartialResultUP full_result) { diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h index e0220d53d1a..5ec11cd7acb 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h +++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h @@ -103,6 +103,7 @@ public: size_t countFS4Hits(); void prepareThreadContextCreation(size_t num_threads); Context::UP createThreadContext(const vespalib::Doom & hardDoom, size_t thread_id, uint32_t distributionKey); + std::vector<std::pair<uint32_t,uint32_t>> extract_docid_ordering(const PartialResult &result) const; std::unique_ptr<Result> makeReply(PartialResultUP full_result); }; |