calculate match features

+ factor out feature extraction + improve summary feature testing + extract returned docids with ordering
author: Håvard Pettersen <havardpe@oath.com> 2021-11-03 16:00:03 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2021-11-04 15:11:32 +0000
commit: 2b2a16ac12b6fd40008bac37d59ec6fc89f66539 (patch)
tree: 3559cd7f934940a4fb2dc7cb1299133b0acf6462 /searchcore
parent: 6ebe77e2ceebd37aa26aa762f4c608fee22c1b40 (diff)
10 files changed, 387 insertions, 78 deletions
diff --git a/searchcore/src/tests/proton/matching/matching_test.cpp b/searchcore/src/tests/proton/matching/matching_test.cpp
index d690fb29795..5d084a2448f 100644
--- a/searchcore/src/tests/proton/matching/matching_test.cpp
+++ b/searchcore/src/tests/proton/matching/matching_test.cpp
@@ -143,10 +143,11 @@ struct MyWorld {
         config.add(indexproperties::rank::FirstPhase::NAME, "attribute(a1)");
         config.add(indexproperties::hitcollector::HeapSize::NAME, (vespalib::asciistream() << heapSize).str());
         config.add(indexproperties::hitcollector::ArraySize::NAME, (vespalib::asciistream() << arraySize).str());
-        config.add(indexproperties::summary::Feature::NAME, "attribute(a1)");
+        config.add(indexproperties::summary::Feature::NAME, "matches(f1)");
         config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"reduce(tensor(x[3])(x),sum)\")");
         config.add(indexproperties::summary::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")");
         config.add(indexproperties::summary::Feature::NAME, "value(100)");
+        config.add(indexproperties::summary::Feature::NAME, " attribute ( a1 ) "); // will be sorted and normalized
 
         config.add(indexproperties::dump::IgnoreDefaultFeatures::NAME, "true");
         config.add(indexproperties::dump::Feature::NAME, "attribute(a2)");
@@ -211,6 +212,44 @@ struct MyWorld {
         config.import(cfg);
     }
 
+    void setup_match_features() {
+        config.add(indexproperties::match::Feature::NAME, "attribute(a1)");
+        config.add(indexproperties::match::Feature::NAME, "attribute(a2)");
+        config.add(indexproperties::match::Feature::NAME, "matches(a1)");
+        config.add(indexproperties::match::Feature::NAME, "matches(f1)");
+        config.add(indexproperties::match::Feature::NAME, "rankingExpression(\"tensor(x[3])(x)\")");
+    }
+
+    static void verify_match_features(SearchReply &reply, const vespalib::string &matched_field) {
+        if (reply.hits.empty()) {
+            EXPECT_EQUAL(reply.match_features.names.size(), 0u);
+            EXPECT_EQUAL(reply.match_features.values.size(), 0u);
+        } else {
+            ASSERT_EQUAL(reply.match_features.names.size(), 5u);
+            EXPECT_EQUAL(reply.match_features.names[0], "attribute(a1)");
+            EXPECT_EQUAL(reply.match_features.names[1], "attribute(a2)");
+            EXPECT_EQUAL(reply.match_features.names[2], "matches(a1)");
+            EXPECT_EQUAL(reply.match_features.names[3], "matches(f1)");
+            EXPECT_EQUAL(reply.match_features.names[4], "rankingExpression(\"tensor(x[3])(x)\")");
+            ASSERT_EQUAL(reply.match_features.values.size(), 5 * reply.hits.size());
+            for (size_t i = 0; i < reply.hits.size(); ++i) {
+                const auto *f = &reply.match_features.values[i * 5];
+                EXPECT_GREATER(f[0].as_double(), 0.0);
+                EXPECT_GREATER(f[1].as_double(), 0.0);
+                EXPECT_EQUAL(f[0].as_double() * 2, f[1].as_double());
+                EXPECT_EQUAL(f[2].as_double(), double(matched_field == "a1"));
+                EXPECT_EQUAL(f[3].as_double(), double(matched_field == "f1"));
+                EXPECT_TRUE(f[4].is_data());
+                {
+                    nbostream buf(f[4].as_data().data, f[4].as_data().size);
+                    auto actual = spec_from_value(*SimpleValue::from_stream(buf));
+                    auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2);
+                    EXPECT_EQUAL(actual, expect);
+                }
+            }
+        }
+    }
+
     void setup_match_phase_limiting(const vespalib::string &attribute, size_t max_hits, bool descending)
     {
         inject_match_phase_limiting(config, attribute, max_hits, descending);
@@ -442,6 +481,30 @@ TEST("require that matching is performed (multi-threaded)") {
     }
 }
 
+TEST("require that match features are calculated (multi-threaded)") {
+    for (size_t threads = 1; threads <= 16; ++threads) {
+        MyWorld world;
+        world.basicSetup();
+        world.basicResults();
+        world.setup_match_features();
+        SearchRequest::SP request = world.createSimpleRequest("f1", "spread");
+        SearchReply::UP reply = world.performSearch(request, threads);
+        EXPECT_GREATER(reply->hits.size(), 0u);
+        world.verify_match_features(*reply, "f1");
+    }
+}
+
+TEST("require that no hits gives no match feature names") {
+    MyWorld world;
+    world.basicSetup();
+    world.basicResults();
+    world.setup_match_features();
+    SearchRequest::SP request = world.createSimpleRequest("f1", "not_found");
+    SearchReply::UP reply = world.performSearch(request, 1);
+    EXPECT_EQUAL(reply->hits.size(), 0u);
+    world.verify_match_features(*reply, "f1");
+}
+
 TEST("require that matching also returns hits when only bitvector is used (multi-threaded)") {
     for (size_t threads = 1; threads <= 16; ++threads) {
         MyWorld world;
@@ -645,30 +708,36 @@ TEST("require that summary features are filled") {
     world.basicResults();
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
     FeatureSet::SP fs = world.getSummaryFeatures(req);
-    const FeatureSet::Value * f = NULL;
-    EXPECT_EQUAL(4u, fs->numFeatures());
+    const FeatureSet::Value * f = nullptr;
+    EXPECT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
-    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
-    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
-    EXPECT_EQUAL("value(100)", fs->getNames()[3]);
-    EXPECT_EQUAL(2u, fs->numDocs());
+    EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+    EXPECT_EQUAL("value(100)", fs->getNames()[4]);
+    EXPECT_EQUAL(3u, fs->numDocs());
     f = fs->getFeaturesByDocId(10);
-    EXPECT_TRUE(f != NULL);
+    EXPECT_TRUE(f != nullptr);
     EXPECT_EQUAL(10, f[0].as_double());
-    EXPECT_EQUAL(100, f[3].as_double());
+    EXPECT_EQUAL(1, f[1].as_double());
+    EXPECT_EQUAL(100, f[4].as_double());
     f = fs->getFeaturesByDocId(15);
-    EXPECT_TRUE(f == NULL);
+    EXPECT_TRUE(f != nullptr);
+    EXPECT_EQUAL(15, f[0].as_double());
+    EXPECT_EQUAL(0, f[1].as_double());
+    EXPECT_EQUAL(100, f[4].as_double());
     f = fs->getFeaturesByDocId(30);
-    EXPECT_TRUE(f != NULL);
+    EXPECT_TRUE(f != nullptr);
     EXPECT_EQUAL(30, f[0].as_double());
-    EXPECT_EQUAL(100, f[3].as_double());
-    EXPECT_TRUE(f[1].is_double());
-    EXPECT_TRUE(!f[1].is_data());
-    EXPECT_EQUAL(f[1].as_double(), 3.0); // 0 + 1 + 2
-    EXPECT_TRUE(!f[2].is_double());
-    EXPECT_TRUE(f[2].is_data());
+    EXPECT_EQUAL(1, f[1].as_double());
+    EXPECT_TRUE(f[2].is_double());
+    EXPECT_TRUE(!f[2].is_data());
+    EXPECT_EQUAL(f[2].as_double(), 3.0); // 0 + 1 + 2
+    EXPECT_TRUE(!f[3].is_double());
+    EXPECT_TRUE(f[3].is_data());
+    EXPECT_EQUAL(100, f[4].as_double());
     {
-        nbostream buf(f[2].as_data().data, f[2].as_data().size);
+        nbostream buf(f[3].as_data().data, f[3].as_data().size);
         auto actual = spec_from_value(*SimpleValue::from_stream(buf));
         auto expect = TensorSpec("tensor(x[3])").add({{"x", 0}}, 0).add({{"x", 1}}, 1).add({{"x", 2}}, 2);
         EXPECT_EQUAL(actual, expect);
@@ -681,17 +750,18 @@ TEST("require that rank features are filled") {
     world.basicResults();
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
     FeatureSet::SP fs = world.getRankFeatures(req);
-    const FeatureSet::Value * f = NULL;
+    const FeatureSet::Value * f = nullptr;
     EXPECT_EQUAL(1u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a2)", fs->getNames()[0]);
-    EXPECT_EQUAL(2u, fs->numDocs());
+    EXPECT_EQUAL(3u, fs->numDocs());
     f = fs->getFeaturesByDocId(10);
-    EXPECT_TRUE(f != NULL);
+    EXPECT_TRUE(f != nullptr);
     EXPECT_EQUAL(20, f[0].as_double());
     f = fs->getFeaturesByDocId(15);
-    EXPECT_TRUE(f == NULL);
+    EXPECT_TRUE(f != nullptr);
+    EXPECT_EQUAL(30, f[0].as_double());
     f = fs->getFeaturesByDocId(30);
-    EXPECT_TRUE(f != NULL);
+    EXPECT_TRUE(f != nullptr);
     EXPECT_EQUAL(60, f[0].as_double());
 }
 
@@ -727,29 +797,42 @@ TEST("require that getSummaryFeatures can use cached query setup") {
     docsum_request->hits.back().docid = 30;
 
     FeatureSet::SP fs = world.getSummaryFeatures(docsum_request);
-    ASSERT_EQUAL(4u, fs->numFeatures());
+    ASSERT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
-    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
-    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
-    EXPECT_EQUAL("value(100)", fs->getNames()[3]);
+    EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+    EXPECT_EQUAL("value(100)", fs->getNames()[4]);
     ASSERT_EQUAL(1u, fs->numDocs());
     const auto *f = fs->getFeaturesByDocId(30);
     ASSERT_TRUE(f);
     EXPECT_EQUAL(30, f[0].as_double());
-    EXPECT_EQUAL(100, f[3].as_double());
+    EXPECT_EQUAL(100, f[4].as_double());
 
     // getSummaryFeatures can be called multiple times.
     fs = world.getSummaryFeatures(docsum_request);
-    ASSERT_EQUAL(4u, fs->numFeatures());
+    ASSERT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
-    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[1]);
-    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[2]);
-    EXPECT_EQUAL("value(100)", fs->getNames()[3]);
+    EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
+    EXPECT_EQUAL("rankingExpression(\"reduce(tensor(x[3])(x),sum)\")", fs->getNames()[2]);
+    EXPECT_EQUAL("rankingExpression(\"tensor(x[3])(x)\")", fs->getNames()[3]);
+    EXPECT_EQUAL("value(100)", fs->getNames()[4]);
     ASSERT_EQUAL(1u, fs->numDocs());
     f = fs->getFeaturesByDocId(30);
     ASSERT_TRUE(f);
     EXPECT_EQUAL(30, f[0].as_double());
-    EXPECT_EQUAL(100, f[3].as_double());
+    EXPECT_EQUAL(100, f[4].as_double());
+}
+
+double count_f1_matches(FeatureSet &fs) {
+    ASSERT_TRUE(fs.getNames().size() > 1);
+    ASSERT_EQUAL(fs.getNames()[1], "matches(f1)");
+    double sum = 0.0;
+    for (size_t i = 0; i < fs.numDocs(); ++i) {
+        auto *f = fs.getFeaturesByIndex(i);
+        sum += f[1].as_double();
+    }
+    return sum;
 }
 
 TEST("require that getSummaryFeatures prefers cached query setup") {
@@ -765,16 +848,18 @@ TEST("require that getSummaryFeatures prefers cached query setup") {
     req->sessionId = request->sessionId;
     req->propertiesMap.lookupCreate(search::MapNames::CACHES).add("query", "true");
     FeatureSet::SP fs = world.getSummaryFeatures(req);
-    EXPECT_EQUAL(4u, fs->numFeatures());
-    ASSERT_EQUAL(0u, fs->numDocs());  // "spread" has no hits
+    EXPECT_EQUAL(5u, fs->numFeatures());
+    EXPECT_EQUAL(3u, fs->numDocs());
+    EXPECT_EQUAL(0.0, count_f1_matches(*fs)); // "spread" has no hits
 
     // Empty cache
     auto pruneTime = vespalib::steady_clock::now() + 600s;
     world.sessionManager->pruneTimedOutSessions(pruneTime);
 
     fs = world.getSummaryFeatures(req);
-    EXPECT_EQUAL(4u, fs->numFeatures());
-    ASSERT_EQUAL(2u, fs->numDocs());  // "foo" has two hits
+    EXPECT_EQUAL(5u, fs->numFeatures());
+    EXPECT_EQUAL(3u, fs->numDocs());
+    EXPECT_EQUAL(2.0, count_f1_matches(*fs)); // "foo" has two hits
 }
 
 TEST("require that match params are set up straight with ranking on") {
diff --git a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
index 5f0017293e7..41e2fe2105f 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
+++ b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(searchcore_matching STATIC
     docid_range_scheduler.cpp
     docsum_matcher.cpp
     document_scorer.cpp
+    extract_features.cpp
     fakesearchcontext.cpp
     handlerecorder.cpp
     i_match_loop_communicator.cpp
diff --git a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
index 91b44b277f0..864e0a6b337 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/docsum_matcher.cpp
@@ -3,6 +3,7 @@
 #include "docsum_matcher.h"
 #include "match_tools.h"
 #include "search_session.h"
+#include "extract_features.h"
 #include <vespa/eval/eval/value_codec.h>
 #include <vespa/vespalib/objects/nbostream.h>
 #include <vespa/searchcommon/attribute/i_search_context.h>
@@ -46,45 +47,7 @@ get_feature_set(const MatchToolsFactory &mtf,
     } else {
         matchTools->setup_dump();
     }
-    RankProgram &rankProgram = matchTools->rank_program();
-
-    std::vector<vespalib::string> featureNames;
-    FeatureResolver resolver(rankProgram.get_seeds(false));
-    featureNames.reserve(resolver.num_features());
-    for (size_t i = 0; i < resolver.num_features(); ++i) {
-        featureNames.emplace_back(resolver.name_of(i));
-    }
-    auto retval = std::make_unique<FeatureSet>(featureNames, docs.size());
-    if (docs.empty()) {
-        return retval;
-    }
-    FeatureSet &fs = *retval;
-
-    SearchIterator &search = matchTools->search();
-    search.initRange(docs.front(), docs.back()+1);
-    for (uint32_t i = 0; i < docs.size(); ++i) {
-        if (search.seek(docs[i])) {
-            uint32_t docId = search.getDocId();
-            search.unpack(docId);
-            auto * f = fs.getFeaturesByIndex(fs.addDocId(docId));
-            for (uint32_t j = 0; j < featureNames.size(); ++j) {
-                if (resolver.is_object(j)) {
-                    auto obj = resolver.resolve(j).as_object(docId);
-                    if (! obj.get().type().is_double()) {
-                        vespalib::nbostream buf;
-                        encode_value(obj.get(), buf);
-                        f[j].set_data(vespalib::Memory(buf.peek(), buf.size()));
-                    } else {
-                        f[j].set_double(obj.get().as_double());
-                    }
-                } else {
-                    f[j].set_double(resolver.resolve(j).as_number(docId));
-                }
-            }
-        } else {
-            LOG(debug, "getFeatureSet: Did not find hit for docid '%u'. Skipping hit", docs[i]);
-        }
-    }
+    auto retval = ExtractFeatures::get_feature_set(matchTools->search(), matchTools->rank_program(), docs);
     if (auto onSummaryTask = mtf.createOnSummaryTask()) {
         onSummaryTask->run(docs);
     }
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
new file mode 100644
index 00000000000..ef03fac2f6a
--- /dev/null
+++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
@@ -0,0 +1,177 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "extract_features.h"
+#include "match_tools.h"
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/vespalib/objects/nbostream.h>
+#include <vespa/vespalib/util/runnable.h>
+#include <vespa/vespalib/util/thread_bundle.h>
+#include <vespa/searchlib/fef/feature_resolver.h>
+#include <vespa/searchlib/fef/rank_program.h>
+#include <vespa/searchlib/queryeval/searchiterator.h>
+
+using vespalib::Runnable;
+using vespalib::ThreadBundle;
+using search::FeatureSet;
+using search::FeatureValues;
+using search::fef::FeatureResolver;
+using search::fef::RankProgram;
+using search::queryeval::SearchIterator;
+
+namespace proton::matching {
+
+using OrderedDocs = ExtractFeatures::OrderedDocs;
+
+namespace {
+
+struct MyChunk : Runnable {
+    const std::pair<uint32_t,uint32_t> *begin;
+    const std::pair<uint32_t,uint32_t> *end;
+    FeatureValues &result;
+    MyChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+           const std::pair<uint32_t,uint32_t> *end_in,
+           FeatureValues &result_in)
+      : begin(begin_in), end(end_in), result(result_in) {}
+    void calculate_features(SearchIterator &search, FeatureResolver &resolver) {
+        size_t num_features = result.names.size();
+        assert(end > begin);
+        assert(num_features == resolver.num_features());
+        search.initRange(begin[0].first, end[-1].first + 1);
+        for (auto pos = begin; pos != end; ++pos) {
+            uint32_t docid = pos->first;
+            search.unpack(docid);
+            auto * f = &result.values[pos->second * num_features];
+            for (uint32_t i = 0; i < num_features; ++i) {
+                if (resolver.is_object(i)) {
+                    auto obj = resolver.resolve(i).as_object(docid);
+                    if (!obj.get().type().is_double()) {
+                        vespalib::nbostream buf;
+                        encode_value(obj.get(), buf);
+                        f[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
+                    } else {
+                        f[i].set_double(obj.get().as_double());
+                    }
+                } else {
+                    f[i].set_double(resolver.resolve(i).as_number(docid));
+                }
+            }
+        }
+    }
+};
+
+struct FirstChunk : MyChunk {
+    SearchIterator &search;
+    FeatureResolver &resolver;
+    FirstChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+               const std::pair<uint32_t,uint32_t> *end_in,
+               FeatureValues &result_in,
+               SearchIterator &search_in,
+               FeatureResolver &resolver_in)
+      : MyChunk(begin_in, end_in, result_in),
+        search(search_in),
+        resolver(resolver_in) {}
+    void run() override { calculate_features(search, resolver); }
+};
+
+struct LaterChunk : MyChunk {
+    const MatchToolsFactory &mtf;
+    LaterChunk(const std::pair<uint32_t,uint32_t> *begin_in,
+               const std::pair<uint32_t,uint32_t> *end_in,
+               FeatureValues &result_in,
+               const MatchToolsFactory &mtf_in)
+      : MyChunk(begin_in, end_in, result_in),
+        mtf(mtf_in) {}
+    void run() override {
+        auto tools = mtf.createMatchTools();
+        tools->setup_match_features();
+        FeatureResolver resolver(tools->rank_program().get_seeds(false));
+        calculate_features(tools->search(), resolver);
+    }
+};
+
+struct MyWork {
+    size_t num_threads;
+    std::vector<Runnable::UP> chunks;
+    MyWork(ThreadBundle &thread_bundle) : num_threads(thread_bundle.size()), chunks() {
+        chunks.reserve(num_threads);
+    }
+    void run(ThreadBundle &thread_bundle) {
+        std::vector<Runnable*> refs;
+        refs.reserve(chunks.size());
+        for (const auto &task: chunks) {
+            refs.push_back(task.get());
+        }
+        thread_bundle.run(refs);
+    }
+};
+
+} // unnamed
+
+FeatureSet::UP
+ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs)
+{
+    std::vector<vespalib::string> featureNames;
+    FeatureResolver resolver(rank_program.get_seeds(false));
+    featureNames.reserve(resolver.num_features());
+    for (size_t i = 0; i < resolver.num_features(); ++i) {
+        featureNames.emplace_back(resolver.name_of(i));
+    }
+    auto result = std::make_unique<FeatureSet>(featureNames, docs.size());
+    if (!docs.empty()) {
+        search.initRange(docs.front(), docs.back()+1);
+        for (uint32_t docid: docs) {
+            search.unpack(docid);
+            auto * f = result->getFeaturesByIndex(result->addDocId(docid));
+            for (uint32_t i = 0; i < featureNames.size(); ++i) {
+                if (resolver.is_object(i)) {
+                    auto obj = resolver.resolve(i).as_object(docid);
+                    if (!obj.get().type().is_double()) {
+                        vespalib::nbostream buf;
+                        encode_value(obj.get(), buf);
+                        f[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
+                    } else {
+                        f[i].set_double(obj.get().as_double());
+                    }
+                } else {
+                    f[i].set_double(resolver.resolve(i).as_number(docid));
+                }
+            }
+        }
+    }
+    return result;
+}
+
+FeatureValues
+ExtractFeatures::get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle)
+{
+    FeatureValues result;
+    auto tools = mtf.createMatchTools();
+    tools->setup_match_features();
+    FeatureResolver resolver(tools->rank_program().get_seeds(false));
+    result.names.reserve(resolver.num_features());
+    for (size_t i = 0; i < resolver.num_features(); ++i) {
+        result.names.emplace_back(resolver.name_of(i));
+    }
+    result.values.resize(result.names.size() * docs.size());
+    MyWork work(thread_bundle);
+    size_t per_thread = docs.size() / work.num_threads;
+    size_t rest_docs = docs.size() % work.num_threads;
+    size_t idx = 0;
+    for (size_t i = 0; i < work.num_threads; ++i) {
+        size_t chunk_size = per_thread + (i < rest_docs);
+        if (chunk_size == 0) {
+            break;
+        }
+        if (i == 0) {
+            work.chunks.push_back(std::make_unique<FirstChunk>(&docs[idx], &docs[idx + chunk_size], result, tools->search(), resolver));
+        } else {
+            work.chunks.push_back(std::make_unique<LaterChunk>(&docs[idx], &docs[idx + chunk_size], result, mtf));
+        }
+        idx += chunk_size;
+    }
+    assert(idx == docs.size());
+    work.run(thread_bundle);
+    return result;
+}
+
+}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.h b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h
new file mode 100644
index 00000000000..66e98d9db2d
--- /dev/null
+++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.h
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchlib/common/featureset.h>
+#include <vector>
+
+namespace vespalib { class ThreadBundle; };
+namespace search::queryeval { class SearchIterator; }
+namespace search::fef { class RankProgram; }
+
+namespace proton::matching {
+
+class MatchToolsFactory;
+
+struct ExtractFeatures {
+    using FeatureSet = search::FeatureSet;
+    using FeatureValues = search::FeatureValues;
+    using ThreadBundle = vespalib::ThreadBundle;
+    using SearchIterator = search::queryeval::SearchIterator;
+    using RankProgram = search::fef::RankProgram;
+
+    /**
+     * Extract all seed features from a rank program for a list of
+     * documents (must be in ascending order) using unpack information
+     * from a search.
+     **/
+    static FeatureSet::UP get_feature_set(SearchIterator &search, RankProgram &rank_program, const std::vector<uint32_t> &docs);
+
+    // first: docid, second: result index (must be sorted on docid)
+    using OrderedDocs = std::vector<std::pair<uint32_t,uint32_t>>;
+
+    /**
+     * Extract match features using multiple threads.
+     **/
+    static FeatureValues get_match_features(const MatchToolsFactory &mtf, const OrderedDocs &docs, ThreadBundle &thread_bundle);
+};
+
+}
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
index 26ed94f1d73..827ff4b5aca 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp
@@ -5,7 +5,9 @@
 #include "match_loop_communicator.h"
 #include "match_thread.h"
 #include "match_tools.h"
+#include "extract_features.h"
 #include <vespa/searchlib/engine/trace.h>
+#include <vespa/searchlib/engine/searchreply.h>
 #include <vespa/vespalib/util/thread_bundle.h>
 #include <vespa/vespalib/util/issue.h>
 #include <vespa/vespalib/data/slime/inserter.h>
@@ -21,6 +23,7 @@ namespace proton::matching {
 using namespace search::fef;
 using search::queryeval::SearchIterator;
 using search::FeatureSet;
+using vespalib::ThreadBundle;
 using vespalib::Issue;
 
 namespace {
@@ -57,12 +60,25 @@ createScheduler(uint32_t numThreads, uint32_t numSearchPartitions, uint32_t numD
     return std::make_unique<TaskDocidRangeScheduler>(numThreads, numSearchPartitions, numDocs);
 }
 
+auto make_reply(const MatchToolsFactory &mtf, ResultProcessor &processor, ThreadBundle &bundle, auto full_result) {
+    if (mtf.has_match_features()) {
+        auto docs = processor.extract_docid_ordering(*full_result);
+        auto reply = processor.makeReply(std::move(std::move(full_result)));
+        if ((docs.size() > 0) && reply->_reply) {
+            reply->_reply->match_features = ExtractFeatures::get_match_features(mtf, docs, bundle);
+        }
+        return reply;
+    } else {
+        return processor.makeReply(std::move(full_result));
+    }
+}
+
 } // namespace proton::matching::<unnamed>
 
 ResultProcessor::Result::UP
 MatchMaster::match(search::engine::Trace & trace,
                    const MatchParams &params,
-                   vespalib::ThreadBundle &threadBundle,
+                   ThreadBundle &threadBundle,
                    const MatchToolsFactory &mtf,
                    ResultProcessor &resultProcessor,
                    uint32_t distributionKey,
@@ -87,7 +103,7 @@ MatchMaster::match(search::engine::Trace & trace,
     }
     resultProcessor.prepareThreadContextCreation(threadBundle.size());
     threadBundle.run(targets);
-    ResultProcessor::Result::UP reply = resultProcessor.makeReply(threadState[0]->extract_result());
+    auto reply = make_reply(mtf, resultProcessor, threadBundle, threadState[0]->extract_result());
     double query_time_s = vespalib::to_s(query_latency_time.elapsed());
     double rerank_time_s = vespalib::to_s(timedCommunicator.elapsed);
     double match_time_s = 0.0;
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
index d652435cbca..accbb19669a 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
@@ -141,6 +141,12 @@ MatchTools::setup_second_phase()
 }
 
 void
+MatchTools::setup_match_features()
+{
+    setup(_rankSetup.create_match_program());
+}
+
+void
 MatchTools::setup_summary()
 {
     setup(_rankSetup.create_summary_program());
@@ -281,6 +287,12 @@ MatchToolsFactory::has_first_phase_rank() const {
     return !_rankSetup.getFirstPhaseRank().empty();
 }
 
+bool
+MatchToolsFactory::has_match_features() const
+{
+    return _rankSetup.has_match_features();
+}
+
 AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext,
                                                vespalib::stringref attribute, vespalib::stringref operation)
     : _requestContext(requestContext),
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
index a403b4b1a78..d63c67ec1d0 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
@@ -67,6 +67,7 @@ public:
     void tag_search_as_changed() { _search_has_changed = true; }
     void setup_first_phase();
     void setup_second_phase();
+    void setup_match_features();
     void setup_summary();
     void setup_dump();
 };
@@ -129,6 +130,7 @@ public:
     std::unique_ptr<search::queryeval::IDiversifier> createDiversifier(uint32_t heapSize) const;
     search::queryeval::Blueprint::HitEstimate estimate() const { return _query.estimate(); }
     bool has_first_phase_rank() const;
+    bool has_match_features() const;
     std::unique_ptr<AttributeOperationTask> createOnMatchTask() const;
     std::unique_ptr<AttributeOperationTask> createOnFirstPhaseTask() const;
     std::unique_ptr<AttributeOperationTask> createOnSecondPhaseTask() const;
diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
index 3da0d17895a..da1e6a2d567 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.cpp
@@ -102,6 +102,19 @@ ResultProcessor::createThreadContext(const vespalib::Doom & hardDoom, size_t thr
     return std::make_unique<Context>(std::move(sort), std::move(result), std::move(groupingContext));
 }
 
+std::vector<std::pair<uint32_t,uint32_t>>
+ResultProcessor::extract_docid_ordering(const PartialResult &result) const
+{
+    size_t est_size = result.size() - std::min(result.size(), _offset);
+    std::vector<std::pair<uint32_t,uint32_t>> list;
+    list.reserve(est_size);
+    for (size_t i = _offset; i < result.size(); ++i) {
+        list.emplace_back(result.hit(i)._docId, list.size());
+    }
+    std::sort(list.begin(), list.end(), [](const auto &a, const auto &b){ return (a.first < b.first); });
+    return list;
+};
+
 ResultProcessor::Result::UP
 ResultProcessor::makeReply(PartialResultUP full_result)
 {
diff --git a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
index e0220d53d1a..5ec11cd7acb 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/result_processor.h
@@ -103,6 +103,7 @@ public:
     size_t countFS4Hits();
     void prepareThreadContextCreation(size_t num_threads);
     Context::UP createThreadContext(const vespalib::Doom & hardDoom, size_t thread_id, uint32_t distributionKey);
+    std::vector<std::pair<uint32_t,uint32_t>> extract_docid_ordering(const PartialResult &result) const;
     std::unique_ptr<Result> makeReply(PartialResultUP full_result);
 };
author	Håvard Pettersen <havardpe@oath.com>	2021-11-03 16:00:03 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2021-11-04 15:11:32 +0000
commit	2b2a16ac12b6fd40008bac37d59ec6fc89f66539 (patch)
tree	3559cd7f934940a4fb2dc7cb1299133b0acf6462 /searchcore
parent	6ebe77e2ceebd37aa26aa762f4c608fee22c1b40 (diff)