Merge pull request #23755 from vespa-engine/balder/also-track-dropped-docids

Balder/also track dropped docids
author: Henning Baldersheim <balder@yahoo-inc.com> 2022-08-24 13:34:20 +0200
committer: GitHub <noreply@github.com> 2022-08-24 13:34:20 +0200
commit: bb2a96b90f2ed0890de58dc955bb6c9bc433beee (patch)
tree: 57b7ca32a1314539107912f42ad55a251e0a1771
parent: 41a101bd2ccfa74e072a2de51101e7881dbe8b50 (diff)
parent: 0ecd1ebf6e06ea9ee0d5c8fda12b02f3c2f66c13 (diff)
6 files changed, 57 insertions, 37 deletions
diff --git a/searchcore/src/tests/proton/matching/matching_test.cpp b/searchcore/src/tests/proton/matching/matching_test.cpp
index 36b79c7bb8e..5f9a3163a16 100644
--- a/searchcore/src/tests/proton/matching/matching_test.cpp
+++ b/searchcore/src/tests/proton/matching/matching_test.cpp
@@ -108,8 +108,8 @@ vespalib::string make_same_element_stack_dump(const vespalib::string &a1_term, c
 const uint32_t NUM_DOCS = 1000;
 
 struct EmptyConstantValueRepo : public proton::matching::IConstantValueRepo {
-    virtual vespalib::eval::ConstantValue::UP getConstant(const vespalib::string &) const override {
-        return vespalib::eval::ConstantValue::UP(nullptr);
+    vespalib::eval::ConstantValue::UP getConstant(const vespalib::string &) const override {
+        return {};
     }
 };
 
@@ -196,7 +196,7 @@ struct MyWorld {
         }
 
         // grouping
-        sessionManager = SessionManager::SP(new SessionManager(100));
+        sessionManager = std::make_shared<SessionManager>(100);
 
         // metaStore
         for (uint32_t i = 0; i < NUM_DOCS; ++i) {
@@ -353,13 +353,13 @@ struct MyWorld {
     struct MySearchHandler : ISearchHandler {
         Matcher::SP _matcher;
 
-        MySearchHandler(Matcher::SP matcher) noexcept : _matcher(std::move(matcher)) {}
+        explicit MySearchHandler(Matcher::SP matcher) noexcept : _matcher(std::move(matcher)) {}
 
         DocsumReply::UP getDocsums(const DocsumRequest &) override {
-            return DocsumReply::UP();
+            return {};
         }
         SearchReply::UP match(const SearchRequest &, vespalib::ThreadBundle &) const override {
-            return SearchReply::UP();
+            return {};
         }
     };
 
@@ -420,15 +420,15 @@ struct MyWorld {
         return std::make_unique<FieldInfo>(*field);
     }
 
-    FeatureSet::SP getSummaryFeatures(DocsumRequest::SP req) {
+    FeatureSet::SP getSummaryFeatures(const DocsumRequest & req) {
         Matcher::SP matcher = createMatcher();
-        auto docsum_matcher = matcher->create_docsum_matcher(*req, searchContext, attributeContext, *sessionManager);
+        auto docsum_matcher = matcher->create_docsum_matcher(req, searchContext, attributeContext, *sessionManager);
         return docsum_matcher->get_summary_features();
     }
 
-    FeatureSet::SP getRankFeatures(DocsumRequest::SP req) {
+    FeatureSet::SP getRankFeatures(const DocsumRequest & req) {
         Matcher::SP matcher = createMatcher();
-        auto docsum_matcher = matcher->create_docsum_matcher(*req, searchContext, attributeContext, *sessionManager);
+        auto docsum_matcher = matcher->create_docsum_matcher(req, searchContext, attributeContext, *sessionManager);
         return docsum_matcher->get_rank_features();
     }
 
@@ -747,7 +747,7 @@ TEST("require that summary features are filled") {
     world.basicSetup();
     world.basicResults();
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
-    FeatureSet::SP fs = world.getSummaryFeatures(req);
+    FeatureSet::SP fs = world.getSummaryFeatures(*req);
     const FeatureSet::Value * f = nullptr;
     EXPECT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
@@ -789,7 +789,7 @@ TEST("require that rank features are filled") {
     world.basicSetup();
     world.basicResults();
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
-    FeatureSet::SP fs = world.getRankFeatures(req);
+    FeatureSet::SP fs = world.getRankFeatures(*req);
     const FeatureSet::Value * f = nullptr;
     EXPECT_EQUAL(1u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a2)", fs->getNames()[0]);
@@ -827,7 +827,7 @@ TEST("require that summary features can be renamed") {
     world.setup_feature_renames();
     world.basicResults();
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
-    FeatureSet::SP fs = world.getSummaryFeatures(req);
+    FeatureSet::SP fs = world.getSummaryFeatures(*req);
     const FeatureSet::Value * f = nullptr;
     EXPECT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
@@ -853,10 +853,10 @@ TEST("require that getSummaryFeatures can use cached query setup") {
     DocsumRequest::SP docsum_request(new DocsumRequest);  // no stack dump
     docsum_request->sessionId = request->sessionId;
     docsum_request->propertiesMap.lookupCreate(search::MapNames::CACHES).add("query", "true");
-    docsum_request->hits.push_back(DocsumRequest::Hit());
+    docsum_request->hits.emplace_back();
     docsum_request->hits.back().docid = 30;
 
-    FeatureSet::SP fs = world.getSummaryFeatures(docsum_request);
+    FeatureSet::SP fs = world.getSummaryFeatures(*docsum_request);
     ASSERT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
     EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
@@ -870,7 +870,7 @@ TEST("require that getSummaryFeatures can use cached query setup") {
     EXPECT_EQUAL(100, f[4].as_double());
 
     // getSummaryFeatures can be called multiple times.
-    fs = world.getSummaryFeatures(docsum_request);
+    fs = world.getSummaryFeatures(*docsum_request);
     ASSERT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL("attribute(a1)", fs->getNames()[0]);
     EXPECT_EQUAL("matches(f1)", fs->getNames()[1]);
@@ -907,7 +907,7 @@ TEST("require that getSummaryFeatures prefers cached query setup") {
     DocsumRequest::SP req = world.createSimpleDocsumRequest("f1", "foo");
     req->sessionId = request->sessionId;
     req->propertiesMap.lookupCreate(search::MapNames::CACHES).add("query", "true");
-    FeatureSet::SP fs = world.getSummaryFeatures(req);
+    FeatureSet::SP fs = world.getSummaryFeatures(*req);
     EXPECT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL(3u, fs->numDocs());
     EXPECT_EQUAL(0.0, count_f1_matches(*fs)); // "spread" has no hits
@@ -916,7 +916,7 @@ TEST("require that getSummaryFeatures prefers cached query setup") {
     auto pruneTime = vespalib::steady_clock::now() + 600s;
     world.sessionManager->pruneTimedOutSessions(pruneTime);
 
-    fs = world.getSummaryFeatures(req);
+    fs = world.getSummaryFeatures(*req);
     EXPECT_EQUAL(5u, fs->numFeatures());
     EXPECT_EQUAL(3u, fs->numDocs());
     EXPECT_EQUAL(2.0, count_f1_matches(*fs)); // "foo" has two hits
@@ -1123,7 +1123,7 @@ struct GlobalFilterParamsFixture {
        rank_properties.add(GlobalFilterLowerLimit::NAME, lower_limit);
        rank_properties.add(GlobalFilterUpperLimit::NAME, upper_limit);
    }
-   AttributeBlueprintParams extract(uint32_t active_docids = 9, uint32_t docid_limit = 10) {
+   AttributeBlueprintParams extract(uint32_t active_docids = 9, uint32_t docid_limit = 10) const {
        return MatchToolsFactory::extract_global_filter_params(rank_setup, rank_properties, active_docids, docid_limit);
    }
 };
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp
index 102da1d71e4..a6e4e366f7f 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp
@@ -61,14 +61,14 @@ MatchThread::Context::Context(double rankDropLimit, MatchTools &tools, HitCollec
     : matches(0),
       _matches_limit(tools.match_limiter().sample_hits_per_thread(num_threads)),
       _score_feature(get_score_feature(tools.rank_program())),
-      _ranking(tools.rank_program()),
       _rankDropLimit(rankDropLimit),
       _hits(hits),
-      _doom(tools.getDoom())
+      _doom(tools.getDoom()),
+      dropped()
 {
 }
 
-template <bool use_rank_drop_limit>
+template <MatchThread::RankDropLimitE use_rank_drop_limit>
 void
 MatchThread::Context::rankHit(uint32_t docId) {
     double score = _score_feature.as_number(docId);
@@ -76,9 +76,11 @@ MatchThread::Context::rankHit(uint32_t docId) {
     if (__builtin_expect(std::isnan(score) || std::isinf(score), false)) {
         score = -HUGE_VAL;
     }
-    if (use_rank_drop_limit) {
+    if (use_rank_drop_limit != RankDropLimitE::no) {
         if (__builtin_expect(score > _rankDropLimit, true)) {
             _hits.addHit(docId, score);
+        } else if (use_rank_drop_limit == RankDropLimitE::track) {
+            dropped.template emplace_back(docId);
         }
     } else {
         _hits.addHit(docId, score);
@@ -136,7 +138,8 @@ MatchThread::try_share(DocidRange &docid_range, uint32_t next_docid) {
     return false;
 }
 
-template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, bool use_rank_drop_limit>
+template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work,
+          MatchThread::RankDropLimitE use_rank_drop_limit>
 uint32_t
 MatchThread::inner_match_loop(Context &context, MatchTools &tools, DocidRange &docid_range)
 {
@@ -164,7 +167,8 @@ MatchThread::inner_match_loop(Context &context, MatchTools &tools, DocidRange &d
     return docId;
 }
 
-template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, bool use_rank_drop_limit>
+template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work,
+          MatchThread::RankDropLimitE use_rank_drop_limit>
 void
 MatchThread::match_loop(MatchTools &tools, HitCollector &hits)
 {
@@ -202,11 +206,16 @@ MatchThread::match_loop(MatchTools &tools, HitCollector &hits)
     if (do_rank) {
         thread_stats.docsRanked(matches);
     }
+    if (use_rank_drop_limit == RankDropLimitE::track) {
+        if (auto task = matchToolsFactory.createOnMatchTask()) {
+            task->run(std::move(context.dropped));
+        }
+    }
 }
 
 //-----------------------------------------------------------------------------
 
-template <bool do_rank, bool do_limit, bool do_share, bool use_rank_drop_limit>
+template <bool do_rank, bool do_limit, bool do_share, MatchThread::RankDropLimitE use_rank_drop_limit>
 void
 MatchThread::match_loop_helper_rank_limit_share_drop(MatchTools &tools, HitCollector &hits)
 {
@@ -218,9 +227,13 @@ void
 MatchThread::match_loop_helper_rank_limit_share(MatchTools &tools, HitCollector &hits)
 {
     if (matchParams.has_rank_drop_limit()) {
-        match_loop_helper_rank_limit_share_drop<do_rank, do_limit, do_share, true>(tools, hits);
+        if (matchToolsFactory.hasOnMatchTask()) {
+            match_loop_helper_rank_limit_share_drop<do_rank, do_limit, do_share, RankDropLimitE::track>(tools, hits);
+        } else {
+            match_loop_helper_rank_limit_share_drop<do_rank, do_limit, do_share, RankDropLimitE::yes>(tools, hits);
+        }
     } else {
-        match_loop_helper_rank_limit_share_drop<do_rank, do_limit, do_share, false>(tools, hits);
+        match_loop_helper_rank_limit_share_drop<do_rank, do_limit, do_share, RankDropLimitE::no>(tools, hits);
     }
 }
 
@@ -312,7 +325,7 @@ MatchThread::processResult(const Doom & doom,
                            ResultProcessor::Context &context)
 {
     if (doom.hard_doom()) return;
-    bool hasGrouping = (context.grouping.get() != 0);
+    bool hasGrouping = bool(context.grouping);
     if (context.sort->hasSortData() || hasGrouping) {
         result->mergeWithBitOverflow(fallback_rank_value());
     }
@@ -361,8 +374,6 @@ MatchThread::processResult(const Doom & doom,
     }
 
     if (auto task = matchToolsFactory.createOnMatchTask()) {
-        // This is not correct, as it should use the results before rank-drop-limit
-        // But keeping like this for now as on-first-phase should be a subset of on-match
         task->run(result->copyResult());
     }
     if (auto task = matchToolsFactory.createOnFirstPhaseTask()) {
@@ -426,7 +437,7 @@ MatchThread::run()
                         scheduler.total_size(thread_id),
                         result->getNumHits(),
                         resultContext->sort->hasSortData(),
-                        resultContext->grouping.get() != 0));
+                        bool(resultContext->grouping)));
         get_token_timer.done();
         trace->addEvent(5, "Start result processing");
         processResult(matchTools->getDoom(), std::move(result), *resultContext);
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_thread.h b/searchcore/src/vespa/searchcore/proton/matching/match_thread.h
index 2ef51f71b73..524dbfd4b39 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_thread.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_thread.h
@@ -48,6 +48,7 @@ public:
     using UniqueIssues = search::UniqueIssues;
 
 private:
+    enum class RankDropLimitE { no, yes, track};
     size_t                        thread_id;
     size_t                        num_threads;
     MatchParams                   matchParams;
@@ -71,7 +72,7 @@ private:
     public:
         Context(double rankDropLimit, MatchTools &tools, HitCollector &hits,
                 uint32_t num_threads) __attribute__((noinline));
-        template <bool use_rank_drop_limit>
+        template <RankDropLimitE use_rank_drop_limit>
         void rankHit(uint32_t docId);
         void addHit(uint32_t docId) { _hits.addHit(docId, search::zero_rank_value); }
         bool isBelowLimit() const { return matches < _matches_limit; }
@@ -82,10 +83,11 @@ private:
     private:
         uint32_t        _matches_limit;
         LazyValue       _score_feature;
-        RankProgram    &_ranking;
         double          _rankDropLimit;
         HitCollector   &_hits;
         const Doom     &_doom;
+    public:
+        std::vector<uint32_t> dropped;
     };
 
     double estimate_match_frequency(uint32_t matches, uint32_t searchedSoFar) __attribute__((noinline));
@@ -94,13 +96,13 @@ private:
     bool any_idle() const { return (idle_observer.get() > 0); }
     bool try_share(DocidRange &docid_range, uint32_t next_docid) __attribute__((noinline));
 
-    template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, bool use_rank_drop_limit>
+    template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, RankDropLimitE use_rank_drop_limit>
     uint32_t inner_match_loop(Context &context, MatchTools &tools, DocidRange &docid_range) __attribute__((noinline));
 
-    template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, bool use_rank_drop_limit>
+    template <typename Strategy, bool do_rank, bool do_limit, bool do_share_work, RankDropLimitE use_rank_drop_limit>
     void match_loop(MatchTools &tools, HitCollector &hits) __attribute__((noinline));
 
-    template <bool do_rank, bool do_limit, bool do_share, bool use_rank_drop_limit>
+    template <bool do_rank, bool do_limit, bool do_share, RankDropLimitE use_rank_drop_limit>
     void match_loop_helper_rank_limit_share_drop(MatchTools &tools, HitCollector &hits);
     template <bool do_rank, bool do_limit, bool do_share> void match_loop_helper_rank_limit_share(MatchTools &tools, HitCollector &hits);
     template <bool do_rank, bool do_limit> void match_loop_helper_rank_limit(MatchTools &tools, HitCollector &hits);
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
index eecbf116e9d..4a0f4e7ae8b 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp
@@ -285,6 +285,11 @@ MatchToolsFactory::createOnSummaryTask() const {
 }
 
 bool
+MatchToolsFactory::hasOnMatchTask() const {
+    return _rankSetup.getMutateOnMatch().enabled();
+}
+
+bool
 MatchToolsFactory::has_first_phase_rank() const {
     return !_rankSetup.getFirstPhaseRank().empty();
 }
diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
index d7254d4b958..5d98fabeb11 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
+++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.h
@@ -148,6 +148,7 @@ public:
     search::queryeval::Blueprint::HitEstimate estimate() const { return _query.estimate(); }
     bool has_first_phase_rank() const;
     bool has_match_features() const;
+    bool hasOnMatchTask() const;
     std::unique_ptr<AttributeOperationTask> createOnMatchTask() const;
     std::unique_ptr<AttributeOperationTask> createOnFirstPhaseTask() const;
     std::unique_ptr<AttributeOperationTask> createOnSecondPhaseTask() const;
diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h
index 1ad2fe0a77a..5b39715d5a1 100644
--- a/searchlib/src/vespa/searchlib/fef/ranksetup.h
+++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h
@@ -31,6 +31,7 @@ public:
             : _attribute(attribute),
               _operation(operation)
         {}
+        bool enabled() const { return !_attribute.empty() && !_operation.empty(); }
         vespalib::string _attribute;
         vespalib::string _operation;
     };
author	Henning Baldersheim <balder@yahoo-inc.com>	2022-08-24 13:34:20 +0200
committer	GitHub <noreply@github.com>	2022-08-24 13:34:20 +0200
commit	bb2a96b90f2ed0890de58dc955bb6c9bc433beee (patch)
tree	57b7ca32a1314539107912f42ad55a251e0a1771
parent	41a101bd2ccfa74e072a2de51101e7881dbe8b50 (diff)
parent	0ecd1ebf6e06ea9ee0d5c8fda12b02f3c2f66c13 (diff)