diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2017-12-14 11:42:54 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-14 11:42:54 +0100 |
commit | b27bada7c6b22d83557c6592d8c00fea2a6c35fe (patch) | |
tree | 33313752bd099009d21abf83ed5e4737e5f89182 | |
parent | ebb38b4228dbedcfecc77a5a0729c15f73d0cf96 (diff) | |
parent | 23484cfa3285e5e73e51f9a8f7b3652df3c3a680 (diff) |
Merge pull request #4441 from vespa-engine/arnej/add-dedup-in-dispatch
do de-duplication on GlobalID in dispatch
3 files changed, 35 insertions, 11 deletions
diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h b/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h index a19dcff025d..ca0053e0261 100644 --- a/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h +++ b/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h @@ -329,6 +329,10 @@ public: _util.CalcHitCount(); _util.AllocAlignedHitBuf(); } + void ST_AdjustNumHits(uint32_t numHits) { + _util.SetAlignedHitCount(numHits); + _util.CalcHitCount(); + } uint32_t ST_GetAlignedSearchOffset() const { return _util.GetAlignedSearchOffset(); } uint32_t ST_GetAlignedMaxHits() const { return _util.GetAlignedMaxHits(); } uint32_t ST_GetAlignedHitCount() const { return _util.GetAlignedHitCount(); } diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp index 846d95bd722..8c4a08a3bbb 100644 --- a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp +++ b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp @@ -5,6 +5,8 @@ #include "fnet_dataset.h" #include "fnet_search.h" #include <vespa/searchcore/util/stlishheap.h> +#include <vespa/vespalib/stllike/hash_set.h> +#include <vespa/vespalib/stllike/hash_set.hpp> #include <vespa/log/log.h> LOG_SETUP(".fdispatch.mergehits"); @@ -65,10 +67,17 @@ FastS_MergeCopyHit(typename T::HitType *src, dst->setDistributionKey(src->getDistributionKey()); } +struct GlobalIdHasher { + vespalib::hash_set<document::GlobalId, document::GlobalId::hash> seenSet; + bool insert(const document::GlobalId & g_id) { + return seenSet.insert(g_id).second; + } + GlobalIdHasher(size_t expected_size) : seenSet(expected_size * 3) {} +}; template <typename T, typename F> -void +size_t FastS_InternalMergeHits(FastS_HitMerger<T> *merger) { typename T::SearchType *search = merger->GetSearch(); @@ -89,18 +98,22 @@ FastS_InternalMergeHits(FastS_HitMerger<T> *merger) sortItr = sortRef; } + GlobalIdHasher seenGids(end - beg); + FastS_make_heap(heap, heapSize, FastS_MergeCompare<T, F>); - while (pt < end) { + while ((pt < end) && (heapSize > 0)) { node = *heap; - FastS_assert(heapSize > 0); + bool useHit = seenGids.insert(node->NT_GetHit()->HT_GetGlobalID()); if (F::UseSortData()) { - if (!F::DropSortData()) { + if (!F::DropSortData() && useHit) { FastS_MergeCopySortData<T>(node, sortItr++, sortDataLen); } node->NT_GetSortDataIterator()->Next(); } - FastS_MergeCopyHit<T>(node->NT_GetHit(), pt++); + if (useHit) { + FastS_MergeCopyHit<T>(node->NT_GetHit(), pt++); + } node->NT_NextHit(); if (node->NT_GetNumHitsLeft() > 0) { FastS_pop_push_heap(heap, heapSize, node, FastS_MergeCompare<T, F>); @@ -108,9 +121,13 @@ FastS_InternalMergeHits(FastS_HitMerger<T> *merger) FastS_pop_heap(heap, heapSize--, FastS_MergeCompare<T, F>); } } + if (pt != end) { + LOG(warning, "Duplicate removal lead to %zd missing hits (wanted %zd, got %zd)", + end - pt, end - beg, pt - beg); + } merger->SetLastNode(node); // source of last hit if (F::UseSortData()) { - FastS_assert(F::DropSortData() || sortItr == sortRef + (end - beg)); + FastS_assert(F::DropSortData() || sortItr == sortRef + (pt - beg)); } // generate merged sort data @@ -124,16 +141,17 @@ FastS_InternalMergeHits(FastS_HitMerger<T> *merger) char *sortData = search->ST_GetSortData(); sortItr = sortRef; - for (uint32_t residue = (end - beg); residue > 0; residue--) { + for (uint32_t residue = (pt - beg); residue > 0; residue--) { *sortIdx++ = offset; memcpy(sortData + offset, sortItr->_buf, sortItr->_len); offset += sortItr->_len; sortItr++; } *sortIdx = offset; - FastS_assert(sortItr == sortRef + (end - beg)); + FastS_assert(sortItr == sortRef + (pt - beg)); FastS_assert(offset == sortDataLen); } + return (pt - beg); } //----------------------------------------------------------------------------- @@ -219,16 +237,17 @@ FastS_HitMerger<T>::MergeHits() // do actual merging by invoking templated function if (useSortData) { if (dropSortData) { - FastS_InternalMergeHits + numDocs = FastS_InternalMergeHits <T, FastS_MergeFeatures<true, true> >(this); } else { - FastS_InternalMergeHits + numDocs = FastS_InternalMergeHits <T, FastS_MergeFeatures<true, false> >(this); } } else { - FastS_InternalMergeHits + numDocs = FastS_InternalMergeHits <T, FastS_MergeFeatures<false, false> >(this); } + _search->ST_AdjustNumHits(numDocs); // detect incomplete/fuzzy results if (_search->ST_ShouldLimitHitsPerNode()) { diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h index 79157acb175..306229b4730 100644 --- a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h +++ b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h @@ -64,6 +64,7 @@ struct FastS_MergeHits_DummySearch bool ST_ShouldDropSortData() { return false; } bool ST_ShouldLimitHitsPerNode() { return false; } void ST_SetNumHits(uint32_t numHits) { (void) numHits; } + void ST_AdjustNumHits(uint32_t nH) { (void) nH; } uint32_t ST_GetAlignedSearchOffset() { return 0; } uint32_t ST_GetAlignedMaxHits() { return 0; } uint32_t ST_GetAlignedHitCount() { return 0; } |