summaryrefslogtreecommitdiffstats
path: root/searchcore
diff options
context:
space:
mode:
authorArne Juul <arnej@yahoo-inc.com>2017-12-12 12:18:55 +0000
committerArne Juul <arnej@yahoo-inc.com>2017-12-13 14:55:49 +0000
commita1ed0cca2a7776f68c16e784e01330d7e7d2a197 (patch)
treebf29d6d208a5ddac219dcf03d371066248e6ec00 /searchcore
parentad36b949f9af80844afc53e4a36b740bd6ed8fc7 (diff)
do de-duplication on GlobalID in dispatch
Diffstat (limited to 'searchcore')
-rw-r--r--searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h4
-rw-r--r--searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp40
-rw-r--r--searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h1
3 files changed, 36 insertions, 9 deletions
diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h b/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h
index a19dcff025d..ca0053e0261 100644
--- a/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h
+++ b/searchcore/src/vespa/searchcore/fdispatch/search/fnet_search.h
@@ -329,6 +329,10 @@ public:
_util.CalcHitCount();
_util.AllocAlignedHitBuf();
}
+ void ST_AdjustNumHits(uint32_t numHits) {
+ _util.SetAlignedHitCount(numHits);
+ _util.CalcHitCount();
+ }
uint32_t ST_GetAlignedSearchOffset() const { return _util.GetAlignedSearchOffset(); }
uint32_t ST_GetAlignedMaxHits() const { return _util.GetAlignedMaxHits(); }
uint32_t ST_GetAlignedHitCount() const { return _util.GetAlignedHitCount(); }
diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp
index 846d95bd722..9a5f663c120 100644
--- a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp
+++ b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.cpp
@@ -5,6 +5,8 @@
#include "fnet_dataset.h"
#include "fnet_search.h"
#include <vespa/searchcore/util/stlishheap.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/vespalib/stllike/hash_set.hpp>
#include <vespa/log/log.h>
LOG_SETUP(".fdispatch.mergehits");
@@ -65,10 +67,16 @@ FastS_MergeCopyHit(typename T::HitType *src,
dst->setDistributionKey(src->getDistributionKey());
}
+struct GlobalIdHasher {
+ vespalib::hash_set<document::GlobalId, document::GlobalId::hash> seenSet;
+ bool insert(const document::GlobalId & g_id) {
+ return seenSet.insert(g_id).second;
+ }
+};
template <typename T, typename F>
-void
+size_t
FastS_InternalMergeHits(FastS_HitMerger<T> *merger)
{
typename T::SearchType *search = merger->GetSearch();
@@ -89,28 +97,40 @@ FastS_InternalMergeHits(FastS_HitMerger<T> *merger)
sortItr = sortRef;
}
+ GlobalIdHasher seenGids;
+
FastS_make_heap(heap, heapSize, FastS_MergeCompare<T, F>);
while (pt < end) {
node = *heap;
FastS_assert(heapSize > 0);
+ bool useHit = seenGids.insert(node->NT_GetHit()->HT_GetGlobalID());
if (F::UseSortData()) {
- if (!F::DropSortData()) {
+ if (!F::DropSortData() && useHit) {
FastS_MergeCopySortData<T>(node, sortItr++, sortDataLen);
}
node->NT_GetSortDataIterator()->Next();
}
- FastS_MergeCopyHit<T>(node->NT_GetHit(), pt++);
+ if (useHit) {
+ FastS_MergeCopyHit<T>(node->NT_GetHit(), pt++);
+ }
node->NT_NextHit();
if (node->NT_GetNumHitsLeft() > 0) {
FastS_pop_push_heap(heap, heapSize, node, FastS_MergeCompare<T, F>);
} else {
FastS_pop_heap(heap, heapSize--, FastS_MergeCompare<T, F>);
+ if (heapSize == 0) {
+ break;
+ }
}
}
+ if (pt != end) {
+ LOG(warning, "Duplicate removal lead to %zd missing hits (wanted %zd, got %zd)",
+ end - pt, end - beg, pt - beg);
+ }
merger->SetLastNode(node); // source of last hit
if (F::UseSortData()) {
- FastS_assert(F::DropSortData() || sortItr == sortRef + (end - beg));
+ FastS_assert(F::DropSortData() || sortItr == sortRef + (pt - beg));
}
// generate merged sort data
@@ -124,16 +144,17 @@ FastS_InternalMergeHits(FastS_HitMerger<T> *merger)
char *sortData = search->ST_GetSortData();
sortItr = sortRef;
- for (uint32_t residue = (end - beg); residue > 0; residue--) {
+ for (uint32_t residue = (pt - beg); residue > 0; residue--) {
*sortIdx++ = offset;
memcpy(sortData + offset, sortItr->_buf, sortItr->_len);
offset += sortItr->_len;
sortItr++;
}
*sortIdx = offset;
- FastS_assert(sortItr == sortRef + (end - beg));
+ FastS_assert(sortItr == sortRef + (pt - beg));
FastS_assert(offset == sortDataLen);
}
+ return (pt - beg);
}
//-----------------------------------------------------------------------------
@@ -219,16 +240,17 @@ FastS_HitMerger<T>::MergeHits()
// do actual merging by invoking templated function
if (useSortData) {
if (dropSortData) {
- FastS_InternalMergeHits
+ numDocs = FastS_InternalMergeHits
<T, FastS_MergeFeatures<true, true> >(this);
} else {
- FastS_InternalMergeHits
+ numDocs = FastS_InternalMergeHits
<T, FastS_MergeFeatures<true, false> >(this);
}
} else {
- FastS_InternalMergeHits
+ numDocs = FastS_InternalMergeHits
<T, FastS_MergeFeatures<false, false> >(this);
}
+ _search->ST_AdjustNumHits(numDocs);
// detect incomplete/fuzzy results
if (_search->ST_ShouldLimitHitsPerNode()) {
diff --git a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h
index 79157acb175..306229b4730 100644
--- a/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h
+++ b/searchcore/src/vespa/searchcore/fdispatch/search/mergehits.h
@@ -64,6 +64,7 @@ struct FastS_MergeHits_DummySearch
bool ST_ShouldDropSortData() { return false; }
bool ST_ShouldLimitHitsPerNode() { return false; }
void ST_SetNumHits(uint32_t numHits) { (void) numHits; }
+ void ST_AdjustNumHits(uint32_t nH) { (void) nH; }
uint32_t ST_GetAlignedSearchOffset() { return 0; }
uint32_t ST_GetAlignedMaxHits() { return 0; }
uint32_t ST_GetAlignedHitCount() { return 0; }