summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2019-11-21 10:20:28 +0000
committerArne Juul <arnej@verizonmedia.com>2019-11-25 12:32:19 +0000
commit2ecd9513fa13a596c51a96fac6aacd4229d912cd (patch)
tree139f7426f0e934bb66eba036648f871924543dc7 /searchlib
parent2debaa455d8393559d1def6126ff719a221fd7dc (diff)
add NearestNeighborIterator
* add shared state in blueprint * set estimate in blueprint * disallow unpack in blueprint
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp177
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp28
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h3
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h41
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp47
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h79
9 files changed, 381 insertions, 5 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index e81d83ddefa..fadd05ec50d 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -188,6 +188,7 @@ vespa_define_module(
src/tests/queryeval/getnodeweight
src/tests/queryeval/monitoring_search_iterator
src/tests/queryeval/multibitvectoriterator
+ src/tests/queryeval/nearest_neighbor
src/tests/queryeval/parallel_weak_and
src/tests/queryeval/predicate
src/tests/queryeval/same_element
diff --git a/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt b/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt
new file mode 100644
index 00000000000..5dd5f03a778
--- /dev/null
+++ b/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+vespa_add_executable(searchlib_nearest_neighbor_test_app TEST
+ SOURCES
+ nearest_neighbor_test.cpp
+ DEPENDS
+ searchlib
+)
+vespa_add_test(NAME searchlib_nearest_neighbor_test_app COMMAND searchlib_nearest_neighbor_test_app)
diff --git a/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp b/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp
new file mode 100644
index 00000000000..a5f1ab1747d
--- /dev/null
+++ b/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp
@@ -0,0 +1,177 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/vespalib/util/stringfmt.h>
+
+#include <vespa/eval/tensor/default_tensor_engine.h>
+#include <vespa/eval/tensor/dense/dense_tensor.h>
+#include <vespa/eval/tensor/tensor.h>
+#include <vespa/searchlib/common/feature.h>
+#include <vespa/searchlib/fef/matchdata.h>
+#include <vespa/searchlib/queryeval/nearest_neighbor_iterator.h>
+#include <vespa/searchlib/queryeval/simpleresult.h>
+#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
+#include <vespa/vespalib/test/insertion_operators.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP("nearest_neighbor_test");
+
+using search::feature_t;
+using search::tensor::DenseTensorAttribute;
+using search::AttributeVector;
+using vespalib::eval::ValueType;
+using vespalib::eval::TensorSpec;
+using vespalib::tensor::Tensor;
+using vespalib::tensor::DenseTensorView;
+using vespalib::tensor::DefaultTensorEngine;
+
+using namespace search::fef;
+using namespace search::queryeval;
+
+vespalib::string denseSpec("tensor(x[2])");
+
+std::unique_ptr<DenseTensorView> createTensor(const TensorSpec &spec) {
+ auto value = DefaultTensorEngine::ref().from_spec(spec);
+ DenseTensorView *tensor = dynamic_cast<DenseTensorView*>(value.get());
+ ASSERT_TRUE(tensor != nullptr);
+ value.release();
+ return std::unique_ptr<DenseTensorView>(tensor);
+}
+
+struct Fixture
+{
+ using BasicType = search::attribute::BasicType;
+ using CollectionType = search::attribute::CollectionType;
+ using Config = search::attribute::Config;
+
+ Config _cfg;
+ vespalib::string _name;
+ vespalib::string _typeSpec;
+ std::shared_ptr<DenseTensorAttribute> _tensorAttr;
+ std::shared_ptr<AttributeVector> _attr;
+
+ Fixture(const vespalib::string &typeSpec)
+ : _cfg(BasicType::TENSOR, CollectionType::SINGLE),
+ _name("test"),
+ _typeSpec(typeSpec),
+ _tensorAttr(),
+ _attr()
+ {
+ _cfg.setTensorType(ValueType::from_spec(typeSpec));
+ _tensorAttr = makeAttr();
+ _attr = _tensorAttr;
+ _attr->addReservedDoc();
+ }
+
+ ~Fixture() {}
+
+ std::shared_ptr<DenseTensorAttribute> makeAttr() {
+ return std::make_shared<DenseTensorAttribute>(_name, _cfg);
+ }
+
+ void ensureSpace(uint32_t docId) {
+ while (_attr->getNumDocs() <= docId) {
+ uint32_t newDocId = 0u;
+ _attr->addDoc(newDocId);
+ _attr->commit();
+ }
+ }
+
+ void setTensor(uint32_t docId, const Tensor &tensor) {
+ ensureSpace(docId);
+ _tensorAttr->setTensor(docId, tensor);
+ _attr->commit();
+ }
+
+ void setTensor(uint32_t docId, double v1, double v2) {
+ auto t = createTensor(TensorSpec(denseSpec)
+ .add({{"x", 0}}, v1)
+ .add({{"x", 1}}, v2));
+ setTensor(docId, *t);
+ }
+};
+
+template <bool strict>
+SimpleResult find_matches(Fixture &env) {
+ using NNI = NearestNeighborIterator<strict>;
+ auto md = MatchData::makeTestInstance(2, 2);
+ auto qt = createTensor(TensorSpec(denseSpec));
+
+ auto &tfmd = *(md->resolveTermField(0));
+ const DenseTensorView &qtv = *qt;
+ auto &attr = *(env._tensorAttr);
+
+ NearestNeighborDistanceHeap dh(2);
+ NNI search(tfmd, qtv, attr, dh);
+ if (strict) {
+ return SimpleResult().searchStrict(search, attr.getNumDocs());
+ } else {
+ return SimpleResult().search(search, attr.getNumDocs());
+ }
+}
+
+TEST("require that NearestNeighborIterator returns expected results") {
+ Fixture fixture(denseSpec);
+ fixture.ensureSpace(6);
+ fixture.setTensor(1, 3.0, 4.0);
+ fixture.setTensor(2, 6.0, 8.0);
+ fixture.setTensor(3, 5.0, 12.0);
+ fixture.setTensor(4, 4.0, 3.0);
+ fixture.setTensor(5, 8.0, 6.0);
+ fixture.setTensor(6, 4.0, 3.0);
+ SimpleResult result = find_matches<true>(fixture);
+ SimpleResult expect({1,2,4,6});
+ EXPECT_EQUAL(result, expect);
+ result = find_matches<false>(fixture);
+ EXPECT_EQUAL(result, expect);
+}
+
+template <bool strict>
+std::vector<feature_t> get_rawscores(Fixture &env) {
+ using NNI = NearestNeighborIterator<strict>;
+ auto md = MatchData::makeTestInstance(2, 2);
+ auto qt = createTensor(TensorSpec(denseSpec));
+ auto &tfmd = *(md->resolveTermField(0));
+ const DenseTensorView &qtv = *qt;
+ auto &attr = *(env._tensorAttr);
+ NearestNeighborDistanceHeap dh(2);
+ NNI search(tfmd, qtv, attr, dh);
+ uint32_t limit = attr.getNumDocs();
+ uint32_t docid = 1;
+ search.initRange(docid, limit);
+ std::vector<feature_t> rv;
+ while (docid < limit) {
+ if (strict) {
+ search.seek(docid);
+ if (search.isAtEnd()) break;
+ docid = search.getDocId();
+ search.unpack(docid);
+ rv.push_back(tfmd.getRawScore());
+ } else {
+ if (search.seek(docid)) {
+ search.unpack(docid);
+ rv.push_back(tfmd.getRawScore());
+ }
+ }
+ ++docid;
+ }
+ return rv;
+}
+
+TEST("require that NearestNeighborIterator sets expected rawscore") {
+ Fixture fixture(denseSpec);
+ fixture.ensureSpace(6);
+ fixture.setTensor(1, 3.0, 4.0);
+ fixture.setTensor(2, 5.0, 12.0);
+ fixture.setTensor(3, 6.0, 8.0);
+ fixture.setTensor(4, 5.0, 12.0);
+ fixture.setTensor(5, 8.0, 6.0);
+ fixture.setTensor(6, 4.0, 3.0);
+ std::vector<feature_t> got = get_rawscores<true>(fixture);
+ std::vector<feature_t> expected{5.0, 13.0, 10.0, 10.0, 5.0};
+ EXPECT_EQUAL(got, expected);
+ got = get_rawscores<false>(fixture);
+ EXPECT_EQUAL(got, expected);
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt
index 8b2d5691886..d8774d86edf 100644
--- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt
@@ -29,6 +29,7 @@ vespa_add_library(searchlib_queryeval OBJECT
multibitvectoriterator.cpp
multisearch.cpp
nearest_neighbor_blueprint.cpp
+ nearest_neighbor_iterator.cpp
nearsearch.cpp
orsearch.cpp
predicate_blueprint.cpp
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
index 3cfec55483e..1d2805d3dfe 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
@@ -2,6 +2,7 @@
#include "emptysearch.h"
#include "nearest_neighbor_blueprint.h"
+#include "nearest_neighbor_iterator.h"
#include <vespa/eval/tensor/dense/dense_tensor_view.h>
#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
@@ -14,8 +15,10 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f
: ComplexLeafBlueprint(field),
_attr_tensor(attr_tensor),
_query_tensor(std::move(query_tensor)),
- _target_num_hits(target_num_hits)
+ _target_num_hits(target_num_hits),
+ _distance_heap(target_num_hits)
{
+ setEstimate(HitEstimate(_attr_tensor.getNumDocs(), false));
}
NearestNeighborBlueprint::~NearestNeighborBlueprint() = default;
@@ -23,10 +26,18 @@ NearestNeighborBlueprint::~NearestNeighborBlueprint() = default;
std::unique_ptr<SearchIterator>
NearestNeighborBlueprint::createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda, bool strict) const
{
- (void) tfmda;
- (void) strict;
- // TODO (geirst): implement
- return std::make_unique<EmptySearch>();
+ using StrictNN = NearestNeighborIterator<true>;
+ using UnStrict = NearestNeighborIterator<false>;
+
+ assert(tfmda.size() == 1);
+ fef::TermFieldMatchData &tfmd = *tfmda[0]; // always search in only one field
+ const vespalib::tensor::DenseTensorView &qT = *_query_tensor;
+
+ if (strict) {
+ return std::make_unique<StrictNN>(tfmd, qT, _attr_tensor, _distance_heap);
+ } else {
+ return std::make_unique<UnStrict>(tfmd, qT, _attr_tensor, _distance_heap);
+ }
}
void
@@ -35,6 +46,13 @@ NearestNeighborBlueprint::visitMembers(vespalib::ObjectVisitor& visitor) const
ComplexLeafBlueprint::visitMembers(visitor);
visitor.visitString("attribute_tensor", _attr_tensor.getTensorType().to_spec());
visitor.visitString("query_tensor", _query_tensor->type().to_spec());
+ visitor.visitInt("target_num_hits", _target_num_hits);
+}
+
+bool
+NearestNeighborBlueprint::always_needs_unpack() const
+{
+ return true;
}
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
index 028817dfdf3..019f8e31842 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
@@ -2,6 +2,7 @@
#pragma once
#include "blueprint.h"
+#include "nearest_neighbor_distance_heap.h"
namespace vespalib::tensor { class DenseTensorView; }
namespace search::tensor { class DenseTensorAttribute; }
@@ -19,6 +20,7 @@ private:
const tensor::DenseTensorAttribute& _attr_tensor;
std::unique_ptr<vespalib::tensor::DenseTensorView> _query_tensor;
uint32_t _target_num_hits;
+ mutable NearestNeighborDistanceHeap _distance_heap;
public:
NearestNeighborBlueprint(const queryeval::FieldSpec& field,
@@ -35,6 +37,7 @@ public:
std::unique_ptr<SearchIterator> createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda,
bool strict) const override;
void visitMembers(vespalib::ObjectVisitor& visitor) const override;
+ bool always_needs_unpack() const override;
};
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h
new file mode 100644
index 00000000000..3937dfba2ca
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h
@@ -0,0 +1,41 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <mutex>
+#include <limits>
+#include <vespa/vespalib/util/priority_queue.h>
+
+namespace search::queryeval {
+
+/**
+ * A heap of the K closest distances that can be shared between multiple search iterators.
+ **/
+class NearestNeighborDistanceHeap {
+private:
+ std::mutex _lock;
+ size_t _size;
+ vespalib::PriorityQueue<double, std::greater<double>> _priQ;
+public:
+ explicit NearestNeighborDistanceHeap(size_t maxSize) : _size(maxSize), _priQ() {
+ _priQ.reserve(maxSize);
+ }
+ double distanceLimit() {
+ std::lock_guard<std::mutex> guard(_lock);
+ if (_priQ.size() < _size) {
+ return std::numeric_limits<double>::max();
+ }
+ return _priQ.front();
+ }
+ void used(double distance) {
+ std::lock_guard<std::mutex> guard(_lock);
+ if (_priQ.size() < _size) {
+ _priQ.push(distance);
+ } else if (distance < _priQ.front()) {
+ _priQ.front() = distance;
+ _priQ.adjust();
+ }
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp
new file mode 100644
index 00000000000..5a105f8e793
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp
@@ -0,0 +1,47 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "nearest_neighbor_iterator.h"
+
+using vespalib::ConstArrayRef;
+using vespalib::tensor::TypedCells;
+
+namespace {
+
+struct SumSq
+{
+ template <typename LCT, typename RCT>
+ static double
+ call(const ConstArrayRef<LCT> &lhs, const ConstArrayRef<RCT> &rhs)
+ {
+ double sum = 0.0;
+ size_t sz = lhs.size();
+ assert(sz == rhs.size());
+ for (size_t i = 0; i < sz; ++i) {
+ double diff = lhs[i] - rhs[i];
+ sum += diff*diff;
+ }
+ return sum;
+ }
+};
+
+}
+
+namespace search::queryeval {
+
+template <bool strict>
+NearestNeighborIterator<strict>::~NearestNeighborIterator() = default;
+
+template <bool strict>
+double
+NearestNeighborIterator<strict>::computeDistance(uint32_t docId)
+{
+ _tensorAttribute.getTensor(docId, _fieldTensor);
+ TypedCells lhsCells = _queryTensor.cellsRef();
+ TypedCells rhsCells = _fieldTensor.cellsRef();
+ return vespalib::tensor::dispatch_2<SumSq>(lhsCells, rhsCells);
+}
+
+template class NearestNeighborIterator<false>;
+template class NearestNeighborIterator<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h
new file mode 100644
index 00000000000..a89cbcf5144
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "searchiterator.h"
+#include "nearest_neighbor_distance_heap.h"
+#include <vespa/eval/tensor/dense/dense_tensor_view.h>
+#include <vespa/eval/tensor/dense/mutable_dense_tensor_view.h>
+#include <vespa/searchlib/fef/termfieldmatchdata.h>
+#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
+#include <vespa/vespalib/util/priority_queue.h>
+#include <cmath>
+
+namespace search::queryeval {
+
+/**
+ * Search iterator for K nearest neighbor matching.
+ **/
+template <bool strict>
+class NearestNeighborIterator : public SearchIterator
+{
+public:
+ using DenseTensorView = vespalib::tensor::DenseTensorView;
+ using DenseTensorAttribute = search::tensor::DenseTensorAttribute;
+ using MutableDenseTensorView = vespalib::tensor::MutableDenseTensorView;
+
+ NearestNeighborIterator(fef::TermFieldMatchData &tfmd,
+ const DenseTensorView &queryTensor,
+ const DenseTensorAttribute &tensorAttribute,
+ NearestNeighborDistanceHeap &distanceHeap)
+ : _tfmd(tfmd),
+ _queryTensor(queryTensor),
+ _tensorAttribute(tensorAttribute),
+ _fieldTensor(_tensorAttribute.getTensorType()),
+ _distanceHeap(distanceHeap),
+ _lastScore(0.0)
+ {
+ assert(_fieldTensor.fast_type() == _queryTensor.fast_type());
+ }
+
+ ~NearestNeighborIterator();
+
+ void doSeek(uint32_t docId) override {
+ double distanceLimit = _distanceHeap.distanceLimit();
+ while (__builtin_expect((docId < getEndId()), true)) {
+ double d = computeDistance(docId);
+ if (d <= distanceLimit) {
+ _lastScore = d;
+ setDocId(docId);
+ return;
+ }
+ if (strict) {
+ ++docId;
+ } else {
+ return;
+ }
+ }
+ setAtEnd();
+ }
+
+ void doUnpack(uint32_t docId) override {
+ _tfmd.setRawScore(docId, sqrt(_lastScore));
+ _distanceHeap.used(_lastScore);
+ }
+
+ Trinary is_strict() const override { return strict ? Trinary::True : Trinary::False ; }
+
+private:
+ double computeDistance(uint32_t docId);
+
+ fef::TermFieldMatchData &_tfmd;
+ const DenseTensorView &_queryTensor;
+ const DenseTensorAttribute &_tensorAttribute;
+ MutableDenseTensorView _fieldTensor;
+ NearestNeighborDistanceHeap &_distanceHeap;
+ double _lastScore;
+};
+
+}