diff options
author | Arne Juul <arnej@verizonmedia.com> | 2019-11-21 10:20:28 +0000 |
---|---|---|
committer | Arne Juul <arnej@verizonmedia.com> | 2019-11-25 12:32:19 +0000 |
commit | 2ecd9513fa13a596c51a96fac6aacd4229d912cd (patch) | |
tree | 139f7426f0e934bb66eba036648f871924543dc7 /searchlib | |
parent | 2debaa455d8393559d1def6126ff719a221fd7dc (diff) |
add NearestNeighborIterator
* add shared state in blueprint
* set estimate in blueprint
* disallow unpack in blueprint
Diffstat (limited to 'searchlib')
9 files changed, 381 insertions, 5 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index e81d83ddefa..fadd05ec50d 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -188,6 +188,7 @@ vespa_define_module( src/tests/queryeval/getnodeweight src/tests/queryeval/monitoring_search_iterator src/tests/queryeval/multibitvectoriterator + src/tests/queryeval/nearest_neighbor src/tests/queryeval/parallel_weak_and src/tests/queryeval/predicate src/tests/queryeval/same_element diff --git a/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt b/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt new file mode 100644 index 00000000000..5dd5f03a778 --- /dev/null +++ b/searchlib/src/tests/queryeval/nearest_neighbor/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(searchlib_nearest_neighbor_test_app TEST + SOURCES + nearest_neighbor_test.cpp + DEPENDS + searchlib +) +vespa_add_test(NAME searchlib_nearest_neighbor_test_app COMMAND searchlib_nearest_neighbor_test_app) diff --git a/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp b/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp new file mode 100644 index 00000000000..a5f1ab1747d --- /dev/null +++ b/searchlib/src/tests/queryeval/nearest_neighbor/nearest_neighbor_test.cpp @@ -0,0 +1,177 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/util/stringfmt.h> + +#include <vespa/eval/tensor/default_tensor_engine.h> +#include <vespa/eval/tensor/dense/dense_tensor.h> +#include <vespa/eval/tensor/tensor.h> +#include <vespa/searchlib/common/feature.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchlib/queryeval/nearest_neighbor_iterator.h> +#include <vespa/searchlib/queryeval/simpleresult.h> +#include <vespa/searchlib/tensor/dense_tensor_attribute.h> +#include <vespa/vespalib/test/insertion_operators.h> + +#include <vespa/log/log.h> +LOG_SETUP("nearest_neighbor_test"); + +using search::feature_t; +using search::tensor::DenseTensorAttribute; +using search::AttributeVector; +using vespalib::eval::ValueType; +using vespalib::eval::TensorSpec; +using vespalib::tensor::Tensor; +using vespalib::tensor::DenseTensorView; +using vespalib::tensor::DefaultTensorEngine; + +using namespace search::fef; +using namespace search::queryeval; + +vespalib::string denseSpec("tensor(x[2])"); + +std::unique_ptr<DenseTensorView> createTensor(const TensorSpec &spec) { + auto value = DefaultTensorEngine::ref().from_spec(spec); + DenseTensorView *tensor = dynamic_cast<DenseTensorView*>(value.get()); + ASSERT_TRUE(tensor != nullptr); + value.release(); + return std::unique_ptr<DenseTensorView>(tensor); +} + +struct Fixture +{ + using BasicType = search::attribute::BasicType; + using CollectionType = search::attribute::CollectionType; + using Config = search::attribute::Config; + + Config _cfg; + vespalib::string _name; + vespalib::string _typeSpec; + std::shared_ptr<DenseTensorAttribute> _tensorAttr; + std::shared_ptr<AttributeVector> _attr; + + Fixture(const vespalib::string &typeSpec) + : _cfg(BasicType::TENSOR, CollectionType::SINGLE), + _name("test"), + _typeSpec(typeSpec), + _tensorAttr(), + _attr() + { + _cfg.setTensorType(ValueType::from_spec(typeSpec)); + _tensorAttr = makeAttr(); + _attr = _tensorAttr; + _attr->addReservedDoc(); + } + + ~Fixture() {} + + std::shared_ptr<DenseTensorAttribute> makeAttr() { + return std::make_shared<DenseTensorAttribute>(_name, _cfg); + } + + void ensureSpace(uint32_t docId) { + while (_attr->getNumDocs() <= docId) { + uint32_t newDocId = 0u; + _attr->addDoc(newDocId); + _attr->commit(); + } + } + + void setTensor(uint32_t docId, const Tensor &tensor) { + ensureSpace(docId); + _tensorAttr->setTensor(docId, tensor); + _attr->commit(); + } + + void setTensor(uint32_t docId, double v1, double v2) { + auto t = createTensor(TensorSpec(denseSpec) + .add({{"x", 0}}, v1) + .add({{"x", 1}}, v2)); + setTensor(docId, *t); + } +}; + +template <bool strict> +SimpleResult find_matches(Fixture &env) { + using NNI = NearestNeighborIterator<strict>; + auto md = MatchData::makeTestInstance(2, 2); + auto qt = createTensor(TensorSpec(denseSpec)); + + auto &tfmd = *(md->resolveTermField(0)); + const DenseTensorView &qtv = *qt; + auto &attr = *(env._tensorAttr); + + NearestNeighborDistanceHeap dh(2); + NNI search(tfmd, qtv, attr, dh); + if (strict) { + return SimpleResult().searchStrict(search, attr.getNumDocs()); + } else { + return SimpleResult().search(search, attr.getNumDocs()); + } +} + +TEST("require that NearestNeighborIterator returns expected results") { + Fixture fixture(denseSpec); + fixture.ensureSpace(6); + fixture.setTensor(1, 3.0, 4.0); + fixture.setTensor(2, 6.0, 8.0); + fixture.setTensor(3, 5.0, 12.0); + fixture.setTensor(4, 4.0, 3.0); + fixture.setTensor(5, 8.0, 6.0); + fixture.setTensor(6, 4.0, 3.0); + SimpleResult result = find_matches<true>(fixture); + SimpleResult expect({1,2,4,6}); + EXPECT_EQUAL(result, expect); + result = find_matches<false>(fixture); + EXPECT_EQUAL(result, expect); +} + +template <bool strict> +std::vector<feature_t> get_rawscores(Fixture &env) { + using NNI = NearestNeighborIterator<strict>; + auto md = MatchData::makeTestInstance(2, 2); + auto qt = createTensor(TensorSpec(denseSpec)); + auto &tfmd = *(md->resolveTermField(0)); + const DenseTensorView &qtv = *qt; + auto &attr = *(env._tensorAttr); + NearestNeighborDistanceHeap dh(2); + NNI search(tfmd, qtv, attr, dh); + uint32_t limit = attr.getNumDocs(); + uint32_t docid = 1; + search.initRange(docid, limit); + std::vector<feature_t> rv; + while (docid < limit) { + if (strict) { + search.seek(docid); + if (search.isAtEnd()) break; + docid = search.getDocId(); + search.unpack(docid); + rv.push_back(tfmd.getRawScore()); + } else { + if (search.seek(docid)) { + search.unpack(docid); + rv.push_back(tfmd.getRawScore()); + } + } + ++docid; + } + return rv; +} + +TEST("require that NearestNeighborIterator sets expected rawscore") { + Fixture fixture(denseSpec); + fixture.ensureSpace(6); + fixture.setTensor(1, 3.0, 4.0); + fixture.setTensor(2, 5.0, 12.0); + fixture.setTensor(3, 6.0, 8.0); + fixture.setTensor(4, 5.0, 12.0); + fixture.setTensor(5, 8.0, 6.0); + fixture.setTensor(6, 4.0, 3.0); + std::vector<feature_t> got = get_rawscores<true>(fixture); + std::vector<feature_t> expected{5.0, 13.0, 10.0, 10.0, 5.0}; + EXPECT_EQUAL(got, expected); + got = get_rawscores<false>(fixture); + EXPECT_EQUAL(got, expected); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt index 8b2d5691886..d8774d86edf 100644 --- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt @@ -29,6 +29,7 @@ vespa_add_library(searchlib_queryeval OBJECT multibitvectoriterator.cpp multisearch.cpp nearest_neighbor_blueprint.cpp + nearest_neighbor_iterator.cpp nearsearch.cpp orsearch.cpp predicate_blueprint.cpp diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp index 3cfec55483e..1d2805d3dfe 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp @@ -2,6 +2,7 @@ #include "emptysearch.h" #include "nearest_neighbor_blueprint.h" +#include "nearest_neighbor_iterator.h" #include <vespa/eval/tensor/dense/dense_tensor_view.h> #include <vespa/searchlib/tensor/dense_tensor_attribute.h> @@ -14,8 +15,10 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f : ComplexLeafBlueprint(field), _attr_tensor(attr_tensor), _query_tensor(std::move(query_tensor)), - _target_num_hits(target_num_hits) + _target_num_hits(target_num_hits), + _distance_heap(target_num_hits) { + setEstimate(HitEstimate(_attr_tensor.getNumDocs(), false)); } NearestNeighborBlueprint::~NearestNeighborBlueprint() = default; @@ -23,10 +26,18 @@ NearestNeighborBlueprint::~NearestNeighborBlueprint() = default; std::unique_ptr<SearchIterator> NearestNeighborBlueprint::createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda, bool strict) const { - (void) tfmda; - (void) strict; - // TODO (geirst): implement - return std::make_unique<EmptySearch>(); + using StrictNN = NearestNeighborIterator<true>; + using UnStrict = NearestNeighborIterator<false>; + + assert(tfmda.size() == 1); + fef::TermFieldMatchData &tfmd = *tfmda[0]; // always search in only one field + const vespalib::tensor::DenseTensorView &qT = *_query_tensor; + + if (strict) { + return std::make_unique<StrictNN>(tfmd, qT, _attr_tensor, _distance_heap); + } else { + return std::make_unique<UnStrict>(tfmd, qT, _attr_tensor, _distance_heap); + } } void @@ -35,6 +46,13 @@ NearestNeighborBlueprint::visitMembers(vespalib::ObjectVisitor& visitor) const ComplexLeafBlueprint::visitMembers(visitor); visitor.visitString("attribute_tensor", _attr_tensor.getTensorType().to_spec()); visitor.visitString("query_tensor", _query_tensor->type().to_spec()); + visitor.visitInt("target_num_hits", _target_num_hits); +} + +bool +NearestNeighborBlueprint::always_needs_unpack() const +{ + return true; } } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h index 028817dfdf3..019f8e31842 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h @@ -2,6 +2,7 @@ #pragma once #include "blueprint.h" +#include "nearest_neighbor_distance_heap.h" namespace vespalib::tensor { class DenseTensorView; } namespace search::tensor { class DenseTensorAttribute; } @@ -19,6 +20,7 @@ private: const tensor::DenseTensorAttribute& _attr_tensor; std::unique_ptr<vespalib::tensor::DenseTensorView> _query_tensor; uint32_t _target_num_hits; + mutable NearestNeighborDistanceHeap _distance_heap; public: NearestNeighborBlueprint(const queryeval::FieldSpec& field, @@ -35,6 +37,7 @@ public: std::unique_ptr<SearchIterator> createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda, bool strict) const override; void visitMembers(vespalib::ObjectVisitor& visitor) const override; + bool always_needs_unpack() const override; }; } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h new file mode 100644 index 00000000000..3937dfba2ca --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_distance_heap.h @@ -0,0 +1,41 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <mutex> +#include <limits> +#include <vespa/vespalib/util/priority_queue.h> + +namespace search::queryeval { + +/** + * A heap of the K closest distances that can be shared between multiple search iterators. + **/ +class NearestNeighborDistanceHeap { +private: + std::mutex _lock; + size_t _size; + vespalib::PriorityQueue<double, std::greater<double>> _priQ; +public: + explicit NearestNeighborDistanceHeap(size_t maxSize) : _size(maxSize), _priQ() { + _priQ.reserve(maxSize); + } + double distanceLimit() { + std::lock_guard<std::mutex> guard(_lock); + if (_priQ.size() < _size) { + return std::numeric_limits<double>::max(); + } + return _priQ.front(); + } + void used(double distance) { + std::lock_guard<std::mutex> guard(_lock); + if (_priQ.size() < _size) { + _priQ.push(distance); + } else if (distance < _priQ.front()) { + _priQ.front() = distance; + _priQ.adjust(); + } + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp new file mode 100644 index 00000000000..5a105f8e793 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp @@ -0,0 +1,47 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "nearest_neighbor_iterator.h" + +using vespalib::ConstArrayRef; +using vespalib::tensor::TypedCells; + +namespace { + +struct SumSq +{ + template <typename LCT, typename RCT> + static double + call(const ConstArrayRef<LCT> &lhs, const ConstArrayRef<RCT> &rhs) + { + double sum = 0.0; + size_t sz = lhs.size(); + assert(sz == rhs.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs[i] - rhs[i]; + sum += diff*diff; + } + return sum; + } +}; + +} + +namespace search::queryeval { + +template <bool strict> +NearestNeighborIterator<strict>::~NearestNeighborIterator() = default; + +template <bool strict> +double +NearestNeighborIterator<strict>::computeDistance(uint32_t docId) +{ + _tensorAttribute.getTensor(docId, _fieldTensor); + TypedCells lhsCells = _queryTensor.cellsRef(); + TypedCells rhsCells = _fieldTensor.cellsRef(); + return vespalib::tensor::dispatch_2<SumSq>(lhsCells, rhsCells); +} + +template class NearestNeighborIterator<false>; +template class NearestNeighborIterator<true>; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h new file mode 100644 index 00000000000..a89cbcf5144 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.h @@ -0,0 +1,79 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "searchiterator.h" +#include "nearest_neighbor_distance_heap.h" +#include <vespa/eval/tensor/dense/dense_tensor_view.h> +#include <vespa/eval/tensor/dense/mutable_dense_tensor_view.h> +#include <vespa/searchlib/fef/termfieldmatchdata.h> +#include <vespa/searchlib/tensor/dense_tensor_attribute.h> +#include <vespa/vespalib/util/priority_queue.h> +#include <cmath> + +namespace search::queryeval { + +/** + * Search iterator for K nearest neighbor matching. + **/ +template <bool strict> +class NearestNeighborIterator : public SearchIterator +{ +public: + using DenseTensorView = vespalib::tensor::DenseTensorView; + using DenseTensorAttribute = search::tensor::DenseTensorAttribute; + using MutableDenseTensorView = vespalib::tensor::MutableDenseTensorView; + + NearestNeighborIterator(fef::TermFieldMatchData &tfmd, + const DenseTensorView &queryTensor, + const DenseTensorAttribute &tensorAttribute, + NearestNeighborDistanceHeap &distanceHeap) + : _tfmd(tfmd), + _queryTensor(queryTensor), + _tensorAttribute(tensorAttribute), + _fieldTensor(_tensorAttribute.getTensorType()), + _distanceHeap(distanceHeap), + _lastScore(0.0) + { + assert(_fieldTensor.fast_type() == _queryTensor.fast_type()); + } + + ~NearestNeighborIterator(); + + void doSeek(uint32_t docId) override { + double distanceLimit = _distanceHeap.distanceLimit(); + while (__builtin_expect((docId < getEndId()), true)) { + double d = computeDistance(docId); + if (d <= distanceLimit) { + _lastScore = d; + setDocId(docId); + return; + } + if (strict) { + ++docId; + } else { + return; + } + } + setAtEnd(); + } + + void doUnpack(uint32_t docId) override { + _tfmd.setRawScore(docId, sqrt(_lastScore)); + _distanceHeap.used(_lastScore); + } + + Trinary is_strict() const override { return strict ? Trinary::True : Trinary::False ; } + +private: + double computeDistance(uint32_t docId); + + fef::TermFieldMatchData &_tfmd; + const DenseTensorView &_queryTensor; + const DenseTensorAttribute &_tensorAttribute; + MutableDenseTensorView _fieldTensor; + NearestNeighborDistanceHeap &_distanceHeap; + double _lastScore; +}; + +} |