From 67e1b35011e14b0c1dafefe44d3ae4989796bf4b Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Thu, 14 Nov 2019 07:42:53 +0000 Subject: add timing/quality benchmark tool for ANN --- eval/CMakeLists.txt | 1 + eval/src/tests/ann/.gitignore | 2 + eval/src/tests/ann/CMakeLists.txt | 10 + eval/src/tests/ann/doc_vector_access.h | 11 + eval/src/tests/ann/for-sift-hit.h | 10 + eval/src/tests/ann/for-sift-top-k.h | 26 +++ eval/src/tests/ann/nns-l2.h | 37 ++++ eval/src/tests/ann/nns.h | 60 ++++++ eval/src/tests/ann/sift_benchmark.cpp | 293 ++++++++++++++++++++++++++ eval/src/tests/ann/std-random.h | 21 ++ eval/src/tests/ann/xp-annoy-nns.cpp | 372 +++++++++++++++++++++++++++++++++ eval/src/tests/ann/xp-lsh-nns.cpp | 243 +++++++++++++++++++++ 12 files changed, 1086 insertions(+) create mode 100644 eval/src/tests/ann/.gitignore create mode 100644 eval/src/tests/ann/CMakeLists.txt create mode 100644 eval/src/tests/ann/doc_vector_access.h create mode 100644 eval/src/tests/ann/for-sift-hit.h create mode 100644 eval/src/tests/ann/for-sift-top-k.h create mode 100644 eval/src/tests/ann/nns-l2.h create mode 100644 eval/src/tests/ann/nns.h create mode 100644 eval/src/tests/ann/sift_benchmark.cpp create mode 100644 eval/src/tests/ann/std-random.h create mode 100644 eval/src/tests/ann/xp-annoy-nns.cpp create mode 100644 eval/src/tests/ann/xp-lsh-nns.cpp (limited to 'eval') diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index b3d4585a176..1fbf8ec110f 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -10,6 +10,7 @@ vespa_define_module( src/apps/tensor_conformance TESTS + src/tests/ann src/tests/eval/aggr src/tests/eval/compile_cache src/tests/eval/compiled_function diff --git a/eval/src/tests/ann/.gitignore b/eval/src/tests/ann/.gitignore new file mode 100644 index 00000000000..9249517f4b3 --- /dev/null +++ b/eval/src/tests/ann/.gitignore @@ -0,0 +1,2 @@ +*_app +log* diff --git a/eval/src/tests/ann/CMakeLists.txt b/eval/src/tests/ann/CMakeLists.txt new file mode 100644 index 00000000000..d82b2311b22 --- /dev/null +++ b/eval/src/tests/ann/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(eval_sift_benchmark_app + SOURCES + sift_benchmark.cpp + xp-annoy-nns.cpp + xp-lsh-nns.cpp + DEPENDS + vespaeval +) diff --git a/eval/src/tests/ann/doc_vector_access.h b/eval/src/tests/ann/doc_vector_access.h new file mode 100644 index 00000000000..c168279bc5e --- /dev/null +++ b/eval/src/tests/ann/doc_vector_access.h @@ -0,0 +1,11 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once +#include + +template +struct DocVectorAccess +{ + virtual vespalib::ConstArrayRef get(uint32_t docid) const = 0; + virtual ~DocVectorAccess() {} +}; diff --git a/eval/src/tests/ann/for-sift-hit.h b/eval/src/tests/ann/for-sift-hit.h new file mode 100644 index 00000000000..1f2c0f21851 --- /dev/null +++ b/eval/src/tests/ann/for-sift-hit.h @@ -0,0 +1,10 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +struct Hit { + uint32_t docid; + double distance; + Hit() : docid(0u), distance(0.0) {} + Hit(int id, double dist) : docid(id), distance(dist) {} +}; diff --git a/eval/src/tests/ann/for-sift-top-k.h b/eval/src/tests/ann/for-sift-top-k.h new file mode 100644 index 00000000000..ba91cb2aebc --- /dev/null +++ b/eval/src/tests/ann/for-sift-top-k.h @@ -0,0 +1,26 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +struct TopK { + static constexpr size_t K = 100; + Hit hits[K]; + + size_t recall(const TopK &other) { + size_t overlap = 0; + size_t i = 0; + size_t j = 0; + while (i < K && j < K) { + if (hits[i].docid == other.hits[j].docid) { + ++overlap; + ++i; + ++j; + } else if (hits[i].distance < other.hits[j].distance) { + ++i; + } else { + ++j; + } + } + return overlap; + } +}; diff --git a/eval/src/tests/ann/nns-l2.h b/eval/src/tests/ann/nns-l2.h new file mode 100644 index 00000000000..cfa5fed704f --- /dev/null +++ b/eval/src/tests/ann/nns-l2.h @@ -0,0 +1,37 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once +#include + +template +struct L2DistCalc { + vespalib::hwaccelrated::IAccelrated::UP _hw; + + L2DistCalc() : _hw(vespalib::hwaccelrated::IAccelrated::getAccelrator()) {} + + using Arr = vespalib::ArrayRef; + using ConstArr = vespalib::ConstArrayRef; + + double product(ConstArr v1, ConstArr v2) { + const FltType *p1 = v1.begin(); + const FltType *p2 = v2.begin(); + return _hw->dotProduct(p1, p2, v1.size()); + } + double l2sq(ConstArr vector) { + const FltType *v = vector.begin(); + return _hw->dotProduct(v, v, vector.size()); + } + double l2sq_dist(ConstArr v1, ConstArr v2, Arr tmp) { + for (size_t i = 0; i < v1.size(); ++i) { + tmp[i] = (v1[i] - v2[i]); + } + return l2sq(tmp); + } + double l2sq_dist(ConstArr v1, ConstArr v2) { + std::vector tmp; + tmp.resize(v1.size()); + return l2sq_dist(v1, v2, Arr(tmp)); + } +}; + +static L2DistCalc l2distCalc; diff --git a/eval/src/tests/ann/nns.h b/eval/src/tests/ann/nns.h new file mode 100644 index 00000000000..2e6666309bd --- /dev/null +++ b/eval/src/tests/ann/nns.h @@ -0,0 +1,60 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once +#include +#include "doc_vector_access.h" +#include "nns-l2.h" +#include + +struct NnsHit { + uint32_t docid; + double sqDistance; + NnsHit(uint32_t di, double sqD) + : docid(di), sqDistance(sqD) {} +}; +struct NnsHitComparatorLessDistance { + bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { + if (lhs.sqDistance > rhs.sqDistance) return false; + if (lhs.sqDistance < rhs.sqDistance) return true; + return (lhs.docid > rhs.docid); + } +}; +struct NnsHitComparatorGreaterDistance { + bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { + if (lhs.sqDistance < rhs.sqDistance) return false; + if (lhs.sqDistance > rhs.sqDistance) return true; + return (lhs.docid > rhs.docid); + } +}; +struct NnsHitComparatorLessDocid { + bool operator() (const NnsHit &lhs, const NnsHit& rhs) const { + return (lhs.docid < rhs.docid); + } +}; + +template +class NNS +{ +public: + NNS(uint32_t numDims, const DocVectorAccess &dva) + : _numDims(numDims), _dva(dva) + {} + + virtual void addDoc(uint32_t docid) = 0; + virtual void removeDoc(uint32_t docid) = 0; + + using Vector = vespalib::ConstArrayRef; + virtual std::vector topK(uint32_t k, Vector vector, uint32_t search_k) = 0; + virtual ~NNS() {} +protected: + uint32_t _numDims; + const DocVectorAccess &_dva; +}; + +extern +std::unique_ptr> +make_annoy_nns(uint32_t numDims, const DocVectorAccess &dva); + +extern +std::unique_ptr> +make_rplsh_nns(uint32_t numDims, const DocVectorAccess &dva); diff --git a/eval/src/tests/ann/sift_benchmark.cpp b/eval/src/tests/ann/sift_benchmark.cpp new file mode 100644 index 00000000000..f64351166c1 --- /dev/null +++ b/eval/src/tests/ann/sift_benchmark.cpp @@ -0,0 +1,293 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_DIMS 128 +#define NUM_DOCS 1000000 +#define NUM_Q 1000 + +#include "doc_vector_access.h" +#include "nns.h" +#include "for-sift-hit.h" +#include "for-sift-top-k.h" + +std::vector bruteforceResults; + +struct PointVector { + float v[NUM_DIMS]; + using ConstArr = vespalib::ConstArrayRef; + operator ConstArr() const { return ConstArr(v, NUM_DIMS); } +}; + +static PointVector *generatedQueries = + (PointVector *) malloc(NUM_Q * sizeof(PointVector)); + +static PointVector *generatedDocs = + (PointVector *) malloc(NUM_DOCS * sizeof(PointVector)); + +struct DocVectorAdapter : public DocVectorAccess +{ + vespalib::ConstArrayRef get(uint32_t docid) const override { + ASSERT_TRUE(docid < NUM_DOCS); + return generatedDocs[docid]; + } +}; + +double computeDistance(const PointVector &query, uint32_t docid) { + const PointVector &docvector = generatedDocs[docid]; + return l2distCalc.l2sq_dist(query, docvector); +} + +void read_queries(std::string fn) { + int fd = open(fn.c_str(), O_RDONLY); + ASSERT_TRUE(fd > 0); + int d; + size_t rv; + fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str()); + for (uint32_t qid = 0; qid < NUM_Q; ++qid) { + rv = read(fd, &d, 4); + ASSERT_EQUAL(rv, 4u); + ASSERT_EQUAL(d, NUM_DIMS); + rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float)); + ASSERT_EQUAL(rv, sizeof(PointVector)); + } + close(fd); +} + +void read_docs(std::string fn) { + int fd = open(fn.c_str(), O_RDONLY); + ASSERT_TRUE(fd > 0); + int d; + size_t rv; + fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str()); + for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) { + rv = read(fd, &d, 4); + ASSERT_EQUAL(rv, 4u); + ASSERT_EQUAL(d, NUM_DIMS); + rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float)); + ASSERT_EQUAL(rv, sizeof(PointVector)); + } + close(fd); +} + +using TimePoint = std::chrono::steady_clock::time_point; +using Duration = std::chrono::steady_clock::duration; + +double to_ms(Duration elapsed) { + std::chrono::duration ms(elapsed); + return ms.count(); +} + +void read_data(std::string dir) { + TimePoint bef = std::chrono::steady_clock::now(); + read_queries(dir + "/sift_query.fvecs"); + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef)); + bef = std::chrono::steady_clock::now(); + read_docs(dir + "/sift_base.fvecs"); + aft = std::chrono::steady_clock::now(); + fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef)); +} + + +struct BfHitComparator { + bool operator() (const Hit &lhs, const Hit& rhs) const { + if (lhs.distance < rhs.distance) return false; + if (lhs.distance > rhs.distance) return true; + return (lhs.docid > rhs.docid); + } +}; + +class BfHitHeap { +private: + size_t _size; + vespalib::PriorityQueue _priQ; +public: + explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() { + _priQ.reserve(maxSize); + } + ~BfHitHeap() {} + void maybe_use(const Hit &hit) { + if (_priQ.size() < _size) { + _priQ.push(hit); + } else if (hit.distance < _priQ.front().distance) { + _priQ.front() = hit; + _priQ.adjust(); + } + } + std::vector bestHits() { + std::vector result; + size_t i = _priQ.size(); + result.resize(i); + while (i-- > 0) { + result[i] = _priQ.front(); + _priQ.pop_front(); + } + return result; + } +}; + +TopK bruteforce_nns(const PointVector &query) { + TopK result; + BfHitHeap heap(result.K); + std::vector tmp_v(NUM_DIMS); + for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) { + const PointVector &docvector = generatedDocs[docid]; + double d = l2distCalc.l2sq_dist(query, docvector, tmp_v); + Hit h(docid, d); + heap.maybe_use(h); + } + std::vector best = heap.bestHits(); + for (size_t i = 0; i < result.K; ++i) { + result.hits[i] = best[i]; + } + return result; +} + +void verifyBF(uint32_t qid) { + const PointVector &query = generatedQueries[qid]; + TopK &result = bruteforceResults[qid]; + double min_distance = result.hits[0].distance; + std::vector all_c2; + for (uint32_t i = 0; i < NUM_DOCS; ++i) { + double dist = computeDistance(query, i); + if (dist < min_distance) { + fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance); + } + EXPECT_FALSE(dist+0.000001 < min_distance); + if (qid == 6) all_c2.push_back(dist / min_distance); + } + if (all_c2.size() != NUM_DOCS) return; + std::sort(all_c2.begin(), all_c2.end()); + for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, NUM_DOCS/2, NUM_DOCS-1}) { + fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]); + } +} + +TEST("require that brute force works") { + TimePoint bef = std::chrono::steady_clock::now(); + for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) { + const PointVector &query = generatedQueries[cnt]; + bruteforceResults.emplace_back(bruteforce_nns(query)); + } + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n", + to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); + for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) { + verifyBF(cnt); + } +} + +using NNS_API = NNS; + +TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) { + TopK result; + const PointVector &qv = generatedQueries[qid]; + vespalib::ConstArrayRef query(qv.v, NUM_DIMS); + auto rv = nns.topK(result.K, query, sk); + for (size_t i = 0; i < result.K; ++i) { + result.hits[i] = Hit(rv[i].docid, rv[i].sqDistance); + } + return result; +} + +void verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) { + TopK perfect = bruteforceResults[qid]; + TopK result = find_with_nns(sk, nns, qid); + int recall = perfect.recall(result); + EXPECT_TRUE(recall > 40); + double sum_error = 0.0; + double c_factor = 1.0; + for (size_t i = 0; i < result.K; ++i) { + double factor = (result.hits[i].distance / perfect.hits[i].distance); + sum_error += factor; + c_factor = std::max(c_factor, factor); + } + EXPECT_TRUE(c_factor < 1.5); + fprintf(stderr, "quality sk=%u: query %u: recall %d, c2-factor %.3f, avg c2: %.3f\n", + sk, qid, recall, c_factor, sum_error / result.K); + if (qid == 6) { + for (size_t i = 0; i < 10; ++i) { + fprintf(stderr, "topk[%zu] BF{%u %.3f} index{%u %.3f}\n", + i, + perfect.hits[i].docid, perfect.hits[i].distance, + result.hits[i].docid, result.hits[i].distance); + } + } +} + +TEST("require that Locality Sensitive Hashing mostly works") { + TimePoint bef = std::chrono::steady_clock::now(); + DocVectorAdapter adapter; + std::unique_ptr nns = make_rplsh_nns(NUM_DIMS, adapter); + for (uint32_t i = 0; i < NUM_DOCS; ++i) { + nns->addDoc(i); + } + fprintf(stderr, "added %u documents...\n", NUM_DOCS); + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "build RPLSH index: %.3f ms\n", to_ms(aft - bef)); + + for (uint32_t search_k : { 200, 1000 }) { + bef = std::chrono::steady_clock::now(); + for (int cnt = 0; cnt < NUM_Q; ++cnt) { + find_with_nns(search_k, *nns, cnt); + } + aft = std::chrono::steady_clock::now(); + fprintf(stderr, "timing for RPLSH search_k=%u: %.3f ms = %.3f ms per query\n", + search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); + for (int cnt = 0; cnt < NUM_Q; ++cnt) { + verify_nns_quality(search_k, *nns, cnt); + } + } +} + +TEST("require that Indexing via NNS api mostly works") { + fprintf(stderr, "trying indexing...\n"); + TimePoint bef = std::chrono::steady_clock::now(); + DocVectorAdapter adapter; + std::unique_ptr nns = make_annoy_nns(NUM_DIMS, adapter); + for (uint32_t i = 0; i < NUM_DOCS; ++i) { + nns->addDoc(i); + } + fprintf(stderr, "added %u documents...\n", NUM_DOCS); + nns->topK(1, adapter.get(0), 1); + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "build annoy index: %.3f ms\n", to_ms(aft - bef)); + + for (uint32_t search_k : { 10000, 20000 }) { + bef = std::chrono::steady_clock::now(); + for (int cnt = 0; cnt < NUM_Q; ++cnt) { + find_with_nns(search_k, *nns, cnt); + } + aft = std::chrono::steady_clock::now(); + fprintf(stderr, "timing for index search_k=%u: %.3f ms = %.3f ms per query\n", + search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q); + for (int cnt = 0; cnt < NUM_Q; ++cnt) { + verify_nns_quality(search_k, *nns, cnt); + } + } +} + +int main(int argc, char **argv) { + TEST_MASTER.init(__FILE__); + std::string sift_dir = "."; + if (argc > 1) { + sift_dir = argv[1]; + } else { + char *home = getenv("HOME"); + if (home) { + sift_dir = home; + sift_dir += "/sift"; + } + } + read_data(sift_dir); + TEST_RUN_ALL(); + return (TEST_MASTER.fini() ? 0 : 1); +} diff --git a/eval/src/tests/ann/std-random.h b/eval/src/tests/ann/std-random.h new file mode 100644 index 00000000000..e48a56006fd --- /dev/null +++ b/eval/src/tests/ann/std-random.h @@ -0,0 +1,21 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once +#include + +class RndGen { +private: + std::mt19937_64 urng; + std::normal_distribution normRng; + std::uniform_real_distribution uf; +public: + RndGen() : urng(0x1234deadbeef5678uLL), normRng(), uf(0.0, 1.0) {} + + double nextNormal() { + return normRng(urng); + } + + double nextUniform() { + return uf(urng); + } +}; diff --git a/eval/src/tests/ann/xp-annoy-nns.cpp b/eval/src/tests/ann/xp-annoy-nns.cpp new file mode 100644 index 00000000000..e5661c0c044 --- /dev/null +++ b/eval/src/tests/ann/xp-annoy-nns.cpp @@ -0,0 +1,372 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "nns.h" +#include "std-random.h" +#include +#include +#include +#include + +using V = vespalib::ConstArrayRef; +class AnnoyLikeNns; +struct Node; + +using QueueNode = std::pair; +using NodeQueue = std::priority_queue; + +struct Node { + Node() {} + virtual ~Node() {} + virtual Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) = 0; + virtual int remove(uint32_t docid, V vector) = 0; + virtual void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const = 0; +}; + +struct LeafNode : public Node { + std::vector docids; + + LeafNode() : Node(), docids() { docids.reserve(128); } + + Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override; + int remove(uint32_t docid, V vector) override; + void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const override; + + Node *split(AnnoyLikeNns &meta); +}; + +struct SplitNode : public Node { + std::vector hyperPlane; + double offsetFromOrigo; + Node *leftChildren; + Node *rightChildren; + + SplitNode() : Node(), hyperPlane(), offsetFromOrigo(), leftChildren(), rightChildren() {} + ~SplitNode(); + + Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override; + int remove(uint32_t docid, V vector) override; + void findCandidates(std::set &cands, V vector, NodeQueue &queue, double minDist) const override; + + double planeDistance(V vector) const; +}; + +class AnnoyLikeNns : public NNS +{ +private: + std::vector _roots; + RndGen _rndGen; + static constexpr size_t numRoots = 50; + +public: + AnnoyLikeNns(uint32_t numDims, const DocVectorAccess &dva) + : NNS(numDims, dva), _roots(), _rndGen() + { + _roots.reserve(numRoots); + for (size_t i = 0; i < numRoots; ++i) { + _roots.push_back(new LeafNode()); + } + } + + ~AnnoyLikeNns() { + for (Node *root : _roots) { + delete root; + } + } + + void addDoc(uint32_t docid) override { + V vector = _dva.get(docid); + for (Node * &root : _roots) { + root = root->addDoc(docid, vector, *this); + } + } + + void removeDoc(uint32_t docid) override { + V vector = _dva.get(docid); + for (Node * root : _roots) { + root->remove(docid, vector); + } + } + std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override; + + V getVector(uint32_t docid) const { return _dva.get(docid); } + double uniformRnd() { return _rndGen.nextUniform(); } + uint32_t dims() const { return _numDims; } +}; + + +double +SplitNode::planeDistance(V vector) const +{ + assert(vector.size() == hyperPlane.size()); + double dp = 0.0; + for (size_t i = 0; i < vector.size(); ++i) { + dp += vector[i] * hyperPlane[i]; + } + return dp - offsetFromOrigo; +} + + +Node * +LeafNode::addDoc(uint32_t docid, V, AnnoyLikeNns &meta) +{ + docids.push_back(docid); + if (docids.size() > 127) { + return split(meta); + } + return this; +} + +struct WeightedCentroid { + uint32_t cnt; + std::vector sum_point; + std::vector tmp_vector; + WeightedCentroid(V vector) + : cnt(1), sum_point(), tmp_vector(vector.size()) + { + sum_point.reserve(vector.size()); + for (float val : vector) { + sum_point.push_back(val); + } + } + void add_v(V vector) { + ++cnt; + for (size_t i = 0; i < vector.size(); ++i) { + sum_point[i] += vector[i]; + } + } + std::vector norm_diff(WeightedCentroid other) { + std::vector r; + const size_t sz = sum_point.size(); + double my_inv = 1.0 / cnt; + double ot_inv = 1.0 / other.cnt; + double sumSq = 0.0; + r.reserve(sz); + for (size_t i = 0; i < sz; ++i) { + double d = (sum_point[i] * my_inv) - (other.sum_point[i] * ot_inv); + r.push_back(d); + sumSq += d*d; + } + if (sumSq > 0) { + double invnorm = 1.0 / sqrt(sumSq); + for (size_t i = 0; i < sz; ++i) { + r[i] *= invnorm; + } + } + return r; + } + std::vector midpoint(WeightedCentroid other) { + std::vector r; + size_t sz = sum_point.size(); + r.reserve(sz); + double my_inv = 1.0 / cnt; + double ot_inv = 1.0 / other.cnt; + for (size_t i = 0; i < sz; ++i) { + double mp = (sum_point[i] * my_inv) + (other.sum_point[i] * ot_inv); + r.push_back(mp * 0.5); + } + return r; + } + double weightedDistance(V vector) { + size_t sz = vector.size(); + for (size_t i = 0; i < sz; ++i) { + tmp_vector[i] = vector[i] * cnt; + } + return l2distCalc.l2sq_dist(tmp_vector, sum_point) / cnt; + } + ~WeightedCentroid() {} +}; + +Node * +LeafNode::split(AnnoyLikeNns &meta) +{ + uint32_t dims = meta.dims(); + uint32_t retries = 3; +retry: + uint32_t p1i = uint32_t(meta.uniformRnd() * docids.size()); + uint32_t p2i = uint32_t(meta.uniformRnd() * (docids.size()-1)); + if (p2i >= p1i) ++p2i; + uint32_t p1d = docids[p1i]; + uint32_t p2d = docids[p2i]; + V p1 = meta.getVector(p1d); + V p2 = meta.getVector(p2d); + + double sumsq = 0; + for (size_t i = 0; i < dims; ++i) { + double d = p1[i] - p2[i]; + sumsq += d*d; + } + if ((!(sumsq > 0)) && (retries-- > 0)) { + goto retry; + } + WeightedCentroid centroid1(p1); + WeightedCentroid centroid2(p2); +#if 1 + for (size_t i = 0; (i * 1) < docids.size(); ++i) { + size_t p3i = (p1i + p2i + i) % docids.size(); + uint32_t p3d = docids[p3i]; + V p3 = meta.getVector(p3d); + double dist_c1 = centroid1.weightedDistance(p3); + double dist_c2 = centroid2.weightedDistance(p3); + bool use_c1 = false; + if (dist_c1 < dist_c2) { + use_c1 = true; + } else if (dist_c1 > dist_c2) { + use_c1 = false; + } else if (centroid1.cnt < centroid2.cnt) { + use_c1 = true; + } + if (use_c1) { + centroid1.add_v(p3); + } else { + centroid2.add_v(p3); + } + } +#endif + std::vector diff = centroid1.norm_diff(centroid2); + std::vector mp = centroid1.midpoint(centroid2); + double off = l2distCalc.product(diff, mp); + + SplitNode *s = new SplitNode(); + s->hyperPlane = std::move(diff); + s->offsetFromOrigo = off; + + std::vector leftDs; + std::vector rightDs; + + for (uint32_t docid : docids) { + V vector = meta.getVector(docid); + double dist = s->planeDistance(vector); + bool left = false; + if (dist < 0) { + left = true; + } else if (!(dist > 0)) { + left = (leftDs.size() < rightDs.size()); + } + if (left) { + leftDs.push_back(docid); + } else { + rightDs.push_back(docid); + } + } + +#if 0 + fprintf(stderr, "splitting leaf node numChildren %u\n", numChildren); + fprintf(stderr, "dims = %u\n", dims); + fprintf(stderr, "p1 idx=%u, docid=%u VSZ=%zu\n", p1i, p1d, p1.size()); + fprintf(stderr, "p2 idx=%u, docid=%u VSZ=%zu\n", p2i, p2d, p2.size()); + fprintf(stderr, "diff %zu sumsq = %g\n", diff.size(), sumsq); + fprintf(stderr, "offset from origo = %g\n", off); + fprintf(stderr, "split left=%zu, right=%zu\n", leftDs.size(), rightDs.size()); +#endif + + LeafNode *newRightNode = new LeafNode(); + newRightNode->docids = rightDs; + s->rightChildren = newRightNode; + this->docids = leftDs; + s->leftChildren = this; + return s; +} + +int +LeafNode::remove(uint32_t docid, V) +{ + auto iter = std::remove(docids.begin(), docids.end(), docid); + int removed = docids.end() - iter; + docids.erase(iter, docids.end()); + return removed; +} + +void +LeafNode::findCandidates(std::set &cands, V, NodeQueue &, double) const +{ + for (uint32_t d : docids) { + cands.insert(d); + } +} + +SplitNode::~SplitNode() +{ + delete leftChildren; + delete rightChildren; +} + +Node * +SplitNode::addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) +{ + double d = planeDistance(vector); + if (d < 0) { + leftChildren = leftChildren->addDoc(docid, vector, meta); + } else { + rightChildren = rightChildren->addDoc(docid, vector, meta); + } + return this; +} + +int +SplitNode::remove(uint32_t docid, V vector) +{ + double d = planeDistance(vector); + if (d < 0) { + int r = leftChildren->remove(docid, vector); + return r; + } else { + int r = rightChildren->remove(docid, vector); + return r; + } +} + +void +SplitNode::findCandidates(std::set &, V vector, NodeQueue &queue, double minDist) const +{ + double d = planeDistance(vector); + // fprintf(stderr, "push 2 nodes dist %g\n", d); + queue.push(std::make_pair(std::min(-d, minDist), leftChildren)); + queue.push(std::make_pair(std::min(d, minDist), rightChildren)); +} + +std::vector +AnnoyLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k) +{ + std::vector tmp; + tmp.resize(_numDims); + vespalib::ArrayRef tmpArr(tmp); + + std::vector r; + r.reserve(k); + std::set candidates; + NodeQueue queue; + // fprintf(stderr, "find %u candidates\n", k); + for (Node *root : _roots) { + double dist = std::numeric_limits::max(); + queue.push(std::make_pair(dist, root)); + } + while ((candidates.size() < std::max(k, search_k)) && (queue.size() > 0)) { + const QueueNode& top = queue.top(); + double md = top.first; + // fprintf(stderr, "find candidates: node with min distance %g\n", md); + Node *n = top.second; + queue.pop(); + n->findCandidates(candidates, vector, queue, md); + } +#if 0 + while (queue.size() > 0) { + const QueueNode& top = queue.top(); + fprintf(stderr, "discard candidates: node with distance %g\n", top.first); + queue.pop(); + } +#endif + for (uint32_t docid : candidates) { + double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr); + NnsHit hit(docid, dist); + r.push_back(hit); + } + std::sort(r.begin(), r.end(), NnsHitComparatorLessDistance()); + while (r.size() > k) r.pop_back(); + return r; +} + +std::unique_ptr> +make_annoy_nns(uint32_t numDims, const DocVectorAccess &dva) +{ + return std::make_unique(numDims, dva); +} diff --git a/eval/src/tests/ann/xp-lsh-nns.cpp b/eval/src/tests/ann/xp-lsh-nns.cpp new file mode 100644 index 00000000000..285985167c0 --- /dev/null +++ b/eval/src/tests/ann/xp-lsh-nns.cpp @@ -0,0 +1,243 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "nns.h" +#include "std-random.h" +#include +#include +#include +#include +#include +#include + +using V = vespalib::ConstArrayRef; + +#define NUM_HASH_WORDS 4 +#define IGNORE_BITS 32 +#define HIST_SIZE (64*NUM_HASH_WORDS + 1) + +struct LsMaskHash { + uint64_t bits[NUM_HASH_WORDS]; + uint64_t mask[NUM_HASH_WORDS]; + LsMaskHash() { + memset(bits, 0xff, sizeof bits); + memset(mask, 0xff, sizeof mask); + } +}; + +static inline int hash_dist(const LsMaskHash &h1, const LsMaskHash &h2) { + int cnt = 0; + for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { + uint64_t hx = h1.bits[o] ^ h2.bits[o]; + hx &= (h1.mask[o] | h2.mask[o]); + cnt += __builtin_popcountl(hx); + } + return cnt; +} + +struct Multiplier { + std::vector multiplier; + Multiplier(size_t dims) : multiplier(dims, 0.0) {} +}; + +LsMaskHash mask_hash_from_pv(V p, std::vector rpMatrix) { + LsMaskHash result; + float transformed[NUM_HASH_WORDS][64]; + std::vector squares; + for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { + uint64_t hash = 0; + for (size_t bit = 0; bit < 64; ++bit) { + hash <<= 1u; + V m = rpMatrix[bit+64*o].multiplier; + double dotproduct = l2distCalc.product(m, p); + if (dotproduct > 0.0) { + hash |= 1u; + } + double sq = dotproduct * dotproduct; + transformed[o][bit] = sq; + squares.push_back(sq); + } + result.bits[o] = hash; + } + std::sort(squares.begin(), squares.end()); + double lim = squares[IGNORE_BITS*NUM_HASH_WORDS-1]; + for (size_t o = 0; o < NUM_HASH_WORDS; ++o) { + uint64_t mask = 0; + for (size_t bit = 0; bit < 64; ++bit) { + mask <<= 1u; + if (transformed[o][bit] > lim) { + mask |= 1u; + } + } + result.mask[o] = mask; + } + return result; +} + +class RpLshNns : public NNS +{ +private: + RndGen _rndGen; + std::vector _transformationMatrix; + std::vector _generated_doc_hashes; + +public: + RpLshNns(uint32_t numDims, const DocVectorAccess &dva) + : NNS(numDims, dva), _rndGen() + { + _transformationMatrix.reserve(NUM_HASH_WORDS*64); + for (size_t i = 0; i < NUM_HASH_WORDS*64; i++) { + _transformationMatrix.emplace_back(numDims); + Multiplier &mult = _transformationMatrix.back(); + for (float &v : mult.multiplier) { + v = _rndGen.nextNormal(); + } + } + fprintf(stderr, "ignore bits for lsh: %d*%d=%d\n", + IGNORE_BITS, NUM_HASH_WORDS, IGNORE_BITS*NUM_HASH_WORDS); + _generated_doc_hashes.reserve(100000); + } + + ~RpLshNns() { + } + + void addDoc(uint32_t docid) override { + V vector = _dva.get(docid); + LsMaskHash hash = mask_hash_from_pv(vector, _transformationMatrix); + if (_generated_doc_hashes.size() == docid) { + _generated_doc_hashes.push_back(hash); + return; + } + while (_generated_doc_hashes.size() <= docid) { + _generated_doc_hashes.push_back(LsMaskHash()); + } + _generated_doc_hashes[docid] = hash; + } + void removeDoc(uint32_t docid) override { + if (_generated_doc_hashes.size() > docid) { + _generated_doc_hashes[docid] = LsMaskHash(); + } + } + std::vector topK(uint32_t k, Vector vector, uint32_t search_k) override; + + V getVector(uint32_t docid) const { return _dva.get(docid); } + double uniformRnd() { return _rndGen.nextUniform(); } + uint32_t dims() const { return _numDims; } +}; + + +struct LshHit { + double distance; + uint32_t docid; + int hash_distance; + LshHit() : distance(0.0), docid(0u), hash_distance(0) {} + LshHit(int id, double dist, int hd = 0) + : distance(dist), docid(id), hash_distance(hd) {} +}; + +struct LshHitComparator { + bool operator() (const LshHit &lhs, const LshHit& rhs) const { + if (lhs.distance < rhs.distance) return false; + if (lhs.distance > rhs.distance) return true; + return (lhs.docid > rhs.docid); + } +}; + +class LshHitHeap { +private: + size_t _size; + vespalib::PriorityQueue _priQ; + std::vector hd_histogram; +public: + explicit LshHitHeap(size_t maxSize) : _size(maxSize), _priQ() { + _priQ.reserve(maxSize); + } + ~LshHitHeap() {} + bool maybe_use(const LshHit &hit) { + if (_priQ.size() < _size) { + _priQ.push(hit); + uint32_t newHd = hit.hash_distance; + while (hd_histogram.size() <= newHd) { + hd_histogram.push_back(0); + } + hd_histogram[newHd]++; + } else if (hit.distance < _priQ.front().distance) { + uint32_t oldHd = _priQ.front().hash_distance; + uint32_t newHd = hit.hash_distance; + while (hd_histogram.size() <= newHd) { + hd_histogram.push_back(0); + } + hd_histogram[newHd]++; + hd_histogram[oldHd]--; + _priQ.front() = hit; + _priQ.adjust(); + return true; + } + return false; + } + int limitHashDistance() { + size_t sz = _priQ.size(); + uint32_t sum = 0; + for (uint32_t i = 0; i < hd_histogram.size(); ++i) { + sum += hd_histogram[i]; + if (sum >= ((3*sz)/4)) return i; + } + return 99999; + } + std::vector bestLshHits() { + std::vector result; + size_t sz = _priQ.size(); + result.resize(sz); + for (size_t i = sz; i-- > 0; ) { + result[i] = _priQ.front(); + _priQ.pop_front(); + } + return result; + } +}; + +std::vector +RpLshNns::topK(uint32_t k, Vector vector, uint32_t search_k) +{ + std::vector result; + result.reserve(k); + + std::vector tmp(_numDims); + vespalib::ArrayRef tmpArr(tmp); + + LsMaskHash query_hash = mask_hash_from_pv(vector, _transformationMatrix); + LshHitHeap heap(std::max(k, search_k)); + int limit_hash_dist = 99999; + int histogram[HIST_SIZE]; + int skipCnt = 0; + int fullCnt = 0; + int whdcCnt = 0; + memset(histogram, 0, sizeof histogram); + size_t docidLimit = _generated_doc_hashes.size(); + for (uint32_t docid = 0; docid < docidLimit; ++docid) { + int hd = hash_dist(query_hash, _generated_doc_hashes[docid]); + histogram[hd]++; + if (hd <= limit_hash_dist) { + ++fullCnt; + double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr); + LshHit h(docid, dist, hd); + if (heap.maybe_use(h)) { + ++whdcCnt; + limit_hash_dist = heap.limitHashDistance(); + } + } else { + ++skipCnt; + } + } + std::vector best = heap.bestLshHits(); + size_t numHits = std::min((size_t)k, best.size()); + for (size_t i = 0; i < numHits; ++i) { + result.emplace_back(best[i].docid, best[i].distance); + } + return result; +} + +std::unique_ptr> +make_rplsh_nns(uint32_t numDims, const DocVectorAccess &dva) +{ + return std::make_unique(numDims, dva); +} -- cgit v1.2.3