Merge pull request #11602 from vespa-engine/arnej/add-xp-workbench-2

add timing/quality benchmark tool for ANN
author: Arne H Juul <arnej27959@users.noreply.github.com> 2019-12-20 13:21:01 +0100
committer: GitHub <noreply@github.com> 2019-12-20 13:21:01 +0100
commit: 7fcc9552456bc4ea862c2647541c48fca0eb11f6 (patch)
tree: 0c4cfec33c1b14bcdf856f26e85fc5171fa25fb0 /eval
parent: 24d224ff74817ccf47ba5356cd002fecc2cec1c6 (diff)
parent: 67e1b35011e14b0c1dafefe44d3ae4989796bf4b (diff)
12 files changed, 1086 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index 35731ef5cfb..b1212650a3f 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -10,6 +10,7 @@ vespa_define_module(
     src/apps/tensor_conformance
 
     TESTS
+    src/tests/ann
     src/tests/eval/aggr
     src/tests/eval/compile_cache
     src/tests/eval/compiled_function
diff --git a/eval/src/tests/ann/.gitignore b/eval/src/tests/ann/.gitignore
new file mode 100644
index 00000000000..9249517f4b3
--- /dev/null
+++ b/eval/src/tests/ann/.gitignore
@@ -0,0 +1,2 @@
+*_app
+log*
diff --git a/eval/src/tests/ann/CMakeLists.txt b/eval/src/tests/ann/CMakeLists.txt
new file mode 100644
index 00000000000..d82b2311b22
--- /dev/null
+++ b/eval/src/tests/ann/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+vespa_add_executable(eval_sift_benchmark_app
+    SOURCES
+    sift_benchmark.cpp
+    xp-annoy-nns.cpp
+    xp-lsh-nns.cpp
+    DEPENDS
+    vespaeval
+)
diff --git a/eval/src/tests/ann/doc_vector_access.h b/eval/src/tests/ann/doc_vector_access.h
new file mode 100644
index 00000000000..c168279bc5e
--- /dev/null
+++ b/eval/src/tests/ann/doc_vector_access.h
@@ -0,0 +1,11 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+#include <vespa/vespalib/util/arrayref.h>
+
+template <typename FltType = float>
+struct DocVectorAccess
+{
+    virtual vespalib::ConstArrayRef<FltType> get(uint32_t docid) const = 0;
+    virtual ~DocVectorAccess() {}
+};
diff --git a/eval/src/tests/ann/for-sift-hit.h b/eval/src/tests/ann/for-sift-hit.h
new file mode 100644
index 00000000000..1f2c0f21851
--- /dev/null
+++ b/eval/src/tests/ann/for-sift-hit.h
@@ -0,0 +1,10 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+struct Hit {
+    uint32_t docid;
+    double distance;
+    Hit() : docid(0u), distance(0.0) {}
+    Hit(int id, double dist) : docid(id), distance(dist) {}
+};
diff --git a/eval/src/tests/ann/for-sift-top-k.h b/eval/src/tests/ann/for-sift-top-k.h
new file mode 100644
index 00000000000..ba91cb2aebc
--- /dev/null
+++ b/eval/src/tests/ann/for-sift-top-k.h
@@ -0,0 +1,26 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+struct TopK {
+    static constexpr size_t K = 100;
+    Hit hits[K];
+
+    size_t recall(const TopK &other) {
+        size_t overlap = 0;
+        size_t i = 0;
+        size_t j = 0;
+        while (i < K && j < K) {
+            if (hits[i].docid == other.hits[j].docid) {
+                ++overlap;
+                ++i;
+                ++j;
+            } else if (hits[i].distance < other.hits[j].distance) {
+                ++i;
+            } else {
+                ++j;
+            }
+        }
+        return overlap;
+    }
+};
diff --git a/eval/src/tests/ann/nns-l2.h b/eval/src/tests/ann/nns-l2.h
new file mode 100644
index 00000000000..cfa5fed704f
--- /dev/null
+++ b/eval/src/tests/ann/nns-l2.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+
+template <typename FltType = float>
+struct L2DistCalc {
+    vespalib::hwaccelrated::IAccelrated::UP _hw;
+
+    L2DistCalc() : _hw(vespalib::hwaccelrated::IAccelrated::getAccelrator()) {}
+
+    using Arr = vespalib::ArrayRef<FltType>;
+    using ConstArr = vespalib::ConstArrayRef<FltType>;
+    
+    double product(ConstArr v1, ConstArr v2) {
+        const FltType *p1 = v1.begin();
+        const FltType *p2 = v2.begin();
+        return _hw->dotProduct(p1, p2, v1.size());
+    }
+    double l2sq(ConstArr vector) {
+        const FltType *v = vector.begin();
+        return _hw->dotProduct(v, v, vector.size());
+    }
+    double l2sq_dist(ConstArr v1, ConstArr v2, Arr tmp) {
+        for (size_t i = 0; i < v1.size(); ++i) {
+            tmp[i] = (v1[i] - v2[i]);
+        }
+        return l2sq(tmp);
+    }
+    double l2sq_dist(ConstArr v1, ConstArr v2) {
+        std::vector<FltType> tmp;
+        tmp.resize(v1.size());
+        return l2sq_dist(v1, v2, Arr(tmp));
+    }
+};
+
+static L2DistCalc l2distCalc;
diff --git a/eval/src/tests/ann/nns.h b/eval/src/tests/ann/nns.h
new file mode 100644
index 00000000000..2e6666309bd
--- /dev/null
+++ b/eval/src/tests/ann/nns.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+#include <vespa/vespalib/util/arrayref.h>
+#include "doc_vector_access.h"
+#include "nns-l2.h"
+#include <memory>
+
+struct NnsHit {
+    uint32_t docid;
+    double sqDistance;
+    NnsHit(uint32_t di, double sqD)
+        : docid(di), sqDistance(sqD) {}
+};
+struct NnsHitComparatorLessDistance {
+    bool operator() (const NnsHit &lhs, const NnsHit& rhs) const {
+        if (lhs.sqDistance > rhs.sqDistance) return false;
+        if (lhs.sqDistance < rhs.sqDistance) return true;
+        return (lhs.docid > rhs.docid);
+    }
+};
+struct NnsHitComparatorGreaterDistance {
+    bool operator() (const NnsHit &lhs, const NnsHit& rhs) const {
+        if (lhs.sqDistance < rhs.sqDistance) return false;
+        if (lhs.sqDistance > rhs.sqDistance) return true;
+        return (lhs.docid > rhs.docid);
+    }
+};
+struct NnsHitComparatorLessDocid {
+    bool operator() (const NnsHit &lhs, const NnsHit& rhs) const {
+        return (lhs.docid < rhs.docid);
+    }
+};
+
+template <typename FltType = float>
+class NNS
+{
+public:
+    NNS(uint32_t numDims, const DocVectorAccess<FltType> &dva)
+      : _numDims(numDims), _dva(dva)
+    {}
+
+    virtual void addDoc(uint32_t docid) = 0;
+    virtual void removeDoc(uint32_t docid) = 0;
+
+    using Vector = vespalib::ConstArrayRef<FltType>;
+    virtual std::vector<NnsHit> topK(uint32_t k, Vector vector, uint32_t search_k) = 0;
+    virtual ~NNS() {}
+protected:
+    uint32_t _numDims;
+    const DocVectorAccess<FltType> &_dva;
+};
+
+extern
+std::unique_ptr<NNS<float>>
+make_annoy_nns(uint32_t numDims, const DocVectorAccess<float> &dva);
+
+extern
+std::unique_ptr<NNS<float>>
+make_rplsh_nns(uint32_t numDims, const DocVectorAccess<float> &dva);
diff --git a/eval/src/tests/ann/sift_benchmark.cpp b/eval/src/tests/ann/sift_benchmark.cpp
new file mode 100644
index 00000000000..f64351166c1
--- /dev/null
+++ b/eval/src/tests/ann/sift_benchmark.cpp
@@ -0,0 +1,293 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/vespalib/util/priority_queue.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <chrono>
+
+#define NUM_DIMS 128
+#define NUM_DOCS 1000000
+#define NUM_Q 1000
+
+#include "doc_vector_access.h"
+#include "nns.h"
+#include "for-sift-hit.h"
+#include "for-sift-top-k.h"
+
+std::vector<TopK> bruteforceResults;
+
+struct PointVector {
+    float v[NUM_DIMS];
+    using ConstArr = vespalib::ConstArrayRef<float>;
+    operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
+};
+
+static PointVector *generatedQueries =
+    (PointVector *) malloc(NUM_Q * sizeof(PointVector));
+
+static PointVector *generatedDocs =
+    (PointVector *) malloc(NUM_DOCS * sizeof(PointVector));
+
+struct DocVectorAdapter : public DocVectorAccess<float>
+{
+    vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
+        ASSERT_TRUE(docid < NUM_DOCS);
+        return generatedDocs[docid];
+    }
+};
+
+double computeDistance(const PointVector &query, uint32_t docid) {
+    const PointVector &docvector = generatedDocs[docid];
+    return l2distCalc.l2sq_dist(query, docvector);
+}
+
+void read_queries(std::string fn) {
+    int fd = open(fn.c_str(), O_RDONLY);
+    ASSERT_TRUE(fd > 0);
+    int d;
+    size_t rv;
+    fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
+    for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
+        rv = read(fd, &d, 4);
+        ASSERT_EQUAL(rv, 4u);
+        ASSERT_EQUAL(d, NUM_DIMS);
+        rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
+        ASSERT_EQUAL(rv, sizeof(PointVector));
+    }
+    close(fd);
+}
+
+void read_docs(std::string fn) {
+    int fd = open(fn.c_str(), O_RDONLY);
+    ASSERT_TRUE(fd > 0);
+    int d;
+    size_t rv;
+    fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
+    for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
+        rv = read(fd, &d, 4);
+        ASSERT_EQUAL(rv, 4u);
+        ASSERT_EQUAL(d, NUM_DIMS);
+        rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
+        ASSERT_EQUAL(rv, sizeof(PointVector));
+    }
+    close(fd);
+}
+
+using TimePoint = std::chrono::steady_clock::time_point;
+using Duration = std::chrono::steady_clock::duration;
+
+double to_ms(Duration elapsed) {
+    std::chrono::duration<double, std::milli> ms(elapsed);
+    return ms.count();
+}
+
+void read_data(std::string dir) {
+    TimePoint bef = std::chrono::steady_clock::now();
+    read_queries(dir + "/sift_query.fvecs");
+    TimePoint aft = std::chrono::steady_clock::now();
+    fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
+    bef = std::chrono::steady_clock::now();
+    read_docs(dir + "/sift_base.fvecs");
+    aft = std::chrono::steady_clock::now();
+    fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
+}
+
+
+struct BfHitComparator {
+    bool operator() (const Hit &lhs, const Hit& rhs) const {
+        if (lhs.distance < rhs.distance) return false;
+        if (lhs.distance > rhs.distance) return true;
+        return (lhs.docid > rhs.docid);
+    }
+};
+
+class BfHitHeap {
+private:
+    size_t _size;
+    vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
+public:
+    explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
+        _priQ.reserve(maxSize);
+    }
+    ~BfHitHeap() {}
+    void maybe_use(const Hit &hit) {
+        if (_priQ.size() < _size) {
+            _priQ.push(hit);
+        } else if (hit.distance < _priQ.front().distance) {
+            _priQ.front() = hit;
+            _priQ.adjust();
+        }
+    }
+    std::vector<Hit> bestHits() {
+        std::vector<Hit> result;
+        size_t i = _priQ.size();
+        result.resize(i);
+        while (i-- > 0) {
+            result[i] = _priQ.front();
+            _priQ.pop_front();
+        }
+        return result;
+    }
+};
+
+TopK bruteforce_nns(const PointVector &query) {
+    TopK result;
+    BfHitHeap heap(result.K);
+    std::vector<float> tmp_v(NUM_DIMS);
+    for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
+        const PointVector &docvector = generatedDocs[docid];
+        double d = l2distCalc.l2sq_dist(query, docvector, tmp_v);
+        Hit h(docid, d);
+        heap.maybe_use(h);
+    }
+    std::vector<Hit> best = heap.bestHits();
+    for (size_t i = 0; i < result.K; ++i) {
+        result.hits[i] = best[i];
+    }
+    return result;
+}
+
+void verifyBF(uint32_t qid) {
+    const PointVector &query = generatedQueries[qid];
+    TopK &result = bruteforceResults[qid];
+    double min_distance = result.hits[0].distance;
+    std::vector<double> all_c2;
+    for (uint32_t i = 0; i < NUM_DOCS; ++i) {
+        double dist = computeDistance(query, i);
+        if (dist < min_distance) {
+            fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
+        }
+        EXPECT_FALSE(dist+0.000001 < min_distance);
+        if (qid == 6) all_c2.push_back(dist / min_distance);
+    }
+    if (all_c2.size() != NUM_DOCS) return;
+    std::sort(all_c2.begin(), all_c2.end());
+    for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, NUM_DOCS/2, NUM_DOCS-1}) {
+        fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]);
+    }
+}
+
+TEST("require that brute force works") {
+    TimePoint bef = std::chrono::steady_clock::now();
+    for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) {
+        const PointVector &query = generatedQueries[cnt];
+        bruteforceResults.emplace_back(bruteforce_nns(query));
+    }
+    TimePoint aft = std::chrono::steady_clock::now();
+    fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n",
+            to_ms(aft - bef), to_ms(aft - bef)/NUM_Q);
+    for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) {
+        verifyBF(cnt);
+    }
+}
+
+using NNS_API = NNS<float>;
+
+TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) {
+    TopK result;
+    const PointVector &qv = generatedQueries[qid];
+    vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
+    auto rv = nns.topK(result.K, query, sk);
+    for (size_t i = 0; i < result.K; ++i) {
+        result.hits[i] = Hit(rv[i].docid, rv[i].sqDistance);
+    }
+    return result;
+}
+
+void verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) {
+    TopK perfect = bruteforceResults[qid];
+    TopK result = find_with_nns(sk, nns, qid);
+    int recall = perfect.recall(result);
+    EXPECT_TRUE(recall > 40);
+    double sum_error = 0.0;
+    double c_factor = 1.0;
+    for (size_t i = 0; i < result.K; ++i) {
+        double factor = (result.hits[i].distance / perfect.hits[i].distance);
+        sum_error += factor;
+        c_factor = std::max(c_factor, factor);
+    }
+    EXPECT_TRUE(c_factor < 1.5);
+    fprintf(stderr, "quality sk=%u: query %u: recall %d, c2-factor %.3f, avg c2: %.3f\n",
+            sk, qid, recall, c_factor, sum_error / result.K);
+    if (qid == 6) {
+        for (size_t i = 0; i < 10; ++i) {
+            fprintf(stderr, "topk[%zu] BF{%u %.3f} index{%u %.3f}\n",
+                    i,
+                    perfect.hits[i].docid, perfect.hits[i].distance,
+                    result.hits[i].docid, result.hits[i].distance);
+        }
+    }
+}
+
+TEST("require that Locality Sensitive Hashing mostly works") {
+    TimePoint bef = std::chrono::steady_clock::now();
+    DocVectorAdapter adapter;
+    std::unique_ptr<NNS_API> nns = make_rplsh_nns(NUM_DIMS, adapter);
+    for (uint32_t i = 0; i < NUM_DOCS; ++i) {
+        nns->addDoc(i);
+    }
+    fprintf(stderr, "added %u documents...\n", NUM_DOCS);
+    TimePoint aft = std::chrono::steady_clock::now();
+    fprintf(stderr, "build RPLSH index: %.3f ms\n", to_ms(aft - bef));
+
+    for (uint32_t search_k : { 200, 1000 }) {
+        bef = std::chrono::steady_clock::now();
+        for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+            find_with_nns(search_k, *nns, cnt);
+        }
+        aft = std::chrono::steady_clock::now();
+        fprintf(stderr, "timing for RPLSH search_k=%u: %.3f ms = %.3f ms per query\n",
+                search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q);
+        for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+            verify_nns_quality(search_k, *nns, cnt);
+        }
+    }
+}
+
+TEST("require that Indexing via NNS api mostly works") {
+    fprintf(stderr, "trying indexing...\n");
+    TimePoint bef = std::chrono::steady_clock::now();
+    DocVectorAdapter adapter;
+    std::unique_ptr<NNS_API> nns = make_annoy_nns(NUM_DIMS, adapter);
+    for (uint32_t i = 0; i < NUM_DOCS; ++i) {
+        nns->addDoc(i);
+    }
+    fprintf(stderr, "added %u documents...\n", NUM_DOCS);
+    nns->topK(1, adapter.get(0), 1);
+    TimePoint aft = std::chrono::steady_clock::now();
+    fprintf(stderr, "build annoy index: %.3f ms\n", to_ms(aft - bef));
+
+    for (uint32_t search_k : { 10000, 20000 }) {
+        bef = std::chrono::steady_clock::now();
+        for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+            find_with_nns(search_k, *nns, cnt);
+        }
+        aft = std::chrono::steady_clock::now();
+        fprintf(stderr, "timing for index search_k=%u: %.3f ms = %.3f ms per query\n",
+                search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q);
+        for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+            verify_nns_quality(search_k, *nns, cnt);
+        }
+    }
+}
+
+int main(int argc, char **argv) {
+    TEST_MASTER.init(__FILE__);
+    std::string sift_dir = ".";
+    if (argc > 1) {
+        sift_dir = argv[1];
+    } else {
+        char *home = getenv("HOME");
+        if (home) {
+            sift_dir = home;
+            sift_dir += "/sift";
+        }
+    }
+    read_data(sift_dir);
+    TEST_RUN_ALL();
+    return (TEST_MASTER.fini() ? 0 : 1);
+}
diff --git a/eval/src/tests/ann/std-random.h b/eval/src/tests/ann/std-random.h
new file mode 100644
index 00000000000..e48a56006fd
--- /dev/null
+++ b/eval/src/tests/ann/std-random.h
@@ -0,0 +1,21 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+#include <random>
+
+class RndGen {
+private:
+    std::mt19937_64 urng;
+    std::normal_distribution<double> normRng;
+    std::uniform_real_distribution<double> uf;
+public:
+    RndGen() : urng(0x1234deadbeef5678uLL), normRng(), uf(0.0, 1.0) {}
+
+    double nextNormal() {
+        return normRng(urng);
+    }
+
+    double nextUniform() {
+        return uf(urng);
+    }
+};
diff --git a/eval/src/tests/ann/xp-annoy-nns.cpp b/eval/src/tests/ann/xp-annoy-nns.cpp
new file mode 100644
index 00000000000..e5661c0c044
--- /dev/null
+++ b/eval/src/tests/ann/xp-annoy-nns.cpp
@@ -0,0 +1,372 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "nns.h"
+#include "std-random.h"
+#include <assert.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+
+using V = vespalib::ConstArrayRef<float>;
+class AnnoyLikeNns;
+struct Node;
+
+using QueueNode = std::pair<double, Node *>;
+using NodeQueue = std::priority_queue<QueueNode>;
+
+struct Node {
+    Node() {}
+    virtual ~Node() {}
+    virtual Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) = 0;
+    virtual int remove(uint32_t docid, V vector) = 0;
+    virtual void findCandidates(std::set<uint32_t> &cands, V vector, NodeQueue &queue, double minDist) const = 0;
+};
+
+struct LeafNode : public Node {
+    std::vector<uint32_t> docids;
+
+    LeafNode() : Node(), docids() { docids.reserve(128); }
+
+    Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override;
+    int remove(uint32_t docid, V vector) override;
+    void findCandidates(std::set<uint32_t> &cands, V vector, NodeQueue &queue, double minDist) const override;
+
+    Node *split(AnnoyLikeNns &meta);
+};
+
+struct SplitNode : public Node {
+    std::vector<float> hyperPlane;
+    double offsetFromOrigo;
+    Node *leftChildren;
+    Node *rightChildren;
+
+    SplitNode() : Node(), hyperPlane(), offsetFromOrigo(), leftChildren(), rightChildren() {}
+    ~SplitNode();
+
+    Node *addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta) override;
+    int remove(uint32_t docid, V vector) override;
+    void findCandidates(std::set<uint32_t> &cands, V vector, NodeQueue &queue, double minDist) const override;
+
+    double planeDistance(V vector) const;
+};
+
+class AnnoyLikeNns : public NNS<float>
+{
+private:
+    std::vector<Node *> _roots;
+    RndGen _rndGen;
+    static constexpr size_t numRoots = 50;
+
+public:
+    AnnoyLikeNns(uint32_t numDims, const DocVectorAccess<float> &dva)
+        : NNS(numDims, dva), _roots(), _rndGen()
+    {
+        _roots.reserve(numRoots);
+        for (size_t i = 0; i < numRoots; ++i) {
+            _roots.push_back(new LeafNode());
+        }
+    }
+
+    ~AnnoyLikeNns() {
+        for (Node *root : _roots) {
+            delete root;
+        }
+    }
+
+    void addDoc(uint32_t docid) override {
+        V vector = _dva.get(docid);
+        for (Node * &root : _roots) {
+            root = root->addDoc(docid, vector, *this);
+        }
+    }
+
+    void removeDoc(uint32_t docid) override {
+        V vector = _dva.get(docid);
+        for (Node * root : _roots) {
+            root->remove(docid, vector);
+        }
+    }
+    std::vector<NnsHit> topK(uint32_t k, Vector vector, uint32_t search_k) override;
+
+    V getVector(uint32_t docid) const { return _dva.get(docid); }
+    double uniformRnd() { return _rndGen.nextUniform(); } 
+    uint32_t dims() const { return _numDims; }
+};
+
+
+double
+SplitNode::planeDistance(V vector) const
+{
+    assert(vector.size() == hyperPlane.size());
+    double dp = 0.0;
+    for (size_t i = 0; i < vector.size(); ++i) {
+        dp += vector[i] * hyperPlane[i];
+    }
+    return dp - offsetFromOrigo;
+}
+
+
+Node *
+LeafNode::addDoc(uint32_t docid, V, AnnoyLikeNns &meta)
+{
+    docids.push_back(docid);
+    if (docids.size() > 127) {
+        return split(meta);
+    }
+    return this;
+}
+
+struct WeightedCentroid {
+    uint32_t cnt;
+    std::vector<float> sum_point;
+    std::vector<float> tmp_vector;
+    WeightedCentroid(V vector)
+        : cnt(1), sum_point(), tmp_vector(vector.size())
+    {
+        sum_point.reserve(vector.size());
+        for (float val : vector) {
+            sum_point.push_back(val);
+        }
+    }
+    void add_v(V vector) {
+        ++cnt;
+        for (size_t i = 0; i < vector.size(); ++i) {
+            sum_point[i] += vector[i];
+        }
+    }
+    std::vector<float> norm_diff(WeightedCentroid other) {
+        std::vector<float> r;
+        const size_t sz = sum_point.size();
+        double my_inv = 1.0 / cnt;
+        double ot_inv = 1.0 / other.cnt;
+        double sumSq = 0.0;
+        r.reserve(sz);
+        for (size_t i = 0; i < sz; ++i) {
+            double d = (sum_point[i] * my_inv) - (other.sum_point[i] * ot_inv);
+            r.push_back(d);
+            sumSq += d*d;
+        }
+        if (sumSq > 0) {
+            double invnorm = 1.0 / sqrt(sumSq);
+            for (size_t i = 0; i < sz; ++i) {
+                r[i] *= invnorm;
+            }
+        }
+        return r;
+    }
+    std::vector<float> midpoint(WeightedCentroid other) {
+        std::vector<float> r;
+        size_t sz = sum_point.size();
+        r.reserve(sz);
+        double my_inv = 1.0 / cnt;
+        double ot_inv = 1.0 / other.cnt;
+        for (size_t i = 0; i < sz; ++i) {
+            double mp = (sum_point[i] * my_inv) + (other.sum_point[i] * ot_inv);
+            r.push_back(mp * 0.5);
+        }
+        return r;
+    }
+    double weightedDistance(V vector) {
+        size_t sz = vector.size();
+        for (size_t i = 0; i < sz; ++i) {
+            tmp_vector[i] = vector[i] * cnt;
+        }
+        return l2distCalc.l2sq_dist(tmp_vector, sum_point) / cnt;
+    }
+    ~WeightedCentroid() {}
+};
+
+Node *
+LeafNode::split(AnnoyLikeNns &meta)
+{
+    uint32_t dims = meta.dims();
+    uint32_t retries = 3;
+retry:
+    uint32_t p1i = uint32_t(meta.uniformRnd() * docids.size());
+    uint32_t p2i = uint32_t(meta.uniformRnd() * (docids.size()-1));
+    if (p2i >= p1i) ++p2i;
+    uint32_t p1d = docids[p1i];
+    uint32_t p2d = docids[p2i];
+    V p1 = meta.getVector(p1d);
+    V p2 = meta.getVector(p2d);
+
+    double sumsq = 0;
+    for (size_t i = 0; i < dims; ++i) {
+        double d = p1[i] - p2[i];
+        sumsq += d*d;
+    }
+    if ((!(sumsq > 0)) && (retries-- > 0)) {
+        goto retry;
+    }
+    WeightedCentroid centroid1(p1);
+    WeightedCentroid centroid2(p2);
+#if 1
+    for (size_t i = 0; (i * 1) < docids.size(); ++i) {
+        size_t p3i = (p1i + p2i + i) % docids.size();
+        uint32_t p3d = docids[p3i];
+        V p3 = meta.getVector(p3d);
+        double dist_c1 = centroid1.weightedDistance(p3);
+        double dist_c2 = centroid2.weightedDistance(p3);
+        bool use_c1 = false;
+        if (dist_c1 < dist_c2) {
+            use_c1 = true;
+        } else if (dist_c1 > dist_c2) {
+            use_c1 = false;
+        } else if (centroid1.cnt < centroid2.cnt) {
+            use_c1 = true;
+        }
+        if (use_c1) {
+            centroid1.add_v(p3);
+        } else {
+            centroid2.add_v(p3);
+        }
+    }
+#endif
+    std::vector<float> diff = centroid1.norm_diff(centroid2);
+    std::vector<float> mp = centroid1.midpoint(centroid2);
+    double off = l2distCalc.product(diff, mp);
+
+    SplitNode *s = new SplitNode();
+    s->hyperPlane = std::move(diff);
+    s->offsetFromOrigo = off;
+
+    std::vector<uint32_t> leftDs;
+    std::vector<uint32_t> rightDs;
+
+    for (uint32_t docid : docids) {
+        V vector = meta.getVector(docid);
+        double dist = s->planeDistance(vector);
+        bool left = false;
+        if (dist < 0) {
+            left = true;
+        } else if (!(dist > 0)) {
+            left = (leftDs.size() < rightDs.size());
+        }
+        if (left) {
+            leftDs.push_back(docid);
+        } else {
+            rightDs.push_back(docid);
+        }
+    }
+
+#if 0
+    fprintf(stderr, "splitting leaf node numChildren %u\n", numChildren);
+    fprintf(stderr, "dims = %u\n", dims);
+    fprintf(stderr, "p1 idx=%u, docid=%u VSZ=%zu\n", p1i, p1d, p1.size());
+    fprintf(stderr, "p2 idx=%u, docid=%u VSZ=%zu\n", p2i, p2d, p2.size());
+    fprintf(stderr, "diff %zu sumsq = %g\n", diff.size(), sumsq);
+    fprintf(stderr, "offset from origo = %g\n", off);
+    fprintf(stderr, "split left=%zu, right=%zu\n", leftDs.size(), rightDs.size());
+#endif
+
+    LeafNode *newRightNode = new LeafNode();
+    newRightNode->docids = rightDs;
+    s->rightChildren = newRightNode;
+    this->docids = leftDs;
+    s->leftChildren = this;
+    return s;
+}
+
+int
+LeafNode::remove(uint32_t docid, V)
+{
+    auto iter = std::remove(docids.begin(), docids.end(), docid);
+    int removed = docids.end() - iter;
+    docids.erase(iter, docids.end());
+    return removed;
+}
+
+void
+LeafNode::findCandidates(std::set<uint32_t> &cands, V, NodeQueue &, double) const
+{
+    for (uint32_t d : docids) {
+        cands.insert(d);
+    }
+}
+
+SplitNode::~SplitNode()
+{
+    delete leftChildren;
+    delete rightChildren;
+}
+
+Node *
+SplitNode::addDoc(uint32_t docid, V vector, AnnoyLikeNns &meta)
+{
+    double d = planeDistance(vector);
+    if (d < 0) {
+        leftChildren = leftChildren->addDoc(docid, vector, meta);
+    } else {
+        rightChildren = rightChildren->addDoc(docid, vector, meta);
+    }
+    return this;
+}
+
+int
+SplitNode::remove(uint32_t docid, V vector)
+{
+    double d = planeDistance(vector);
+    if (d < 0) {
+        int r = leftChildren->remove(docid, vector);
+        return r;
+    } else {
+        int r = rightChildren->remove(docid, vector);
+        return r;
+    }
+}
+
+void
+SplitNode::findCandidates(std::set<uint32_t> &, V vector, NodeQueue &queue, double minDist) const
+{
+    double d = planeDistance(vector);
+    // fprintf(stderr, "push 2 nodes dist %g\n", d);
+    queue.push(std::make_pair(std::min(-d, minDist), leftChildren));
+    queue.push(std::make_pair(std::min(d, minDist), rightChildren));
+}
+
+std::vector<NnsHit>
+AnnoyLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k)
+{
+    std::vector<float> tmp;
+    tmp.resize(_numDims);
+    vespalib::ArrayRef<float> tmpArr(tmp);
+
+    std::vector<NnsHit> r;
+    r.reserve(k);
+    std::set<uint32_t> candidates;
+    NodeQueue queue;
+    // fprintf(stderr, "find %u candidates\n", k);
+    for (Node *root : _roots) {
+        double dist = std::numeric_limits<double>::max();
+        queue.push(std::make_pair(dist, root));
+    }
+    while ((candidates.size() < std::max(k, search_k)) && (queue.size() > 0)) {
+        const QueueNode& top = queue.top();
+        double md = top.first;
+        // fprintf(stderr, "find candidates: node with min distance %g\n", md);
+        Node *n = top.second;
+        queue.pop();
+        n->findCandidates(candidates, vector, queue, md);
+    }
+#if 0
+    while (queue.size() > 0) {
+        const QueueNode& top = queue.top();
+        fprintf(stderr, "discard candidates: node with distance %g\n", top.first);
+        queue.pop();
+    }
+#endif
+    for (uint32_t docid : candidates) {
+        double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr);
+        NnsHit hit(docid, dist);
+        r.push_back(hit);
+    }
+    std::sort(r.begin(), r.end(), NnsHitComparatorLessDistance());
+    while (r.size() > k) r.pop_back();
+    return r;
+}
+
+std::unique_ptr<NNS<float>>
+make_annoy_nns(uint32_t numDims, const DocVectorAccess<float> &dva)
+{
+    return std::make_unique<AnnoyLikeNns>(numDims, dva);
+}
diff --git a/eval/src/tests/ann/xp-lsh-nns.cpp b/eval/src/tests/ann/xp-lsh-nns.cpp
new file mode 100644
index 00000000000..285985167c0
--- /dev/null
+++ b/eval/src/tests/ann/xp-lsh-nns.cpp
@@ -0,0 +1,243 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "nns.h"
+#include "std-random.h"
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include <vespa/vespalib/util/priority_queue.h>
+
+using V = vespalib::ConstArrayRef<float>;
+
+#define NUM_HASH_WORDS 4
+#define IGNORE_BITS 32
+#define HIST_SIZE (64*NUM_HASH_WORDS + 1)
+
+struct LsMaskHash {
+    uint64_t bits[NUM_HASH_WORDS];
+    uint64_t mask[NUM_HASH_WORDS];
+    LsMaskHash() {
+        memset(bits, 0xff, sizeof bits); 
+        memset(mask, 0xff, sizeof mask); 
+    }
+};
+
+static inline int hash_dist(const LsMaskHash &h1, const LsMaskHash &h2) {
+    int cnt = 0;
+    for (size_t o = 0; o < NUM_HASH_WORDS; ++o) {
+        uint64_t hx = h1.bits[o] ^ h2.bits[o];
+        hx &= (h1.mask[o] | h2.mask[o]);
+        cnt += __builtin_popcountl(hx);
+    }
+    return cnt;
+}
+
+struct Multiplier {
+    std::vector<float> multiplier;
+    Multiplier(size_t dims) : multiplier(dims, 0.0) {}
+};
+
+LsMaskHash mask_hash_from_pv(V p, std::vector<Multiplier> rpMatrix) {
+    LsMaskHash result;
+    float transformed[NUM_HASH_WORDS][64];
+    std::vector<double> squares;
+    for (size_t o = 0; o < NUM_HASH_WORDS; ++o) {
+        uint64_t hash = 0;
+        for (size_t bit = 0; bit < 64; ++bit) {
+            hash <<= 1u;
+            V m = rpMatrix[bit+64*o].multiplier;
+            double dotproduct = l2distCalc.product(m, p);
+            if (dotproduct > 0.0) {
+                hash |= 1u;
+            }
+            double sq = dotproduct * dotproduct;
+            transformed[o][bit] = sq;
+            squares.push_back(sq);
+        }
+        result.bits[o] = hash;
+    }
+    std::sort(squares.begin(), squares.end());
+    double lim = squares[IGNORE_BITS*NUM_HASH_WORDS-1];
+    for (size_t o = 0; o < NUM_HASH_WORDS; ++o) {
+        uint64_t mask = 0;
+        for (size_t bit = 0; bit < 64; ++bit) {
+            mask <<= 1u;
+            if (transformed[o][bit] > lim) {
+                mask |= 1u;
+            }
+        }
+        result.mask[o] = mask;
+    }
+    return result;
+}
+
+class RpLshNns : public NNS<float>
+{
+private:
+    RndGen _rndGen;
+    std::vector<Multiplier> _transformationMatrix;
+    std::vector<LsMaskHash> _generated_doc_hashes;
+
+public:
+    RpLshNns(uint32_t numDims, const DocVectorAccess<float> &dva)
+        : NNS(numDims, dva), _rndGen()
+    {
+        _transformationMatrix.reserve(NUM_HASH_WORDS*64);
+        for (size_t i = 0; i < NUM_HASH_WORDS*64; i++) {
+            _transformationMatrix.emplace_back(numDims);
+            Multiplier &mult = _transformationMatrix.back();
+            for (float &v : mult.multiplier) {
+                v = _rndGen.nextNormal();
+            }
+        }
+        fprintf(stderr, "ignore bits for lsh: %d*%d=%d\n",
+                IGNORE_BITS, NUM_HASH_WORDS, IGNORE_BITS*NUM_HASH_WORDS);
+        _generated_doc_hashes.reserve(100000);
+    }
+
+    ~RpLshNns() {
+    }
+
+    void addDoc(uint32_t docid) override {
+        V vector = _dva.get(docid);
+        LsMaskHash hash = mask_hash_from_pv(vector, _transformationMatrix);
+        if (_generated_doc_hashes.size() == docid) {
+            _generated_doc_hashes.push_back(hash);
+            return;
+        }
+        while (_generated_doc_hashes.size() <= docid) {
+            _generated_doc_hashes.push_back(LsMaskHash());
+        }
+        _generated_doc_hashes[docid] = hash;
+    }
+    void removeDoc(uint32_t docid) override {
+        if (_generated_doc_hashes.size() > docid) {
+            _generated_doc_hashes[docid] = LsMaskHash();
+        }
+    }
+    std::vector<NnsHit> topK(uint32_t k, Vector vector, uint32_t search_k) override;
+
+    V getVector(uint32_t docid) const { return _dva.get(docid); }
+    double uniformRnd() { return _rndGen.nextUniform(); } 
+    uint32_t dims() const { return _numDims; }
+};
+
+
+struct LshHit {
+    double distance;
+    uint32_t docid;
+    int hash_distance;
+    LshHit() : distance(0.0), docid(0u), hash_distance(0) {}
+    LshHit(int id, double dist, int hd = 0)
+        : distance(dist), docid(id), hash_distance(hd) {}
+};
+
+struct LshHitComparator {
+    bool operator() (const LshHit &lhs, const LshHit& rhs) const {
+        if (lhs.distance < rhs.distance) return false;
+        if (lhs.distance > rhs.distance) return true;
+        return (lhs.docid > rhs.docid);
+    }
+};
+
+class LshHitHeap {
+private:
+    size_t _size;
+    vespalib::PriorityQueue<LshHit, LshHitComparator> _priQ;
+    std::vector<int> hd_histogram;
+public:
+    explicit LshHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
+        _priQ.reserve(maxSize);
+    }
+    ~LshHitHeap() {}
+    bool maybe_use(const LshHit &hit) {
+        if (_priQ.size() < _size) {
+            _priQ.push(hit);
+            uint32_t newHd = hit.hash_distance;
+            while (hd_histogram.size() <= newHd) {
+                hd_histogram.push_back(0);
+            }
+            hd_histogram[newHd]++;
+        } else if (hit.distance < _priQ.front().distance) {
+            uint32_t oldHd = _priQ.front().hash_distance;
+            uint32_t newHd = hit.hash_distance;
+            while (hd_histogram.size() <= newHd) {
+                hd_histogram.push_back(0);
+            }
+            hd_histogram[newHd]++;
+            hd_histogram[oldHd]--;
+            _priQ.front() = hit;
+            _priQ.adjust();
+            return true;
+        }
+        return false;
+    }
+    int limitHashDistance() {
+        size_t sz = _priQ.size();
+        uint32_t sum = 0;
+        for (uint32_t i = 0; i < hd_histogram.size(); ++i) {
+            sum += hd_histogram[i];
+            if (sum >= ((3*sz)/4)) return i;
+        }
+        return 99999;
+    }
+    std::vector<LshHit> bestLshHits() {
+        std::vector<LshHit> result;
+        size_t sz = _priQ.size();
+        result.resize(sz);
+        for (size_t i = sz; i-- > 0; ) {
+            result[i] = _priQ.front();
+            _priQ.pop_front();
+        }
+        return result;
+    }
+};
+
+std::vector<NnsHit>
+RpLshNns::topK(uint32_t k, Vector vector, uint32_t search_k)
+{
+    std::vector<NnsHit> result;
+    result.reserve(k);
+
+    std::vector<float> tmp(_numDims);
+    vespalib::ArrayRef<float> tmpArr(tmp);
+
+    LsMaskHash query_hash = mask_hash_from_pv(vector, _transformationMatrix);
+    LshHitHeap heap(std::max(k, search_k));
+    int limit_hash_dist = 99999;
+    int histogram[HIST_SIZE];
+    int skipCnt = 0;
+    int fullCnt = 0;
+    int whdcCnt = 0;
+    memset(histogram, 0, sizeof histogram);
+    size_t docidLimit = _generated_doc_hashes.size();
+    for (uint32_t docid = 0; docid < docidLimit; ++docid) {
+        int hd = hash_dist(query_hash, _generated_doc_hashes[docid]);
+        histogram[hd]++;
+        if (hd <= limit_hash_dist) {
+            ++fullCnt;
+            double dist = l2distCalc.l2sq_dist(vector, _dva.get(docid), tmpArr);
+            LshHit h(docid, dist, hd);
+            if (heap.maybe_use(h)) {
+                ++whdcCnt;
+                limit_hash_dist = heap.limitHashDistance();
+            }
+        } else {
+            ++skipCnt;
+        }
+    }
+    std::vector<LshHit> best = heap.bestLshHits();
+    size_t numHits = std::min((size_t)k, best.size());
+    for (size_t i = 0; i < numHits; ++i) {
+        result.emplace_back(best[i].docid, best[i].distance);
+    }
+    return result;
+}
+
+std::unique_ptr<NNS<float>>
+make_rplsh_nns(uint32_t numDims, const DocVectorAccess<float> &dva)
+{
+    return std::make_unique<RpLshNns>(numDims, dva);
+}
author	Arne H Juul <arnej27959@users.noreply.github.com>	2019-12-20 13:21:01 +0100
committer	GitHub <noreply@github.com>	2019-12-20 13:21:01 +0100
commit	7fcc9552456bc4ea862c2647541c48fca0eb11f6 (patch)
tree	0c4cfec33c1b14bcdf856f26e85fc5171fa25fb0 /eval
parent	24d224ff74817ccf47ba5356cd002fecc2cec1c6 (diff)
parent	67e1b35011e14b0c1dafefe44d3ae4989796bf4b (diff)