summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-02-11 10:10:10 +0000
committerArne Juul <arnej@verizonmedia.com>2020-02-11 11:23:45 +0000
commit47710bc9914f2a4fefef940ba53bbb5ecb60fc86 (patch)
tree762a7d07b358f2b3ce2a8e30c065983922e1dda9
parent0a95e7d96c558de508c670d65e18f1dddd4ee417 (diff)
* add "remove" benchmark
* redo ops tracking * use std::aligned_alloc * more stats - measure reach This reverts commit 37fd87978ab1c3abfa840403e4e8f289d5ea4a20.
-rw-r--r--eval/src/tests/ann/CMakeLists.txt10
-rw-r--r--eval/src/tests/ann/remove-bm.cpp514
-rw-r--r--eval/src/tests/ann/sift_benchmark.cpp14
-rw-r--r--eval/src/tests/ann/xp-annoy-nns.cpp20
-rw-r--r--eval/src/tests/ann/xp-hnsw-wrap.cpp5
-rw-r--r--eval/src/tests/ann/xp-hnswlike-nns.cpp161
6 files changed, 652 insertions, 72 deletions
diff --git a/eval/src/tests/ann/CMakeLists.txt b/eval/src/tests/ann/CMakeLists.txt
index 05256d19f00..52b4d675d9c 100644
--- a/eval/src/tests/ann/CMakeLists.txt
+++ b/eval/src/tests/ann/CMakeLists.txt
@@ -9,3 +9,13 @@ vespa_add_executable(eval_sift_benchmark_app
DEPENDS
vespaeval
)
+
+vespa_add_executable(eval_remove_bm_app
+ SOURCES
+ remove-bm.cpp
+ xp-annoy-nns.cpp
+ xp-hnswlike-nns.cpp
+ xp-lsh-nns.cpp
+ DEPENDS
+ vespaeval
+)
diff --git a/eval/src/tests/ann/remove-bm.cpp b/eval/src/tests/ann/remove-bm.cpp
new file mode 100644
index 00000000000..2da735f1929
--- /dev/null
+++ b/eval/src/tests/ann/remove-bm.cpp
@@ -0,0 +1,514 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/vespalib/util/priority_queue.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <chrono>
+
+#define NUM_DIMS 960
+#define NUM_DOCS 250000
+#define NUM_DOCS_REMOVE 50000
+#define EFFECTIVE_DOCS (NUM_DOCS - NUM_DOCS_REMOVE)
+#define NUM_Q 1000
+
+#include "doc_vector_access.h"
+#include "nns.h"
+#include "for-sift-hit.h"
+#include "for-sift-top-k.h"
+
+std::vector<TopK> bruteforceResults;
+std::vector<float> tmp_v(NUM_DIMS);
+
+struct PointVector {
+ float v[NUM_DIMS];
+ using ConstArr = vespalib::ConstArrayRef<float>;
+ operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
+};
+
+static PointVector *aligned_alloc(size_t num) {
+ size_t num_bytes = num * sizeof(PointVector);
+ double mega_bytes = num_bytes / (1024.0*1024.0);
+ fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
+ char *mem = (char *)malloc(num_bytes + 512);
+ mem += 512;
+ size_t val = (size_t)mem;
+ size_t unalign = val % 512;
+ mem -= unalign;
+ return (PointVector *)mem;
+}
+
+static PointVector *generatedQueries = aligned_alloc(NUM_Q);
+static PointVector *generatedDocs = aligned_alloc(NUM_DOCS);
+
+struct DocVectorAdapter : public DocVectorAccess<float>
+{
+ vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
+ ASSERT_TRUE(docid < NUM_DOCS);
+ return generatedDocs[docid];
+ }
+};
+
+double computeDistance(const PointVector &query, uint32_t docid) {
+ const PointVector &docvector = generatedDocs[docid];
+ return l2distCalc.l2sq_dist(query, docvector, tmp_v);
+}
+
+void read_queries(std::string fn) {
+ int fd = open(fn.c_str(), O_RDONLY);
+ ASSERT_TRUE(fd > 0);
+ int d;
+ size_t rv;
+ fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
+ for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
+ rv = read(fd, &d, 4);
+ ASSERT_EQUAL(rv, 4u);
+ ASSERT_EQUAL(d, NUM_DIMS);
+ rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
+ ASSERT_EQUAL(rv, sizeof(PointVector));
+ }
+ close(fd);
+}
+
+void read_docs(std::string fn) {
+ int fd = open(fn.c_str(), O_RDONLY);
+ ASSERT_TRUE(fd > 0);
+ int d;
+ size_t rv;
+ fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
+ for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
+ rv = read(fd, &d, 4);
+ ASSERT_EQUAL(rv, 4u);
+ ASSERT_EQUAL(d, NUM_DIMS);
+ rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
+ ASSERT_EQUAL(rv, sizeof(PointVector));
+ }
+ close(fd);
+}
+
+using TimePoint = std::chrono::steady_clock::time_point;
+using Duration = std::chrono::steady_clock::duration;
+
+double to_ms(Duration elapsed) {
+ std::chrono::duration<double, std::milli> ms(elapsed);
+ return ms.count();
+}
+
+void read_data(std::string dir) {
+ TimePoint bef = std::chrono::steady_clock::now();
+ read_queries(dir + "/gist_query.fvecs");
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
+ bef = std::chrono::steady_clock::now();
+ read_docs(dir + "/gist_base.fvecs");
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
+}
+
+
+struct BfHitComparator {
+ bool operator() (const Hit &lhs, const Hit& rhs) const {
+ if (lhs.distance < rhs.distance) return false;
+ if (lhs.distance > rhs.distance) return true;
+ return (lhs.docid > rhs.docid);
+ }
+};
+
+class BfHitHeap {
+private:
+ size_t _size;
+ vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
+public:
+ explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
+ _priQ.reserve(maxSize);
+ }
+ ~BfHitHeap() {}
+ void maybe_use(const Hit &hit) {
+ if (_priQ.size() < _size) {
+ _priQ.push(hit);
+ } else if (hit.distance < _priQ.front().distance) {
+ _priQ.front() = hit;
+ _priQ.adjust();
+ }
+ }
+ std::vector<Hit> bestHits() {
+ std::vector<Hit> result;
+ size_t i = _priQ.size();
+ result.resize(i);
+ while (i-- > 0) {
+ result[i] = _priQ.front();
+ _priQ.pop_front();
+ }
+ return result;
+ }
+};
+
+TopK bruteforce_nns(const PointVector &query) {
+ TopK result;
+ BfHitHeap heap(result.K);
+ for (uint32_t docid = 0; docid < EFFECTIVE_DOCS; ++docid) {
+ const PointVector &docvector = generatedDocs[docid];
+ double d = l2distCalc.l2sq_dist(query, docvector, tmp_v);
+ Hit h(docid, d);
+ heap.maybe_use(h);
+ }
+ std::vector<Hit> best = heap.bestHits();
+ for (size_t i = 0; i < result.K; ++i) {
+ result.hits[i] = best[i];
+ }
+ return result;
+}
+
+void verifyBF(uint32_t qid) {
+ const PointVector &query = generatedQueries[qid];
+ TopK &result = bruteforceResults[qid];
+ double min_distance = result.hits[0].distance;
+ std::vector<double> all_c2;
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
+ double dist = computeDistance(query, i);
+ if (dist < min_distance) {
+ fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
+ }
+ EXPECT_FALSE(dist+0.000001 < min_distance);
+ if (min_distance > 0.0) all_c2.push_back(dist / min_distance);
+ }
+ if (all_c2.size() != EFFECTIVE_DOCS) return;
+ std::sort(all_c2.begin(), all_c2.end());
+ for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, EFFECTIVE_DOCS/2, EFFECTIVE_DOCS-1}) {
+ fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]);
+ }
+}
+
+using NNS_API = NNS<float>;
+
+#if 1
+TEST("require that HNSW via NNS api remove all works") {
+ DocVectorAdapter adapter;
+ std::unique_ptr<NNS_API> nns = make_hnsw_nns(NUM_DIMS, adapter);
+ fprintf(stderr, "adding and removing all docs forward...\n");
+ for (uint32_t i = 0; i < 1000; ++i) {
+ nns->addDoc(i);
+ }
+ for (uint32_t i = 0; i < 1000; ++i) {
+ nns->removeDoc(i);
+ }
+ fprintf(stderr, "adding and removing all docs reverse...\n");
+ for (uint32_t i = 1000; i < 2000; ++i) {
+ nns->addDoc(i);
+ }
+ for (uint32_t i = 2000; i-- > 1000; ) {
+ nns->removeDoc(i);
+ }
+}
+#endif
+
+TEST("require that brute force works") {
+ TimePoint bef = std::chrono::steady_clock::now();
+ fprintf(stderr, "generating %u brute force results\n", NUM_Q);
+ bruteforceResults.reserve(NUM_Q);
+ for (uint32_t cnt = 0; cnt < NUM_Q; ++cnt) {
+ const PointVector &query = generatedQueries[cnt];
+ bruteforceResults.emplace_back(bruteforce_nns(query));
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "timing for brute force: %.3f ms = %.3f ms per query\n",
+ to_ms(aft - bef), to_ms(aft - bef)/NUM_Q);
+ for (int cnt = 0; cnt < NUM_Q; cnt = (cnt+1)*2) {
+ verifyBF(cnt);
+ }
+}
+
+bool reach_with_nns_1(NNS_API &nns, uint32_t docid) {
+ const PointVector &qv = generatedDocs[docid];
+ vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
+ auto rv = nns.topK(1, query, 1);
+ if (rv.size() != 1) {
+ fprintf(stderr, "Result/A from query for %u is %zu hits\n", docid, rv.size());
+ return false;
+ }
+ if (rv[0].docid != docid) {
+ if (rv[0].sq.distance != 0.0)
+ fprintf(stderr, "Expected/A to find %u but got %u with sq distance %.3f\n",
+ docid, rv[0].docid, rv[0].sq.distance);
+ }
+ return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
+}
+
+bool reach_with_nns_100(NNS_API &nns, uint32_t docid) {
+ const PointVector &qv = generatedDocs[docid];
+ vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
+ auto rv = nns.topK(10, query, 100);
+ if (rv.size() != 10) {
+ fprintf(stderr, "Result/B from query for %u is %zu hits\n", docid, rv.size());
+ }
+ if (rv[0].docid != docid) {
+ if (rv[0].sq.distance != 0.0)
+ fprintf(stderr, "Expected/B to find %u but got %u with sq distance %.3f\n",
+ docid, rv[0].docid, rv[0].sq.distance);
+ }
+ return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
+}
+
+bool reach_with_nns_1k(NNS_API &nns, uint32_t docid) {
+ const PointVector &qv = generatedDocs[docid];
+ vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
+ auto rv = nns.topK(10, query, 1000);
+ if (rv.size() != 10) {
+ fprintf(stderr, "Result/C from query for %u is %zu hits\n", docid, rv.size());
+ }
+ if (rv[0].docid != docid) {
+ if (rv[0].sq.distance != 0.0)
+ fprintf(stderr, "Expected/C to find %u but got %u with sq distance %.3f\n",
+ docid, rv[0].docid, rv[0].sq.distance);
+ }
+ return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
+}
+
+TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) {
+ TopK result;
+ const PointVector &qv = generatedQueries[qid];
+ vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
+ auto rv = nns.topK(result.K, query, sk);
+ for (size_t i = 0; i < result.K; ++i) {
+ result.hits[i] = Hit(rv[i].docid, rv[i].sq.distance);
+ }
+ return result;
+}
+
+void verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) {
+ TopK perfect = bruteforceResults[qid];
+ TopK result = find_with_nns(sk, nns, qid);
+ int recall = perfect.recall(result);
+ EXPECT_TRUE(recall > 40);
+ double sum_error = 0.0;
+ double c_factor = 1.0;
+ for (size_t i = 0; i < result.K; ++i) {
+ double factor = (result.hits[i].distance / perfect.hits[i].distance);
+ if (factor < 0.99 || factor > 25) {
+ fprintf(stderr, "hit[%zu] got distance %.3f, expected %.3f\n",
+ i, result.hits[i].distance, perfect.hits[i].distance);
+ }
+ sum_error += factor;
+ c_factor = std::max(c_factor, factor);
+ }
+ EXPECT_TRUE(c_factor < 1.5);
+ fprintf(stderr, "quality sk=%u: query %u: recall %d c2-factor %.3f avg c2: %.3f\n",
+ sk, qid, recall, c_factor, sum_error / result.K);
+}
+
+void timing_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) {
+ for (uint32_t search_k : sk_list) {
+ TimePoint bef = std::chrono::steady_clock::now();
+ for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+ find_with_nns(search_k, nns, cnt);
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "timing for %s search_k=%u: %.3f ms = %.3f ms/q\n",
+ name, search_k, to_ms(aft - bef), to_ms(aft - bef)/NUM_Q);
+ }
+}
+
+void quality_nns(NNS_API &nns, std::vector<uint32_t> sk_list) {
+ for (uint32_t search_k : sk_list) {
+ for (int cnt = 0; cnt < NUM_Q; ++cnt) {
+ verify_nns_quality(search_k, nns, cnt);
+ }
+ }
+ uint32_t reached = 0;
+ for (uint32_t i = 0; i < 20000; ++i) {
+ if (reach_with_nns_1(nns, i)) ++reached;
+ }
+ fprintf(stderr, "Could reach %u of 20000 first documents with k=1\n", reached);
+ reached = 0;
+ for (uint32_t i = 0; i < 20000; ++i) {
+ if (reach_with_nns_100(nns, i)) ++reached;
+ }
+ fprintf(stderr, "Could reach %u of 20000 first documents with k=100\n", reached);
+ reached = 0;
+ for (uint32_t i = 0; i < 20000; ++i) {
+ if (reach_with_nns_1k(nns, i)) ++reached;
+ }
+ fprintf(stderr, "Could reach %u of 20000 first documents with k=1000\n", reached);
+}
+
+void benchmark_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) {
+ fprintf(stderr, "trying %s indexing...\n", name);
+
+#if 0
+ TimePoint bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS + i);
+ }
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(i);
+ }
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.removeDoc(EFFECTIVE_DOCS + i);
+ nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i);
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s realistic build with %u documents:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+#endif
+
+#if 1
+ TimePoint bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
+ nns.addDoc(i);
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+
+ bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS + i);
+ }
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.removeDoc(EFFECTIVE_DOCS + i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index add then remove %u docs: %.3f ms\n",
+ name, NUM_DOCS_REMOVE, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+#endif
+
+#if 0
+ TimePoint bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
+ nns.addDoc(i);
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+
+ bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
+ nns.removeDoc(i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index removed %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
+
+ const uint32_t addFirst = NUM_DOCS - (NUM_DOCS_REMOVE * 3);
+ const uint32_t addSecond = NUM_DOCS - (NUM_DOCS_REMOVE * 2);
+
+ bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < addFirst; ++i) {
+ nns.addDoc(i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, addFirst, to_ms(aft - bef));
+
+ bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS + i);
+ nns.addDoc(addFirst + i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index added %u docs: %.3f ms\n",
+ name, 2 * NUM_DOCS_REMOVE, to_ms(aft - bef));
+
+ bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.removeDoc(EFFECTIVE_DOCS + i);
+ nns.addDoc(addSecond + i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index added %u and removed %u docs: %.3f ms\n",
+ name, NUM_DOCS_REMOVE, NUM_DOCS_REMOVE, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s with %u documents some churn:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+
+#endif
+
+#if 0
+ bef = std::chrono::steady_clock::now();
+ fprintf(stderr, "removing and adding %u documents...\n", EFFECTIVE_DOCS);
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
+ nns.removeDoc(i);
+ nns.addDoc(i);
+ }
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index rem/add %u docs: %.3f ms\n",
+ name, EFFECTIVE_DOCS, to_ms(aft - bef));
+
+ timing_nns(name, nns, sk_list);
+ fprintf(stderr, "Quality for %s with %u documents full churn:\n", name, EFFECTIVE_DOCS);
+ quality_nns(nns, sk_list);
+#endif
+}
+
+#if 0
+TEST("require that Locality Sensitive Hashing mostly works") {
+ DocVectorAdapter adapter;
+ std::unique_ptr<NNS_API> nns = make_rplsh_nns(NUM_DIMS, adapter);
+ benchmark_nns("RPLSH", *nns, { 200, 1000 });
+}
+#endif
+
+#if 0
+TEST("require that Annoy via NNS api mostly works") {
+ DocVectorAdapter adapter;
+ std::unique_ptr<NNS_API> nns = make_annoy_nns(NUM_DIMS, adapter);
+ benchmark_nns("Annoy", *nns, { 8000, 10000 });
+}
+#endif
+
+#if 1
+TEST("require that HNSW via NNS api mostly works") {
+ DocVectorAdapter adapter;
+ std::unique_ptr<NNS_API> nns = make_hnsw_nns(NUM_DIMS, adapter);
+ benchmark_nns("HNSW-like", *nns, { 100, 150, 200 });
+}
+#endif
+
+#if 0
+TEST("require that HNSW wrapped api mostly works") {
+ DocVectorAdapter adapter;
+ std::unique_ptr<NNS_API> nns = make_hnsw_wrap(NUM_DIMS, adapter);
+ benchmark_nns("HNSW-wrap", *nns, { 100, 150, 200 });
+}
+#endif
+
+/**
+ * Before running the benchmark the ANN_GIST1M data set must be downloaded and extracted:
+ * wget ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz
+ * tar -xf gist.tar.gz
+ *
+ * The benchmark program will load the data set from $HOME/gist if no directory is specified.
+ *
+ * More information about the dataset is found here: http://corpus-texmex.irisa.fr/.
+ */
+int main(int argc, char **argv) {
+ TEST_MASTER.init(__FILE__);
+ std::string gist_dir = ".";
+ if (argc > 1) {
+ gist_dir = argv[1];
+ } else {
+ char *home = getenv("HOME");
+ if (home) {
+ gist_dir = home;
+ gist_dir += "/gist";
+ }
+ }
+ read_data(gist_dir);
+ TEST_RUN_ALL();
+ return (TEST_MASTER.fini() ? 0 : 1);
+}
diff --git a/eval/src/tests/ann/sift_benchmark.cpp b/eval/src/tests/ann/sift_benchmark.cpp
index f3570eb9247..7c060d86371 100644
--- a/eval/src/tests/ann/sift_benchmark.cpp
+++ b/eval/src/tests/ann/sift_benchmark.cpp
@@ -8,6 +8,7 @@
#include <unistd.h>
#include <stdio.h>
#include <chrono>
+#include <cstdlib>
#define NUM_DIMS 128
#define NUM_DOCS 1000000
@@ -28,11 +29,12 @@ struct PointVector {
};
static PointVector *aligned_alloc(size_t num) {
- char *mem = (char *)malloc(num * sizeof(PointVector) + 512);
- mem += 512;
- size_t val = (size_t)mem;
- size_t unalign = val % 512;
- mem -= unalign;
+ size_t sz = num * sizeof(PointVector);
+ size_t align = 512;
+ while ((sz % align) != 0) { align /= 2; }
+ double mega_bytes = sz / (1024.0*1024.0);
+ fprintf(stderr, "allocate %.2f MB of vectors (align %zu)\n", mega_bytes, align);
+ void *mem = std::aligned_alloc(align, sz);
return reinterpret_cast<PointVector *>(mem);
}
@@ -169,7 +171,7 @@ void verifyBF(uint32_t qid) {
fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
}
EXPECT_FALSE(dist+0.000001 < min_distance);
- if (qid == 6) all_c2.push_back(dist / min_distance);
+ if (min_distance > 0) all_c2.push_back(dist / min_distance);
}
if (all_c2.size() != NUM_DOCS) return;
std::sort(all_c2.begin(), all_c2.end());
diff --git a/eval/src/tests/ann/xp-annoy-nns.cpp b/eval/src/tests/ann/xp-annoy-nns.cpp
index c34f9f6eb36..f022aae5974 100644
--- a/eval/src/tests/ann/xp-annoy-nns.cpp
+++ b/eval/src/tests/ann/xp-annoy-nns.cpp
@@ -12,11 +12,11 @@ using V = vespalib::ConstArrayRef<float>;
class AnnoyLikeNns;
struct Node;
-static uint64_t plane_dist_cnt = 0;
-static uint64_t w_cen_dist_cnt = 0;
-static uint64_t leaf_split_cnt = 0;
-static uint64_t find_top_k_cnt = 0;
-static uint64_t find_cand_cnt = 0;
+static size_t plane_dist_cnt = 0;
+static size_t w_cen_dist_cnt = 0;
+static size_t leaf_split_cnt = 0;
+static size_t find_top_k_cnt = 0;
+static size_t find_cand_cnt = 0;
using QueueNode = std::pair<double, Node *>;
using NodeQueue = std::priority_queue<QueueNode>;
@@ -390,11 +390,11 @@ AnnoyLikeNns::topK(uint32_t k, Vector vector, uint32_t search_k)
void
AnnoyLikeNns::dumpStats() {
fprintf(stderr, "stats for AnnoyLikeNns:\n");
- fprintf(stderr, "planeDistance() calls: %" PRIu64 "\n", plane_dist_cnt);
- fprintf(stderr, "weightedDistance() calls: %" PRIu64 "\n", w_cen_dist_cnt);
- fprintf(stderr, "leaf split() calls: %" PRIu64 "\n", leaf_split_cnt);
- fprintf(stderr, "topK() calls: %" PRIu64 "\n", find_top_k_cnt);
- fprintf(stderr, "findCandidates() calls: %" PRIu64 "\n", find_cand_cnt);
+ fprintf(stderr, "planeDistance() calls: %zu\n", plane_dist_cnt);
+ fprintf(stderr, "weightedDistance() calls: %zu\n", w_cen_dist_cnt);
+ fprintf(stderr, "leaf split() calls: %zu\n", leaf_split_cnt);
+ fprintf(stderr, "topK() calls: %zu\n", find_top_k_cnt);
+ fprintf(stderr, "findCandidates() calls: %zu\n", find_cand_cnt);
std::vector<uint32_t> depths;
_roots[0]->stats(depths);
std::vector<uint32_t> counts;
diff --git a/eval/src/tests/ann/xp-hnsw-wrap.cpp b/eval/src/tests/ann/xp-hnsw-wrap.cpp
index 33895b2bd7c..3eb01142dcd 100644
--- a/eval/src/tests/ann/xp-hnsw-wrap.cpp
+++ b/eval/src/tests/ann/xp-hnsw-wrap.cpp
@@ -15,7 +15,7 @@ public:
HnswWrapNns(uint32_t numDims, const DocVectorAccess<float> &dva)
: NNS(numDims, dva),
_l2space(numDims),
- _hnsw(&_l2space, 1000000, 16, 200)
+ _hnsw(&_l2space, 2500000, 16, 200)
{
}
@@ -32,7 +32,8 @@ public:
std::vector<NnsHit> topK(uint32_t k, Vector vector, uint32_t search_k) override {
std::vector<NnsHit> reversed;
- auto priQ = _hnsw.searchKnn(vector.cbegin(), std::max(k, search_k));
+ _hnsw.setEf(search_k);
+ auto priQ = _hnsw.searchKnn(vector.cbegin(), k);
while (! priQ.empty()) {
auto pair = priQ.top();
reversed.emplace_back(pair.second, SqDist(pair.first));
diff --git a/eval/src/tests/ann/xp-hnswlike-nns.cpp b/eval/src/tests/ann/xp-hnswlike-nns.cpp
index 72b3fdb21f9..5cdbdd8efa3 100644
--- a/eval/src/tests/ann/xp-hnswlike-nns.cpp
+++ b/eval/src/tests/ann/xp-hnswlike-nns.cpp
@@ -7,13 +7,31 @@
#include "std-random.h"
#include "nns.h"
-static uint64_t distcalls_simple;
-static uint64_t distcalls_search_layer;
-static uint64_t distcalls_other;
-static uint64_t distcalls_heuristic;
-static uint64_t distcalls_shrink;
-static uint64_t distcalls_refill;
-static uint64_t refill_needed_calls;
+/*
+ Todo:
+
+ measure effect of:
+ 1) removing leftover backlinks during "shrink" operation
+ 2) refilling to low-watermark after 1) happens
+ 3) refilling to mid-watermark after 1) happens
+ 4) adding then removing 20% extra documents
+ 5) removing 20% first-added documents
+ 6) removing first-added documents while inserting new ones
+
+ 7) auto-tune search_k to ensure >= 50% recall on 1000 Q with k=100
+ 8) auto-tune search_k to ensure avg 90% recall on 1000 Q with k=100
+ 9) auto-tune search_k to ensure >= 90% reachability of 10000 docids
+
+ 10) timings for SIFT, GIST, and DEEP data (100k, 200k, 300k, 500k, 700k, 1000k)
+ */
+
+static size_t distcalls_simple;
+static size_t distcalls_search_layer;
+static size_t distcalls_other;
+static size_t distcalls_heuristic;
+static size_t distcalls_shrink;
+static size_t distcalls_refill;
+static size_t refill_needed_calls;
struct LinkList : std::vector<uint32_t>
{
@@ -31,6 +49,7 @@ struct LinkList : std::vector<uint32_t>
}
}
fprintf(stderr, "BAD missing link to remove: %u\n", id);
+ abort();
}
};
@@ -130,6 +149,7 @@ private:
double _levelMultiplier;
RndGen _rndGen;
VisitedSetPool _visitedSetPool;
+ size_t _ops_counter;
double distance(Vector v, uint32_t id) const;
@@ -144,6 +164,7 @@ private:
return (int) r;
}
+ uint32_t count_reachable() const;
void dumpStats() const;
public:
@@ -155,9 +176,9 @@ public:
_M(16),
_efConstruction(200),
_levelMultiplier(1.0 / log(1.0 * _M)),
- _rndGen()
+ _rndGen(),
+ _ops_counter(0)
{
- _nodes.reserve(1234567);
}
~HnswLikeNns() { dumpStats(); }
@@ -217,6 +238,7 @@ public:
if (_entryLevel < 0) {
_entryId = docid;
_entryLevel = level;
+ track_ops();
return;
}
int searchLevel = _entryLevel;
@@ -241,18 +263,23 @@ public:
_entryLevel = level;
_entryId = docid;
}
- if (_nodes.size() % 10000 == 0) {
- double div = _nodes.size();
- fprintf(stderr, "added docs: %d\n", (int)div);
- fprintf(stderr, "distance calls for layer: %" PRIu64 " is %.3f per doc\n", distcalls_search_layer, distcalls_search_layer/ div);
- fprintf(stderr, "distance calls for heuristic: %" PRIu64 " is %.3f per doc\n", distcalls_heuristic, distcalls_heuristic / div);
- fprintf(stderr, "distance calls for simple: %" PRIu64 " is %.3f per doc\n", distcalls_simple, distcalls_simple / div);
- fprintf(stderr, "distance calls for shrink: %" PRIu64 " is %.3f per doc\n", distcalls_shrink, distcalls_shrink / div);
- fprintf(stderr, "distance calls for refill: %" PRIu64 " is %.3f per doc\n", distcalls_refill, distcalls_refill / div);
- fprintf(stderr, "distance calls for other: %" PRIu64 " is %.3f per doc\n", distcalls_other, distcalls_other / div);
- fprintf(stderr, "refill needed calls: %" PRIu64 " is %.3f per doc\n", refill_needed_calls, refill_needed_calls / div);
+ track_ops();
+ }
+
+ void track_ops() {
+ _ops_counter++;
+ if ((_ops_counter % 10000) == 0) {
+ double div = _ops_counter;
+ fprintf(stderr, "add / remove ops: %zu\n", _ops_counter);
+ fprintf(stderr, "distance calls for layer: %zu is %.3f per op\n", distcalls_search_layer, distcalls_search_layer/ div);
+ fprintf(stderr, "distance calls for heuristic: %zu is %.3f per op\n", distcalls_heuristic, distcalls_heuristic / div);
+ fprintf(stderr, "distance calls for simple: %zu is %.3f per op\n", distcalls_simple, distcalls_simple / div);
+ fprintf(stderr, "distance calls for shrink: %zu is %.3f per op\n", distcalls_shrink, distcalls_shrink / div);
+ fprintf(stderr, "distance calls for refill: %zu is %.3f per op\n", distcalls_refill, distcalls_refill / div);
+ fprintf(stderr, "distance calls for other: %zu is %.3f per op\n", distcalls_other, distcalls_other / div);
+ fprintf(stderr, "refill needed calls: %zu is %.3f per op\n", refill_needed_calls, refill_needed_calls / div);
}
- }
+ }
void remove_link_from(uint32_t from_id, uint32_t remove_id, uint32_t level) {
LinkList &links = getLinkList(from_id, level);
@@ -267,9 +294,10 @@ public:
if (repl_id == my_id) continue;
if (my_links.has_link_to(repl_id)) continue;
LinkList &other_links = getLinkList(repl_id, level);
- if (other_links.size() >= _M) continue;
+ if (other_links.size() + 1 >= _M) continue;
other_links.push_back(my_id);
my_links.push_back(repl_id);
+ if (my_links.size() >= _M) return;
}
}
}
@@ -299,14 +327,17 @@ public:
Node &node = _nodes[docid];
bool need_new_entrypoint = (docid == _entryId);
for (int level = node._links.size(); level-- > 0; ) {
- const LinkList &my_links = node._links[level];
+ LinkList my_links;
+ my_links.swap(node._links[level]);
for (uint32_t n_id : my_links) {
if (need_new_entrypoint) {
_entryId = n_id;
_entryLevel = level;
- need_new_entrypoint = false;
+ need_new_entrypoint = false;
}
remove_link_from(n_id, docid, level);
+ }
+ for (uint32_t n_id : my_links) {
refill_ifneeded(n_id, my_links, level);
}
}
@@ -322,6 +353,7 @@ public:
}
}
}
+ track_ops();
}
std::vector<NnsHit> topK(uint32_t k, Vector vector, uint32_t search_k) override {
@@ -331,12 +363,12 @@ public:
++distcalls_other;
HnswHit entryPoint(_entryId, SqDist(entryDist));
int searchLevel = _entryLevel;
+ FurthestPriQ w;
+ w.push(entryPoint);
while (searchLevel > 0) {
- entryPoint = search_layer_simple(vector, entryPoint, searchLevel);
+ search_layer(vector, w, std::min(k, search_k), searchLevel);
--searchLevel;
}
- FurthestPriQ w;
- w.push(entryPoint);
search_layer(vector, w, std::max(k, search_k), 0);
while (w.size() > k) {
w.pop();
@@ -490,66 +522,87 @@ HnswLikeNns::connect_new_node(uint32_t id, const LinkList &neighbors, uint32_t l
}
}
+uint32_t
+HnswLikeNns::count_reachable() const {
+ VisitedSet visited(_nodes.size());
+ int level = _entryLevel;
+ LinkList curList;
+ curList.push_back(_entryId);
+ visited.mark(_entryId);
+ uint32_t idx = 0;
+ while (level >= 0) {
+ while (idx < curList.size()) {
+ uint32_t id = curList[idx++];
+ const LinkList &links = getLinkList(id, level);
+ for (uint32_t n_id : links) {
+ if (visited.isMarked(n_id)) continue;
+ visited.mark(n_id);
+ curList.push_back(n_id);
+ }
+ }
+ --level;
+ idx = 0;
+ }
+ return curList.size();
+}
+
void
HnswLikeNns::dumpStats() const {
- std::vector<uint32_t> inLinkCounters;
- inLinkCounters.resize(_nodes.size());
- std::vector<uint32_t> outLinkCounters;
- outLinkCounters.resize(_nodes.size());
std::vector<uint32_t> levelCounts;
levelCounts.resize(_entryLevel + 2);
std::vector<uint32_t> outLinkHist;
outLinkHist.resize(2 * _M + 2);
+ uint32_t symmetrics = 0;
+ uint32_t level1links = 0;
+ uint32_t both_l_links = 0;
fprintf(stderr, "stats for HnswLikeNns with %zu nodes, entry level = %d, entry id = %u\n",
_nodes.size(), _entryLevel, _entryId);
+
for (uint32_t id = 0; id < _nodes.size(); ++id) {
const auto &node = _nodes[id];
uint32_t levels = node._links.size();
levelCounts[levels]++;
if (levels < 1) {
- outLinkCounters[id] = 0;
outLinkHist[0]++;
continue;
}
const LinkList &link_list = getLinkList(id, 0);
uint32_t numlinks = link_list.size();
- outLinkCounters[id] = numlinks;
outLinkHist[numlinks]++;
- if (numlinks < 2) {
+ if (numlinks < 1) {
fprintf(stderr, "node with %u links: id %u\n", numlinks, id);
- for (uint32_t n_id : link_list) {
- const LinkList &neigh_list = getLinkList(n_id, 0);
- fprintf(stderr, "neighbor id %u has %zu links\n", n_id, neigh_list.size());
- if (! neigh_list.has_link_to(id)) {
- fprintf(stderr, "BAD neighbor %u is missing backlink\n", n_id);
- }
- }
}
+ bool all_sym = true;
for (uint32_t n_id : link_list) {
- inLinkCounters[n_id]++;
+ const LinkList &neigh_list = getLinkList(n_id, 0);
+ if (! neigh_list.has_link_to(id)) {
+ fprintf(stderr, "BAD: %u has link to neighbor %u, but backlink is missing\n", id, n_id);
+ all_sym = false;
+ }
+ }
+ if (all_sym) ++symmetrics;
+ if (levels < 2) continue;
+ const LinkList &link_list_1 = getLinkList(id, 1);
+ for (uint32_t n_id : link_list_1) {
+ ++level1links;
+ if (link_list.has_link_to(n_id)) ++both_l_links;
}
}
for (uint32_t l = 0; l < levelCounts.size(); ++l) {
fprintf(stderr, "Nodes on %u levels: %u\n", l, levelCounts[l]);
}
+ fprintf(stderr, "reachable nodes %u / %zu\n",
+ count_reachable(), _nodes.size() - levelCounts[0]);
+ fprintf(stderr, "level 1 links overlapping on l0: %u / total: %u\n",
+ both_l_links, level1links);
for (uint32_t l = 0; l < outLinkHist.size(); ++l) {
- fprintf(stderr, "Nodes with %u outward links on L0: %u\n", l, outLinkHist[l]);
- }
- uint32_t symmetrics = 0;
- std::vector<uint32_t> inLinkHist;
- for (uint32_t id = 0; id < _nodes.size(); ++id) {
- uint32_t cnt = inLinkCounters[id];
- while (cnt >= inLinkHist.size()) inLinkHist.push_back(0);
- inLinkHist[cnt]++;
- if (cnt == outLinkCounters[id]) ++symmetrics;
- }
- for (uint32_t l = 0; l < inLinkHist.size(); ++l) {
- fprintf(stderr, "Nodes with %u inward links on L0: %u\n", l, inLinkHist[l]);
+ if (outLinkHist[l] != 0) {
+ fprintf(stderr, "Nodes with %u outward links on L0: %u\n", l, outLinkHist[l]);
+ }
}
fprintf(stderr, "Symmetric in-out nodes: %u\n", symmetrics);
}
-
std::unique_ptr<NNS<float>>
make_hnsw_nns(uint32_t numDims, const DocVectorAccess<float> &dva)
{