summaryrefslogtreecommitdiffstats
path: root/eval/src/tests/ann/remove-bm.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'eval/src/tests/ann/remove-bm.cpp')
-rw-r--r--eval/src/tests/ann/remove-bm.cpp258
1 files changed, 73 insertions, 185 deletions
diff --git a/eval/src/tests/ann/remove-bm.cpp b/eval/src/tests/ann/remove-bm.cpp
index be010552ab8..005f3804af9 100644
--- a/eval/src/tests/ann/remove-bm.cpp
+++ b/eval/src/tests/ann/remove-bm.cpp
@@ -13,6 +13,7 @@
#define NUM_DOCS 250000
#define NUM_DOCS_REMOVE 50000
#define EFFECTIVE_DOCS (NUM_DOCS - NUM_DOCS_REMOVE)
+#define NUM_REACH 10000
#define NUM_Q 1000
#include "doc_vector_access.h"
@@ -30,10 +31,10 @@ struct PointVector {
};
static PointVector *aligned_alloc(size_t num) {
- size_t sz = num * sizeof(PointVector);
- double mega_bytes = sz / (1024.0*1024.0);
+ size_t num_bytes = num * sizeof(PointVector);
+ double mega_bytes = num_bytes / (1024.0*1024.0);
fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
- char *mem = (char *)malloc(sz + 512);
+ char *mem = (char *)malloc(num_bytes + 512);
mem += 512;
size_t val = (size_t)mem;
size_t unalign = val % 512;
@@ -221,83 +222,8 @@ TEST("require that brute force works") {
}
}
-bool reach_with_nns_1(NNS_API &nns, uint32_t docid) {
- const PointVector &qv = generatedDocs[docid];
- vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
- auto rv = nns.topK(1, query, 1);
- if (rv.size() != 1) {
- fprintf(stderr, "Result/A from query for %u is %zu hits\n", docid, rv.size());
- return false;
- }
- if (rv[0].docid != docid) {
- if (rv[0].sq.distance != 0.0)
- fprintf(stderr, "Expected/A to find %u but got %u with sq distance %.3f\n",
- docid, rv[0].docid, rv[0].sq.distance);
- }
- return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
-}
-
-bool reach_with_nns_100(NNS_API &nns, uint32_t docid) {
- const PointVector &qv = generatedDocs[docid];
- vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
- auto rv = nns.topK(10, query, 100);
- if (rv.size() != 10) {
- fprintf(stderr, "Result/B from query for %u is %zu hits\n", docid, rv.size());
- }
- if (rv[0].docid != docid) {
- if (rv[0].sq.distance != 0.0)
- fprintf(stderr, "Expected/B to find %u but got %u with sq distance %.3f\n",
- docid, rv[0].docid, rv[0].sq.distance);
- }
- return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
-}
-
-bool reach_with_nns_1k(NNS_API &nns, uint32_t docid) {
- const PointVector &qv = generatedDocs[docid];
- vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
- auto rv = nns.topK(10, query, 1000);
- if (rv.size() != 10) {
- fprintf(stderr, "Result/C from query for %u is %zu hits\n", docid, rv.size());
- }
- if (rv[0].docid != docid) {
- if (rv[0].sq.distance != 0.0)
- fprintf(stderr, "Expected/C to find %u but got %u with sq distance %.3f\n",
- docid, rv[0].docid, rv[0].sq.distance);
- }
- return (rv[0].docid == docid || rv[0].sq.distance == 0.0);
-}
-
-TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) {
- TopK result;
- const PointVector &qv = generatedQueries[qid];
- vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS);
- auto rv = nns.topK(result.K, query, sk);
- for (size_t i = 0; i < result.K; ++i) {
- result.hits[i] = Hit(rv[i].docid, rv[i].sq.distance);
- }
- return result;
-}
-
-void verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) {
- TopK perfect = bruteforceResults[qid];
- TopK result = find_with_nns(sk, nns, qid);
- int recall = perfect.recall(result);
- EXPECT_TRUE(recall > 40);
- double sum_error = 0.0;
- double c_factor = 1.0;
- for (size_t i = 0; i < result.K; ++i) {
- double factor = (result.hits[i].distance / perfect.hits[i].distance);
- if (factor < 0.99 || factor > 25) {
- fprintf(stderr, "hit[%zu] got distance %.3f, expected %.3f\n",
- i, result.hits[i].distance, perfect.hits[i].distance);
- }
- sum_error += factor;
- c_factor = std::max(c_factor, factor);
- }
- EXPECT_TRUE(c_factor < 1.5);
- fprintf(stderr, "quality sk=%u: query %u: recall %d c2-factor %.3f avg c2: %.3f\n",
- sk, qid, recall, c_factor, sum_error / result.K);
-}
+#include "find-with-nns.h"
+#include "verify-top-k.h"
void timing_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) {
for (uint32_t search_k : sk_list) {
@@ -311,64 +237,22 @@ void timing_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) {
}
}
-void quality_nns(NNS_API &nns, std::vector<uint32_t> sk_list) {
- for (uint32_t search_k : sk_list) {
- for (int cnt = 0; cnt < NUM_Q; ++cnt) {
- verify_nns_quality(search_k, nns, cnt);
- }
- }
- uint32_t reached = 0;
- for (uint32_t i = 0; i < 20000; ++i) {
- if (reach_with_nns_1(nns, i)) ++reached;
- }
- fprintf(stderr, "Could reach %u of 20000 first documents with k=1\n", reached);
- reached = 0;
- for (uint32_t i = 0; i < 20000; ++i) {
- if (reach_with_nns_100(nns, i)) ++reached;
- }
- fprintf(stderr, "Could reach %u of 20000 first documents with k=100\n", reached);
- reached = 0;
- for (uint32_t i = 0; i < 20000; ++i) {
- if (reach_with_nns_1k(nns, i)) ++reached;
- }
- fprintf(stderr, "Could reach %u of 20000 first documents with k=1000\n", reached);
-}
+#include "quality-nns.h"
-void benchmark_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) {
+template <typename FUNC>
+void bm_nns_simple(const char *name, FUNC creator, std::vector<uint32_t> sk_list) {
+ std::unique_ptr<NNS_API> nnsp = creator();
+ NNS_API &nns = *nnsp;
fprintf(stderr, "trying %s indexing...\n", name);
-
-#if 0
- TimePoint bef = std::chrono::steady_clock::now();
- for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
- nns.addDoc(EFFECTIVE_DOCS + i);
- }
- for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) {
- nns.addDoc(i);
- }
- for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
- nns.removeDoc(EFFECTIVE_DOCS + i);
- nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i);
- }
- TimePoint aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
-
- timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s realistic build with %u documents:\n", name, EFFECTIVE_DOCS);
- quality_nns(nns, sk_list);
-#endif
-
-#if 1
TimePoint bef = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
nns.addDoc(i);
}
TimePoint aft = std::chrono::steady_clock::now();
fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
-
timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS);
+ fprintf(stderr, "Quality for %s [A] clean build with %u documents:\n", name, EFFECTIVE_DOCS);
quality_nns(nns, sk_list);
-
bef = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
nns.addDoc(EFFECTIVE_DOCS + i);
@@ -379,111 +263,115 @@ void benchmark_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list
aft = std::chrono::steady_clock::now();
fprintf(stderr, "build %s index add then remove %u docs: %.3f ms\n",
name, NUM_DOCS_REMOVE, to_ms(aft - bef));
-
timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS);
+ fprintf(stderr, "Quality for %s [B] remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS);
quality_nns(nns, sk_list);
-#endif
+}
-#if 0
+template <typename FUNC>
+void bm_nns_remove_old(const char *name, FUNC creator, std::vector<uint32_t> sk_list) {
+ std::unique_ptr<NNS_API> nnsp = creator();
+ NNS_API &nns = *nnsp;
TimePoint bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS + i);
+ }
for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
nns.addDoc(i);
}
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.removeDoc(EFFECTIVE_DOCS + i);
+ }
TimePoint aft = std::chrono::steady_clock::now();
fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
-
timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS);
+ fprintf(stderr, "Quality for %s [C] remove-oldest build with %u documents:\n", name, EFFECTIVE_DOCS);
quality_nns(nns, sk_list);
+}
- bef = std::chrono::steady_clock::now();
- for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
- nns.removeDoc(i);
- }
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index removed %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
-
- const uint32_t addFirst = NUM_DOCS - (NUM_DOCS_REMOVE * 3);
- const uint32_t addSecond = NUM_DOCS - (NUM_DOCS_REMOVE * 2);
-
- bef = std::chrono::steady_clock::now();
- for (uint32_t i = 0; i < addFirst; ++i) {
- nns.addDoc(i);
- }
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, addFirst, to_ms(aft - bef));
-
- bef = std::chrono::steady_clock::now();
+template <typename FUNC>
+void bm_nns_interleave(const char *name, FUNC creator, std::vector<uint32_t> sk_list) {
+ std::unique_ptr<NNS_API> nnsp = creator();
+ NNS_API &nns = *nnsp;
+ TimePoint bef = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
nns.addDoc(EFFECTIVE_DOCS + i);
- nns.addDoc(addFirst + i);
}
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index added %u docs: %.3f ms\n",
- name, 2 * NUM_DOCS_REMOVE, to_ms(aft - bef));
-
- bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(i);
+ }
for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
nns.removeDoc(EFFECTIVE_DOCS + i);
- nns.addDoc(addSecond + i);
+ nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i);
}
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index added %u and removed %u docs: %.3f ms\n",
- name, NUM_DOCS_REMOVE, NUM_DOCS_REMOVE, to_ms(aft - bef));
-
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s with %u documents some churn:\n", name, EFFECTIVE_DOCS);
+ fprintf(stderr, "Quality for %s [D] realistic build with %u documents:\n", name, EFFECTIVE_DOCS);
quality_nns(nns, sk_list);
+}
-#endif
-
-#if 0
- bef = std::chrono::steady_clock::now();
- fprintf(stderr, "removing and adding %u documents...\n", EFFECTIVE_DOCS);
- for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
- nns.removeDoc(i);
+template <typename FUNC>
+void bm_nns_remove_old_add_new(const char *name, FUNC creator, std::vector<uint32_t> sk_list) {
+ std::unique_ptr<NNS_API> nnsp = creator();
+ NNS_API &nns = *nnsp;
+ TimePoint bef = std::chrono::steady_clock::now();
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS + i);
+ }
+ for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) {
nns.addDoc(i);
}
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "build %s index rem/add %u docs: %.3f ms\n",
- name, EFFECTIVE_DOCS, to_ms(aft - bef));
-
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.removeDoc(EFFECTIVE_DOCS + i);
+ }
+ for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) {
+ nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i);
+ }
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef));
timing_nns(name, nns, sk_list);
- fprintf(stderr, "Quality for %s with %u documents full churn:\n", name, EFFECTIVE_DOCS);
+ fprintf(stderr, "Quality for %s [E] remove old, add new build with %u documents:\n", name, EFFECTIVE_DOCS);
quality_nns(nns, sk_list);
-#endif
+}
+
+template <typename FUNC>
+void benchmark_nns(const char *name, FUNC creator, std::vector<uint32_t> sk_list) {
+ bm_nns_simple(name, creator, sk_list);
+ bm_nns_remove_old(name, creator, sk_list);
+ bm_nns_interleave(name, creator, sk_list);
+ bm_nns_remove_old_add_new(name, creator, sk_list);
}
#if 0
TEST("require that Locality Sensitive Hashing mostly works") {
DocVectorAdapter adapter;
- std::unique_ptr<NNS_API> nns = make_rplsh_nns(NUM_DIMS, adapter);
- benchmark_nns("RPLSH", *nns, { 200, 1000 });
+ auto creator = [&adapter]() { return make_rplsh_nns(NUM_DIMS, adapter); };
+ benchmark_nns("RPLSH", creator, { 200, 1000 });
}
#endif
#if 0
TEST("require that Annoy via NNS api mostly works") {
DocVectorAdapter adapter;
- std::unique_ptr<NNS_API> nns = make_annoy_nns(NUM_DIMS, adapter);
- benchmark_nns("Annoy", *nns, { 8000, 10000 });
+ auto creator = [&adapter]() { return make_annoy_nns(NUM_DIMS, adapter); };
+ benchmark_nns("Annoy", creator, { 8000, 10000 });
}
#endif
#if 1
TEST("require that HNSW via NNS api mostly works") {
DocVectorAdapter adapter;
- std::unique_ptr<NNS_API> nns = make_hnsw_nns(NUM_DIMS, adapter);
- benchmark_nns("HNSW-like", *nns, { 100, 150, 200 });
+ auto creator = [&adapter]() { return make_hnsw_nns(NUM_DIMS, adapter); };
+ benchmark_nns("HNSW-like", creator, { 100, 150, 200 });
}
#endif
#if 0
TEST("require that HNSW wrapped api mostly works") {
DocVectorAdapter adapter;
- std::unique_ptr<NNS_API> nns = make_hnsw_wrap(NUM_DIMS, adapter);
- benchmark_nns("HNSW-wrap", *nns, { 100, 150, 200 });
+ auto creator = [&adapter]() { return make_hnsw_wrap(NUM_DIMS, adapter); };
+ benchmark_nns("HNSW-wrap", creator, { 100, 150, 200 });
}
#endif