diff options
Diffstat (limited to 'eval/src/tests/ann/remove-bm.cpp')
-rw-r--r-- | eval/src/tests/ann/remove-bm.cpp | 434 |
1 files changed, 84 insertions, 350 deletions
diff --git a/eval/src/tests/ann/remove-bm.cpp b/eval/src/tests/ann/remove-bm.cpp index be010552ab8..546c2cfd75e 100644 --- a/eval/src/tests/ann/remove-bm.cpp +++ b/eval/src/tests/ann/remove-bm.cpp @@ -13,174 +13,17 @@ #define NUM_DOCS 250000 #define NUM_DOCS_REMOVE 50000 #define EFFECTIVE_DOCS (NUM_DOCS - NUM_DOCS_REMOVE) +#define NUM_REACH 10000 #define NUM_Q 1000 #include "doc_vector_access.h" #include "nns.h" #include "for-sift-hit.h" #include "for-sift-top-k.h" - -std::vector<TopK> bruteforceResults; -std::vector<float> tmp_v(NUM_DIMS); - -struct PointVector { - float v[NUM_DIMS]; - using ConstArr = vespalib::ConstArrayRef<float>; - operator ConstArr() const { return ConstArr(v, NUM_DIMS); } -}; - -static PointVector *aligned_alloc(size_t num) { - size_t sz = num * sizeof(PointVector); - double mega_bytes = sz / (1024.0*1024.0); - fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes); - char *mem = (char *)malloc(sz + 512); - mem += 512; - size_t val = (size_t)mem; - size_t unalign = val % 512; - mem -= unalign; - return reinterpret_cast<PointVector *>(mem); -} - -static PointVector *generatedQueries = aligned_alloc(NUM_Q); -static PointVector *generatedDocs = aligned_alloc(NUM_DOCS); - -struct DocVectorAdapter : public DocVectorAccess<float> -{ - vespalib::ConstArrayRef<float> get(uint32_t docid) const override { - ASSERT_TRUE(docid < NUM_DOCS); - return generatedDocs[docid]; - } -}; - -double computeDistance(const PointVector &query, uint32_t docid) { - const PointVector &docvector = generatedDocs[docid]; - return l2distCalc.l2sq_dist(query, docvector, tmp_v); -} - -void read_queries(std::string fn) { - int fd = open(fn.c_str(), O_RDONLY); - ASSERT_TRUE(fd > 0); - int d; - size_t rv; - fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str()); - for (uint32_t qid = 0; qid < NUM_Q; ++qid) { - rv = read(fd, &d, 4); - ASSERT_EQUAL(rv, 4u); - ASSERT_EQUAL(d, NUM_DIMS); - rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float)); - ASSERT_EQUAL(rv, sizeof(PointVector)); - } - close(fd); -} - -void read_docs(std::string fn) { - int fd = open(fn.c_str(), O_RDONLY); - ASSERT_TRUE(fd > 0); - int d; - size_t rv; - fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str()); - for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) { - rv = read(fd, &d, 4); - ASSERT_EQUAL(rv, 4u); - ASSERT_EQUAL(d, NUM_DIMS); - rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float)); - ASSERT_EQUAL(rv, sizeof(PointVector)); - } - close(fd); -} - -using TimePoint = std::chrono::steady_clock::time_point; -using Duration = std::chrono::steady_clock::duration; - -double to_ms(Duration elapsed) { - std::chrono::duration<double, std::milli> ms(elapsed); - return ms.count(); -} - -void read_data(std::string dir) { - TimePoint bef = std::chrono::steady_clock::now(); - read_queries(dir + "/gist_query.fvecs"); - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef)); - bef = std::chrono::steady_clock::now(); - read_docs(dir + "/gist_base.fvecs"); - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef)); -} - - -struct BfHitComparator { - bool operator() (const Hit &lhs, const Hit& rhs) const { - if (lhs.distance < rhs.distance) return false; - if (lhs.distance > rhs.distance) return true; - return (lhs.docid > rhs.docid); - } -}; - -class BfHitHeap { -private: - size_t _size; - vespalib::PriorityQueue<Hit, BfHitComparator> _priQ; -public: - explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() { - _priQ.reserve(maxSize); - } - ~BfHitHeap() {} - void maybe_use(const Hit &hit) { - if (_priQ.size() < _size) { - _priQ.push(hit); - } else if (hit.distance < _priQ.front().distance) { - _priQ.front() = hit; - _priQ.adjust(); - } - } - std::vector<Hit> bestHits() { - std::vector<Hit> result; - size_t i = _priQ.size(); - result.resize(i); - while (i-- > 0) { - result[i] = _priQ.front(); - _priQ.pop_front(); - } - return result; - } -}; - -TopK bruteforce_nns(const PointVector &query) { - TopK result; - BfHitHeap heap(result.K); - for (uint32_t docid = 0; docid < EFFECTIVE_DOCS; ++docid) { - const PointVector &docvector = generatedDocs[docid]; - double d = l2distCalc.l2sq_dist(query, docvector, tmp_v); - Hit h(docid, d); - heap.maybe_use(h); - } - std::vector<Hit> best = heap.bestHits(); - for (size_t i = 0; i < result.K; ++i) { - result.hits[i] = best[i]; - } - return result; -} - -void verifyBF(uint32_t qid) { - const PointVector &query = generatedQueries[qid]; - TopK &result = bruteforceResults[qid]; - double min_distance = result.hits[0].distance; - std::vector<double> all_c2; - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - double dist = computeDistance(query, i); - if (dist < min_distance) { - fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance); - } - EXPECT_FALSE(dist+0.000001 < min_distance); - if (min_distance > 0.0) all_c2.push_back(dist / min_distance); - } - if (all_c2.size() != EFFECTIVE_DOCS) return; - std::sort(all_c2.begin(), all_c2.end()); - for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, EFFECTIVE_DOCS/2, EFFECTIVE_DOCS-1}) { - fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]); - } -} +#include "time-util.h" +#include "point-vector.h" +#include "read-vecs.h" +#include "bruteforce-nns.h" using NNS_API = NNS<float>; @@ -221,83 +64,8 @@ TEST("require that brute force works") { } } -bool reach_with_nns_1(NNS_API &nns, uint32_t docid) { - const PointVector &qv = generatedDocs[docid]; - vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS); - auto rv = nns.topK(1, query, 1); - if (rv.size() != 1) { - fprintf(stderr, "Result/A from query for %u is %zu hits\n", docid, rv.size()); - return false; - } - if (rv[0].docid != docid) { - if (rv[0].sq.distance != 0.0) - fprintf(stderr, "Expected/A to find %u but got %u with sq distance %.3f\n", - docid, rv[0].docid, rv[0].sq.distance); - } - return (rv[0].docid == docid || rv[0].sq.distance == 0.0); -} - -bool reach_with_nns_100(NNS_API &nns, uint32_t docid) { - const PointVector &qv = generatedDocs[docid]; - vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS); - auto rv = nns.topK(10, query, 100); - if (rv.size() != 10) { - fprintf(stderr, "Result/B from query for %u is %zu hits\n", docid, rv.size()); - } - if (rv[0].docid != docid) { - if (rv[0].sq.distance != 0.0) - fprintf(stderr, "Expected/B to find %u but got %u with sq distance %.3f\n", - docid, rv[0].docid, rv[0].sq.distance); - } - return (rv[0].docid == docid || rv[0].sq.distance == 0.0); -} - -bool reach_with_nns_1k(NNS_API &nns, uint32_t docid) { - const PointVector &qv = generatedDocs[docid]; - vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS); - auto rv = nns.topK(10, query, 1000); - if (rv.size() != 10) { - fprintf(stderr, "Result/C from query for %u is %zu hits\n", docid, rv.size()); - } - if (rv[0].docid != docid) { - if (rv[0].sq.distance != 0.0) - fprintf(stderr, "Expected/C to find %u but got %u with sq distance %.3f\n", - docid, rv[0].docid, rv[0].sq.distance); - } - return (rv[0].docid == docid || rv[0].sq.distance == 0.0); -} - -TopK find_with_nns(uint32_t sk, NNS_API &nns, uint32_t qid) { - TopK result; - const PointVector &qv = generatedQueries[qid]; - vespalib::ConstArrayRef<float> query(qv.v, NUM_DIMS); - auto rv = nns.topK(result.K, query, sk); - for (size_t i = 0; i < result.K; ++i) { - result.hits[i] = Hit(rv[i].docid, rv[i].sq.distance); - } - return result; -} - -void verify_nns_quality(uint32_t sk, NNS_API &nns, uint32_t qid) { - TopK perfect = bruteforceResults[qid]; - TopK result = find_with_nns(sk, nns, qid); - int recall = perfect.recall(result); - EXPECT_TRUE(recall > 40); - double sum_error = 0.0; - double c_factor = 1.0; - for (size_t i = 0; i < result.K; ++i) { - double factor = (result.hits[i].distance / perfect.hits[i].distance); - if (factor < 0.99 || factor > 25) { - fprintf(stderr, "hit[%zu] got distance %.3f, expected %.3f\n", - i, result.hits[i].distance, perfect.hits[i].distance); - } - sum_error += factor; - c_factor = std::max(c_factor, factor); - } - EXPECT_TRUE(c_factor < 1.5); - fprintf(stderr, "quality sk=%u: query %u: recall %d c2-factor %.3f avg c2: %.3f\n", - sk, qid, recall, c_factor, sum_error / result.K); -} +#include "find-with-nns.h" +#include "verify-top-k.h" void timing_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) { for (uint32_t search_k : sk_list) { @@ -311,64 +79,22 @@ void timing_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) { } } -void quality_nns(NNS_API &nns, std::vector<uint32_t> sk_list) { - for (uint32_t search_k : sk_list) { - for (int cnt = 0; cnt < NUM_Q; ++cnt) { - verify_nns_quality(search_k, nns, cnt); - } - } - uint32_t reached = 0; - for (uint32_t i = 0; i < 20000; ++i) { - if (reach_with_nns_1(nns, i)) ++reached; - } - fprintf(stderr, "Could reach %u of 20000 first documents with k=1\n", reached); - reached = 0; - for (uint32_t i = 0; i < 20000; ++i) { - if (reach_with_nns_100(nns, i)) ++reached; - } - fprintf(stderr, "Could reach %u of 20000 first documents with k=100\n", reached); - reached = 0; - for (uint32_t i = 0; i < 20000; ++i) { - if (reach_with_nns_1k(nns, i)) ++reached; - } - fprintf(stderr, "Could reach %u of 20000 first documents with k=1000\n", reached); -} +#include "quality-nns.h" -void benchmark_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list) { +template <typename FUNC> +void bm_nns_simple(const char *name, FUNC creator, std::vector<uint32_t> sk_list) { + std::unique_ptr<NNS_API> nnsp = creator(); + NNS_API &nns = *nnsp; fprintf(stderr, "trying %s indexing...\n", name); - -#if 0 - TimePoint bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.addDoc(EFFECTIVE_DOCS + i); - } - for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) { - nns.addDoc(i); - } - for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { - nns.removeDoc(EFFECTIVE_DOCS + i); - nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i); - } - TimePoint aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s realistic build with %u documents:\n", name, EFFECTIVE_DOCS); - quality_nns(nns, sk_list); -#endif - -#if 1 TimePoint bef = std::chrono::steady_clock::now(); for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { nns.addDoc(i); } TimePoint aft = std::chrono::steady_clock::now(); fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS); + fprintf(stderr, "Quality for %s [A] clean build with %u documents:\n", name, EFFECTIVE_DOCS); quality_nns(nns, sk_list); - bef = std::chrono::steady_clock::now(); for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { nns.addDoc(EFFECTIVE_DOCS + i); @@ -379,111 +105,115 @@ void benchmark_nns(const char *name, NNS_API &nns, std::vector<uint32_t> sk_list aft = std::chrono::steady_clock::now(); fprintf(stderr, "build %s index add then remove %u docs: %.3f ms\n", name, NUM_DOCS_REMOVE, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS); + fprintf(stderr, "Quality for %s [B] remove-damaged build with %u documents:\n", name, EFFECTIVE_DOCS); quality_nns(nns, sk_list); -#endif +} -#if 0 +template <typename FUNC> +void bm_nns_remove_old(const char *name, FUNC creator, std::vector<uint32_t> sk_list) { + std::unique_ptr<NNS_API> nnsp = creator(); + NNS_API &nns = *nnsp; TimePoint bef = std::chrono::steady_clock::now(); + for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { + nns.addDoc(EFFECTIVE_DOCS + i); + } for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { nns.addDoc(i); } + for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { + nns.removeDoc(EFFECTIVE_DOCS + i); + } TimePoint aft = std::chrono::steady_clock::now(); fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s clean build with %u documents:\n", name, EFFECTIVE_DOCS); + fprintf(stderr, "Quality for %s [C] remove-oldest build with %u documents:\n", name, EFFECTIVE_DOCS); quality_nns(nns, sk_list); +} - bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - nns.removeDoc(i); - } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index removed %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); - - const uint32_t addFirst = NUM_DOCS - (NUM_DOCS_REMOVE * 3); - const uint32_t addSecond = NUM_DOCS - (NUM_DOCS_REMOVE * 2); - - bef = std::chrono::steady_clock::now(); - for (uint32_t i = 0; i < addFirst; ++i) { - nns.addDoc(i); - } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, addFirst, to_ms(aft - bef)); - - bef = std::chrono::steady_clock::now(); +template <typename FUNC> +void bm_nns_interleave(const char *name, FUNC creator, std::vector<uint32_t> sk_list) { + std::unique_ptr<NNS_API> nnsp = creator(); + NNS_API &nns = *nnsp; + TimePoint bef = std::chrono::steady_clock::now(); for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { nns.addDoc(EFFECTIVE_DOCS + i); - nns.addDoc(addFirst + i); } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index added %u docs: %.3f ms\n", - name, 2 * NUM_DOCS_REMOVE, to_ms(aft - bef)); - - bef = std::chrono::steady_clock::now(); + for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) { + nns.addDoc(i); + } for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { nns.removeDoc(EFFECTIVE_DOCS + i); - nns.addDoc(addSecond + i); + nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i); } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index added %u and removed %u docs: %.3f ms\n", - name, NUM_DOCS_REMOVE, NUM_DOCS_REMOVE, to_ms(aft - bef)); - + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s with %u documents some churn:\n", name, EFFECTIVE_DOCS); + fprintf(stderr, "Quality for %s [D] realistic build with %u documents:\n", name, EFFECTIVE_DOCS); quality_nns(nns, sk_list); +} -#endif - -#if 0 - bef = std::chrono::steady_clock::now(); - fprintf(stderr, "removing and adding %u documents...\n", EFFECTIVE_DOCS); - for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) { - nns.removeDoc(i); +template <typename FUNC> +void bm_nns_remove_old_add_new(const char *name, FUNC creator, std::vector<uint32_t> sk_list) { + std::unique_ptr<NNS_API> nnsp = creator(); + NNS_API &nns = *nnsp; + TimePoint bef = std::chrono::steady_clock::now(); + for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { + nns.addDoc(EFFECTIVE_DOCS + i); + } + for (uint32_t i = 0; i < EFFECTIVE_DOCS - NUM_DOCS_REMOVE; ++i) { nns.addDoc(i); } - aft = std::chrono::steady_clock::now(); - fprintf(stderr, "build %s index rem/add %u docs: %.3f ms\n", - name, EFFECTIVE_DOCS, to_ms(aft - bef)); - + for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { + nns.removeDoc(EFFECTIVE_DOCS + i); + } + for (uint32_t i = 0; i < NUM_DOCS_REMOVE; ++i) { + nns.addDoc(EFFECTIVE_DOCS - NUM_DOCS_REMOVE + i); + } + TimePoint aft = std::chrono::steady_clock::now(); + fprintf(stderr, "build %s index with %u docs: %.3f ms\n", name, EFFECTIVE_DOCS, to_ms(aft - bef)); timing_nns(name, nns, sk_list); - fprintf(stderr, "Quality for %s with %u documents full churn:\n", name, EFFECTIVE_DOCS); + fprintf(stderr, "Quality for %s [E] remove old, add new build with %u documents:\n", name, EFFECTIVE_DOCS); quality_nns(nns, sk_list); -#endif +} + +template <typename FUNC> +void benchmark_nns(const char *name, FUNC creator, std::vector<uint32_t> sk_list) { + bm_nns_simple(name, creator, sk_list); + bm_nns_remove_old(name, creator, sk_list); + bm_nns_interleave(name, creator, sk_list); + bm_nns_remove_old_add_new(name, creator, sk_list); } #if 0 TEST("require that Locality Sensitive Hashing mostly works") { DocVectorAdapter adapter; - std::unique_ptr<NNS_API> nns = make_rplsh_nns(NUM_DIMS, adapter); - benchmark_nns("RPLSH", *nns, { 200, 1000 }); + auto creator = [&adapter]() { return make_rplsh_nns(NUM_DIMS, adapter); }; + benchmark_nns("RPLSH", creator, { 200, 1000 }); } #endif #if 0 TEST("require that Annoy via NNS api mostly works") { DocVectorAdapter adapter; - std::unique_ptr<NNS_API> nns = make_annoy_nns(NUM_DIMS, adapter); - benchmark_nns("Annoy", *nns, { 8000, 10000 }); + auto creator = [&adapter]() { return make_annoy_nns(NUM_DIMS, adapter); }; + benchmark_nns("Annoy", creator, { 8000, 10000 }); } #endif #if 1 TEST("require that HNSW via NNS api mostly works") { DocVectorAdapter adapter; - std::unique_ptr<NNS_API> nns = make_hnsw_nns(NUM_DIMS, adapter); - benchmark_nns("HNSW-like", *nns, { 100, 150, 200 }); + auto creator = [&adapter]() { return make_hnsw_nns(NUM_DIMS, adapter); }; + benchmark_nns("HNSW-like", creator, { 100, 150, 200 }); } #endif #if 0 TEST("require that HNSW wrapped api mostly works") { DocVectorAdapter adapter; - std::unique_ptr<NNS_API> nns = make_hnsw_wrap(NUM_DIMS, adapter); - benchmark_nns("HNSW-wrap", *nns, { 100, 150, 200 }); + auto creator = [&adapter]() { return make_hnsw_wrap(NUM_DIMS, adapter); }; + benchmark_nns("HNSW-wrap", creator, { 100, 150, 200 }); } #endif @@ -498,17 +228,21 @@ TEST("require that HNSW wrapped api mostly works") { */ int main(int argc, char **argv) { TEST_MASTER.init(__FILE__); - std::string gist_dir = "."; - if (argc > 1) { - gist_dir = argv[1]; + std::string data_set = "gist"; + std::string data_dir = "."; + if (argc > 2) { + data_set = argv[1]; + data_dir = argv[2]; + } else if (argc > 1) { + data_dir = argv[1]; } else { char *home = getenv("HOME"); if (home) { - gist_dir = home; - gist_dir += "/gist"; + data_dir = home; + data_dir += "/" + data_set; } } - read_data(gist_dir); + read_data(data_dir, data_set); TEST_RUN_ALL(); return (TEST_MASTER.fini() ? 0 : 1); } |