summaryrefslogtreecommitdiffstats
path: root/eval
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-02-25 14:11:00 +0000
committerArne Juul <arnej@verizonmedia.com>2020-02-25 14:12:03 +0000
commit44fef3325d3e9bfa673d71b87721f0979f8404c8 (patch)
tree7f67a2fe14362cd0b31daf75333cad1e1ea83035 /eval
parentcc3c709d6278ebd699d4f4c67f8f769c9b6fa177 (diff)
split out common subroutines
Diffstat (limited to 'eval')
-rw-r--r--eval/src/tests/ann/bruteforce-nns.h74
-rw-r--r--eval/src/tests/ann/gist_benchmark.cpp181
-rw-r--r--eval/src/tests/ann/point-vector.h30
-rw-r--r--eval/src/tests/ann/read-vecs.h45
-rw-r--r--eval/src/tests/ann/remove-bm.cpp182
-rw-r--r--eval/src/tests/ann/sift_benchmark.cpp160
-rw-r--r--eval/src/tests/ann/time-util.h9
7 files changed, 190 insertions, 491 deletions
diff --git a/eval/src/tests/ann/bruteforce-nns.h b/eval/src/tests/ann/bruteforce-nns.h
new file mode 100644
index 00000000000..0c7c48654f7
--- /dev/null
+++ b/eval/src/tests/ann/bruteforce-nns.h
@@ -0,0 +1,74 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+std::vector<TopK> bruteforceResults;
+
+double computeDistance(const PointVector &query, uint32_t docid) {
+ const PointVector &docvector = generatedDocs[docid];
+ return l2distCalc.l2sq_dist(query, docvector);
+}
+
+struct BfHitComparator {
+ bool operator() (const Hit &lhs, const Hit& rhs) const {
+ if (lhs.distance < rhs.distance) return false;
+ if (lhs.distance > rhs.distance) return true;
+ return (lhs.docid > rhs.docid);
+ }
+};
+
+class BfHitHeap {
+private:
+ size_t _size;
+ vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
+public:
+ explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
+ _priQ.reserve(maxSize);
+ }
+ ~BfHitHeap() {}
+ void maybe_use(const Hit &hit) {
+ if (_priQ.size() < _size) {
+ _priQ.push(hit);
+ } else if (hit.distance < _priQ.front().distance) {
+ _priQ.front() = hit;
+ _priQ.adjust();
+ }
+ }
+ std::vector<Hit> bestHits() {
+ std::vector<Hit> result;
+ size_t i = _priQ.size();
+ result.resize(i);
+ while (i-- > 0) {
+ result[i] = _priQ.front();
+ _priQ.pop_front();
+ }
+ return result;
+ }
+};
+
+TopK bruteforce_nns(const PointVector &query) {
+ TopK result;
+ BfHitHeap heap(result.K);
+ for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
+ const PointVector &docvector = generatedDocs[docid];
+ double d = l2distCalc.l2sq_dist(query, docvector);
+ Hit h(docid, d);
+ heap.maybe_use(h);
+ }
+ std::vector<Hit> best = heap.bestHits();
+ for (size_t i = 0; i < result.K; ++i) {
+ result.hits[i] = best[i];
+ }
+ return result;
+}
+
+void verifyBF(uint32_t qid) {
+ const PointVector &query = generatedQueries[qid];
+ TopK &result = bruteforceResults[qid];
+ double min_distance = result.hits[0].distance;
+ for (uint32_t i = 0; i < NUM_DOCS; ++i) {
+ double dist = computeDistance(query, i);
+ if (dist < min_distance) {
+ fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
+ }
+ EXPECT_FALSE(dist+0.000001 < min_distance);
+ }
+}
diff --git a/eval/src/tests/ann/gist_benchmark.cpp b/eval/src/tests/ann/gist_benchmark.cpp
index 45559fc2557..de8bff877e6 100644
--- a/eval/src/tests/ann/gist_benchmark.cpp
+++ b/eval/src/tests/ann/gist_benchmark.cpp
@@ -18,167 +18,10 @@
#include "nns.h"
#include "for-sift-hit.h"
#include "for-sift-top-k.h"
-
-std::vector<TopK> bruteforceResults;
-
-struct PointVector {
- float v[NUM_DIMS];
- using ConstArr = vespalib::ConstArrayRef<float>;
- operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
-};
-
-static PointVector *aligned_alloc(size_t num) {
- size_t num_bytes = num * sizeof(PointVector);
- double mega_bytes = num_bytes / (1024.0*1024.0);
- fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
- char *mem = (char *)malloc(num_bytes + 512);
- mem += 512;
- size_t val = (size_t)mem;
- size_t unalign = val % 512;
- mem -= unalign;
- return reinterpret_cast<PointVector *>(mem);
-}
-
-static PointVector *generatedQueries = aligned_alloc(NUM_Q);
-static PointVector *generatedDocs = aligned_alloc(NUM_DOCS);
-
-struct DocVectorAdapter : public DocVectorAccess<float>
-{
- vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
- ASSERT_TRUE(docid < NUM_DOCS);
- return generatedDocs[docid];
- }
-};
-
-double computeDistance(const PointVector &query, uint32_t docid) {
- const PointVector &docvector = generatedDocs[docid];
- return l2distCalc.l2sq_dist(query, docvector);
-}
-
-void read_queries(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
- for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-void read_docs(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
- for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-using TimePoint = std::chrono::steady_clock::time_point;
-using Duration = std::chrono::steady_clock::duration;
-
-double to_ms(Duration elapsed) {
- std::chrono::duration<double, std::milli> ms(elapsed);
- return ms.count();
-}
-
-void read_data(std::string dir) {
- TimePoint bef = std::chrono::steady_clock::now();
- read_queries(dir + "/gist_query.fvecs");
- TimePoint aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
- bef = std::chrono::steady_clock::now();
- read_docs(dir + "/gist_base.fvecs");
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
-}
-
-
-struct BfHitComparator {
- bool operator() (const Hit &lhs, const Hit& rhs) const {
- if (lhs.distance < rhs.distance) return false;
- if (lhs.distance > rhs.distance) return true;
- return (lhs.docid > rhs.docid);
- }
-};
-
-class BfHitHeap {
-private:
- size_t _size;
- vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
-public:
- explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
- _priQ.reserve(maxSize);
- }
- ~BfHitHeap() {}
- void maybe_use(const Hit &hit) {
- if (_priQ.size() < _size) {
- _priQ.push(hit);
- } else if (hit.distance < _priQ.front().distance) {
- _priQ.front() = hit;
- _priQ.adjust();
- }
- }
- std::vector<Hit> bestHits() {
- std::vector<Hit> result;
- size_t i = _priQ.size();
- result.resize(i);
- while (i-- > 0) {
- result[i] = _priQ.front();
- _priQ.pop_front();
- }
- return result;
- }
-};
-
-TopK bruteforce_nns(const PointVector &query) {
- TopK result;
- BfHitHeap heap(result.K);
- for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
- const PointVector &docvector = generatedDocs[docid];
- double d = l2distCalc.l2sq_dist(query, docvector);
- Hit h(docid, d);
- heap.maybe_use(h);
- }
- std::vector<Hit> best = heap.bestHits();
- for (size_t i = 0; i < result.K; ++i) {
- result.hits[i] = best[i];
- }
- return result;
-}
-
-void verifyBF(uint32_t qid) {
- const PointVector &query = generatedQueries[qid];
- TopK &result = bruteforceResults[qid];
- double min_distance = result.hits[0].distance;
- std::vector<double> all_c2;
- for (uint32_t i = 0; i < NUM_DOCS; ++i) {
- double dist = computeDistance(query, i);
- if (dist < min_distance) {
- fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
- }
- EXPECT_FALSE(dist+0.000001 < min_distance);
- if (min_distance > 0.0) all_c2.push_back(dist / min_distance);
- }
- if (all_c2.size() != NUM_DOCS) return;
- std::sort(all_c2.begin(), all_c2.end());
- for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, NUM_DOCS/2, NUM_DOCS-1}) {
- fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]);
- }
-}
+#include "time-util.h"
+#include "point-vector.h"
+#include "read-vecs.h"
+#include "bruteforce-nns.h"
using NNS_API = NNS<float>;
@@ -279,17 +122,21 @@ TEST("require that HNSW wrapped api mostly works") {
*/
int main(int argc, char **argv) {
TEST_MASTER.init(__FILE__);
- std::string gist_dir = ".";
- if (argc > 1) {
- gist_dir = argv[1];
+ std::string data_set = "gist";
+ std::string data_dir = ".";
+ if (argc > 2) {
+ data_set = argv[1];
+ data_dir = argv[2];
+ } else if (argc > 1) {
+ data_dir = argv[1];
} else {
char *home = getenv("HOME");
if (home) {
- gist_dir = home;
- gist_dir += "/gist";
+ data_dir = home;
+ data_dir += "/" + data_set;
}
}
- read_data(gist_dir);
+ read_data(data_dir, data_set);
TEST_RUN_ALL();
return (TEST_MASTER.fini() ? 0 : 1);
}
diff --git a/eval/src/tests/ann/point-vector.h b/eval/src/tests/ann/point-vector.h
new file mode 100644
index 00000000000..eca60e11194
--- /dev/null
+++ b/eval/src/tests/ann/point-vector.h
@@ -0,0 +1,30 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+struct PointVector {
+ float v[NUM_DIMS];
+ using ConstArr = vespalib::ConstArrayRef<float>;
+ operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
+};
+
+static PointVector *aligned_alloc(size_t num) {
+ size_t num_bytes = num * sizeof(PointVector);
+ double mega_bytes = num_bytes / (1024.0*1024.0);
+ fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
+ char *mem = (char *)malloc(num_bytes + 512);
+ mem += 512;
+ size_t val = (size_t)mem;
+ size_t unalign = val % 512;
+ mem -= unalign;
+ return reinterpret_cast<PointVector *>(mem);
+}
+
+static PointVector *generatedQueries = aligned_alloc(NUM_Q);
+static PointVector *generatedDocs = aligned_alloc(NUM_DOCS);
+
+struct DocVectorAdapter : public DocVectorAccess<float>
+{
+ vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
+ ASSERT_TRUE(docid < NUM_DOCS);
+ return generatedDocs[docid];
+ }
+};
diff --git a/eval/src/tests/ann/read-vecs.h b/eval/src/tests/ann/read-vecs.h
new file mode 100644
index 00000000000..39c2a332710
--- /dev/null
+++ b/eval/src/tests/ann/read-vecs.h
@@ -0,0 +1,45 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+void read_queries(std::string fn) {
+ int fd = open(fn.c_str(), O_RDONLY);
+ ASSERT_TRUE(fd > 0);
+ int d;
+ size_t rv;
+ fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
+ for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
+ rv = read(fd, &d, 4);
+ ASSERT_EQUAL(rv, 4u);
+ ASSERT_EQUAL(d, NUM_DIMS);
+ rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
+ ASSERT_EQUAL(rv, sizeof(PointVector));
+ }
+ close(fd);
+}
+
+void read_docs(std::string fn) {
+ int fd = open(fn.c_str(), O_RDONLY);
+ ASSERT_TRUE(fd > 0);
+ int d;
+ size_t rv;
+ fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
+ for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
+ rv = read(fd, &d, 4);
+ ASSERT_EQUAL(rv, 4u);
+ ASSERT_EQUAL(d, NUM_DIMS);
+ rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
+ ASSERT_EQUAL(rv, sizeof(PointVector));
+ }
+ close(fd);
+}
+
+void read_data(const std::string& dir, const std::string& data_set) {
+ fprintf(stderr, "read data set '%s' from directory '%s'\n", data_set.c_str(), dir.c_str());
+ TimePoint bef = std::chrono::steady_clock::now();
+ read_queries(dir + "/" + data_set + "_query.fvecs");
+ TimePoint aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
+ bef = std::chrono::steady_clock::now();
+ read_docs(dir + "/" + data_set + "_base.fvecs");
+ aft = std::chrono::steady_clock::now();
+ fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
+}
diff --git a/eval/src/tests/ann/remove-bm.cpp b/eval/src/tests/ann/remove-bm.cpp
index 005f3804af9..546c2cfd75e 100644
--- a/eval/src/tests/ann/remove-bm.cpp
+++ b/eval/src/tests/ann/remove-bm.cpp
@@ -20,168 +20,10 @@
#include "nns.h"
#include "for-sift-hit.h"
#include "for-sift-top-k.h"
-
-std::vector<TopK> bruteforceResults;
-std::vector<float> tmp_v(NUM_DIMS);
-
-struct PointVector {
- float v[NUM_DIMS];
- using ConstArr = vespalib::ConstArrayRef<float>;
- operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
-};
-
-static PointVector *aligned_alloc(size_t num) {
- size_t num_bytes = num * sizeof(PointVector);
- double mega_bytes = num_bytes / (1024.0*1024.0);
- fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
- char *mem = (char *)malloc(num_bytes + 512);
- mem += 512;
- size_t val = (size_t)mem;
- size_t unalign = val % 512;
- mem -= unalign;
- return reinterpret_cast<PointVector *>(mem);
-}
-
-static PointVector *generatedQueries = aligned_alloc(NUM_Q);
-static PointVector *generatedDocs = aligned_alloc(NUM_DOCS);
-
-struct DocVectorAdapter : public DocVectorAccess<float>
-{
- vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
- ASSERT_TRUE(docid < NUM_DOCS);
- return generatedDocs[docid];
- }
-};
-
-double computeDistance(const PointVector &query, uint32_t docid) {
- const PointVector &docvector = generatedDocs[docid];
- return l2distCalc.l2sq_dist(query, docvector, tmp_v);
-}
-
-void read_queries(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
- for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-void read_docs(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
- for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-using TimePoint = std::chrono::steady_clock::time_point;
-using Duration = std::chrono::steady_clock::duration;
-
-double to_ms(Duration elapsed) {
- std::chrono::duration<double, std::milli> ms(elapsed);
- return ms.count();
-}
-
-void read_data(std::string dir) {
- TimePoint bef = std::chrono::steady_clock::now();
- read_queries(dir + "/gist_query.fvecs");
- TimePoint aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
- bef = std::chrono::steady_clock::now();
- read_docs(dir + "/gist_base.fvecs");
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
-}
-
-
-struct BfHitComparator {
- bool operator() (const Hit &lhs, const Hit& rhs) const {
- if (lhs.distance < rhs.distance) return false;
- if (lhs.distance > rhs.distance) return true;
- return (lhs.docid > rhs.docid);
- }
-};
-
-class BfHitHeap {
-private:
- size_t _size;
- vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
-public:
- explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
- _priQ.reserve(maxSize);
- }
- ~BfHitHeap() {}
- void maybe_use(const Hit &hit) {
- if (_priQ.size() < _size) {
- _priQ.push(hit);
- } else if (hit.distance < _priQ.front().distance) {
- _priQ.front() = hit;
- _priQ.adjust();
- }
- }
- std::vector<Hit> bestHits() {
- std::vector<Hit> result;
- size_t i = _priQ.size();
- result.resize(i);
- while (i-- > 0) {
- result[i] = _priQ.front();
- _priQ.pop_front();
- }
- return result;
- }
-};
-
-TopK bruteforce_nns(const PointVector &query) {
- TopK result;
- BfHitHeap heap(result.K);
- for (uint32_t docid = 0; docid < EFFECTIVE_DOCS; ++docid) {
- const PointVector &docvector = generatedDocs[docid];
- double d = l2distCalc.l2sq_dist(query, docvector, tmp_v);
- Hit h(docid, d);
- heap.maybe_use(h);
- }
- std::vector<Hit> best = heap.bestHits();
- for (size_t i = 0; i < result.K; ++i) {
- result.hits[i] = best[i];
- }
- return result;
-}
-
-void verifyBF(uint32_t qid) {
- const PointVector &query = generatedQueries[qid];
- TopK &result = bruteforceResults[qid];
- double min_distance = result.hits[0].distance;
- std::vector<double> all_c2;
- for (uint32_t i = 0; i < EFFECTIVE_DOCS; ++i) {
- double dist = computeDistance(query, i);
- if (dist < min_distance) {
- fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
- }
- EXPECT_FALSE(dist+0.000001 < min_distance);
- if (min_distance > 0.0) all_c2.push_back(dist / min_distance);
- }
- if (all_c2.size() != EFFECTIVE_DOCS) return;
- std::sort(all_c2.begin(), all_c2.end());
- for (uint32_t idx : { 1, 3, 10, 30, 100, 300, 1000, 3000, EFFECTIVE_DOCS/2, EFFECTIVE_DOCS-1}) {
- fprintf(stderr, "c2-factor[%u] = %.3f\n", idx, all_c2[idx]);
- }
-}
+#include "time-util.h"
+#include "point-vector.h"
+#include "read-vecs.h"
+#include "bruteforce-nns.h"
using NNS_API = NNS<float>;
@@ -386,17 +228,21 @@ TEST("require that HNSW wrapped api mostly works") {
*/
int main(int argc, char **argv) {
TEST_MASTER.init(__FILE__);
- std::string gist_dir = ".";
- if (argc > 1) {
- gist_dir = argv[1];
+ std::string data_set = "gist";
+ std::string data_dir = ".";
+ if (argc > 2) {
+ data_set = argv[1];
+ data_dir = argv[2];
+ } else if (argc > 1) {
+ data_dir = argv[1];
} else {
char *home = getenv("HOME");
if (home) {
- gist_dir = home;
- gist_dir += "/gist";
+ data_dir = home;
+ data_dir += "/" + data_set;
}
}
- read_data(gist_dir);
+ read_data(data_dir, data_set);
TEST_RUN_ALL();
return (TEST_MASTER.fini() ? 0 : 1);
}
diff --git a/eval/src/tests/ann/sift_benchmark.cpp b/eval/src/tests/ann/sift_benchmark.cpp
index 5f3c16e127d..b2fa66cd0f1 100644
--- a/eval/src/tests/ann/sift_benchmark.cpp
+++ b/eval/src/tests/ann/sift_benchmark.cpp
@@ -20,148 +20,10 @@
#include "for-sift-hit.h"
#include "for-sift-top-k.h"
#include "std-random.h"
-
-std::vector<TopK> bruteforceResults;
-
-struct PointVector {
- float v[NUM_DIMS];
- using ConstArr = vespalib::ConstArrayRef<float>;
- operator ConstArr() const { return ConstArr(v, NUM_DIMS); }
-};
-
-static PointVector *aligned_alloc(size_t num) {
- size_t num_bytes = num * sizeof(PointVector);
- double mega_bytes = num_bytes / (1024.0*1024.0);
- fprintf(stderr, "allocate %.2f MB of vectors\n", mega_bytes);
- char *mem = (char *)malloc(num_bytes + 512);
- mem += 512;
- size_t val = (size_t)mem;
- size_t unalign = val % 512;
- mem -= unalign;
- return reinterpret_cast<PointVector *>(mem);
-}
-
-static PointVector *generatedQueries = aligned_alloc(NUM_Q);
-static PointVector *generatedDocs = aligned_alloc(NUM_DOCS);
-
-struct DocVectorAdapter : public DocVectorAccess<float>
-{
- vespalib::ConstArrayRef<float> get(uint32_t docid) const override {
- ASSERT_TRUE(docid < NUM_DOCS);
- return generatedDocs[docid];
- }
-};
-
-double computeDistance(const PointVector &query, uint32_t docid) {
- const PointVector &docvector = generatedDocs[docid];
- return l2distCalc.l2sq_dist(query, docvector);
-}
-
-void read_queries(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u queries from %s\n", NUM_Q, fn.c_str());
- for (uint32_t qid = 0; qid < NUM_Q; ++qid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedQueries[qid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-void read_docs(std::string fn) {
- int fd = open(fn.c_str(), O_RDONLY);
- ASSERT_TRUE(fd > 0);
- int d;
- size_t rv;
- fprintf(stderr, "reading %u doc vectors from %s\n", NUM_DOCS, fn.c_str());
- for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
- rv = read(fd, &d, 4);
- ASSERT_EQUAL(rv, 4u);
- ASSERT_EQUAL(d, NUM_DIMS);
- rv = read(fd, &generatedDocs[docid].v, NUM_DIMS*sizeof(float));
- ASSERT_EQUAL(rv, sizeof(PointVector));
- }
- close(fd);
-}
-
-using TimePoint = std::chrono::steady_clock::time_point;
-using Duration = std::chrono::steady_clock::duration;
-
-double to_ms(Duration elapsed) {
- std::chrono::duration<double, std::milli> ms(elapsed);
- return ms.count();
-}
-
-void read_data(const std::string& dir, const std::string& data_set) {
- fprintf(stderr, "read data set '%s' from directory '%s'\n", data_set.c_str(), dir.c_str());
- TimePoint bef = std::chrono::steady_clock::now();
- read_queries(dir + "/" + data_set + "_query.fvecs");
- TimePoint aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read queries: %.3f ms\n", to_ms(aft - bef));
- bef = std::chrono::steady_clock::now();
- read_docs(dir + "/" + data_set + "_base.fvecs");
- aft = std::chrono::steady_clock::now();
- fprintf(stderr, "read docs: %.3f ms\n", to_ms(aft - bef));
-}
-
-
-struct BfHitComparator {
- bool operator() (const Hit &lhs, const Hit& rhs) const {
- if (lhs.distance < rhs.distance) return false;
- if (lhs.distance > rhs.distance) return true;
- return (lhs.docid > rhs.docid);
- }
-};
-
-class BfHitHeap {
-private:
- size_t _size;
- vespalib::PriorityQueue<Hit, BfHitComparator> _priQ;
-public:
- explicit BfHitHeap(size_t maxSize) : _size(maxSize), _priQ() {
- _priQ.reserve(maxSize);
- }
- ~BfHitHeap() {}
- void maybe_use(const Hit &hit) {
- if (_priQ.size() < _size) {
- _priQ.push(hit);
- } else if (hit.distance < _priQ.front().distance) {
- _priQ.front() = hit;
- _priQ.adjust();
- }
- }
- std::vector<Hit> bestHits() {
- std::vector<Hit> result;
- size_t i = _priQ.size();
- result.resize(i);
- while (i-- > 0) {
- result[i] = _priQ.front();
- _priQ.pop_front();
- }
- return result;
- }
-};
-
-TopK bruteforce_nns(const PointVector &query) {
- TopK result;
- BfHitHeap heap(result.K);
- for (uint32_t docid = 0; docid < NUM_DOCS; ++docid) {
- const PointVector &docvector = generatedDocs[docid];
- double d = l2distCalc.l2sq_dist(query, docvector);
- Hit h(docid, d);
- heap.maybe_use(h);
- }
- std::vector<Hit> best = heap.bestHits();
- for (size_t i = 0; i < result.K; ++i) {
- result.hits[i] = best[i];
- }
- return result;
-}
+#include "time-util.h"
+#include "point-vector.h"
+#include "read-vecs.h"
+#include "bruteforce-nns.h"
TopK bruteforce_nns_filter(const PointVector &query, const BitVector &blacklist) {
TopK result;
@@ -181,20 +43,6 @@ TopK bruteforce_nns_filter(const PointVector &query, const BitVector &blacklist)
return result;
}
-
-void verifyBF(uint32_t qid) {
- const PointVector &query = generatedQueries[qid];
- TopK &result = bruteforceResults[qid];
- double min_distance = result.hits[0].distance;
- for (uint32_t i = 0; i < NUM_DOCS; ++i) {
- double dist = computeDistance(query, i);
- if (dist < min_distance) {
- fprintf(stderr, "WARN dist %.9g < mindist %.9g\n", dist, min_distance);
- }
- EXPECT_FALSE(dist+0.000001 < min_distance);
- }
-}
-
void timing_bf_filter(int percent)
{
BitVector blacklist(NUM_DOCS);
diff --git a/eval/src/tests/ann/time-util.h b/eval/src/tests/ann/time-util.h
new file mode 100644
index 00000000000..2f5c2bdd583
--- /dev/null
+++ b/eval/src/tests/ann/time-util.h
@@ -0,0 +1,9 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+using TimePoint = std::chrono::steady_clock::time_point;
+using Duration = std::chrono::steady_clock::duration;
+
+double to_ms(Duration elapsed) {
+ std::chrono::duration<double, std::milli> ms(elapsed);
+ return ms.count();
+}