aboutsummaryrefslogtreecommitdiffstats
path: root/ann_benchmark
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2021-05-20 13:36:07 +0200
committerTor Egge <Tor.Egge@online.no>2021-05-20 13:36:07 +0200
commitae3e9e3e953f6866650a21b2079e0aa0b62408f6 (patch)
treed084e5d423f780deb071f2c3f32581419fbc0587 /ann_benchmark
parent6c92e912eba3cfaa52cf359c02e1664d071fd18c (diff)
Updates based on review feedback:
Remove distance_threshold argument to find_top_k member function in HNSW index fixture class. Remove duplicate tests in unit test. Adjust comments and descriptions.
Diffstat (limited to 'ann_benchmark')
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/test_euclidean.py4
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/setup.py4
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp16
3 files changed, 13 insertions, 11 deletions
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
index 1fc883ef003..5bda19dbc0e 100644
--- a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
+++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
@@ -21,7 +21,7 @@ class Fixture:
return self.index.clear_vector(lid)
def find(self, k, value):
- return self.index.find_top_k(k, value, k + 200, 1e300)
+ return self.index.find_top_k(k, value, k + 200)
def test_set_value():
f = Fixture()
@@ -41,8 +41,6 @@ def test_find():
f = Fixture()
f.set(0, [0, 0])
f.set(1, [10, 10])
- assert f.get(0) == [0, 0]
- assert f.get(1) == [10, 10]
top = f.find(10, [1, 1])
assert [top[0][0], top[1][0]] == [0, 1]
# Allow some rounding errors
diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py
index e57b1d595dd..e19dada5fff 100644
--- a/ann_benchmark/src/vespa/ann_benchmark/setup.py
+++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py
@@ -19,8 +19,8 @@ setup(
version="0.1.0",
author="Tor Egge",
author_email="Tor.Egge@verizonmedia.com",
- description="Python binding for an HNSW index fixture using tensor attribute",
- long_description="Python binding for an HNSW index fixture using tensor attribute -- long version",
+ description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search",
+ long_description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search used for low-level benchmarking",
ext_modules=[Extension("vespa_ann_benchmark", sources=[])],
cmdclass={"build_ext": PreBuiltExt},
zip_safe=False,
diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
index f0230f8e3a0..470dd1939f7 100644
--- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
+++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
@@ -49,8 +49,13 @@ constexpr uint32_t lid_bias = 1; // lid 0 is reserved
}
/*
- * Wrapper class for a tensor attribute vector containing a nearest
- * neighbor index.
+ * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space.
+ *
+ * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory.
+ * This class only supports single-threaded access (both for indexing and searching),
+ * and should only be used for low-level benchmarking.
+ * To use nearest neighbor search in a Vespa application,
+ * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details.
*/
class HnswIndex
{
@@ -69,7 +74,7 @@ public:
void set_vector(uint32_t lid, const std::vector<float>& value);
std::vector<float> get_vector(uint32_t lid);
void clear_vector(uint32_t lid);
- TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold);
+ TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k);
};
HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params)
@@ -85,7 +90,6 @@ HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params
assert(_tensor_type.is_dense());
assert(_tensor_type.count_indexed_dimensions() == 1u);
_dim_size = _tensor_type.dimensions()[0].size;
- std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl;
cfg.setTensorType(_tensor_type);
cfg.set_distance_metric(hnsw_index_params.distance_metric());
cfg.set_hnsw_index_params(hnsw_index_params);
@@ -166,7 +170,7 @@ HnswIndex::clear_vector(uint32_t lid)
}
TopKResult
-HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold)
+HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k)
{
if (!check_value("find_top_k", value)) {
return {};
@@ -177,7 +181,7 @@ HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t expl
*/
TopKResult result;
TypedCells typed_cells(&value[0], CellType::FLOAT, value.size());
- auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold);
+ auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, std::numeric_limits<double>::max());
result.reserve(raw_result.size());
switch (_hnsw_index_params.distance_metric()) {
case DistanceMetric::Euclidean: