Updates based on review feedback:

Remove distance_threshold argument to find_top_k member function in HNSW index fixture class. Remove duplicate tests in unit test. Adjust comments and descriptions.
author: Tor Egge <Tor.Egge@online.no> 2021-05-20 13:36:07 +0200
committer: Tor Egge <Tor.Egge@online.no> 2021-05-20 13:36:07 +0200
commit: ae3e9e3e953f6866650a21b2079e0aa0b62408f6 (patch)
tree: d084e5d423f780deb071f2c3f32581419fbc0587 /ann_benchmark
parent: 6c92e912eba3cfaa52cf359c02e1664d071fd18c (diff)
3 files changed, 13 insertions, 11 deletions
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
index 1fc883ef003..5bda19dbc0e 100644
--- a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
+++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
@@ -21,7 +21,7 @@ class Fixture:
         return self.index.clear_vector(lid)
 
     def find(self, k, value):
-        return self.index.find_top_k(k, value, k + 200, 1e300)
+        return self.index.find_top_k(k, value, k + 200)
 
 def test_set_value():
     f = Fixture()
@@ -41,8 +41,6 @@ def test_find():
     f = Fixture()
     f.set(0, [0, 0])
     f.set(1, [10, 10])
-    assert f.get(0) == [0, 0]
-    assert f.get(1) == [10, 10]
     top = f.find(10, [1, 1])
     assert [top[0][0], top[1][0]] == [0, 1]
     # Allow some rounding errors
diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py
index e57b1d595dd..e19dada5fff 100644
--- a/ann_benchmark/src/vespa/ann_benchmark/setup.py
+++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py
@@ -19,8 +19,8 @@ setup(
   version="0.1.0",
   author="Tor Egge",
   author_email="Tor.Egge@verizonmedia.com",
-  description="Python binding for an HNSW index fixture using tensor attribute",
-  long_description="Python binding for an HNSW index fixture using tensor attribute -- long version",
+  description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search",
+  long_description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search used for low-level benchmarking",
   ext_modules=[Extension("vespa_ann_benchmark", sources=[])],
   cmdclass={"build_ext": PreBuiltExt},
   zip_safe=False,
diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
index f0230f8e3a0..470dd1939f7 100644
--- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
+++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
@@ -49,8 +49,13 @@ constexpr uint32_t lid_bias = 1; // lid 0 is reserved
 }
 
 /*
- * Wrapper class for a tensor attribute vector containing a nearest
- * neighbor index.
+ * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space.
+ *
+ * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory.
+ * This class only supports single-threaded access (both for indexing and searching),
+ * and should only be used for low-level benchmarking.
+ * To use nearest neighbor search in a Vespa application,
+ * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details.
  */
 class HnswIndex
 {
@@ -69,7 +74,7 @@ public:
     void set_vector(uint32_t lid, const std::vector<float>& value);
     std::vector<float> get_vector(uint32_t lid);
     void clear_vector(uint32_t lid);
-    TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold);
+    TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k);
 };
 
 HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params)
@@ -85,7 +90,6 @@ HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params
     assert(_tensor_type.is_dense());
     assert(_tensor_type.count_indexed_dimensions() == 1u);
     _dim_size = _tensor_type.dimensions()[0].size;
-    std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl;
     cfg.setTensorType(_tensor_type);
     cfg.set_distance_metric(hnsw_index_params.distance_metric());
     cfg.set_hnsw_index_params(hnsw_index_params);
@@ -166,7 +170,7 @@ HnswIndex::clear_vector(uint32_t lid)
 }
 
 TopKResult
-HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold)
+HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k)
 {
     if (!check_value("find_top_k", value)) {
         return {};
@@ -177,7 +181,7 @@ HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t expl
      */
     TopKResult result;
     TypedCells typed_cells(&value[0], CellType::FLOAT, value.size());
-    auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold);
+    auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, std::numeric_limits<double>::max());
     result.reserve(raw_result.size());
     switch (_hnsw_index_params.distance_metric()) {
     case DistanceMetric::Euclidean:
author	Tor Egge <Tor.Egge@online.no>	2021-05-20 13:36:07 +0200
committer	Tor Egge <Tor.Egge@online.no>	2021-05-20 13:36:07 +0200
commit	ae3e9e3e953f6866650a21b2079e0aa0b62408f6 (patch)
tree	d084e5d423f780deb071f2c3f32581419fbc0587 /ann_benchmark
parent	6c92e912eba3cfaa52cf359c02e1664d071fd18c (diff)