diff options
author | Tor Egge <Tor.Egge@online.no> | 2021-05-20 13:36:07 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2021-05-20 13:36:07 +0200 |
commit | ae3e9e3e953f6866650a21b2079e0aa0b62408f6 (patch) | |
tree | d084e5d423f780deb071f2c3f32581419fbc0587 /ann_benchmark/src | |
parent | 6c92e912eba3cfaa52cf359c02e1664d071fd18c (diff) |
Updates based on review feedback:
Remove distance_threshold argument to find_top_k member function in
HNSW index fixture class.
Remove duplicate tests in unit test.
Adjust comments and descriptions.
Diffstat (limited to 'ann_benchmark/src')
-rw-r--r-- | ann_benchmark/src/tests/ann_benchmark/test_euclidean.py | 4 | ||||
-rw-r--r-- | ann_benchmark/src/vespa/ann_benchmark/setup.py | 4 | ||||
-rw-r--r-- | ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp | 16 |
3 files changed, 13 insertions, 11 deletions
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py index 1fc883ef003..5bda19dbc0e 100644 --- a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py +++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py @@ -21,7 +21,7 @@ class Fixture: return self.index.clear_vector(lid) def find(self, k, value): - return self.index.find_top_k(k, value, k + 200, 1e300) + return self.index.find_top_k(k, value, k + 200) def test_set_value(): f = Fixture() @@ -41,8 +41,6 @@ def test_find(): f = Fixture() f.set(0, [0, 0]) f.set(1, [10, 10]) - assert f.get(0) == [0, 0] - assert f.get(1) == [10, 10] top = f.find(10, [1, 1]) assert [top[0][0], top[1][0]] == [0, 1] # Allow some rounding errors diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py index e57b1d595dd..e19dada5fff 100644 --- a/ann_benchmark/src/vespa/ann_benchmark/setup.py +++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py @@ -19,8 +19,8 @@ setup( version="0.1.0", author="Tor Egge", author_email="Tor.Egge@verizonmedia.com", - description="Python binding for an HNSW index fixture using tensor attribute", - long_description="Python binding for an HNSW index fixture using tensor attribute -- long version", + description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search", + long_description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search used for low-level benchmarking", ext_modules=[Extension("vespa_ann_benchmark", sources=[])], cmdclass={"build_ext": PreBuiltExt}, zip_safe=False, diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp index f0230f8e3a0..470dd1939f7 100644 --- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp +++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp @@ -49,8 +49,13 @@ constexpr uint32_t lid_bias = 1; // lid 0 is reserved } /* - * Wrapper class for a tensor attribute vector containing a nearest - * neighbor index. + * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space. + * + * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory. + * This class only supports single-threaded access (both for indexing and searching), + * and should only be used for low-level benchmarking. + * To use nearest neighbor search in a Vespa application, + * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details. */ class HnswIndex { @@ -69,7 +74,7 @@ public: void set_vector(uint32_t lid, const std::vector<float>& value); std::vector<float> get_vector(uint32_t lid); void clear_vector(uint32_t lid); - TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold); + TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k); }; HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params) @@ -85,7 +90,6 @@ HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params assert(_tensor_type.is_dense()); assert(_tensor_type.count_indexed_dimensions() == 1u); _dim_size = _tensor_type.dimensions()[0].size; - std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl; cfg.setTensorType(_tensor_type); cfg.set_distance_metric(hnsw_index_params.distance_metric()); cfg.set_hnsw_index_params(hnsw_index_params); @@ -166,7 +170,7 @@ HnswIndex::clear_vector(uint32_t lid) } TopKResult -HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold) +HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k) { if (!check_value("find_top_k", value)) { return {}; @@ -177,7 +181,7 @@ HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t expl */ TopKResult result; TypedCells typed_cells(&value[0], CellType::FLOAT, value.size()); - auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold); + auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, std::numeric_limits<double>::max()); result.reserve(raw_result.size()); switch (_hnsw_index_params.distance_metric()) { case DistanceMetric::Euclidean: |