aboutsummaryrefslogtreecommitdiffstats
path: root/ann_benchmark
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2021-05-20 11:31:17 +0200
committerTor Egge <Tor.Egge@online.no>2021-05-20 11:31:17 +0200
commit6c92e912eba3cfaa52cf359c02e1664d071fd18c (patch)
treec64beebd8bf976c182aeab311a1adc925b183af1 /ann_benchmark
parentc8746c75e6ad4dad17ecd3daf2916fe45fd594f9 (diff)
Add python bindings for a HNSW index fixture using a tensor attribute vector containing a nearest neighbor index.
Diffstat (limited to 'ann_benchmark')
-rw-r--r--ann_benchmark/CMakeLists.txt13
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/.gitignore1
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt3
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/test_euclidean.py63
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt28
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/setup.py27
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp217
7 files changed, 352 insertions, 0 deletions
diff --git a/ann_benchmark/CMakeLists.txt b/ann_benchmark/CMakeLists.txt
new file mode 100644
index 00000000000..d34329667e3
--- /dev/null
+++ b/ann_benchmark/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_define_module(
+ DEPENDS
+ searchlib
+
+ LIBS
+ src/vespa/ann_benchmark
+
+ APPS
+
+ TESTS
+ src/tests/ann_benchmark
+)
diff --git a/ann_benchmark/src/tests/ann_benchmark/.gitignore b/ann_benchmark/src/tests/ann_benchmark/.gitignore
new file mode 100644
index 00000000000..225fc6f6650
--- /dev/null
+++ b/ann_benchmark/src/tests/ann_benchmark/.gitignore
@@ -0,0 +1 @@
+/__pycache__
diff --git a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt
new file mode 100644
index 00000000000..455ed24bcce
--- /dev/null
+++ b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt
@@ -0,0 +1,3 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+vespa_add_test(NAME ann_benchmark_test COMMAND ${PYTHON_EXECUTABLE} -m pytest WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS vespa_ann_benchmark)
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
new file mode 100644
index 00000000000..1fc883ef003
--- /dev/null
+++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
@@ -0,0 +1,63 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+import pytest
+import sys
+import os
+import math
+sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark"))
+from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex
+
+class Fixture:
+ def __init__(self):
+ self.index = HnswIndex(2, HnswIndexParams(16, 200, DistanceMetric.Euclidean, False))
+
+ def set(self, lid, value):
+ self.index.set_vector(lid, value)
+
+ def get(self, lid):
+ return self.index.get_vector(lid)
+
+ def clear(self, lid):
+ return self.index.clear_vector(lid)
+
+ def find(self, k, value):
+ return self.index.find_top_k(k, value, k + 200, 1e300)
+
+def test_set_value():
+ f = Fixture()
+ f.set(0, [1, 2])
+ f.set(1, [3, 4])
+ assert [1, 2] == f.get(0)
+ assert [3, 4] == f.get(1)
+
+def test_clear_value():
+ f = Fixture()
+ f.set(0, [1, 2])
+ assert [1, 2] == f.get(0)
+ f.clear(0)
+ assert [0, 0] == f.get(0)
+
+def test_find():
+ f = Fixture()
+ f.set(0, [0, 0])
+ f.set(1, [10, 10])
+ assert f.get(0) == [0, 0]
+ assert f.get(1) == [10, 10]
+ top = f.find(10, [1, 1])
+ assert [top[0][0], top[1][0]] == [0, 1]
+ # Allow some rounding errors
+ epsilon = 1e-20
+ assert abs(top[0][1] - math.sqrt(2)) < epsilon
+ assert abs(top[1][1] - math.sqrt(162)) < epsilon
+ top2 = f.find(10, [9, 9])
+ # Result is not sorted by distance
+ assert [top2[0][0], top2[1][0]] == [0, 1]
+ assert abs(top2[0][1] - math.sqrt(162)) < epsilon
+ assert abs(top2[1][1] - math.sqrt(2)) < epsilon
+ assert 0 == f.find(1, [1, 1])[0][0]
+ assert 1 == f.find(1, [9, 9])[0][0]
+ f.clear(1)
+ assert 0 == f.find(1, [9, 9])[0][0]
+ assert 0 == f.find(1, [0, 0])[0][0]
+ f.clear(0)
+ assert 0 == len(f.find(1, [9, 9]))
diff --git a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt
new file mode 100644
index 00000000000..a9a8ceb3660
--- /dev/null
+++ b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+install(DIRECTORY DESTINATION libexec/vespa_ann_benchmark)
+
+vespa_add_library(vespa_ann_benchmark
+ ALLOW_UNRESOLVED_SYMBOLS
+ SOURCES
+ vespa_ann_benchmark.cpp
+
+ INSTALL libexec/vespa_ann_benchmark
+ DEPENDS
+ pybind11::pybind11
+)
+
+if (TARGET pybind11::lto)
+ target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module pybind11::lto)
+else()
+ target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module)
+endif()
+
+if (COMMAND pybind11_extension)
+ pybind11_extension(vespa_ann_benchmark)
+else()
+ set_target_properties(vespa_ann_benchmark PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
+ set_target_properties(vespa_ann_benchmark PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endif()
+
+set_target_properties(vespa_ann_benchmark PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+vespa_install_script(setup.py libexec/vespa_ann_benchmark)
diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py
new file mode 100644
index 00000000000..e57b1d595dd
--- /dev/null
+++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py
@@ -0,0 +1,27 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+import os
+import sys
+import platform
+import distutils.sysconfig
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+class PreBuiltExt(build_ext):
+ def build_extension(self, ext):
+ print("Using prebuilt extension library")
+ libdir="lib.%s-%s-%s" % (sys.platform, platform.machine(), distutils.sysconfig.get_python_version())
+ os.system("mkdir -p build/%s" % libdir)
+ os.system("cp -p vespa_ann_benchmark.*.so build/%s" % libdir)
+
+setup(
+ name="vespa_ann_benchmark",
+ version="0.1.0",
+ author="Tor Egge",
+ author_email="Tor.Egge@verizonmedia.com",
+ description="Python binding for an HNSW index fixture using tensor attribute",
+ long_description="Python binding for an HNSW index fixture using tensor attribute -- long version",
+ ext_modules=[Extension("vespa_ann_benchmark", sources=[])],
+ cmdclass={"build_ext": PreBuiltExt},
+ zip_safe=False,
+)
diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
new file mode 100644
index 00000000000..f0230f8e3a0
--- /dev/null
+++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
@@ -0,0 +1,217 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vespa/searchcommon/attribute/hnsw_index_params.h>
+#include <vespa/searchlib/attribute/attributevector.h>
+#include <vespa/searchlib/attribute/attributefactory.h>
+#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
+#include <vespa/searchlib/tensor/nearest_neighbor_index.h>
+#include <vespa/eval/eval/value.h>
+#include <vespa/vespalib/test/insertion_operators.h>
+#include <ostream>
+#include <sstream>
+#include <limits>
+
+namespace py = pybind11;
+
+using search::AttributeFactory;
+using search::AttributeVector;
+using search::attribute::BasicType;
+using search::attribute::Config;
+using search::attribute::CollectionType;
+using search::attribute::DistanceMetric;
+using search::attribute::HnswIndexParams;
+using search::tensor::NearestNeighborIndex;
+using search::tensor::TensorAttribute;
+using vespalib::eval::CellType;
+using vespalib::eval::DenseValueView;
+using vespalib::eval::TypedCells;
+using vespalib::eval::ValueType;
+using vespalib::eval::Value;
+
+namespace vespa_ann_benchmark {
+
+using TopKResult = std::vector<std::pair<uint32_t, double>>;
+
+namespace {
+
+std::string
+make_tensor_spec(uint32_t dim_size)
+{
+ std::ostringstream os;
+ os << "tensor<float>(x[" << dim_size << "])";
+ return os.str();
+}
+
+constexpr uint32_t lid_bias = 1; // lid 0 is reserved
+
+}
+
+/*
+ * Wrapper class for a tensor attribute vector containing a nearest
+ * neighbor index.
+ */
+class HnswIndex
+{
+ ValueType _tensor_type;
+ HnswIndexParams _hnsw_index_params;
+ std::shared_ptr<AttributeVector> _attribute;
+ TensorAttribute* _tensor_attribute;
+ const NearestNeighborIndex* _nearest_neighbor_index;
+ size_t _dim_size;
+
+ bool check_lid(uint32_t lid);
+ bool check_value(const char *op, const std::vector<float>& value);
+public:
+ HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params);
+ virtual ~HnswIndex();
+ void set_vector(uint32_t lid, const std::vector<float>& value);
+ std::vector<float> get_vector(uint32_t lid);
+ void clear_vector(uint32_t lid);
+ TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold);
+};
+
+HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params)
+ : _tensor_type(ValueType::error_type()),
+ _hnsw_index_params(hnsw_index_params),
+ _attribute(),
+ _tensor_attribute(nullptr),
+ _nearest_neighbor_index(nullptr),
+ _dim_size(0u)
+{
+ Config cfg(BasicType::TENSOR, CollectionType::SINGLE);
+ _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size));
+ assert(_tensor_type.is_dense());
+ assert(_tensor_type.count_indexed_dimensions() == 1u);
+ _dim_size = _tensor_type.dimensions()[0].size;
+ std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl;
+ cfg.setTensorType(_tensor_type);
+ cfg.set_distance_metric(hnsw_index_params.distance_metric());
+ cfg.set_hnsw_index_params(hnsw_index_params);
+ _attribute = AttributeFactory::createAttribute("tensor", cfg);
+ _tensor_attribute = dynamic_cast<TensorAttribute *>(_attribute.get());
+ assert(_tensor_attribute != nullptr);
+ _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index();
+ assert(_nearest_neighbor_index != nullptr);
+}
+
+HnswIndex::~HnswIndex() = default;
+
+bool
+HnswIndex::check_lid(uint32_t lid)
+{
+ if (lid >= std::numeric_limits<uint32_t>::max() - lid_bias) {
+ std::cerr << "lid is too high" << std::endl;
+ return false;
+ }
+ return true;
+}
+
+bool
+HnswIndex::check_value(const char *op, const std::vector<float>& value)
+{
+ if (value.size() != _dim_size) {
+ std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl;
+ return false;
+ }
+ return true;
+}
+
+void
+HnswIndex::set_vector(uint32_t lid, const std::vector<float>& value)
+{
+ if (!check_lid(lid)) {
+ return;
+ }
+ if (!check_value("set_vector", value)) {
+ return;
+ }
+ /*
+ * Not thread safe against concurrent set_vector().
+ */
+ TypedCells typed_cells(&value[0], CellType::FLOAT, value.size());
+ DenseValueView tensor_view(_tensor_type, typed_cells);
+ while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) {
+ uint32_t new_lid = 0;
+ _attribute->addDoc(new_lid);
+ }
+ _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa
+ _attribute->commit();
+}
+
+std::vector<float>
+HnswIndex::get_vector(uint32_t lid)
+{
+ if (!check_lid(lid)) {
+ return {};
+ }
+ TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias);
+ assert(typed_cells.size == _dim_size);
+ const float* data = static_cast<const float* >(typed_cells.data);
+ return {data, data + _dim_size};
+ return {};
+}
+
+void
+HnswIndex::clear_vector(uint32_t lid)
+{
+ if (!check_lid(lid)) {
+ return;
+ }
+ if (size_t(lid + lid_bias) < _attribute->getNumDocs()) {
+ _attribute->clearDoc(lid + lid_bias);
+ _attribute->commit();
+ }
+}
+
+TopKResult
+HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold)
+{
+ if (!check_value("find_top_k", value)) {
+ return {};
+ }
+ /*
+ * Not thread safe against concurrent set_vector() since attribute
+ * read guard is not taken here.
+ */
+ TopKResult result;
+ TypedCells typed_cells(&value[0], CellType::FLOAT, value.size());
+ auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold);
+ result.reserve(raw_result.size());
+ switch (_hnsw_index_params.distance_metric()) {
+ case DistanceMetric::Euclidean:
+ for (auto &raw : raw_result) {
+ result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance));
+ }
+ break;
+ default:
+ for (auto &raw : raw_result) {
+ result.emplace_back(raw.docid - lid_bias, raw.distance);
+ }
+ }
+ // Results are sorted by lid, not by distance
+ return result;
+}
+
+}
+
+using vespa_ann_benchmark::HnswIndex;
+
+PYBIND11_MODULE(vespa_ann_benchmark, m) {
+ m.doc() = "vespa_ann_benchmark plugin";
+
+ py::enum_<DistanceMetric>(m, "DistanceMetric")
+ .value("Euclidean", DistanceMetric::Euclidean)
+ .value("Angular", DistanceMetric::Angular);
+
+ py::class_<HnswIndexParams>(m, "HnswIndexParams")
+ .def(py::init<uint32_t, uint32_t, DistanceMetric, bool>());
+
+ py::class_<HnswIndex>(m, "HnswIndex")
+ .def(py::init<uint32_t, const HnswIndexParams&>())
+ .def("set_vector", &HnswIndex::set_vector)
+ .def("get_vector", &HnswIndex::get_vector)
+ .def("clear_vector", &HnswIndex::clear_vector)
+ .def("find_top_k", &HnswIndex::find_top_k);
+}