diff options
author | Tor Egge <Tor.Egge@online.no> | 2021-05-20 11:31:17 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2021-05-20 11:31:17 +0200 |
commit | 6c92e912eba3cfaa52cf359c02e1664d071fd18c (patch) | |
tree | c64beebd8bf976c182aeab311a1adc925b183af1 | |
parent | c8746c75e6ad4dad17ecd3daf2916fe45fd594f9 (diff) |
Add python bindings for a HNSW index fixture using a tensor attribute vector containing a nearest neighbor index.
-rw-r--r-- | CMakeLists.txt | 4 | ||||
-rw-r--r-- | ann_benchmark/CMakeLists.txt | 13 | ||||
-rw-r--r-- | ann_benchmark/src/tests/ann_benchmark/.gitignore | 1 | ||||
-rw-r--r-- | ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt | 3 | ||||
-rw-r--r-- | ann_benchmark/src/tests/ann_benchmark/test_euclidean.py | 63 | ||||
-rw-r--r-- | ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt | 28 | ||||
-rw-r--r-- | ann_benchmark/src/vespa/ann_benchmark/setup.py | 27 | ||||
-rw-r--r-- | ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp | 217 | ||||
-rw-r--r-- | dist/vespa.spec | 30 |
9 files changed, 386 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index c98994cf993..1045f6270c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,9 @@ find_package(JNI REQUIRED) find_package(GTest REQUIRED) +set(PYBIND11_FIND_PYTHON ON) +find_package(pybind11 CONFIG REQUIRED) + include(build_settings.cmake) # Enable CTest unit testing @@ -42,6 +45,7 @@ vespa_install_data(valgrind-suppressions.txt etc/vespa) # Include vespa config definitions in every target include_directories(BEFORE ${CMAKE_BINARY_DIR}/configdefinitions/src) +add_subdirectory(ann_benchmark) add_subdirectory(application-model) add_subdirectory(application-preprocessor) add_subdirectory(athenz-identity-provider-service) diff --git a/ann_benchmark/CMakeLists.txt b/ann_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..d34329667e3 --- /dev/null +++ b/ann_benchmark/CMakeLists.txt @@ -0,0 +1,13 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_define_module( + DEPENDS + searchlib + + LIBS + src/vespa/ann_benchmark + + APPS + + TESTS + src/tests/ann_benchmark +) diff --git a/ann_benchmark/src/tests/ann_benchmark/.gitignore b/ann_benchmark/src/tests/ann_benchmark/.gitignore new file mode 100644 index 00000000000..225fc6f6650 --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/.gitignore @@ -0,0 +1 @@ +/__pycache__ diff --git a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..455ed24bcce --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt @@ -0,0 +1,3 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_test(NAME ann_benchmark_test COMMAND ${PYTHON_EXECUTABLE} -m pytest WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS vespa_ann_benchmark) diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py new file mode 100644 index 00000000000..1fc883ef003 --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py @@ -0,0 +1,63 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +import pytest +import sys +import os +import math +sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark")) +from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex + +class Fixture: + def __init__(self): + self.index = HnswIndex(2, HnswIndexParams(16, 200, DistanceMetric.Euclidean, False)) + + def set(self, lid, value): + self.index.set_vector(lid, value) + + def get(self, lid): + return self.index.get_vector(lid) + + def clear(self, lid): + return self.index.clear_vector(lid) + + def find(self, k, value): + return self.index.find_top_k(k, value, k + 200, 1e300) + +def test_set_value(): + f = Fixture() + f.set(0, [1, 2]) + f.set(1, [3, 4]) + assert [1, 2] == f.get(0) + assert [3, 4] == f.get(1) + +def test_clear_value(): + f = Fixture() + f.set(0, [1, 2]) + assert [1, 2] == f.get(0) + f.clear(0) + assert [0, 0] == f.get(0) + +def test_find(): + f = Fixture() + f.set(0, [0, 0]) + f.set(1, [10, 10]) + assert f.get(0) == [0, 0] + assert f.get(1) == [10, 10] + top = f.find(10, [1, 1]) + assert [top[0][0], top[1][0]] == [0, 1] + # Allow some rounding errors + epsilon = 1e-20 + assert abs(top[0][1] - math.sqrt(2)) < epsilon + assert abs(top[1][1] - math.sqrt(162)) < epsilon + top2 = f.find(10, [9, 9]) + # Result is not sorted by distance + assert [top2[0][0], top2[1][0]] == [0, 1] + assert abs(top2[0][1] - math.sqrt(162)) < epsilon + assert abs(top2[1][1] - math.sqrt(2)) < epsilon + assert 0 == f.find(1, [1, 1])[0][0] + assert 1 == f.find(1, [9, 9])[0][0] + f.clear(1) + assert 0 == f.find(1, [9, 9])[0][0] + assert 0 == f.find(1, [0, 0])[0][0] + f.clear(0) + assert 0 == len(f.find(1, [9, 9])) diff --git a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..a9a8ceb3660 --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +install(DIRECTORY DESTINATION libexec/vespa_ann_benchmark) + +vespa_add_library(vespa_ann_benchmark + ALLOW_UNRESOLVED_SYMBOLS + SOURCES + vespa_ann_benchmark.cpp + + INSTALL libexec/vespa_ann_benchmark + DEPENDS + pybind11::pybind11 +) + +if (TARGET pybind11::lto) + target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module pybind11::lto) +else() + target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module) +endif() + +if (COMMAND pybind11_extension) + pybind11_extension(vespa_ann_benchmark) +else() + set_target_properties(vespa_ann_benchmark PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") + set_target_properties(vespa_ann_benchmark PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}") +endif() + +set_target_properties(vespa_ann_benchmark PROPERTIES CXX_VISIBILITY_PRESET "hidden") +vespa_install_script(setup.py libexec/vespa_ann_benchmark) diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py new file mode 100644 index 00000000000..e57b1d595dd --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py @@ -0,0 +1,27 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +import os +import sys +import platform +import distutils.sysconfig +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext + +class PreBuiltExt(build_ext): + def build_extension(self, ext): + print("Using prebuilt extension library") + libdir="lib.%s-%s-%s" % (sys.platform, platform.machine(), distutils.sysconfig.get_python_version()) + os.system("mkdir -p build/%s" % libdir) + os.system("cp -p vespa_ann_benchmark.*.so build/%s" % libdir) + +setup( + name="vespa_ann_benchmark", + version="0.1.0", + author="Tor Egge", + author_email="Tor.Egge@verizonmedia.com", + description="Python binding for an HNSW index fixture using tensor attribute", + long_description="Python binding for an HNSW index fixture using tensor attribute -- long version", + ext_modules=[Extension("vespa_ann_benchmark", sources=[])], + cmdclass={"build_ext": PreBuiltExt}, + zip_safe=False, +) diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp new file mode 100644 index 00000000000..f0230f8e3a0 --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp @@ -0,0 +1,217 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <vespa/searchcommon/attribute/hnsw_index_params.h> +#include <vespa/searchlib/attribute/attributevector.h> +#include <vespa/searchlib/attribute/attributefactory.h> +#include <vespa/searchlib/tensor/dense_tensor_attribute.h> +#include <vespa/searchlib/tensor/nearest_neighbor_index.h> +#include <vespa/eval/eval/value.h> +#include <vespa/vespalib/test/insertion_operators.h> +#include <ostream> +#include <sstream> +#include <limits> + +namespace py = pybind11; + +using search::AttributeFactory; +using search::AttributeVector; +using search::attribute::BasicType; +using search::attribute::Config; +using search::attribute::CollectionType; +using search::attribute::DistanceMetric; +using search::attribute::HnswIndexParams; +using search::tensor::NearestNeighborIndex; +using search::tensor::TensorAttribute; +using vespalib::eval::CellType; +using vespalib::eval::DenseValueView; +using vespalib::eval::TypedCells; +using vespalib::eval::ValueType; +using vespalib::eval::Value; + +namespace vespa_ann_benchmark { + +using TopKResult = std::vector<std::pair<uint32_t, double>>; + +namespace { + +std::string +make_tensor_spec(uint32_t dim_size) +{ + std::ostringstream os; + os << "tensor<float>(x[" << dim_size << "])"; + return os.str(); +} + +constexpr uint32_t lid_bias = 1; // lid 0 is reserved + +} + +/* + * Wrapper class for a tensor attribute vector containing a nearest + * neighbor index. + */ +class HnswIndex +{ + ValueType _tensor_type; + HnswIndexParams _hnsw_index_params; + std::shared_ptr<AttributeVector> _attribute; + TensorAttribute* _tensor_attribute; + const NearestNeighborIndex* _nearest_neighbor_index; + size_t _dim_size; + + bool check_lid(uint32_t lid); + bool check_value(const char *op, const std::vector<float>& value); +public: + HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params); + virtual ~HnswIndex(); + void set_vector(uint32_t lid, const std::vector<float>& value); + std::vector<float> get_vector(uint32_t lid); + void clear_vector(uint32_t lid); + TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold); +}; + +HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params) + : _tensor_type(ValueType::error_type()), + _hnsw_index_params(hnsw_index_params), + _attribute(), + _tensor_attribute(nullptr), + _nearest_neighbor_index(nullptr), + _dim_size(0u) +{ + Config cfg(BasicType::TENSOR, CollectionType::SINGLE); + _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size)); + assert(_tensor_type.is_dense()); + assert(_tensor_type.count_indexed_dimensions() == 1u); + _dim_size = _tensor_type.dimensions()[0].size; + std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl; + cfg.setTensorType(_tensor_type); + cfg.set_distance_metric(hnsw_index_params.distance_metric()); + cfg.set_hnsw_index_params(hnsw_index_params); + _attribute = AttributeFactory::createAttribute("tensor", cfg); + _tensor_attribute = dynamic_cast<TensorAttribute *>(_attribute.get()); + assert(_tensor_attribute != nullptr); + _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index(); + assert(_nearest_neighbor_index != nullptr); +} + +HnswIndex::~HnswIndex() = default; + +bool +HnswIndex::check_lid(uint32_t lid) +{ + if (lid >= std::numeric_limits<uint32_t>::max() - lid_bias) { + std::cerr << "lid is too high" << std::endl; + return false; + } + return true; +} + +bool +HnswIndex::check_value(const char *op, const std::vector<float>& value) +{ + if (value.size() != _dim_size) { + std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl; + return false; + } + return true; +} + +void +HnswIndex::set_vector(uint32_t lid, const std::vector<float>& value) +{ + if (!check_lid(lid)) { + return; + } + if (!check_value("set_vector", value)) { + return; + } + /* + * Not thread safe against concurrent set_vector(). + */ + TypedCells typed_cells(&value[0], CellType::FLOAT, value.size()); + DenseValueView tensor_view(_tensor_type, typed_cells); + while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) { + uint32_t new_lid = 0; + _attribute->addDoc(new_lid); + } + _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa + _attribute->commit(); +} + +std::vector<float> +HnswIndex::get_vector(uint32_t lid) +{ + if (!check_lid(lid)) { + return {}; + } + TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias); + assert(typed_cells.size == _dim_size); + const float* data = static_cast<const float* >(typed_cells.data); + return {data, data + _dim_size}; + return {}; +} + +void +HnswIndex::clear_vector(uint32_t lid) +{ + if (!check_lid(lid)) { + return; + } + if (size_t(lid + lid_bias) < _attribute->getNumDocs()) { + _attribute->clearDoc(lid + lid_bias); + _attribute->commit(); + } +} + +TopKResult +HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold) +{ + if (!check_value("find_top_k", value)) { + return {}; + } + /* + * Not thread safe against concurrent set_vector() since attribute + * read guard is not taken here. + */ + TopKResult result; + TypedCells typed_cells(&value[0], CellType::FLOAT, value.size()); + auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold); + result.reserve(raw_result.size()); + switch (_hnsw_index_params.distance_metric()) { + case DistanceMetric::Euclidean: + for (auto &raw : raw_result) { + result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance)); + } + break; + default: + for (auto &raw : raw_result) { + result.emplace_back(raw.docid - lid_bias, raw.distance); + } + } + // Results are sorted by lid, not by distance + return result; +} + +} + +using vespa_ann_benchmark::HnswIndex; + +PYBIND11_MODULE(vespa_ann_benchmark, m) { + m.doc() = "vespa_ann_benchmark plugin"; + + py::enum_<DistanceMetric>(m, "DistanceMetric") + .value("Euclidean", DistanceMetric::Euclidean) + .value("Angular", DistanceMetric::Angular); + + py::class_<HnswIndexParams>(m, "HnswIndexParams") + .def(py::init<uint32_t, uint32_t, DistanceMetric, bool>()); + + py::class_<HnswIndex>(m, "HnswIndex") + .def(py::init<uint32_t, const HnswIndexParams&>()) + .def("set_vector", &HnswIndex::set_vector) + .def("get_vector", &HnswIndex::get_vector) + .def("clear_vector", &HnswIndex::clear_vector) + .def("find_top_k", &HnswIndex::find_top_k); +} diff --git a/dist/vespa.spec b/dist/vespa.spec index 2c75319da58..f1f439e0b41 100644 --- a/dist/vespa.spec +++ b/dist/vespa.spec @@ -437,6 +437,27 @@ Requires: %{name}-base-libs = %{version}-%{release} Vespa - The open big data serving engine - tools +%package ann-benchmark + +Summary: Vespa - The open big data serving engine - ann-benchmark + +Requires: %{name}-libs = %{version}-%{release} +%if 0%{?el7} +Requires: python3 +%endif +%if 0%{?el8} +Requires: python36 +%endif +%if 0%{?fedora} +Requires: python3 +%endif + +%description ann-benchmark + +Vespa - The open big data serving engine - ann-benchmark + +Python binding for an HNSW index fixture using tensor attribute. + %prep %if 0%{?installdir:1} %setup -c -D -T @@ -622,6 +643,7 @@ fi %{_prefix}/lib/jars/zookeeper-command-line-client-jar-with-dependencies.jar %{_prefix}/lib/perl5 %{_prefix}/libexec +%exclude %{_prefix}/libexec/vespa_ann_benchmark %exclude %{_prefix}/libexec/vespa/common-env.sh %exclude %{_prefix}/libexec/vespa/node-admin.sh %exclude %{_prefix}/libexec/vespa/standalone-container.sh @@ -817,4 +839,12 @@ fi %dir %{_prefix}/lib/jars %{_prefix}/lib/jars/vespaclient-java-jar-with-dependencies.jar +%files ann-benchmark +%if %{_defattr_is_vespa_vespa} +%defattr(-,%{_vespa_user},%{_vespa_group},-) +%endif +%dir %{_prefix} +%dir %{_prefix}/libexec +%{_prefix}/libexec/vespa_ann_benchmark + %changelog |