diff options
Diffstat (limited to 'ann_benchmark/src')
6 files changed, 339 insertions, 0 deletions
diff --git a/ann_benchmark/src/tests/ann_benchmark/.gitignore b/ann_benchmark/src/tests/ann_benchmark/.gitignore new file mode 100644 index 00000000000..225fc6f6650 --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/.gitignore @@ -0,0 +1 @@ +/__pycache__ diff --git a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..455ed24bcce --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt @@ -0,0 +1,3 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_test(NAME ann_benchmark_test COMMAND ${PYTHON_EXECUTABLE} -m pytest WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS vespa_ann_benchmark) diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py new file mode 100644 index 00000000000..1fc883ef003 --- /dev/null +++ b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py @@ -0,0 +1,63 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +import pytest +import sys +import os +import math +sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark")) +from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex + +class Fixture: + def __init__(self): + self.index = HnswIndex(2, HnswIndexParams(16, 200, DistanceMetric.Euclidean, False)) + + def set(self, lid, value): + self.index.set_vector(lid, value) + + def get(self, lid): + return self.index.get_vector(lid) + + def clear(self, lid): + return self.index.clear_vector(lid) + + def find(self, k, value): + return self.index.find_top_k(k, value, k + 200, 1e300) + +def test_set_value(): + f = Fixture() + f.set(0, [1, 2]) + f.set(1, [3, 4]) + assert [1, 2] == f.get(0) + assert [3, 4] == f.get(1) + +def test_clear_value(): + f = Fixture() + f.set(0, [1, 2]) + assert [1, 2] == f.get(0) + f.clear(0) + assert [0, 0] == f.get(0) + +def test_find(): + f = Fixture() + f.set(0, [0, 0]) + f.set(1, [10, 10]) + assert f.get(0) == [0, 0] + assert f.get(1) == [10, 10] + top = f.find(10, [1, 1]) + assert [top[0][0], top[1][0]] == [0, 1] + # Allow some rounding errors + epsilon = 1e-20 + assert abs(top[0][1] - math.sqrt(2)) < epsilon + assert abs(top[1][1] - math.sqrt(162)) < epsilon + top2 = f.find(10, [9, 9]) + # Result is not sorted by distance + assert [top2[0][0], top2[1][0]] == [0, 1] + assert abs(top2[0][1] - math.sqrt(162)) < epsilon + assert abs(top2[1][1] - math.sqrt(2)) < epsilon + assert 0 == f.find(1, [1, 1])[0][0] + assert 1 == f.find(1, [9, 9])[0][0] + f.clear(1) + assert 0 == f.find(1, [9, 9])[0][0] + assert 0 == f.find(1, [0, 0])[0][0] + f.clear(0) + assert 0 == len(f.find(1, [9, 9])) diff --git a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..a9a8ceb3660 --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +install(DIRECTORY DESTINATION libexec/vespa_ann_benchmark) + +vespa_add_library(vespa_ann_benchmark + ALLOW_UNRESOLVED_SYMBOLS + SOURCES + vespa_ann_benchmark.cpp + + INSTALL libexec/vespa_ann_benchmark + DEPENDS + pybind11::pybind11 +) + +if (TARGET pybind11::lto) + target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module pybind11::lto) +else() + target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module) +endif() + +if (COMMAND pybind11_extension) + pybind11_extension(vespa_ann_benchmark) +else() + set_target_properties(vespa_ann_benchmark PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") + set_target_properties(vespa_ann_benchmark PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}") +endif() + +set_target_properties(vespa_ann_benchmark PROPERTIES CXX_VISIBILITY_PRESET "hidden") +vespa_install_script(setup.py libexec/vespa_ann_benchmark) diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py b/ann_benchmark/src/vespa/ann_benchmark/setup.py new file mode 100644 index 00000000000..e57b1d595dd --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/setup.py @@ -0,0 +1,27 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +import os +import sys +import platform +import distutils.sysconfig +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext + +class PreBuiltExt(build_ext): + def build_extension(self, ext): + print("Using prebuilt extension library") + libdir="lib.%s-%s-%s" % (sys.platform, platform.machine(), distutils.sysconfig.get_python_version()) + os.system("mkdir -p build/%s" % libdir) + os.system("cp -p vespa_ann_benchmark.*.so build/%s" % libdir) + +setup( + name="vespa_ann_benchmark", + version="0.1.0", + author="Tor Egge", + author_email="Tor.Egge@verizonmedia.com", + description="Python binding for an HNSW index fixture using tensor attribute", + long_description="Python binding for an HNSW index fixture using tensor attribute -- long version", + ext_modules=[Extension("vespa_ann_benchmark", sources=[])], + cmdclass={"build_ext": PreBuiltExt}, + zip_safe=False, +) diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp new file mode 100644 index 00000000000..f0230f8e3a0 --- /dev/null +++ b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp @@ -0,0 +1,217 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <vespa/searchcommon/attribute/hnsw_index_params.h> +#include <vespa/searchlib/attribute/attributevector.h> +#include <vespa/searchlib/attribute/attributefactory.h> +#include <vespa/searchlib/tensor/dense_tensor_attribute.h> +#include <vespa/searchlib/tensor/nearest_neighbor_index.h> +#include <vespa/eval/eval/value.h> +#include <vespa/vespalib/test/insertion_operators.h> +#include <ostream> +#include <sstream> +#include <limits> + +namespace py = pybind11; + +using search::AttributeFactory; +using search::AttributeVector; +using search::attribute::BasicType; +using search::attribute::Config; +using search::attribute::CollectionType; +using search::attribute::DistanceMetric; +using search::attribute::HnswIndexParams; +using search::tensor::NearestNeighborIndex; +using search::tensor::TensorAttribute; +using vespalib::eval::CellType; +using vespalib::eval::DenseValueView; +using vespalib::eval::TypedCells; +using vespalib::eval::ValueType; +using vespalib::eval::Value; + +namespace vespa_ann_benchmark { + +using TopKResult = std::vector<std::pair<uint32_t, double>>; + +namespace { + +std::string +make_tensor_spec(uint32_t dim_size) +{ + std::ostringstream os; + os << "tensor<float>(x[" << dim_size << "])"; + return os.str(); +} + +constexpr uint32_t lid_bias = 1; // lid 0 is reserved + +} + +/* + * Wrapper class for a tensor attribute vector containing a nearest + * neighbor index. + */ +class HnswIndex +{ + ValueType _tensor_type; + HnswIndexParams _hnsw_index_params; + std::shared_ptr<AttributeVector> _attribute; + TensorAttribute* _tensor_attribute; + const NearestNeighborIndex* _nearest_neighbor_index; + size_t _dim_size; + + bool check_lid(uint32_t lid); + bool check_value(const char *op, const std::vector<float>& value); +public: + HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params); + virtual ~HnswIndex(); + void set_vector(uint32_t lid, const std::vector<float>& value); + std::vector<float> get_vector(uint32_t lid); + void clear_vector(uint32_t lid); + TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold); +}; + +HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params) + : _tensor_type(ValueType::error_type()), + _hnsw_index_params(hnsw_index_params), + _attribute(), + _tensor_attribute(nullptr), + _nearest_neighbor_index(nullptr), + _dim_size(0u) +{ + Config cfg(BasicType::TENSOR, CollectionType::SINGLE); + _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size)); + assert(_tensor_type.is_dense()); + assert(_tensor_type.count_indexed_dimensions() == 1u); + _dim_size = _tensor_type.dimensions()[0].size; + std::cout << "HnswIndex::HnswIndex Dimension size is " << _dim_size << std::endl; + cfg.setTensorType(_tensor_type); + cfg.set_distance_metric(hnsw_index_params.distance_metric()); + cfg.set_hnsw_index_params(hnsw_index_params); + _attribute = AttributeFactory::createAttribute("tensor", cfg); + _tensor_attribute = dynamic_cast<TensorAttribute *>(_attribute.get()); + assert(_tensor_attribute != nullptr); + _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index(); + assert(_nearest_neighbor_index != nullptr); +} + +HnswIndex::~HnswIndex() = default; + +bool +HnswIndex::check_lid(uint32_t lid) +{ + if (lid >= std::numeric_limits<uint32_t>::max() - lid_bias) { + std::cerr << "lid is too high" << std::endl; + return false; + } + return true; +} + +bool +HnswIndex::check_value(const char *op, const std::vector<float>& value) +{ + if (value.size() != _dim_size) { + std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl; + return false; + } + return true; +} + +void +HnswIndex::set_vector(uint32_t lid, const std::vector<float>& value) +{ + if (!check_lid(lid)) { + return; + } + if (!check_value("set_vector", value)) { + return; + } + /* + * Not thread safe against concurrent set_vector(). + */ + TypedCells typed_cells(&value[0], CellType::FLOAT, value.size()); + DenseValueView tensor_view(_tensor_type, typed_cells); + while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) { + uint32_t new_lid = 0; + _attribute->addDoc(new_lid); + } + _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa + _attribute->commit(); +} + +std::vector<float> +HnswIndex::get_vector(uint32_t lid) +{ + if (!check_lid(lid)) { + return {}; + } + TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias); + assert(typed_cells.size == _dim_size); + const float* data = static_cast<const float* >(typed_cells.data); + return {data, data + _dim_size}; + return {}; +} + +void +HnswIndex::clear_vector(uint32_t lid) +{ + if (!check_lid(lid)) { + return; + } + if (size_t(lid + lid_bias) < _attribute->getNumDocs()) { + _attribute->clearDoc(lid + lid_bias); + _attribute->commit(); + } +} + +TopKResult +HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k, double distance_threshold) +{ + if (!check_value("find_top_k", value)) { + return {}; + } + /* + * Not thread safe against concurrent set_vector() since attribute + * read guard is not taken here. + */ + TopKResult result; + TypedCells typed_cells(&value[0], CellType::FLOAT, value.size()); + auto raw_result = _nearest_neighbor_index->find_top_k(k, typed_cells, explore_k, distance_threshold * distance_threshold); + result.reserve(raw_result.size()); + switch (_hnsw_index_params.distance_metric()) { + case DistanceMetric::Euclidean: + for (auto &raw : raw_result) { + result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance)); + } + break; + default: + for (auto &raw : raw_result) { + result.emplace_back(raw.docid - lid_bias, raw.distance); + } + } + // Results are sorted by lid, not by distance + return result; +} + +} + +using vespa_ann_benchmark::HnswIndex; + +PYBIND11_MODULE(vespa_ann_benchmark, m) { + m.doc() = "vespa_ann_benchmark plugin"; + + py::enum_<DistanceMetric>(m, "DistanceMetric") + .value("Euclidean", DistanceMetric::Euclidean) + .value("Angular", DistanceMetric::Angular); + + py::class_<HnswIndexParams>(m, "HnswIndexParams") + .def(py::init<uint32_t, uint32_t, DistanceMetric, bool>()); + + py::class_<HnswIndex>(m, "HnswIndex") + .def(py::init<uint32_t, const HnswIndexParams&>()) + .def("set_vector", &HnswIndex::set_vector) + .def("get_vector", &HnswIndex::get_vector) + .def("clear_vector", &HnswIndex::clear_vector) + .def("find_top_k", &HnswIndex::find_top_k); +} |