From c2a94a121ae034aebe8024371b0c5c02cffb8315 Mon Sep 17 00:00:00 2001 From: Arnstein Ressem Date: Tue, 17 Oct 2023 12:57:49 +0200 Subject: Remove the moved ann_bencmark. No more python dependencies. --- CMakeLists.txt | 1 - ann_benchmark/CMakeLists.txt | 13 -- ann_benchmark/src/tests/ann_benchmark/.gitignore | 1 - .../src/tests/ann_benchmark/CMakeLists.txt | 5 - .../src/tests/ann_benchmark/test_angular.py | 41 ---- .../src/tests/ann_benchmark/test_euclidean.py | 61 ----- ann_benchmark/src/vespa/ann_benchmark/.gitignore | 2 - .../src/vespa/ann_benchmark/CMakeLists.txt | 31 --- ann_benchmark/src/vespa/ann_benchmark/setup.py.in | 27 --- .../vespa/ann_benchmark/vespa_ann_benchmark.cpp | 252 --------------------- dist/vespa.spec | 36 --- 11 files changed, 470 deletions(-) delete mode 100644 ann_benchmark/CMakeLists.txt delete mode 100644 ann_benchmark/src/tests/ann_benchmark/.gitignore delete mode 100644 ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt delete mode 100644 ann_benchmark/src/tests/ann_benchmark/test_angular.py delete mode 100644 ann_benchmark/src/tests/ann_benchmark/test_euclidean.py delete mode 100644 ann_benchmark/src/vespa/ann_benchmark/.gitignore delete mode 100644 ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt delete mode 100644 ann_benchmark/src/vespa/ann_benchmark/setup.py.in delete mode 100644 ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a4e89cbdeb9..6a5d635964f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,6 @@ vespa_install_data(tsan-suppressions.txt etc/vespa) include_directories(BEFORE ${CMAKE_BINARY_DIR}/configdefinitions/src) add_subdirectory(airlift-zstd) -add_subdirectory(ann_benchmark) add_subdirectory(application-model) add_subdirectory(client) add_subdirectory(cloud-tenant-cd) diff --git a/ann_benchmark/CMakeLists.txt b/ann_benchmark/CMakeLists.txt deleted file mode 100644 index 06d742cf072..00000000000 --- a/ann_benchmark/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_define_module( - DEPENDS - searchlib - - LIBS - src/vespa/ann_benchmark - - APPS - - TESTS - src/tests/ann_benchmark -) diff --git a/ann_benchmark/src/tests/ann_benchmark/.gitignore b/ann_benchmark/src/tests/ann_benchmark/.gitignore deleted file mode 100644 index 225fc6f6650..00000000000 --- a/ann_benchmark/src/tests/ann_benchmark/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/__pycache__ diff --git a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt deleted file mode 100644 index 03126ce1b47..00000000000 --- a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -if(NOT DEFINED VESPA_USE_SANITIZER) - vespa_add_test(NAME ann_benchmark_test NO_VALGRIND COMMAND ${Python_EXECUTABLE} -m pytest WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS vespa_ann_benchmark) -endif() diff --git a/ann_benchmark/src/tests/ann_benchmark/test_angular.py b/ann_benchmark/src/tests/ann_benchmark/test_angular.py deleted file mode 100644 index ac7feb29d76..00000000000 --- a/ann_benchmark/src/tests/ann_benchmark/test_angular.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -import pytest -import sys -import os -import math -sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark")) -from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex - -class Fixture: - def __init__(self, normalize): - metric = DistanceMetric.InnerProduct if normalize else DistanceMetric.Angular - self.index = HnswIndex(2, HnswIndexParams(16, 200, metric, False), normalize) - self.index.set_vector(0, [1, 0]) - self.index.set_vector(1, [10, 10]) - - def find(self, k, value): - return self.index.find_top_k(k, value, k + 200) - - def run_test(self): - top = self.find(10, [1, 1]) - assert [top[0][0], top[1][0]] == [0, 1] - # Allow some rounding errors - epsilon = 6e-8 - assert abs((1 - top[0][1]) - math.sqrt(0.5)) < epsilon - assert abs((1 - top[1][1]) - 1) < epsilon - top2 = self.find(10, [0, 2]) - # Result is not sorted by distance - assert [top2[0][0], top2[1][0]] == [0, 1] - assert abs((1 - top2[0][1]) - 0) < epsilon - assert abs((1 - top2[1][1]) - math.sqrt(0.5)) < epsilon - assert 1 == self.find(1, [1, 1])[0][0] - assert 0 == self.find(1, [1, -1])[0][0] - -def test_find_angular(): - f = Fixture(False) - f.run_test() - -def test_find_angular_normalized(): - f = Fixture(True) - f.run_test() diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py deleted file mode 100644 index ca4d5ecd6a1..00000000000 --- a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -import pytest -import sys -import os -import math -sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark")) -from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex - -class Fixture: - def __init__(self): - self.index = HnswIndex(2, HnswIndexParams(16, 200, DistanceMetric.Euclidean, False), False) - - def set(self, lid, value): - self.index.set_vector(lid, value) - - def get(self, lid): - return self.index.get_vector(lid) - - def clear(self, lid): - return self.index.clear_vector(lid) - - def find(self, k, value): - return self.index.find_top_k(k, value, k + 200) - -def test_set_value(): - f = Fixture() - f.set(0, [1, 2]) - f.set(1, [3, 4]) - assert [1, 2] == f.get(0) - assert [3, 4] == f.get(1) - -def test_clear_value(): - f = Fixture() - f.set(0, [1, 2]) - assert [1, 2] == f.get(0) - f.clear(0) - assert [0, 0] == f.get(0) - -def test_find(): - f = Fixture() - f.set(0, [0, 0]) - f.set(1, [10, 10]) - top = f.find(10, [1, 1]) - assert [top[0][0], top[1][0]] == [0, 1] - # Allow some rounding errors - epsilon = 1e-20 - assert abs(top[0][1] - math.sqrt(2)) < epsilon - assert abs(top[1][1] - math.sqrt(162)) < epsilon - top2 = f.find(10, [9, 9]) - # Result is not sorted by distance - assert [top2[0][0], top2[1][0]] == [0, 1] - assert abs(top2[0][1] - math.sqrt(162)) < epsilon - assert abs(top2[1][1] - math.sqrt(2)) < epsilon - assert 0 == f.find(1, [1, 1])[0][0] - assert 1 == f.find(1, [9, 9])[0][0] - f.clear(1) - assert 0 == f.find(1, [9, 9])[0][0] - assert 0 == f.find(1, [0, 0])[0][0] - f.clear(0) - assert 0 == len(f.find(1, [9, 9])) diff --git a/ann_benchmark/src/vespa/ann_benchmark/.gitignore b/ann_benchmark/src/vespa/ann_benchmark/.gitignore deleted file mode 100644 index 3b4605aeee2..00000000000 --- a/ann_benchmark/src/vespa/ann_benchmark/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/vespa_ann_benchmark.cpython*.so -/setup.py diff --git a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt deleted file mode 100644 index fa3b10b2269..00000000000 --- a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -install(DIRECTORY DESTINATION libexec/vespa_ann_benchmark) - -vespa_add_library(vespa_ann_benchmark - ALLOW_UNRESOLVED_SYMBOLS - SOURCES - vespa_ann_benchmark.cpp - - INSTALL libexec/vespa_ann_benchmark - DEPENDS - pybind11::pybind11 -) - -if (TARGET pybind11::lto) - target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module pybind11::lto) -else() - target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module) -endif() - -if (COMMAND pybind11_extension) - pybind11_extension(vespa_ann_benchmark) -else() - set_target_properties(vespa_ann_benchmark PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") - set_target_properties(vespa_ann_benchmark PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}") -endif() - -set_target_properties(vespa_ann_benchmark PROPERTIES CXX_VISIBILITY_PRESET "hidden") - -configure_file(setup.py.in setup.py @ONLY) - -vespa_install_script(setup.py libexec/vespa_ann_benchmark) diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py.in b/ann_benchmark/src/vespa/ann_benchmark/setup.py.in deleted file mode 100644 index 457d6e1b4b5..00000000000 --- a/ann_benchmark/src/vespa/ann_benchmark/setup.py.in +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -import subprocess -import sys -import platform -import distutils.sysconfig -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext - -class PreBuiltExt(build_ext): - def build_extension(self, ext): - print("Using prebuilt extension library") - libdir="lib.%s-%s-%s" % (sys.platform, platform.machine(), distutils.sysconfig.get_python_version()) - subprocess.run(["mkdir", "-p", "build/%s" % libdir]) - subprocess.run(["cp", "-p", "@PYTHON_MODULE_PREFIX@vespa_ann_benchmark@PYTHON_MODULE_EXTENSION@", "build/%s" % libdir]) - -setup( - name="vespa_ann_benchmark", - version="0.1.0", - author="Tor Egge", - author_email="Tor.Egge@yahooinc.com", - description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search", - long_description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search used for low-level benchmarking", - ext_modules=[Extension("vespa_ann_benchmark", sources=[])], - cmdclass={"build_ext": PreBuiltExt}, - zip_safe=False, -) diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp deleted file mode 100644 index ab00f997226..00000000000 --- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace py = pybind11; - -using search::AttributeFactory; -using search::AttributeVector; -using search::attribute::BasicType; -using search::attribute::Config; -using search::attribute::CollectionType; -using search::attribute::DistanceMetric; -using search::attribute::HnswIndexParams; -using search::tensor::NearestNeighborIndex; -using search::tensor::TensorAttribute; -using vespalib::eval::CellType; -using vespalib::eval::DenseValueView; -using vespalib::eval::TypedCells; -using vespalib::eval::ValueType; -using vespalib::eval::Value; - -namespace vespa_ann_benchmark { - -using TopKResult = std::vector>; - -namespace { - -std::string -make_tensor_spec(uint32_t dim_size) -{ - std::ostringstream os; - os << "tensor(x[" << dim_size << "])"; - return os.str(); -} - -constexpr uint32_t lid_bias = 1; // lid 0 is reserved - -} - -/* - * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space. - * - * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory. - * This class only supports single-threaded access (both for indexing and searching), - * and should only be used for low-level benchmarking. - * To use nearest neighbor search in a Vespa application, - * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details. - */ -class HnswIndex -{ - ValueType _tensor_type; - HnswIndexParams _hnsw_index_params; - std::shared_ptr _attribute; - TensorAttribute* _tensor_attribute; - const NearestNeighborIndex* _nearest_neighbor_index; - size_t _dim_size; - bool _normalize_vectors; - vespalib::FakeDoom _no_doom; - - bool check_lid(uint32_t lid); - bool check_value(const char *op, const std::vector& value); - TypedCells get_typed_cells(const std::vector& value, std::vector& normalized_value); -public: - HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors); - virtual ~HnswIndex(); - void set_vector(uint32_t lid, const std::vector& value); - std::vector get_vector(uint32_t lid); - void clear_vector(uint32_t lid); - TopKResult find_top_k(uint32_t k, const std::vector& value, uint32_t explore_k); -}; - -HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors) - : _tensor_type(ValueType::error_type()), - _hnsw_index_params(hnsw_index_params), - _attribute(), - _tensor_attribute(nullptr), - _nearest_neighbor_index(nullptr), - _dim_size(0u), - _normalize_vectors(normalize_vectors), - _no_doom() -{ - Config cfg(BasicType::TENSOR, CollectionType::SINGLE); - _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size)); - assert(_tensor_type.is_dense()); - assert(_tensor_type.count_indexed_dimensions() == 1u); - _dim_size = _tensor_type.dimensions()[0].size; - cfg.setTensorType(_tensor_type); - cfg.set_distance_metric(hnsw_index_params.distance_metric()); - cfg.set_hnsw_index_params(hnsw_index_params); - _attribute = AttributeFactory::createAttribute("tensor", cfg); - _tensor_attribute = dynamic_cast(_attribute.get()); - assert(_tensor_attribute != nullptr); - _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index(); - assert(_nearest_neighbor_index != nullptr); -} - -HnswIndex::~HnswIndex() = default; - -bool -HnswIndex::check_lid(uint32_t lid) -{ - if (lid >= std::numeric_limits::max() - lid_bias) { - std::cerr << "lid is too high" << std::endl; - return false; - } - return true; -} - -bool -HnswIndex::check_value(const char *op, const std::vector& value) -{ - if (value.size() != _dim_size) { - std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl; - return false; - } - return true; -} - -TypedCells -HnswIndex::get_typed_cells(const std::vector& value, std::vector& normalized_value) -{ - if (!_normalize_vectors) { - return {&value[0], CellType::FLOAT, value.size()}; - } - double sum_of_squared = 0.0; - for (auto elem : value) { - double delem = elem; - sum_of_squared += delem * delem; - } - double factor = 1.0 / (sqrt(sum_of_squared) + 1e-40); - normalized_value.reserve(value.size()); - normalized_value.clear(); - for (auto elem : value) { - normalized_value.emplace_back(elem * factor); - } - return {&normalized_value[0], CellType::FLOAT, normalized_value.size()}; -} - -void -HnswIndex::set_vector(uint32_t lid, const std::vector& value) -{ - if (!check_lid(lid)) { - return; - } - if (!check_value("set_vector", value)) { - return; - } - /* - * Not thread safe against concurrent set_vector(). - */ - std::vector normalized_value; - auto typed_cells = get_typed_cells(value, normalized_value); - DenseValueView tensor_view(_tensor_type, typed_cells); - while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) { - uint32_t new_lid = 0; - _attribute->addDoc(new_lid); - } - _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa - _attribute->commit(); -} - -std::vector -HnswIndex::get_vector(uint32_t lid) -{ - if (!check_lid(lid)) { - return {}; - } - TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias); - assert(typed_cells.size == _dim_size); - const float* data = static_cast(typed_cells.data); - return {data, data + _dim_size}; - return {}; -} - -void -HnswIndex::clear_vector(uint32_t lid) -{ - if (!check_lid(lid)) { - return; - } - if (size_t(lid + lid_bias) < _attribute->getNumDocs()) { - _attribute->clearDoc(lid + lid_bias); - _attribute->commit(); - } -} - -TopKResult -HnswIndex::find_top_k(uint32_t k, const std::vector& value, uint32_t explore_k) -{ - if (!check_value("find_top_k", value)) { - return {}; - } - /* - * Not thread safe against concurrent set_vector() since attribute - * read guard is not taken here. - */ - TopKResult result; - std::vector normalized_value; - auto typed_cells = get_typed_cells(value, normalized_value); - auto df = _nearest_neighbor_index->distance_function_factory().for_query_vector(typed_cells); - auto raw_result = _nearest_neighbor_index->find_top_k(k, *df, explore_k, _no_doom.get_doom(), std::numeric_limits::max()); - result.reserve(raw_result.size()); - switch (_hnsw_index_params.distance_metric()) { - case DistanceMetric::Euclidean: - for (auto &raw : raw_result) { - result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance)); - } - break; - default: - for (auto &raw : raw_result) { - result.emplace_back(raw.docid - lid_bias, raw.distance); - } - } - // Results are sorted by lid, not by distance - return result; -} - -} - -using vespa_ann_benchmark::HnswIndex; - -PYBIND11_MODULE(vespa_ann_benchmark, m) { - m.doc() = "vespa_ann_benchmark plugin"; - - py::enum_(m, "DistanceMetric") - .value("Euclidean", DistanceMetric::Euclidean) - .value("Angular", DistanceMetric::Angular) - .value("InnerProduct", DistanceMetric::InnerProduct); - - py::class_(m, "HnswIndexParams") - .def(py::init()); - - py::class_(m, "HnswIndex") - .def(py::init()) - .def("set_vector", &HnswIndex::set_vector) - .def("get_vector", &HnswIndex::get_vector) - .def("clear_vector", &HnswIndex::clear_vector) - .def("find_top_k", &HnswIndex::find_top_k); -} diff --git a/dist/vespa.spec b/dist/vespa.spec index ce302ae0338..cd1381fd48d 100644 --- a/dist/vespa.spec +++ b/dist/vespa.spec @@ -280,29 +280,6 @@ Requires: %{name}-base-libs = %{version}-%{release} Vespa - The open big data serving engine - devel package -%package ann-benchmark - -Summary: Vespa - The open big data serving engine - ann-benchmark - -Requires: %{name}-base-libs = %{version}-%{release} -Requires: %{name}-libs = %{version}-%{release} -%if 0%{?el8} -Requires: python39 -%endif -%if 0%{?el9} -Requires: python3 -%endif -%if 0%{?fedora} -Requires: python3 -%endif - -%description ann-benchmark - -Vespa - The open big data serving engine - ann-benchmark - -Python binding for the Vespa implementation of an HNSW index for -nearest neighbor search used for low-level benchmarking. - %prep %if 0%{?installdir:1} %if 0%{?source_base:1} @@ -383,10 +360,6 @@ export JAVA_HOME=%{?_java_home} export JAVA_HOME=/usr/lib/jvm/java-17-openjdk %endif export PATH="$JAVA_HOME/bin:$PATH" -%if 0%{?el8} -python3.9 -m pip install --user pytest -%endif -export PYTHONPATH="$PYTHONPATH:/usr/local/lib/$(basename $(readlink -f $(which python3)))/site-packages" #%{?_use_mvn_wrapper:./mvnw}%{!?_use_mvn_wrapper:mvn} --batch-mode -nsu -T 1C -Dmaven.javadoc.skip=true test make test ARGS="--output-on-failure %{_smp_mflags}" %endif @@ -537,7 +510,6 @@ fi %{_prefix}/lib/jars/zookeeper-command-line-client-jar-with-dependencies.jar %{_prefix}/lib/perl5 %{_prefix}/libexec -%exclude %{_prefix}/libexec/vespa_ann_benchmark %exclude %{_prefix}/libexec/vespa/common-env.sh %exclude %{_prefix}/libexec/vespa/vespa-wrapper %exclude %{_prefix}/libexec/vespa/find-pid @@ -763,12 +735,4 @@ fi %{_prefix}/include %{_prefix}/share/cmake -%files ann-benchmark -%if %{_defattr_is_vespa_vespa} -%defattr(-,%{_vespa_user},%{_vespa_group},-) -%endif -%dir %{_prefix} -%dir %{_prefix}/libexec -%{_prefix}/libexec/vespa_ann_benchmark - %changelog -- cgit v1.2.3 From 379e8478f29143ab53f427a6386459cc7a10b519 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Wed, 18 Oct 2023 07:41:25 +0000 Subject: Do not require pybind11 anymore. --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a5d635964f..b975f3adf2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,6 @@ find_package(JNI REQUIRED) find_package(GTest REQUIRED) find_package(Python 3.6 COMPONENTS Interpreter Development REQUIRED) -find_package(pybind11 CONFIG REQUIRED) find_package(Protobuf REQUIRED) -- cgit v1.2.3