aboutsummaryrefslogtreecommitdiffstats
path: root/ann_benchmark/src
diff options
context:
space:
mode:
Diffstat (limited to 'ann_benchmark/src')
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/.gitignore1
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt5
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/test_angular.py41
-rw-r--r--ann_benchmark/src/tests/ann_benchmark/test_euclidean.py61
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/.gitignore2
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt31
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/setup.py.in27
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp252
8 files changed, 0 insertions, 420 deletions
diff --git a/ann_benchmark/src/tests/ann_benchmark/.gitignore b/ann_benchmark/src/tests/ann_benchmark/.gitignore
deleted file mode 100644
index 225fc6f6650..00000000000
--- a/ann_benchmark/src/tests/ann_benchmark/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/__pycache__
diff --git a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt
deleted file mode 100644
index 30ecb155a5b..00000000000
--- a/ann_benchmark/src/tests/ann_benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-if(NOT DEFINED VESPA_USE_SANITIZER)
- vespa_add_test(NAME ann_benchmark_test NO_VALGRIND COMMAND ${Python_EXECUTABLE} -m pytest WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS vespa_ann_benchmark)
-endif()
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_angular.py b/ann_benchmark/src/tests/ann_benchmark/test_angular.py
deleted file mode 100644
index 15e718906d6..00000000000
--- a/ann_benchmark/src/tests/ann_benchmark/test_angular.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import pytest
-import sys
-import os
-import math
-sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark"))
-from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex
-
-class Fixture:
- def __init__(self, normalize):
- metric = DistanceMetric.InnerProduct if normalize else DistanceMetric.Angular
- self.index = HnswIndex(2, HnswIndexParams(16, 200, metric, False), normalize)
- self.index.set_vector(0, [1, 0])
- self.index.set_vector(1, [10, 10])
-
- def find(self, k, value):
- return self.index.find_top_k(k, value, k + 200)
-
- def run_test(self):
- top = self.find(10, [1, 1])
- assert [top[0][0], top[1][0]] == [0, 1]
- # Allow some rounding errors
- epsilon = 6e-8
- assert abs((1 - top[0][1]) - math.sqrt(0.5)) < epsilon
- assert abs((1 - top[1][1]) - 1) < epsilon
- top2 = self.find(10, [0, 2])
- # Result is not sorted by distance
- assert [top2[0][0], top2[1][0]] == [0, 1]
- assert abs((1 - top2[0][1]) - 0) < epsilon
- assert abs((1 - top2[1][1]) - math.sqrt(0.5)) < epsilon
- assert 1 == self.find(1, [1, 1])[0][0]
- assert 0 == self.find(1, [1, -1])[0][0]
-
-def test_find_angular():
- f = Fixture(False)
- f.run_test()
-
-def test_find_angular_normalized():
- f = Fixture(True)
- f.run_test()
diff --git a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py b/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
deleted file mode 100644
index 6663e1929ec..00000000000
--- a/ann_benchmark/src/tests/ann_benchmark/test_euclidean.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import pytest
-import sys
-import os
-import math
-sys.path.insert(0, os.path.abspath("../../vespa/ann_benchmark"))
-from vespa_ann_benchmark import DistanceMetric, HnswIndexParams, HnswIndex
-
-class Fixture:
- def __init__(self):
- self.index = HnswIndex(2, HnswIndexParams(16, 200, DistanceMetric.Euclidean, False), False)
-
- def set(self, lid, value):
- self.index.set_vector(lid, value)
-
- def get(self, lid):
- return self.index.get_vector(lid)
-
- def clear(self, lid):
- return self.index.clear_vector(lid)
-
- def find(self, k, value):
- return self.index.find_top_k(k, value, k + 200)
-
-def test_set_value():
- f = Fixture()
- f.set(0, [1, 2])
- f.set(1, [3, 4])
- assert [1, 2] == f.get(0)
- assert [3, 4] == f.get(1)
-
-def test_clear_value():
- f = Fixture()
- f.set(0, [1, 2])
- assert [1, 2] == f.get(0)
- f.clear(0)
- assert [0, 0] == f.get(0)
-
-def test_find():
- f = Fixture()
- f.set(0, [0, 0])
- f.set(1, [10, 10])
- top = f.find(10, [1, 1])
- assert [top[0][0], top[1][0]] == [0, 1]
- # Allow some rounding errors
- epsilon = 1e-20
- assert abs(top[0][1] - math.sqrt(2)) < epsilon
- assert abs(top[1][1] - math.sqrt(162)) < epsilon
- top2 = f.find(10, [9, 9])
- # Result is not sorted by distance
- assert [top2[0][0], top2[1][0]] == [0, 1]
- assert abs(top2[0][1] - math.sqrt(162)) < epsilon
- assert abs(top2[1][1] - math.sqrt(2)) < epsilon
- assert 0 == f.find(1, [1, 1])[0][0]
- assert 1 == f.find(1, [9, 9])[0][0]
- f.clear(1)
- assert 0 == f.find(1, [9, 9])[0][0]
- assert 0 == f.find(1, [0, 0])[0][0]
- f.clear(0)
- assert 0 == len(f.find(1, [9, 9]))
diff --git a/ann_benchmark/src/vespa/ann_benchmark/.gitignore b/ann_benchmark/src/vespa/ann_benchmark/.gitignore
deleted file mode 100644
index 3b4605aeee2..00000000000
--- a/ann_benchmark/src/vespa/ann_benchmark/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/vespa_ann_benchmark.cpython*.so
-/setup.py
diff --git a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt b/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt
deleted file mode 100644
index da27365113a..00000000000
--- a/ann_benchmark/src/vespa/ann_benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-install(DIRECTORY DESTINATION libexec/vespa_ann_benchmark)
-
-vespa_add_library(vespa_ann_benchmark
- ALLOW_UNRESOLVED_SYMBOLS
- SOURCES
- vespa_ann_benchmark.cpp
-
- INSTALL libexec/vespa_ann_benchmark
- DEPENDS
- pybind11::pybind11
-)
-
-if (TARGET pybind11::lto)
- target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module pybind11::lto)
-else()
- target_link_libraries(vespa_ann_benchmark PRIVATE pybind11::module)
-endif()
-
-if (COMMAND pybind11_extension)
- pybind11_extension(vespa_ann_benchmark)
-else()
- set_target_properties(vespa_ann_benchmark PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
- set_target_properties(vespa_ann_benchmark PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
-endif()
-
-set_target_properties(vespa_ann_benchmark PROPERTIES CXX_VISIBILITY_PRESET "hidden")
-
-configure_file(setup.py.in setup.py @ONLY)
-
-vespa_install_script(setup.py libexec/vespa_ann_benchmark)
diff --git a/ann_benchmark/src/vespa/ann_benchmark/setup.py.in b/ann_benchmark/src/vespa/ann_benchmark/setup.py.in
deleted file mode 100644
index ee107076410..00000000000
--- a/ann_benchmark/src/vespa/ann_benchmark/setup.py.in
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-import subprocess
-import sys
-import platform
-import distutils.sysconfig
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-
-class PreBuiltExt(build_ext):
- def build_extension(self, ext):
- print("Using prebuilt extension library")
- libdir="lib.%s-%s-%s" % (sys.platform, platform.machine(), distutils.sysconfig.get_python_version())
- subprocess.run(["mkdir", "-p", "build/%s" % libdir])
- subprocess.run(["cp", "-p", "@PYTHON_MODULE_PREFIX@vespa_ann_benchmark@PYTHON_MODULE_EXTENSION@", "build/%s" % libdir])
-
-setup(
- name="vespa_ann_benchmark",
- version="0.1.0",
- author="Tor Egge",
- author_email="Tor.Egge@yahooinc.com",
- description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search",
- long_description="Python binding for the Vespa implementation of an HNSW index for nearest neighbor search used for low-level benchmarking",
- ext_modules=[Extension("vespa_ann_benchmark", sources=[])],
- cmdclass={"build_ext": PreBuiltExt},
- zip_safe=False,
-)
diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
deleted file mode 100644
index 730ee141f83..00000000000
--- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <vespa/searchcommon/attribute/hnsw_index_params.h>
-#include <vespa/searchlib/attribute/attributevector.h>
-#include <vespa/searchlib/attribute/attributefactory.h>
-#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
-#include <vespa/searchlib/tensor/nearest_neighbor_index.h>
-#include <vespa/searchcommon/attribute/config.h>
-#include <vespa/eval/eval/value.h>
-#include <vespa/vespalib/test/insertion_operators.h>
-#include <vespa/vespalib/util/fake_doom.h>
-#include <iostream>
-#include <sstream>
-#include <limits>
-
-namespace py = pybind11;
-
-using search::AttributeFactory;
-using search::AttributeVector;
-using search::attribute::BasicType;
-using search::attribute::Config;
-using search::attribute::CollectionType;
-using search::attribute::DistanceMetric;
-using search::attribute::HnswIndexParams;
-using search::tensor::NearestNeighborIndex;
-using search::tensor::TensorAttribute;
-using vespalib::eval::CellType;
-using vespalib::eval::DenseValueView;
-using vespalib::eval::TypedCells;
-using vespalib::eval::ValueType;
-using vespalib::eval::Value;
-
-namespace vespa_ann_benchmark {
-
-using TopKResult = std::vector<std::pair<uint32_t, double>>;
-
-namespace {
-
-std::string
-make_tensor_spec(uint32_t dim_size)
-{
- std::ostringstream os;
- os << "tensor<float>(x[" << dim_size << "])";
- return os.str();
-}
-
-constexpr uint32_t lid_bias = 1; // lid 0 is reserved
-
-}
-
-/*
- * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space.
- *
- * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory.
- * This class only supports single-threaded access (both for indexing and searching),
- * and should only be used for low-level benchmarking.
- * To use nearest neighbor search in a Vespa application,
- * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details.
- */
-class HnswIndex
-{
- ValueType _tensor_type;
- HnswIndexParams _hnsw_index_params;
- std::shared_ptr<AttributeVector> _attribute;
- TensorAttribute* _tensor_attribute;
- const NearestNeighborIndex* _nearest_neighbor_index;
- size_t _dim_size;
- bool _normalize_vectors;
- vespalib::FakeDoom _no_doom;
-
- bool check_lid(uint32_t lid);
- bool check_value(const char *op, const std::vector<float>& value);
- TypedCells get_typed_cells(const std::vector<float>& value, std::vector<float>& normalized_value);
-public:
- HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors);
- virtual ~HnswIndex();
- void set_vector(uint32_t lid, const std::vector<float>& value);
- std::vector<float> get_vector(uint32_t lid);
- void clear_vector(uint32_t lid);
- TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k);
-};
-
-HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors)
- : _tensor_type(ValueType::error_type()),
- _hnsw_index_params(hnsw_index_params),
- _attribute(),
- _tensor_attribute(nullptr),
- _nearest_neighbor_index(nullptr),
- _dim_size(0u),
- _normalize_vectors(normalize_vectors),
- _no_doom()
-{
- Config cfg(BasicType::TENSOR, CollectionType::SINGLE);
- _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size));
- assert(_tensor_type.is_dense());
- assert(_tensor_type.count_indexed_dimensions() == 1u);
- _dim_size = _tensor_type.dimensions()[0].size;
- cfg.setTensorType(_tensor_type);
- cfg.set_distance_metric(hnsw_index_params.distance_metric());
- cfg.set_hnsw_index_params(hnsw_index_params);
- _attribute = AttributeFactory::createAttribute("tensor", cfg);
- _tensor_attribute = dynamic_cast<TensorAttribute *>(_attribute.get());
- assert(_tensor_attribute != nullptr);
- _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index();
- assert(_nearest_neighbor_index != nullptr);
-}
-
-HnswIndex::~HnswIndex() = default;
-
-bool
-HnswIndex::check_lid(uint32_t lid)
-{
- if (lid >= std::numeric_limits<uint32_t>::max() - lid_bias) {
- std::cerr << "lid is too high" << std::endl;
- return false;
- }
- return true;
-}
-
-bool
-HnswIndex::check_value(const char *op, const std::vector<float>& value)
-{
- if (value.size() != _dim_size) {
- std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl;
- return false;
- }
- return true;
-}
-
-TypedCells
-HnswIndex::get_typed_cells(const std::vector<float>& value, std::vector<float>& normalized_value)
-{
- if (!_normalize_vectors) {
- return {&value[0], CellType::FLOAT, value.size()};
- }
- double sum_of_squared = 0.0;
- for (auto elem : value) {
- double delem = elem;
- sum_of_squared += delem * delem;
- }
- double factor = 1.0 / (sqrt(sum_of_squared) + 1e-40);
- normalized_value.reserve(value.size());
- normalized_value.clear();
- for (auto elem : value) {
- normalized_value.emplace_back(elem * factor);
- }
- return {&normalized_value[0], CellType::FLOAT, normalized_value.size()};
-}
-
-void
-HnswIndex::set_vector(uint32_t lid, const std::vector<float>& value)
-{
- if (!check_lid(lid)) {
- return;
- }
- if (!check_value("set_vector", value)) {
- return;
- }
- /*
- * Not thread safe against concurrent set_vector().
- */
- std::vector<float> normalized_value;
- auto typed_cells = get_typed_cells(value, normalized_value);
- DenseValueView tensor_view(_tensor_type, typed_cells);
- while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) {
- uint32_t new_lid = 0;
- _attribute->addDoc(new_lid);
- }
- _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa
- _attribute->commit();
-}
-
-std::vector<float>
-HnswIndex::get_vector(uint32_t lid)
-{
- if (!check_lid(lid)) {
- return {};
- }
- TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias);
- assert(typed_cells.size == _dim_size);
- const float* data = static_cast<const float* >(typed_cells.data);
- return {data, data + _dim_size};
- return {};
-}
-
-void
-HnswIndex::clear_vector(uint32_t lid)
-{
- if (!check_lid(lid)) {
- return;
- }
- if (size_t(lid + lid_bias) < _attribute->getNumDocs()) {
- _attribute->clearDoc(lid + lid_bias);
- _attribute->commit();
- }
-}
-
-TopKResult
-HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k)
-{
- if (!check_value("find_top_k", value)) {
- return {};
- }
- /*
- * Not thread safe against concurrent set_vector() since attribute
- * read guard is not taken here.
- */
- TopKResult result;
- std::vector<float> normalized_value;
- auto typed_cells = get_typed_cells(value, normalized_value);
- auto df = _nearest_neighbor_index->distance_function_factory().for_query_vector(typed_cells);
- auto raw_result = _nearest_neighbor_index->find_top_k(k, *df, explore_k, _no_doom.get_doom(), std::numeric_limits<double>::max());
- result.reserve(raw_result.size());
- switch (_hnsw_index_params.distance_metric()) {
- case DistanceMetric::Euclidean:
- for (auto &raw : raw_result) {
- result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance));
- }
- break;
- default:
- for (auto &raw : raw_result) {
- result.emplace_back(raw.docid - lid_bias, raw.distance);
- }
- }
- // Results are sorted by lid, not by distance
- return result;
-}
-
-}
-
-using vespa_ann_benchmark::HnswIndex;
-
-PYBIND11_MODULE(vespa_ann_benchmark, m) {
- m.doc() = "vespa_ann_benchmark plugin";
-
- py::enum_<DistanceMetric>(m, "DistanceMetric")
- .value("Euclidean", DistanceMetric::Euclidean)
- .value("Angular", DistanceMetric::Angular)
- .value("InnerProduct", DistanceMetric::InnerProduct);
-
- py::class_<HnswIndexParams>(m, "HnswIndexParams")
- .def(py::init<uint32_t, uint32_t, DistanceMetric, bool>());
-
- py::class_<HnswIndex>(m, "HnswIndex")
- .def(py::init<uint32_t, const HnswIndexParams&, bool>())
- .def("set_vector", &HnswIndex::set_vector)
- .def("get_vector", &HnswIndex::get_vector)
- .def("clear_vector", &HnswIndex::clear_vector)
- .def("find_top_k", &HnswIndex::find_top_k);
-}