aboutsummaryrefslogtreecommitdiffstats
path: root/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp')
-rw-r--r--ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp252
1 files changed, 0 insertions, 252 deletions
diff --git a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp b/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
deleted file mode 100644
index 730ee141f83..00000000000
--- a/ann_benchmark/src/vespa/ann_benchmark/vespa_ann_benchmark.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <vespa/searchcommon/attribute/hnsw_index_params.h>
-#include <vespa/searchlib/attribute/attributevector.h>
-#include <vespa/searchlib/attribute/attributefactory.h>
-#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
-#include <vespa/searchlib/tensor/nearest_neighbor_index.h>
-#include <vespa/searchcommon/attribute/config.h>
-#include <vespa/eval/eval/value.h>
-#include <vespa/vespalib/test/insertion_operators.h>
-#include <vespa/vespalib/util/fake_doom.h>
-#include <iostream>
-#include <sstream>
-#include <limits>
-
-namespace py = pybind11;
-
-using search::AttributeFactory;
-using search::AttributeVector;
-using search::attribute::BasicType;
-using search::attribute::Config;
-using search::attribute::CollectionType;
-using search::attribute::DistanceMetric;
-using search::attribute::HnswIndexParams;
-using search::tensor::NearestNeighborIndex;
-using search::tensor::TensorAttribute;
-using vespalib::eval::CellType;
-using vespalib::eval::DenseValueView;
-using vespalib::eval::TypedCells;
-using vespalib::eval::ValueType;
-using vespalib::eval::Value;
-
-namespace vespa_ann_benchmark {
-
-using TopKResult = std::vector<std::pair<uint32_t, double>>;
-
-namespace {
-
-std::string
-make_tensor_spec(uint32_t dim_size)
-{
- std::ostringstream os;
- os << "tensor<float>(x[" << dim_size << "])";
- return os.str();
-}
-
-constexpr uint32_t lid_bias = 1; // lid 0 is reserved
-
-}
-
-/*
- * Class exposing the Vespa implementation of an HNSW index for nearest neighbor search over data points in a high dimensional vector space.
- *
- * A tensor attribute field (https://docs.vespa.ai/en/reference/schema-reference.html#type:tensor) is used to store the vectors in memory.
- * This class only supports single-threaded access (both for indexing and searching),
- * and should only be used for low-level benchmarking.
- * To use nearest neighbor search in a Vespa application,
- * see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details.
- */
-class HnswIndex
-{
- ValueType _tensor_type;
- HnswIndexParams _hnsw_index_params;
- std::shared_ptr<AttributeVector> _attribute;
- TensorAttribute* _tensor_attribute;
- const NearestNeighborIndex* _nearest_neighbor_index;
- size_t _dim_size;
- bool _normalize_vectors;
- vespalib::FakeDoom _no_doom;
-
- bool check_lid(uint32_t lid);
- bool check_value(const char *op, const std::vector<float>& value);
- TypedCells get_typed_cells(const std::vector<float>& value, std::vector<float>& normalized_value);
-public:
- HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors);
- virtual ~HnswIndex();
- void set_vector(uint32_t lid, const std::vector<float>& value);
- std::vector<float> get_vector(uint32_t lid);
- void clear_vector(uint32_t lid);
- TopKResult find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k);
-};
-
-HnswIndex::HnswIndex(uint32_t dim_size, const HnswIndexParams &hnsw_index_params, bool normalize_vectors)
- : _tensor_type(ValueType::error_type()),
- _hnsw_index_params(hnsw_index_params),
- _attribute(),
- _tensor_attribute(nullptr),
- _nearest_neighbor_index(nullptr),
- _dim_size(0u),
- _normalize_vectors(normalize_vectors),
- _no_doom()
-{
- Config cfg(BasicType::TENSOR, CollectionType::SINGLE);
- _tensor_type = ValueType::from_spec(make_tensor_spec(dim_size));
- assert(_tensor_type.is_dense());
- assert(_tensor_type.count_indexed_dimensions() == 1u);
- _dim_size = _tensor_type.dimensions()[0].size;
- cfg.setTensorType(_tensor_type);
- cfg.set_distance_metric(hnsw_index_params.distance_metric());
- cfg.set_hnsw_index_params(hnsw_index_params);
- _attribute = AttributeFactory::createAttribute("tensor", cfg);
- _tensor_attribute = dynamic_cast<TensorAttribute *>(_attribute.get());
- assert(_tensor_attribute != nullptr);
- _nearest_neighbor_index = _tensor_attribute->nearest_neighbor_index();
- assert(_nearest_neighbor_index != nullptr);
-}
-
-HnswIndex::~HnswIndex() = default;
-
-bool
-HnswIndex::check_lid(uint32_t lid)
-{
- if (lid >= std::numeric_limits<uint32_t>::max() - lid_bias) {
- std::cerr << "lid is too high" << std::endl;
- return false;
- }
- return true;
-}
-
-bool
-HnswIndex::check_value(const char *op, const std::vector<float>& value)
-{
- if (value.size() != _dim_size) {
- std::cerr << op << " failed, expected vector with size " << _dim_size << ", got vector with size " << value.size() << std::endl;
- return false;
- }
- return true;
-}
-
-TypedCells
-HnswIndex::get_typed_cells(const std::vector<float>& value, std::vector<float>& normalized_value)
-{
- if (!_normalize_vectors) {
- return {&value[0], CellType::FLOAT, value.size()};
- }
- double sum_of_squared = 0.0;
- for (auto elem : value) {
- double delem = elem;
- sum_of_squared += delem * delem;
- }
- double factor = 1.0 / (sqrt(sum_of_squared) + 1e-40);
- normalized_value.reserve(value.size());
- normalized_value.clear();
- for (auto elem : value) {
- normalized_value.emplace_back(elem * factor);
- }
- return {&normalized_value[0], CellType::FLOAT, normalized_value.size()};
-}
-
-void
-HnswIndex::set_vector(uint32_t lid, const std::vector<float>& value)
-{
- if (!check_lid(lid)) {
- return;
- }
- if (!check_value("set_vector", value)) {
- return;
- }
- /*
- * Not thread safe against concurrent set_vector().
- */
- std::vector<float> normalized_value;
- auto typed_cells = get_typed_cells(value, normalized_value);
- DenseValueView tensor_view(_tensor_type, typed_cells);
- while (size_t(lid + lid_bias) >= _attribute->getNumDocs()) {
- uint32_t new_lid = 0;
- _attribute->addDoc(new_lid);
- }
- _tensor_attribute->setTensor(lid + lid_bias, tensor_view); // lid 0 is special in vespa
- _attribute->commit();
-}
-
-std::vector<float>
-HnswIndex::get_vector(uint32_t lid)
-{
- if (!check_lid(lid)) {
- return {};
- }
- TypedCells typed_cells = _tensor_attribute->extract_cells_ref(lid + lid_bias);
- assert(typed_cells.size == _dim_size);
- const float* data = static_cast<const float* >(typed_cells.data);
- return {data, data + _dim_size};
- return {};
-}
-
-void
-HnswIndex::clear_vector(uint32_t lid)
-{
- if (!check_lid(lid)) {
- return;
- }
- if (size_t(lid + lid_bias) < _attribute->getNumDocs()) {
- _attribute->clearDoc(lid + lid_bias);
- _attribute->commit();
- }
-}
-
-TopKResult
-HnswIndex::find_top_k(uint32_t k, const std::vector<float>& value, uint32_t explore_k)
-{
- if (!check_value("find_top_k", value)) {
- return {};
- }
- /*
- * Not thread safe against concurrent set_vector() since attribute
- * read guard is not taken here.
- */
- TopKResult result;
- std::vector<float> normalized_value;
- auto typed_cells = get_typed_cells(value, normalized_value);
- auto df = _nearest_neighbor_index->distance_function_factory().for_query_vector(typed_cells);
- auto raw_result = _nearest_neighbor_index->find_top_k(k, *df, explore_k, _no_doom.get_doom(), std::numeric_limits<double>::max());
- result.reserve(raw_result.size());
- switch (_hnsw_index_params.distance_metric()) {
- case DistanceMetric::Euclidean:
- for (auto &raw : raw_result) {
- result.emplace_back(raw.docid - lid_bias, sqrt(raw.distance));
- }
- break;
- default:
- for (auto &raw : raw_result) {
- result.emplace_back(raw.docid - lid_bias, raw.distance);
- }
- }
- // Results are sorted by lid, not by distance
- return result;
-}
-
-}
-
-using vespa_ann_benchmark::HnswIndex;
-
-PYBIND11_MODULE(vespa_ann_benchmark, m) {
- m.doc() = "vespa_ann_benchmark plugin";
-
- py::enum_<DistanceMetric>(m, "DistanceMetric")
- .value("Euclidean", DistanceMetric::Euclidean)
- .value("Angular", DistanceMetric::Angular)
- .value("InnerProduct", DistanceMetric::InnerProduct);
-
- py::class_<HnswIndexParams>(m, "HnswIndexParams")
- .def(py::init<uint32_t, uint32_t, DistanceMetric, bool>());
-
- py::class_<HnswIndex>(m, "HnswIndex")
- .def(py::init<uint32_t, const HnswIndexParams&, bool>())
- .def("set_vector", &HnswIndex::set_vector)
- .def("get_vector", &HnswIndex::get_vector)
- .def("clear_vector", &HnswIndex::clear_vector)
- .def("find_top_k", &HnswIndex::find_top_k);
-}