summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-02-10 11:59:23 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-02-10 12:12:08 +0000
commit8164e4a5c4910a878e49663730de5c2b310d5940 (patch)
tree283898fd957bdac97afbd99cfb10bf021e70f798
parent2fb3b11480e049cea184c96824f9ac6ab11e4c46 (diff)
Add interface used to calculate the distance between two n-dim vectors.
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function.h21
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h34
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h16
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h5
7 files changed, 83 insertions, 24 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index 634569cbc8c..0c5ddb93e92 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -1,6 +1,7 @@
// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/eval/tensor/dense/typed_cells.h>
+#include <vespa/searchlib/tensor/distance_functions.h>
#include <vespa/searchlib/tensor/doc_vector_access.h>
#include <vespa/searchlib/tensor/hnsw_index.h>
#include <vespa/searchlib/tensor/random_level_generator.h>
@@ -41,12 +42,14 @@ struct LevelGenerator : public RandomLevelGenerator {
};
using FloatVectors = MyDocVectorAccess<float>;
+using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>;
using FloatIndex = HnswIndex<float>;
using FloatIndexUP = std::unique_ptr<FloatIndex>;
class HnswIndexTest : public ::testing::Test {
public:
FloatVectors vectors;
+ FloatSqEuclideanDistance distance_func;
LevelGenerator level_generator;
FloatIndexUP index;
@@ -60,7 +63,7 @@ public:
.set(7, {3, 5});
}
void init(bool heuristic_select_neighbors) {
- index = std::make_unique<FloatIndex>(vectors, level_generator,
+ index = std::make_unique<FloatIndex>(vectors, distance_func, level_generator,
HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors));
}
void add_document(uint32_t docid, uint32_t max_level = 0) {
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h
new file mode 100644
index 00000000000..8dfb77ddccb
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+namespace vespalib::tensor { struct TypedCells; }
+
+namespace search::tensor {
+
+/**
+ * Interface used to calculate the distance between two n-dimensional vectors.
+ *
+ * The vectors must be of same size and same type (float or double).
+ * The actual implementation must know which type the vectors are.
+ */
+class DistanceFunction {
+public:
+ virtual ~DistanceFunction() {}
+ virtual double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const = 0;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
new file mode 100644
index 00000000000..dabab55dc7a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -0,0 +1,34 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+
+namespace search::tensor {
+
+// TODO: Make distance functions hardware optimized.
+
+/**
+ * Calculates the square of the standard Euclidean distance.
+ */
+template <typename FloatType>
+class SquaredEuclideanDistance : public DistanceFunction {
+public:
+ double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override {
+ auto lhs_vector = lhs.template typify<FloatType>();
+ auto rhs_vector = rhs.template typify<FloatType>();
+ double result = 0.0;
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ for (size_t i = 0; i < sz; ++i) {
+ double diff = lhs_vector[i] - rhs_vector[i];
+ result += diff * diff;
+ }
+ return result;
+ }
+};
+
+template class SquaredEuclideanDistance<float>;
+template class SquaredEuclideanDistance<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 345f7f551a6..cb617246066 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -1,5 +1,6 @@
// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "distance_function.h"
#include "hnsw_index.h"
#include <vespa/vespalib/util/rcuvector.hpp>
@@ -15,23 +16,15 @@ HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) cons
template <typename FloatType>
double
-HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const
+HnswIndex<FloatType>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
{
- // TODO: Make it possible to specify the distance function from the outside and make it hardware optimized.
auto rhs = get_vector(rhs_docid);
- double result = 0.0;
- size_t sz = lhs.size();
- assert(sz == rhs.size());
- for (size_t i = 0; i < sz; ++i) {
- double diff = lhs[i] - rhs[i];
- result += diff * diff;
- }
- return result;
+ return _distance_func.calc(lhs, rhs);
}
template <typename FloatType>
HnswCandidate
-HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level)
+HnswIndex<FloatType>::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level)
{
HnswCandidate nearest = entry_point;
bool keep_searching = true;
@@ -50,7 +43,7 @@ HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandi
template <typename FloatType>
void
-HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
+HnswIndex<FloatType>::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
{
NearestPriQ candidates;
// TODO: Add proper handling of visited set.
@@ -86,8 +79,9 @@ HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_fi
}
template <typename FloatType>
-HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg)
- : HnswIndexBase(vectors, level_generator, cfg)
+HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
+ RandomLevelGenerator& level_generator, const Config& cfg)
+ : HnswIndexBase(vectors, distance_func, level_generator, cfg)
{
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index be6f507dbe1..a93062cf200 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -11,6 +11,7 @@
namespace search::tensor {
+class DistanceFunction;
class DocVectorAccess;
class RandomLevelGenerator;
@@ -25,22 +26,23 @@ class RandomLevelGenerator;
template <typename FloatType = float>
class HnswIndex : public HnswIndexBase {
private:
- using Vector = vespalib::ConstArrayRef<FloatType>;
+ using TypedCells = vespalib::tensor::TypedCells;
- inline Vector get_vector(uint32_t docid) const {
- return _vectors.get_vector(docid).template typify<FloatType>();
+ inline TypedCells get_vector(uint32_t docid) const {
+ return _vectors.get_vector(docid);
}
double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override;
- double calc_distance(const Vector& lhs, uint32_t rhs_docid) const;
+ double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
/**
* Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
*/
- HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level);
- void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
+ HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level);
+ void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
public:
- HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg);
+ HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
+ RandomLevelGenerator& level_generator, const Config& cfg);
~HnswIndex() override;
void add_document(uint32_t docid) override;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
index a7b9c1dba79..e0ecdf87399 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
@@ -156,8 +156,10 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t
set_link_array(remove_from, level, new_links);
}
-HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg)
+HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
+ RandomLevelGenerator& level_generator, const Config& cfg)
: _vectors(vectors),
+ _distance_func(distance_func),
_level_generator(level_generator),
_cfg(cfg),
_node_refs(),
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
index 9987b61c157..ab59ad02f14 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
@@ -13,6 +13,7 @@
namespace search::tensor {
+class DistanceFunction;
class DocVectorAccess;
class RandomLevelGenerator;
@@ -78,6 +79,7 @@ protected:
using LinkArray = vespalib::Array<uint32_t>;
const DocVectorAccess& _vectors;
+ const DistanceFunction& _distance_func;
RandomLevelGenerator& _level_generator;
Config _cfg;
NodeRefVector _node_refs;
@@ -112,7 +114,8 @@ protected:
void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level);
public:
- HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg);
+ HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
+ RandomLevelGenerator& level_generator, const Config& cfg);
~HnswIndexBase() override;
// TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists)