summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-09-07 16:27:42 +0200
committerGitHub <noreply@github.com>2020-09-07 16:27:42 +0200
commit5c0ae1c193e407cf81528699fd841b49bb0c4e81 (patch)
tree47e878e46c90b08b7073ed6b888dfbcd9e5b2bcf /searchlib
parent2642c56c07cb25fd28838d15ff62ba661c8affe9 (diff)
parent27ea2e6df45211dd23c1fd416247c86ce7c73af0 (diff)
Merge pull request #14299 from vespa-engine/arnej/add-hamming-distance-metric
add "Hamming" distance metric for ANN
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp40
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_header.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/configconverter.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h40
6 files changed, 97 insertions, 0 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index 9d240bc688a..f11a28b8716 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -142,6 +142,46 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
EXPECT_LT(i44, 0.000001);
}
+TEST(DistanceFunctionsTest, hamming_gives_expected_score)
+{
+ auto ct = vespalib::eval::ValueType::CellType::DOUBLE;
+
+ auto hamming = make_distance_function(DistanceMetric::Hamming, ct);
+
+ std::vector<std::vector<double>>
+ points{{0.0, 0.0, 0.0},
+ {1.0, 0.0, 0.0},
+ {0.0, 1.0, 1.0},
+ {2.0, 2.0, 2.0},
+ {0.5, 0.5, 0.5},
+ {0.0,-1.0, 1.0},
+ {1.0, 1.0, 1.0}};
+ for (const auto & p : points) {
+ double h0 = hamming->calc(t(p), t(p));
+ EXPECT_EQ(h0, 0.0);
+ EXPECT_EQ(hamming->to_rawscore(h0), 1.0);
+ }
+ double d12 = hamming->calc(t(points[1]), t(points[2]));
+ EXPECT_EQ(d12, 3.0);
+ EXPECT_DOUBLE_EQ(hamming->to_rawscore(d12), 1.0/(1.0 + 3.0));
+
+ double d16 = hamming->calc(t(points[1]), t(points[6]));
+ EXPECT_EQ(d16, 2.0);
+ EXPECT_DOUBLE_EQ(hamming->to_rawscore(d16), 1.0/(1.0 + 2.0));
+
+ double d23 = hamming->calc(t(points[2]), t(points[3]));
+ EXPECT_EQ(d23, 3.0);
+ EXPECT_DOUBLE_EQ(hamming->to_rawscore(d23), 1.0/(1.0 + 3.0));
+
+ double d24 = hamming->calc(t(points[2]), t(points[4]));
+ EXPECT_EQ(d24, 3.0);
+ EXPECT_DOUBLE_EQ(hamming->to_rawscore(d24), 1.0/(1.0 + 3.0));
+
+ double d25 = hamming->calc(t(points[2]), t(points[5]));
+ EXPECT_EQ(d25, 1.0);
+ EXPECT_DOUBLE_EQ(hamming->to_rawscore(d25), 1.0/(1.0 + 1.0));
+}
+
TEST(GeoDegreesTest, gives_expected_score)
{
auto ct = vespalib::eval::ValueType::CellType::DOUBLE;
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
index acf0d3d2fd6..430f2eaa560 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
@@ -28,6 +28,7 @@ const vespalib::string euclidean = "euclidean";
const vespalib::string angular = "angular";
const vespalib::string geodegrees = "geodegrees";
const vespalib::string innerproduct = "innerproduct";
+const vespalib::string hamming = "hamming";
const vespalib::string doc_id_limit_tag = "docIdLimit";
const vespalib::string enumerated_tag = "enumerated";
const vespalib::string unique_value_count_tag = "uniqueValueCount";
@@ -99,6 +100,7 @@ to_string(DistanceMetric metric)
case DistanceMetric::Angular: return angular;
case DistanceMetric::GeoDegrees: return geodegrees;
case DistanceMetric::InnerProduct: return innerproduct;
+ case DistanceMetric::Hamming: return hamming;
}
throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric)));
}
@@ -112,6 +114,8 @@ to_distance_metric(const vespalib::string& metric)
return DistanceMetric::Angular;
} else if (metric == geodegrees) {
return DistanceMetric::GeoDegrees;
+ } else if (metric == hamming) {
+ return DistanceMetric::Hamming;
} else {
throw vespalib::IllegalStateException("Unknown distance metric '" + metric + "'");
}
diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
index 5a8b32ec01b..f2e2f8271de 100644
--- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
@@ -88,6 +88,9 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
case CfgDm::INNERPRODUCT:
dm = DistanceMetric::InnerProduct;
break;
+ case CfgDm::HAMMING:
+ dm = DistanceMetric::Hamming;
+ break;
}
retval.set_distance_metric(dm);
if (cfg.index.hnsw.enabled) {
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index b76994d6092..a868dfe191b 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -40,6 +40,13 @@ make_distance_function(DistanceMetric variant, ValueType::CellType cell_type)
return std::make_unique<InnerProductDistance<double>>();
}
break;
+ case DistanceMetric::Hamming:
+ if (cell_type == ValueType::CellType::FLOAT) {
+ return std::make_unique<HammingDistance<float>>();
+ } else {
+ return std::make_unique<HammingDistance<double>>();
+ }
+ break;
}
// not reached:
return DistanceFunction::UP();
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
index 9017628d42c..8cf3a95db19 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
@@ -16,4 +16,7 @@ template class InnerProductDistance<double>;
template class GeoDegreesDistance<float>;
template class GeoDegreesDistance<double>;
+template class HammingDistance<float>;
+template class HammingDistance<double>;
+
}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index f88b239885f..73716395369 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -178,4 +178,44 @@ public:
};
+/**
+ * Calculates the Hamming distance defined as
+ * "number of cells where the values are different"
+ */
+template <typename FloatType>
+class HammingDistance : public DistanceFunction {
+public:
+ HammingDistance() {}
+ double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override {
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ size_t sum = 0;
+ for (size_t i = 0; i < sz; ++i) {
+ sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
+ }
+ return (double)sum;
+ }
+ double to_rawscore(double distance) const override {
+ double score = 1.0 / (1.0 + distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::tensor::TypedCells& lhs,
+ const vespalib::tensor::TypedCells& rhs,
+ double limit) const override
+ {
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ size_t sum = 0;
+ for (size_t i = 0; i < sz && sum <= limit; ++i) {
+ sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
+ }
+ return (double)sum;
+ }
+};
+
+
}