diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-09-07 16:27:42 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-07 16:27:42 +0200 |
commit | 5c0ae1c193e407cf81528699fd841b49bb0c4e81 (patch) | |
tree | 47e878e46c90b08b7073ed6b888dfbcd9e5b2bcf /searchlib | |
parent | 2642c56c07cb25fd28838d15ff62ba661c8affe9 (diff) | |
parent | 27ea2e6df45211dd23c1fd416247c86ce7c73af0 (diff) |
Merge pull request #14299 from vespa-engine/arnej/add-hamming-distance-metric
add "Hamming" distance metric for ANN
Diffstat (limited to 'searchlib')
6 files changed, 97 insertions, 0 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index 9d240bc688a..f11a28b8716 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -142,6 +142,46 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score) EXPECT_LT(i44, 0.000001); } +TEST(DistanceFunctionsTest, hamming_gives_expected_score) +{ + auto ct = vespalib::eval::ValueType::CellType::DOUBLE; + + auto hamming = make_distance_function(DistanceMetric::Hamming, ct); + + std::vector<std::vector<double>> + points{{0.0, 0.0, 0.0}, + {1.0, 0.0, 0.0}, + {0.0, 1.0, 1.0}, + {2.0, 2.0, 2.0}, + {0.5, 0.5, 0.5}, + {0.0,-1.0, 1.0}, + {1.0, 1.0, 1.0}}; + for (const auto & p : points) { + double h0 = hamming->calc(t(p), t(p)); + EXPECT_EQ(h0, 0.0); + EXPECT_EQ(hamming->to_rawscore(h0), 1.0); + } + double d12 = hamming->calc(t(points[1]), t(points[2])); + EXPECT_EQ(d12, 3.0); + EXPECT_DOUBLE_EQ(hamming->to_rawscore(d12), 1.0/(1.0 + 3.0)); + + double d16 = hamming->calc(t(points[1]), t(points[6])); + EXPECT_EQ(d16, 2.0); + EXPECT_DOUBLE_EQ(hamming->to_rawscore(d16), 1.0/(1.0 + 2.0)); + + double d23 = hamming->calc(t(points[2]), t(points[3])); + EXPECT_EQ(d23, 3.0); + EXPECT_DOUBLE_EQ(hamming->to_rawscore(d23), 1.0/(1.0 + 3.0)); + + double d24 = hamming->calc(t(points[2]), t(points[4])); + EXPECT_EQ(d24, 3.0); + EXPECT_DOUBLE_EQ(hamming->to_rawscore(d24), 1.0/(1.0 + 3.0)); + + double d25 = hamming->calc(t(points[2]), t(points[5])); + EXPECT_EQ(d25, 1.0); + EXPECT_DOUBLE_EQ(hamming->to_rawscore(d25), 1.0/(1.0 + 1.0)); +} + TEST(GeoDegreesTest, gives_expected_score) { auto ct = vespalib::eval::ValueType::CellType::DOUBLE; diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp index acf0d3d2fd6..430f2eaa560 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp @@ -28,6 +28,7 @@ const vespalib::string euclidean = "euclidean"; const vespalib::string angular = "angular"; const vespalib::string geodegrees = "geodegrees"; const vespalib::string innerproduct = "innerproduct"; +const vespalib::string hamming = "hamming"; const vespalib::string doc_id_limit_tag = "docIdLimit"; const vespalib::string enumerated_tag = "enumerated"; const vespalib::string unique_value_count_tag = "uniqueValueCount"; @@ -99,6 +100,7 @@ to_string(DistanceMetric metric) case DistanceMetric::Angular: return angular; case DistanceMetric::GeoDegrees: return geodegrees; case DistanceMetric::InnerProduct: return innerproduct; + case DistanceMetric::Hamming: return hamming; } throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric))); } @@ -112,6 +114,8 @@ to_distance_metric(const vespalib::string& metric) return DistanceMetric::Angular; } else if (metric == geodegrees) { return DistanceMetric::GeoDegrees; + } else if (metric == hamming) { + return DistanceMetric::Hamming; } else { throw vespalib::IllegalStateException("Unknown distance metric '" + metric + "'"); } diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp index 5a8b32ec01b..f2e2f8271de 100644 --- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp +++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp @@ -88,6 +88,9 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg) case CfgDm::INNERPRODUCT: dm = DistanceMetric::InnerProduct; break; + case CfgDm::HAMMING: + dm = DistanceMetric::Hamming; + break; } retval.set_distance_metric(dm); if (cfg.index.hnsw.enabled) { diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index b76994d6092..a868dfe191b 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -40,6 +40,13 @@ make_distance_function(DistanceMetric variant, ValueType::CellType cell_type) return std::make_unique<InnerProductDistance<double>>(); } break; + case DistanceMetric::Hamming: + if (cell_type == ValueType::CellType::FLOAT) { + return std::make_unique<HammingDistance<float>>(); + } else { + return std::make_unique<HammingDistance<double>>(); + } + break; } // not reached: return DistanceFunction::UP(); diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp index 9017628d42c..8cf3a95db19 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp @@ -16,4 +16,7 @@ template class InnerProductDistance<double>; template class GeoDegreesDistance<float>; template class GeoDegreesDistance<double>; +template class HammingDistance<float>; +template class HammingDistance<double>; + } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index f88b239885f..73716395369 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -178,4 +178,44 @@ public: }; +/** + * Calculates the Hamming distance defined as + * "number of cells where the values are different" + */ +template <typename FloatType> +class HammingDistance : public DistanceFunction { +public: + HammingDistance() {} + double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::tensor::TypedCells& lhs, + const vespalib::tensor::TypedCells& rhs, + double limit) const override + { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz && sum <= limit; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } +}; + + } |