diff options
8 files changed, 61 insertions, 4 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java index f7f2259bd58..8cf862a72af 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java @@ -39,7 +39,7 @@ import java.util.Set; */ public final class Attribute implements Cloneable, Serializable { - public enum DistanceMetric { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT } + public enum DistanceMetric { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT, HAMMING } // Remember to change hashCode and equals when you add new fields diff --git a/configdefinitions/src/vespa/attributes.def b/configdefinitions/src/vespa/attributes.def index 53f737d2b33..6c69d71fdf6 100644 --- a/configdefinitions/src/vespa/attributes.def +++ b/configdefinitions/src/vespa/attributes.def @@ -33,13 +33,13 @@ attribute[].imported bool default=false # The distance metric to use for nearest neighbor search. # Is only used when the attribute is a 1-dimensional indexed tensor. -attribute[].distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT } default=EUCLIDEAN +attribute[].distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT, HAMMING } default=EUCLIDEAN # Configuration parameters for a hnsw index used together with a 1-dimensional indexed tensor for approximate nearest neighbor search. attribute[].index.hnsw.enabled bool default=false attribute[].index.hnsw.maxlinkspernode int default=16 attribute[].index.hnsw.neighborstoexploreatinsert int default=200 # Deprecated: Remove on Vespa 8 or before when possible. -attribute[].index.hnsw.distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES } default=EUCLIDEAN +attribute[].index.hnsw.distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, HAMMING } default=EUCLIDEAN # Whether multi-threaded indexing is enabled for this hnsw index. attribute[].index.hnsw.multithreadedindexing bool default=true diff --git a/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h b/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h index aa4ff22cdf3..6397eaf0c56 100644 --- a/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h +++ b/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h @@ -4,6 +4,6 @@ namespace search::attribute { -enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct }; +enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming }; } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp index acf0d3d2fd6..430f2eaa560 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp @@ -28,6 +28,7 @@ const vespalib::string euclidean = "euclidean"; const vespalib::string angular = "angular"; const vespalib::string geodegrees = "geodegrees"; const vespalib::string innerproduct = "innerproduct"; +const vespalib::string hamming = "hamming"; const vespalib::string doc_id_limit_tag = "docIdLimit"; const vespalib::string enumerated_tag = "enumerated"; const vespalib::string unique_value_count_tag = "uniqueValueCount"; @@ -99,6 +100,7 @@ to_string(DistanceMetric metric) case DistanceMetric::Angular: return angular; case DistanceMetric::GeoDegrees: return geodegrees; case DistanceMetric::InnerProduct: return innerproduct; + case DistanceMetric::Hamming: return hamming; } throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric))); } @@ -112,6 +114,8 @@ to_distance_metric(const vespalib::string& metric) return DistanceMetric::Angular; } else if (metric == geodegrees) { return DistanceMetric::GeoDegrees; + } else if (metric == hamming) { + return DistanceMetric::Hamming; } else { throw vespalib::IllegalStateException("Unknown distance metric '" + metric + "'"); } diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp index 5a8b32ec01b..f2e2f8271de 100644 --- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp +++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp @@ -88,6 +88,9 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg) case CfgDm::INNERPRODUCT: dm = DistanceMetric::InnerProduct; break; + case CfgDm::HAMMING: + dm = DistanceMetric::Hamming; + break; } retval.set_distance_metric(dm); if (cfg.index.hnsw.enabled) { diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index b76994d6092..a868dfe191b 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -40,6 +40,13 @@ make_distance_function(DistanceMetric variant, ValueType::CellType cell_type) return std::make_unique<InnerProductDistance<double>>(); } break; + case DistanceMetric::Hamming: + if (cell_type == ValueType::CellType::FLOAT) { + return std::make_unique<HammingDistance<float>>(); + } else { + return std::make_unique<HammingDistance<double>>(); + } + break; } // not reached: return DistanceFunction::UP(); diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp index 9017628d42c..8cf3a95db19 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp @@ -16,4 +16,7 @@ template class InnerProductDistance<double>; template class GeoDegreesDistance<float>; template class GeoDegreesDistance<double>; +template class HammingDistance<float>; +template class HammingDistance<double>; + } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index f88b239885f..73716395369 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -178,4 +178,44 @@ public: }; +/** + * Calculates the Hamming distance defined as + * "number of cells where the values are different" + */ +template <typename FloatType> +class HammingDistance : public DistanceFunction { +public: + HammingDistance() {} + double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::tensor::TypedCells& lhs, + const vespalib::tensor::TypedCells& rhs, + double limit) const override + { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz && sum <= limit; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } +}; + + } |