aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-09-04 08:13:47 +0000
committerArne Juul <arnej@verizonmedia.com>2020-09-04 13:12:35 +0000
commit1bf14be27705a251adaac48143187d06922ad720 (patch)
treeac15b1ec7ac50cab51291cf88726431a65b37199
parent36798cb5069419c4a45172c1629406089efcee90 (diff)
add "Hamming" distance metric for ANN
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java2
-rw-r--r--configdefinitions/src/vespa/attributes.def4
-rw-r--r--searchcommon/src/vespa/searchcommon/attribute/distance_metric.h2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_header.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/configconverter.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h40
8 files changed, 61 insertions, 4 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java
index f7f2259bd58..8cf862a72af 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/Attribute.java
@@ -39,7 +39,7 @@ import java.util.Set;
*/
public final class Attribute implements Cloneable, Serializable {
- public enum DistanceMetric { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT }
+ public enum DistanceMetric { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT, HAMMING }
// Remember to change hashCode and equals when you add new fields
diff --git a/configdefinitions/src/vespa/attributes.def b/configdefinitions/src/vespa/attributes.def
index 53f737d2b33..6c69d71fdf6 100644
--- a/configdefinitions/src/vespa/attributes.def
+++ b/configdefinitions/src/vespa/attributes.def
@@ -33,13 +33,13 @@ attribute[].imported bool default=false
# The distance metric to use for nearest neighbor search.
# Is only used when the attribute is a 1-dimensional indexed tensor.
-attribute[].distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT } default=EUCLIDEAN
+attribute[].distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, INNERPRODUCT, HAMMING } default=EUCLIDEAN
# Configuration parameters for a hnsw index used together with a 1-dimensional indexed tensor for approximate nearest neighbor search.
attribute[].index.hnsw.enabled bool default=false
attribute[].index.hnsw.maxlinkspernode int default=16
attribute[].index.hnsw.neighborstoexploreatinsert int default=200
# Deprecated: Remove on Vespa 8 or before when possible.
-attribute[].index.hnsw.distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES } default=EUCLIDEAN
+attribute[].index.hnsw.distancemetric enum { EUCLIDEAN, ANGULAR, GEODEGREES, HAMMING } default=EUCLIDEAN
# Whether multi-threaded indexing is enabled for this hnsw index.
attribute[].index.hnsw.multithreadedindexing bool default=true
diff --git a/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h b/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h
index aa4ff22cdf3..6397eaf0c56 100644
--- a/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h
+++ b/searchcommon/src/vespa/searchcommon/attribute/distance_metric.h
@@ -4,6 +4,6 @@
namespace search::attribute {
-enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct };
+enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming };
}
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
index acf0d3d2fd6..430f2eaa560 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
@@ -28,6 +28,7 @@ const vespalib::string euclidean = "euclidean";
const vespalib::string angular = "angular";
const vespalib::string geodegrees = "geodegrees";
const vespalib::string innerproduct = "innerproduct";
+const vespalib::string hamming = "hamming";
const vespalib::string doc_id_limit_tag = "docIdLimit";
const vespalib::string enumerated_tag = "enumerated";
const vespalib::string unique_value_count_tag = "uniqueValueCount";
@@ -99,6 +100,7 @@ to_string(DistanceMetric metric)
case DistanceMetric::Angular: return angular;
case DistanceMetric::GeoDegrees: return geodegrees;
case DistanceMetric::InnerProduct: return innerproduct;
+ case DistanceMetric::Hamming: return hamming;
}
throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric)));
}
@@ -112,6 +114,8 @@ to_distance_metric(const vespalib::string& metric)
return DistanceMetric::Angular;
} else if (metric == geodegrees) {
return DistanceMetric::GeoDegrees;
+ } else if (metric == hamming) {
+ return DistanceMetric::Hamming;
} else {
throw vespalib::IllegalStateException("Unknown distance metric '" + metric + "'");
}
diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
index 5a8b32ec01b..f2e2f8271de 100644
--- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
@@ -88,6 +88,9 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
case CfgDm::INNERPRODUCT:
dm = DistanceMetric::InnerProduct;
break;
+ case CfgDm::HAMMING:
+ dm = DistanceMetric::Hamming;
+ break;
}
retval.set_distance_metric(dm);
if (cfg.index.hnsw.enabled) {
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index b76994d6092..a868dfe191b 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -40,6 +40,13 @@ make_distance_function(DistanceMetric variant, ValueType::CellType cell_type)
return std::make_unique<InnerProductDistance<double>>();
}
break;
+ case DistanceMetric::Hamming:
+ if (cell_type == ValueType::CellType::FLOAT) {
+ return std::make_unique<HammingDistance<float>>();
+ } else {
+ return std::make_unique<HammingDistance<double>>();
+ }
+ break;
}
// not reached:
return DistanceFunction::UP();
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
index 9017628d42c..8cf3a95db19 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
@@ -16,4 +16,7 @@ template class InnerProductDistance<double>;
template class GeoDegreesDistance<float>;
template class GeoDegreesDistance<double>;
+template class HammingDistance<float>;
+template class HammingDistance<double>;
+
}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index f88b239885f..73716395369 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -178,4 +178,44 @@ public:
};
+/**
+ * Calculates the Hamming distance defined as
+ * "number of cells where the values are different"
+ */
+template <typename FloatType>
+class HammingDistance : public DistanceFunction {
+public:
+ HammingDistance() {}
+ double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override {
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ size_t sum = 0;
+ for (size_t i = 0; i < sz; ++i) {
+ sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
+ }
+ return (double)sum;
+ }
+ double to_rawscore(double distance) const override {
+ double score = 1.0 / (1.0 + distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::tensor::TypedCells& lhs,
+ const vespalib::tensor::TypedCells& rhs,
+ double limit) const override
+ {
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ size_t sum = 0;
+ for (size_t i = 0; i < sz && sum <= limit; ++i) {
+ sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
+ }
+ return (double)sum;
+ }
+};
+
+
}