aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-04-26 15:44:12 +0200
committerTor Egge <Tor.Egge@online.no>2023-04-26 15:53:51 +0200
commit9792400f4465d839de3987c70f0cc88cd74fdf49 (patch)
tree742e09b0a68446f0a7634a00227bd0f0703b1c98
parentc4dacaddf9bd2aff43db43180b389488be7b9fc0 (diff)
Consolidate feature extraction between indexed and streaming search.
-rw-r--r--searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp42
-rw-r--r--searchlib/src/vespa/searchlib/fef/utils.cpp38
-rw-r--r--searchlib/src/vespa/searchlib/fef/utils.h13
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp35
4 files changed, 65 insertions, 63 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
index 30958214b72..4f9e1f6d1f4 100644
--- a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp
@@ -9,6 +9,7 @@
#include <vespa/vespalib/util/thread_bundle.h>
#include <vespa/searchlib/fef/feature_resolver.h>
#include <vespa/searchlib/fef/rank_program.h>
+#include <vespa/searchlib/fef/utils.h>
#include <vespa/searchlib/queryeval/searchiterator.h>
using vespalib::Doom;
@@ -25,38 +26,9 @@ namespace proton::matching {
using OrderedDocs = ExtractFeatures::OrderedDocs;
using search::StringStringMap;
-namespace {
-
-auto extract_names(const FeatureResolver &resolver, const StringStringMap &renames) {
- std::vector<vespalib::string> result;
- result.reserve(resolver.num_features());
- for (size_t i = 0; i < resolver.num_features(); ++i) {
- vespalib::string name = resolver.name_of(i);
- auto iter = renames.find(name);
- if (iter != renames.end()) {
- name = iter->second;
- }
- result.emplace_back(name);
- }
- return result;
-}
+using FefUtils = search::fef::Utils;
-void extract_values(const FeatureResolver &resolver, uint32_t docid, FeatureSet::Value *dst) {
- for (uint32_t i = 0; i < resolver.num_features(); ++i) {
- if (resolver.is_object(i)) {
- auto obj = resolver.resolve(i).as_object(docid);
- if (!obj.get().type().is_double()) {
- vespalib::nbostream buf;
- encode_value(obj.get(), buf);
- dst[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
- } else {
- dst[i].set_double(obj.get().as_double());
- }
- } else {
- dst[i].set_double(resolver.resolve(i).as_number(docid));
- }
- }
-}
+namespace {
struct MyChunk : Runnable {
const std::pair<uint32_t,uint32_t> *begin;
@@ -77,7 +49,7 @@ struct MyChunk : Runnable {
}
search.unpack(pos->first);
auto *dst = &result.values[pos->second * resolver.num_features()];
- extract_values(resolver, pos->first, dst);
+ FefUtils::extract_feature_values(resolver, pos->first, dst);
}
}
};
@@ -121,7 +93,7 @@ ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_progr
const Doom &doom, const StringStringMap &renames)
{
FeatureResolver resolver(rank_program.get_seeds(false));
- auto result = std::make_unique<FeatureSet>(extract_names(resolver, renames), docs.size());
+ auto result = std::make_unique<FeatureSet>(FefUtils::extract_feature_names(resolver, renames), docs.size());
if (!docs.empty()) {
search.initRange(docs.front(), docs.back()+1);
for (uint32_t docid: docs) {
@@ -130,7 +102,7 @@ ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_progr
}
search.unpack(docid);
auto *dst = result->getFeaturesByIndex(result->addDocId(docid));
- extract_values(resolver, docid, dst);
+ FefUtils::extract_feature_values(resolver, docid, dst);
}
}
return result;
@@ -143,7 +115,7 @@ ExtractFeatures::get_match_features(const MatchToolsFactory &mtf, const OrderedD
auto tools = mtf.createMatchTools();
tools->setup_match_features();
FeatureResolver resolver(tools->rank_program().get_seeds(false));
- result.names = extract_names(resolver, mtf.get_feature_rename_map());
+ result.names = FefUtils::extract_feature_names(resolver, mtf.get_feature_rename_map());
result.values.resize(result.names.size() * docs.size());
size_t num_threads = thread_bundle.size();
std::vector<Runnable::UP> chunks;
diff --git a/searchlib/src/vespa/searchlib/fef/utils.cpp b/searchlib/src/vespa/searchlib/fef/utils.cpp
index 5a3444a44ee..1eddee88d2b 100644
--- a/searchlib/src/vespa/searchlib/fef/utils.cpp
+++ b/searchlib/src/vespa/searchlib/fef/utils.cpp
@@ -2,9 +2,12 @@
#include "utils.h"
#include "rank_program.h"
+#include <vespa/eval/eval/value_codec.h>
#include <vector>
#include <cassert>
+using vespalib::FeatureSet;
+
namespace search::fef {
feature_t
@@ -56,4 +59,39 @@ Utils::getAllFeatures(const RankProgram &rankProgram, uint32_t docid)
return resolveFeatures(resolver, docid);
}
+std::vector<vespalib::string>
+Utils::extract_feature_names(const FeatureResolver& resolver, const StringStringMap& renames)
+{
+ std::vector<vespalib::string> result;
+ result.reserve(resolver.num_features());
+ for (size_t i = 0; i < resolver.num_features(); ++i) {
+ vespalib::string name = resolver.name_of(i);
+ auto iter = renames.find(name);
+ if (iter != renames.end()) {
+ name = iter->second;
+ }
+ result.emplace_back(name);
+ }
+ return result;
+}
+
+void
+Utils::extract_feature_values(const FeatureResolver& resolver, uint32_t docid, FeatureSet::Value* dst)
+{
+ for (uint32_t i = 0; i < resolver.num_features(); ++i) {
+ if (resolver.is_object(i)) {
+ auto obj = resolver.resolve(i).as_object(docid);
+ if (!obj.get().type().is_double()) {
+ vespalib::nbostream buf;
+ encode_value(obj.get(), buf);
+ dst[i].set_data(vespalib::Memory(buf.peek(), buf.size()));
+ } else {
+ dst[i].set_double(obj.get().as_double());
+ }
+ } else {
+ dst[i].set_double(resolver.resolve(i).as_number(docid));
+ }
+ }
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/fef/utils.h b/searchlib/src/vespa/searchlib/fef/utils.h
index 0db2becde39..868f7ef42d7 100644
--- a/searchlib/src/vespa/searchlib/fef/utils.h
+++ b/searchlib/src/vespa/searchlib/fef/utils.h
@@ -3,11 +3,14 @@
#pragma once
#include <vespa/searchlib/common/feature.h>
+#include <vespa/searchlib/common/stringmap.h>
#include <vespa/eval/eval/value.h>
+#include <vespa/vespalib/util/featureset.h>
#include <map>
namespace search::fef {
+class FeatureResolver;
class RankProgram;
struct Utils
@@ -32,6 +35,16 @@ struct Utils
**/
static std::map<vespalib::string, feature_t> getAllFeatures(const RankProgram &rankProgram, uint32_t docid);
+ /*
+ * Extract features names for the given feature resolver.
+ */
+ std::vector<vespalib::string>
+ static extract_feature_names(const FeatureResolver& resolver, const search::StringStringMap& renames);
+
+ /*
+ * Extract feature values for the given feature resolver.
+ */
+ static void extract_feature_values(const FeatureResolver& resolver, uint32_t docid, vespalib::FeatureSet::Value* dst);
};
}
diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
index 7b4e3cb0208..362d5a26611 100644
--- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
@@ -2,6 +2,7 @@
#include "hitcollector.h"
#include <vespa/searchlib/fef/feature_resolver.h>
+#include <vespa/searchlib/fef/utils.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <algorithm>
#include <vespa/eval/eval/value_codec.h>
@@ -14,6 +15,8 @@ using search::fef::MatchData;
using vespalib::FeatureSet;
using vdslib::SearchResult;
+using FefUtils = search::fef::Utils;
+
namespace streaming {
HitCollector::Hit::Hit(const vsm::StorageDocument * doc, uint32_t docId, const search::fef::MatchData & matchData,
@@ -147,40 +150,16 @@ HitCollector::getFeatureSet(IRankProgram &rankProgram,
const search::StringStringMap &feature_rename_map)
{
if (resolver.num_features() == 0 || _hits.empty()) {
- return FeatureSet::SP(new FeatureSet());
+ return std::make_shared<FeatureSet>();
}
sortByDocId();
- std::vector<vespalib::string> names;
- names.reserve(resolver.num_features());
- for (size_t i = 0; i < resolver.num_features(); ++i) {
- vespalib::string name = resolver.name_of(i);
- auto iter = feature_rename_map.find(name);
- if (iter != feature_rename_map.end()) {
- name = iter->second;
- }
- names.emplace_back(name);
- }
- FeatureSet::SP retval = FeatureSet::SP(new FeatureSet(names, _hits.size()));
+ auto names = FefUtils::extract_feature_names(resolver, feature_rename_map);
+ FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _hits.size());
for (const Hit & hit : _hits) {
rankProgram.run(hit.getDocId(), hit.getMatchData());
uint32_t docId = hit.getDocId();
auto * f = retval->getFeaturesByIndex(retval->addDocId(docId));
- for (uint32_t j = 0; j < names.size(); ++j) {
- if (resolver.is_object(j)) {
- auto obj = resolver.resolve(j).as_object(docId);
- if (! obj.get().type().is_double()) {
- vespalib::nbostream buf;
- encode_value(obj.get(), buf);
- f[j].set_data(vespalib::Memory(buf.peek(), buf.size()));
- } else {
- f[j].set_double(obj.get().as_double());
- }
- } else {
- f[j].set_double(resolver.resolve(j).as_number(docId));
- }
- LOG(debug, "getFeatureSet: lDocId(%u), '%s': %f %s", docId, names[j].c_str(), f[j].as_double(),
- f[j].is_data() ? "[tensor]" : "");
- }
+ FefUtils::extract_feature_values(resolver, docId, f);
}
return retval;
}