diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-04-26 16:34:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-26 16:34:11 +0200 |
commit | f0ed62d462143f06df778c824a43821ec41da5af (patch) | |
tree | 2dfbd434956c53f99b42119ce15cbdd4cc18c306 | |
parent | 3eda03d6d047425681cba14caca4f1fced8168dc (diff) | |
parent | 9792400f4465d839de3987c70f0cc88cd74fdf49 (diff) |
Merge pull request #26872 from vespa-engine/toregge/consolidate-feature-extraction
Consolidate feature extraction between indexed and streaming search.
4 files changed, 65 insertions, 63 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp index 30958214b72..4f9e1f6d1f4 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/extract_features.cpp @@ -9,6 +9,7 @@ #include <vespa/vespalib/util/thread_bundle.h> #include <vespa/searchlib/fef/feature_resolver.h> #include <vespa/searchlib/fef/rank_program.h> +#include <vespa/searchlib/fef/utils.h> #include <vespa/searchlib/queryeval/searchiterator.h> using vespalib::Doom; @@ -25,38 +26,9 @@ namespace proton::matching { using OrderedDocs = ExtractFeatures::OrderedDocs; using search::StringStringMap; -namespace { - -auto extract_names(const FeatureResolver &resolver, const StringStringMap &renames) { - std::vector<vespalib::string> result; - result.reserve(resolver.num_features()); - for (size_t i = 0; i < resolver.num_features(); ++i) { - vespalib::string name = resolver.name_of(i); - auto iter = renames.find(name); - if (iter != renames.end()) { - name = iter->second; - } - result.emplace_back(name); - } - return result; -} +using FefUtils = search::fef::Utils; -void extract_values(const FeatureResolver &resolver, uint32_t docid, FeatureSet::Value *dst) { - for (uint32_t i = 0; i < resolver.num_features(); ++i) { - if (resolver.is_object(i)) { - auto obj = resolver.resolve(i).as_object(docid); - if (!obj.get().type().is_double()) { - vespalib::nbostream buf; - encode_value(obj.get(), buf); - dst[i].set_data(vespalib::Memory(buf.peek(), buf.size())); - } else { - dst[i].set_double(obj.get().as_double()); - } - } else { - dst[i].set_double(resolver.resolve(i).as_number(docid)); - } - } -} +namespace { struct MyChunk : Runnable { const std::pair<uint32_t,uint32_t> *begin; @@ -77,7 +49,7 @@ struct MyChunk : Runnable { } search.unpack(pos->first); auto *dst = &result.values[pos->second * resolver.num_features()]; - extract_values(resolver, pos->first, dst); + FefUtils::extract_feature_values(resolver, pos->first, dst); } } }; @@ -121,7 +93,7 @@ ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_progr const Doom &doom, const StringStringMap &renames) { FeatureResolver resolver(rank_program.get_seeds(false)); - auto result = std::make_unique<FeatureSet>(extract_names(resolver, renames), docs.size()); + auto result = std::make_unique<FeatureSet>(FefUtils::extract_feature_names(resolver, renames), docs.size()); if (!docs.empty()) { search.initRange(docs.front(), docs.back()+1); for (uint32_t docid: docs) { @@ -130,7 +102,7 @@ ExtractFeatures::get_feature_set(SearchIterator &search, RankProgram &rank_progr } search.unpack(docid); auto *dst = result->getFeaturesByIndex(result->addDocId(docid)); - extract_values(resolver, docid, dst); + FefUtils::extract_feature_values(resolver, docid, dst); } } return result; @@ -143,7 +115,7 @@ ExtractFeatures::get_match_features(const MatchToolsFactory &mtf, const OrderedD auto tools = mtf.createMatchTools(); tools->setup_match_features(); FeatureResolver resolver(tools->rank_program().get_seeds(false)); - result.names = extract_names(resolver, mtf.get_feature_rename_map()); + result.names = FefUtils::extract_feature_names(resolver, mtf.get_feature_rename_map()); result.values.resize(result.names.size() * docs.size()); size_t num_threads = thread_bundle.size(); std::vector<Runnable::UP> chunks; diff --git a/searchlib/src/vespa/searchlib/fef/utils.cpp b/searchlib/src/vespa/searchlib/fef/utils.cpp index 5a3444a44ee..1eddee88d2b 100644 --- a/searchlib/src/vespa/searchlib/fef/utils.cpp +++ b/searchlib/src/vespa/searchlib/fef/utils.cpp @@ -2,9 +2,12 @@ #include "utils.h" #include "rank_program.h" +#include <vespa/eval/eval/value_codec.h> #include <vector> #include <cassert> +using vespalib::FeatureSet; + namespace search::fef { feature_t @@ -56,4 +59,39 @@ Utils::getAllFeatures(const RankProgram &rankProgram, uint32_t docid) return resolveFeatures(resolver, docid); } +std::vector<vespalib::string> +Utils::extract_feature_names(const FeatureResolver& resolver, const StringStringMap& renames) +{ + std::vector<vespalib::string> result; + result.reserve(resolver.num_features()); + for (size_t i = 0; i < resolver.num_features(); ++i) { + vespalib::string name = resolver.name_of(i); + auto iter = renames.find(name); + if (iter != renames.end()) { + name = iter->second; + } + result.emplace_back(name); + } + return result; +} + +void +Utils::extract_feature_values(const FeatureResolver& resolver, uint32_t docid, FeatureSet::Value* dst) +{ + for (uint32_t i = 0; i < resolver.num_features(); ++i) { + if (resolver.is_object(i)) { + auto obj = resolver.resolve(i).as_object(docid); + if (!obj.get().type().is_double()) { + vespalib::nbostream buf; + encode_value(obj.get(), buf); + dst[i].set_data(vespalib::Memory(buf.peek(), buf.size())); + } else { + dst[i].set_double(obj.get().as_double()); + } + } else { + dst[i].set_double(resolver.resolve(i).as_number(docid)); + } + } +} + } diff --git a/searchlib/src/vespa/searchlib/fef/utils.h b/searchlib/src/vespa/searchlib/fef/utils.h index 0db2becde39..868f7ef42d7 100644 --- a/searchlib/src/vespa/searchlib/fef/utils.h +++ b/searchlib/src/vespa/searchlib/fef/utils.h @@ -3,11 +3,14 @@ #pragma once #include <vespa/searchlib/common/feature.h> +#include <vespa/searchlib/common/stringmap.h> #include <vespa/eval/eval/value.h> +#include <vespa/vespalib/util/featureset.h> #include <map> namespace search::fef { +class FeatureResolver; class RankProgram; struct Utils @@ -32,6 +35,16 @@ struct Utils **/ static std::map<vespalib::string, feature_t> getAllFeatures(const RankProgram &rankProgram, uint32_t docid); + /* + * Extract features names for the given feature resolver. + */ + std::vector<vespalib::string> + static extract_feature_names(const FeatureResolver& resolver, const search::StringStringMap& renames); + + /* + * Extract feature values for the given feature resolver. + */ + static void extract_feature_values(const FeatureResolver& resolver, uint32_t docid, vespalib::FeatureSet::Value* dst); }; } diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp index 7b4e3cb0208..362d5a26611 100644 --- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp @@ -2,6 +2,7 @@ #include "hitcollector.h" #include <vespa/searchlib/fef/feature_resolver.h> +#include <vespa/searchlib/fef/utils.h> #include <vespa/vespalib/util/stringfmt.h> #include <algorithm> #include <vespa/eval/eval/value_codec.h> @@ -14,6 +15,8 @@ using search::fef::MatchData; using vespalib::FeatureSet; using vdslib::SearchResult; +using FefUtils = search::fef::Utils; + namespace streaming { HitCollector::Hit::Hit(const vsm::StorageDocument * doc, uint32_t docId, const search::fef::MatchData & matchData, @@ -147,40 +150,16 @@ HitCollector::getFeatureSet(IRankProgram &rankProgram, const search::StringStringMap &feature_rename_map) { if (resolver.num_features() == 0 || _hits.empty()) { - return FeatureSet::SP(new FeatureSet()); + return std::make_shared<FeatureSet>(); } sortByDocId(); - std::vector<vespalib::string> names; - names.reserve(resolver.num_features()); - for (size_t i = 0; i < resolver.num_features(); ++i) { - vespalib::string name = resolver.name_of(i); - auto iter = feature_rename_map.find(name); - if (iter != feature_rename_map.end()) { - name = iter->second; - } - names.emplace_back(name); - } - FeatureSet::SP retval = FeatureSet::SP(new FeatureSet(names, _hits.size())); + auto names = FefUtils::extract_feature_names(resolver, feature_rename_map); + FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _hits.size()); for (const Hit & hit : _hits) { rankProgram.run(hit.getDocId(), hit.getMatchData()); uint32_t docId = hit.getDocId(); auto * f = retval->getFeaturesByIndex(retval->addDocId(docId)); - for (uint32_t j = 0; j < names.size(); ++j) { - if (resolver.is_object(j)) { - auto obj = resolver.resolve(j).as_object(docId); - if (! obj.get().type().is_double()) { - vespalib::nbostream buf; - encode_value(obj.get(), buf); - f[j].set_data(vespalib::Memory(buf.peek(), buf.size())); - } else { - f[j].set_double(obj.get().as_double()); - } - } else { - f[j].set_double(resolver.resolve(j).as_number(docId)); - } - LOG(debug, "getFeatureSet: lDocId(%u), '%s': %f %s", docId, names[j].c_str(), f[j].as_double(), - f[j].is_data() ? "[tensor]" : ""); - } + FefUtils::extract_feature_values(resolver, docId, f); } return retval; } |