diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-04-25 11:09:40 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-04-25 11:09:40 +0200 |
commit | fbe9c1f4dddb9f7ca25964691c669f037d791df0 (patch) | |
tree | 1c613add846918a04a471164493779b0e91a5ffa /vespalib | |
parent | f3ac0e360e47778eb51e3619825f09e52d3b6082 (diff) |
Move search::FeatureValues to vespalib::FeatureValues in preparation for
extending vdslib::SearchResult.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/vespa/vespalib/util/CMakeLists.txt | 1 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/util/featureset.cpp | 90 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/util/featureset.h | 155 |
3 files changed, 246 insertions, 0 deletions
diff --git a/vespalib/src/vespa/vespalib/util/CMakeLists.txt b/vespalib/src/vespa/vespalib/util/CMakeLists.txt index 8ee3957af32..91365d446c1 100644 --- a/vespalib/src/vespa/vespalib/util/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/util/CMakeLists.txt @@ -31,6 +31,7 @@ vespa_add_library(vespalib_vespalib_util OBJECT exceptions.cpp execution_profiler.cpp executor_idle_tracking.cpp + featureset.cpp file_area_freelist.cpp foregroundtaskexecutor.cpp gate.cpp diff --git a/vespalib/src/vespa/vespalib/util/featureset.cpp b/vespalib/src/vespa/vespalib/util/featureset.cpp new file mode 100644 index 00000000000..6ac90461cfb --- /dev/null +++ b/vespalib/src/vespa/vespalib/util/featureset.cpp @@ -0,0 +1,90 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "featureset.h" + +namespace vespalib { + +FeatureSet::FeatureSet() + : _names(), + _docIds(), + _values() +{ +} + +FeatureSet::~FeatureSet() {} + +FeatureSet::FeatureSet(const StringVector &names, uint32_t expectDocs) + : _names(names), + _docIds(), + _values() +{ + _docIds.reserve(expectDocs); + _values.reserve(expectDocs * names.size()); +} + +bool +FeatureSet::equals(const FeatureSet &rhs) const +{ + return ((_docIds == rhs._docIds) && + (_values == rhs._values) && + (_names == rhs._names)); // do names last, as they are most likely to match +} + +uint32_t +FeatureSet::addDocId(uint32_t docId) +{ + _docIds.push_back(docId); + _values.resize(_names.size() * _docIds.size()); + return (_docIds.size() - 1); +} + +bool +FeatureSet::contains(const std::vector<uint32_t> &docIds) const +{ + using ITR = std::vector<uint32_t>::const_iterator; + ITR myPos = _docIds.begin(); + ITR myEnd = _docIds.end(); + ITR pos = docIds.begin(); + ITR end = docIds.end(); + + for (; pos != end; ++pos) { + while (myPos != myEnd && *myPos < *pos) { + ++myPos; + } + if (myPos == myEnd || *myPos != *pos) { + return false; + } + ++myPos; + } + return true; +} + +FeatureSet::Value * +FeatureSet::getFeaturesByIndex(uint32_t idx) +{ + if (idx >= _docIds.size()) { + return 0; + } + return &(_values[idx * _names.size()]); +} + +const FeatureSet::Value * +FeatureSet::getFeaturesByDocId(uint32_t docId) const +{ + uint32_t low = 0; + uint32_t hi = _docIds.size(); + while (low < hi) { + uint32_t pos = (low + hi) >> 1; + uint32_t val = _docIds[pos]; + if (val < docId) { + low = pos + 1; + } else if (val > docId) { + hi = pos; + } else { + return &(_values[pos * _names.size()]); + } + } + return 0; +} + +} diff --git a/vespalib/src/vespa/vespalib/util/featureset.h b/vespalib/src/vespa/vespalib/util/featureset.h new file mode 100644 index 00000000000..ae7a0c6932f --- /dev/null +++ b/vespalib/src/vespa/vespalib/util/featureset.h @@ -0,0 +1,155 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/data/memory.h> +#include <map> +#include <vector> +#include <memory> + +namespace vespalib { + +/** + * This class holds information about a set of features for a set of + * documents. + **/ +class FeatureSet +{ +public: + class Value { + private: + std::vector<char> _data; + double _value; + public: + bool operator==(const Value &rhs) const { + return ((_data == rhs._data) && (_value == rhs._value)); + } + bool is_double() const { return _data.empty(); } + bool is_data() const { return !_data.empty(); } + double as_double() const { return _value; } + vespalib::Memory as_data() const { return vespalib::Memory(&_data[0], _data.size()); } + void set_double(double value) { + _data.clear(); + _value = value; + } + void set_data(vespalib::Memory data) { + _data.assign(data.data, data.data + data.size); + _value = 0.0; + } + }; + + using string = vespalib::string; + using StringVector = std::vector<string>; +private: + StringVector _names; + std::vector<uint32_t> _docIds; + std::vector<Value> _values; + + FeatureSet(const FeatureSet &); + FeatureSet & operator=(const FeatureSet &); + +public: + using SP = std::shared_ptr<FeatureSet>; + using UP = std::unique_ptr<FeatureSet>; + + /** + * Create a new object without any feature information. + **/ + FeatureSet(); + ~FeatureSet(); + + /** + * Create a new object that will contain information about the + * given features. + * + * @param names names of all features + * @param expectDocs the number of documents we expect to store information about + **/ + FeatureSet(const StringVector &names, uint32_t expectDocs); + + /** + * Check whether this object is equal to the given object. + * + * @return true if the objects are equal. + **/ + bool equals(const FeatureSet &rhs) const; + + /** + * Obtain the names of all the features tracked by this object. + * + * @return feature names + **/ + const StringVector &getNames() const { return _names; } + + /** + * Obtain the number of features this object contains information + * about. + * + * @return number of features + **/ + uint32_t numFeatures() const { return _names.size(); } + + /** + * Obtain the number of documents this object contains information + * about. + * + * @return number of documents. + **/ + uint32_t numDocs() const { return _docIds.size(); } + + /** + * Add a document to the set of documents this object contains + * information about. Documents must be added in ascending + * order. When a new document is added, all features are + * initialized to 0.0. The return value from this method can be + * used together with the @ref getFeaturesByIndex method to set + * the actual feature values. The ordering among features are + * assumed to be the same as in the name vector passed to the + * constructor. + * + * @return the index of the document just added + * @param docid the id of the document to add + **/ + uint32_t addDocId(uint32_t docid); + + /** + * Check whether this object contains information about the given + * set of documents. The given set of documents must be sorted on + * document id; lowest first. + * + * @return true if this object contains information about all the given documents + * @param docIds the documents we want information about + **/ + bool contains(const std::vector<uint32_t> &docIds) const; + + /** + * Obtain the feature values belonging to a document based on the + * index into the internal docid array. This method is intended + * for use only when filling in the feature values during object + * initialization. + * + * @return pointer to features + * @param idx index into docid array + **/ + Value *getFeaturesByIndex(uint32_t idx); + + /** + * Obtain the feature values belonging to a document based on the + * docid value. This method is intended for lookup when generating + * the summary features or rank features docsum field. + * + * @return pointer to features + * @param docId docid value + **/ + const Value *getFeaturesByDocId(uint32_t docId) const; +}; + +// An even simpler feature container. Used to pass match features around. +struct FeatureValues { + using Value = FeatureSet::Value; + std::vector<vespalib::string> names; + std::vector<Value> values; // values.size() == names.size() * N +}; + +} |