summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-04-25 11:09:40 +0200
committerTor Egge <Tor.Egge@online.no>2023-04-25 11:09:40 +0200
commitfbe9c1f4dddb9f7ca25964691c669f037d791df0 (patch)
tree1c613add846918a04a471164493779b0e91a5ffa /vespalib
parentf3ac0e360e47778eb51e3619825f09e52d3b6082 (diff)
Move search::FeatureValues to vespalib::FeatureValues in preparation for
extending vdslib::SearchResult.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/util/CMakeLists.txt1
-rw-r--r--vespalib/src/vespa/vespalib/util/featureset.cpp90
-rw-r--r--vespalib/src/vespa/vespalib/util/featureset.h155
3 files changed, 246 insertions, 0 deletions
diff --git a/vespalib/src/vespa/vespalib/util/CMakeLists.txt b/vespalib/src/vespa/vespalib/util/CMakeLists.txt
index 8ee3957af32..91365d446c1 100644
--- a/vespalib/src/vespa/vespalib/util/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/util/CMakeLists.txt
@@ -31,6 +31,7 @@ vespa_add_library(vespalib_vespalib_util OBJECT
exceptions.cpp
execution_profiler.cpp
executor_idle_tracking.cpp
+ featureset.cpp
file_area_freelist.cpp
foregroundtaskexecutor.cpp
gate.cpp
diff --git a/vespalib/src/vespa/vespalib/util/featureset.cpp b/vespalib/src/vespa/vespalib/util/featureset.cpp
new file mode 100644
index 00000000000..6ac90461cfb
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/util/featureset.cpp
@@ -0,0 +1,90 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "featureset.h"
+
+namespace vespalib {
+
+FeatureSet::FeatureSet()
+ : _names(),
+ _docIds(),
+ _values()
+{
+}
+
+FeatureSet::~FeatureSet() {}
+
+FeatureSet::FeatureSet(const StringVector &names, uint32_t expectDocs)
+ : _names(names),
+ _docIds(),
+ _values()
+{
+ _docIds.reserve(expectDocs);
+ _values.reserve(expectDocs * names.size());
+}
+
+bool
+FeatureSet::equals(const FeatureSet &rhs) const
+{
+ return ((_docIds == rhs._docIds) &&
+ (_values == rhs._values) &&
+ (_names == rhs._names)); // do names last, as they are most likely to match
+}
+
+uint32_t
+FeatureSet::addDocId(uint32_t docId)
+{
+ _docIds.push_back(docId);
+ _values.resize(_names.size() * _docIds.size());
+ return (_docIds.size() - 1);
+}
+
+bool
+FeatureSet::contains(const std::vector<uint32_t> &docIds) const
+{
+ using ITR = std::vector<uint32_t>::const_iterator;
+ ITR myPos = _docIds.begin();
+ ITR myEnd = _docIds.end();
+ ITR pos = docIds.begin();
+ ITR end = docIds.end();
+
+ for (; pos != end; ++pos) {
+ while (myPos != myEnd && *myPos < *pos) {
+ ++myPos;
+ }
+ if (myPos == myEnd || *myPos != *pos) {
+ return false;
+ }
+ ++myPos;
+ }
+ return true;
+}
+
+FeatureSet::Value *
+FeatureSet::getFeaturesByIndex(uint32_t idx)
+{
+ if (idx >= _docIds.size()) {
+ return 0;
+ }
+ return &(_values[idx * _names.size()]);
+}
+
+const FeatureSet::Value *
+FeatureSet::getFeaturesByDocId(uint32_t docId) const
+{
+ uint32_t low = 0;
+ uint32_t hi = _docIds.size();
+ while (low < hi) {
+ uint32_t pos = (low + hi) >> 1;
+ uint32_t val = _docIds[pos];
+ if (val < docId) {
+ low = pos + 1;
+ } else if (val > docId) {
+ hi = pos;
+ } else {
+ return &(_values[pos * _names.size()]);
+ }
+ }
+ return 0;
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/util/featureset.h b/vespalib/src/vespa/vespalib/util/featureset.h
new file mode 100644
index 00000000000..ae7a0c6932f
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/util/featureset.h
@@ -0,0 +1,155 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/vespalib/data/memory.h>
+#include <map>
+#include <vector>
+#include <memory>
+
+namespace vespalib {
+
+/**
+ * This class holds information about a set of features for a set of
+ * documents.
+ **/
+class FeatureSet
+{
+public:
+ class Value {
+ private:
+ std::vector<char> _data;
+ double _value;
+ public:
+ bool operator==(const Value &rhs) const {
+ return ((_data == rhs._data) && (_value == rhs._value));
+ }
+ bool is_double() const { return _data.empty(); }
+ bool is_data() const { return !_data.empty(); }
+ double as_double() const { return _value; }
+ vespalib::Memory as_data() const { return vespalib::Memory(&_data[0], _data.size()); }
+ void set_double(double value) {
+ _data.clear();
+ _value = value;
+ }
+ void set_data(vespalib::Memory data) {
+ _data.assign(data.data, data.data + data.size);
+ _value = 0.0;
+ }
+ };
+
+ using string = vespalib::string;
+ using StringVector = std::vector<string>;
+private:
+ StringVector _names;
+ std::vector<uint32_t> _docIds;
+ std::vector<Value> _values;
+
+ FeatureSet(const FeatureSet &);
+ FeatureSet & operator=(const FeatureSet &);
+
+public:
+ using SP = std::shared_ptr<FeatureSet>;
+ using UP = std::unique_ptr<FeatureSet>;
+
+ /**
+ * Create a new object without any feature information.
+ **/
+ FeatureSet();
+ ~FeatureSet();
+
+ /**
+ * Create a new object that will contain information about the
+ * given features.
+ *
+ * @param names names of all features
+ * @param expectDocs the number of documents we expect to store information about
+ **/
+ FeatureSet(const StringVector &names, uint32_t expectDocs);
+
+ /**
+ * Check whether this object is equal to the given object.
+ *
+ * @return true if the objects are equal.
+ **/
+ bool equals(const FeatureSet &rhs) const;
+
+ /**
+ * Obtain the names of all the features tracked by this object.
+ *
+ * @return feature names
+ **/
+ const StringVector &getNames() const { return _names; }
+
+ /**
+ * Obtain the number of features this object contains information
+ * about.
+ *
+ * @return number of features
+ **/
+ uint32_t numFeatures() const { return _names.size(); }
+
+ /**
+ * Obtain the number of documents this object contains information
+ * about.
+ *
+ * @return number of documents.
+ **/
+ uint32_t numDocs() const { return _docIds.size(); }
+
+ /**
+ * Add a document to the set of documents this object contains
+ * information about. Documents must be added in ascending
+ * order. When a new document is added, all features are
+ * initialized to 0.0. The return value from this method can be
+ * used together with the @ref getFeaturesByIndex method to set
+ * the actual feature values. The ordering among features are
+ * assumed to be the same as in the name vector passed to the
+ * constructor.
+ *
+ * @return the index of the document just added
+ * @param docid the id of the document to add
+ **/
+ uint32_t addDocId(uint32_t docid);
+
+ /**
+ * Check whether this object contains information about the given
+ * set of documents. The given set of documents must be sorted on
+ * document id; lowest first.
+ *
+ * @return true if this object contains information about all the given documents
+ * @param docIds the documents we want information about
+ **/
+ bool contains(const std::vector<uint32_t> &docIds) const;
+
+ /**
+ * Obtain the feature values belonging to a document based on the
+ * index into the internal docid array. This method is intended
+ * for use only when filling in the feature values during object
+ * initialization.
+ *
+ * @return pointer to features
+ * @param idx index into docid array
+ **/
+ Value *getFeaturesByIndex(uint32_t idx);
+
+ /**
+ * Obtain the feature values belonging to a document based on the
+ * docid value. This method is intended for lookup when generating
+ * the summary features or rank features docsum field.
+ *
+ * @return pointer to features
+ * @param docId docid value
+ **/
+ const Value *getFeaturesByDocId(uint32_t docId) const;
+};
+
+// An even simpler feature container. Used to pass match features around.
+struct FeatureValues {
+ using Value = FeatureSet::Value;
+ std::vector<vespalib::string> names;
+ std::vector<Value> values; // values.size() == names.size() * N
+};
+
+}