diff options
author | Harald Musum <musum@verizonmedia.com> | 2020-06-08 08:03:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-08 08:03:16 +0200 |
commit | 8a1262dcfb1698ca3bf06f2734dd364f25cc1f70 (patch) | |
tree | 1da3189429e5c4fd816b7261031ba1000c259a96 | |
parent | bda9b2e28e416daaefffa181d4dc8fe8566e8ca0 (diff) |
Revert "When we pull in a cacheline, we should use it too."
16 files changed, 68 insertions, 285 deletions
diff --git a/eval/src/tests/ann/nns-l2.h b/eval/src/tests/ann/nns-l2.h index de24df50b6c..82a95741200 100644 --- a/eval/src/tests/ann/nns-l2.h +++ b/eval/src/tests/ann/nns-l2.h @@ -36,7 +36,7 @@ template <typename FltType = float> struct L2DistCalc { const vespalib::hwaccelrated::IAccelrated & _hw; - L2DistCalc() : _hw(vespalib::hwaccelrated::IAccelrated::getAccelerator()) {} + L2DistCalc() : _hw(vespalib::hwaccelrated::IAccelrated::getAccelrator()) {} using Arr = vespalib::ArrayRef<FltType>; using ConstArr = vespalib::ConstArrayRef<FltType>; diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index 0a33e23de72..96234e373dc 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -167,7 +167,7 @@ BitVector::countInterval(Range range_in) const ++endw; } if (startw < endw) { - res += IAccelrated::getAccelerator().populationCount(bitValues + startw, endw - startw); + res += IAccelrated::getAccelrator().populationCount(bitValues + startw, endw - startw); } if (partialEnd) { res += Optimized::popCount(bitValues[endw] & ~endBits(last)); @@ -185,13 +185,13 @@ BitVector::orWith(const BitVector & right) if (right.size() > 0) { ssize_t commonBytes = numActiveBytes(getStartIndex(), right.size()) - sizeof(Word); if (commonBytes > 0) { - IAccelrated::getAccelerator().orBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); + IAccelrated::getAccelrator().orBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); } Index last(right.size() - 1); getWordIndex(last)[0] |= (right.getWordIndex(last)[0] & ~endBits(last)); } } else { - IAccelrated::getAccelerator().orBit(getActiveStart(), right.getWordIndex(getStartIndex()), getActiveBytes()); + IAccelrated::getAccelrator().orBit(getActiveStart(), right.getWordIndex(getStartIndex()), getActiveBytes()); } repairEnds(); invalidateCachedCount(); @@ -216,7 +216,7 @@ BitVector::andWith(const BitVector & right) verifyInclusiveStart(*this, right); uint32_t commonBytes = std::min(getActiveBytes(), numActiveBytes(getStartIndex(), right.size())); - IAccelrated::getAccelerator().andBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); + IAccelrated::getAccelrator().andBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); if (right.size() < size()) { clearInterval(right.size(), size()); } @@ -235,13 +235,13 @@ BitVector::andNotWith(const BitVector& right) if (right.size() > 0) { ssize_t commonBytes = numActiveBytes(getStartIndex(), right.size()) - sizeof(Word); if (commonBytes > 0) { - IAccelrated::getAccelerator().andNotBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); + IAccelrated::getAccelrator().andNotBit(getActiveStart(), right.getWordIndex(getStartIndex()), commonBytes); } Index last(right.size() - 1); getWordIndex(last)[0] &= ~(right.getWordIndex(last)[0] & ~endBits(last)); } } else { - IAccelrated::getAccelerator().andNotBit(getActiveStart(), right.getWordIndex(getStartIndex()), getActiveBytes()); + IAccelrated::getAccelrator().andNotBit(getActiveStart(), right.getWordIndex(getStartIndex()), getActiveBytes()); } repairEnds(); @@ -250,7 +250,7 @@ BitVector::andNotWith(const BitVector& right) void BitVector::notSelf() { - IAccelrated::getAccelerator().notBit(getActiveStart(), getActiveBytes()); + IAccelrated::getAccelrator().notBit(getActiveStart(), getActiveBytes()); setGuardBit(); invalidateCachedCount(); } diff --git a/searchlib/src/vespa/searchlib/features/dotproductfeature.cpp b/searchlib/src/vespa/searchlib/features/dotproductfeature.cpp index 37fd98c9f20..a8737a19eec 100644 --- a/searchlib/src/vespa/searchlib/features/dotproductfeature.cpp +++ b/searchlib/src/vespa/searchlib/features/dotproductfeature.cpp @@ -256,7 +256,7 @@ namespace dotproduct::array { template <typename BaseType> DotProductExecutorBase<BaseType>::DotProductExecutorBase(const V & queryVector) : FeatureExecutor(), - _multiplier(IAccelrated::getAccelerator()), + _multiplier(IAccelrated::getAccelrator()), _queryVector(queryVector) { } diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp index d36d16a679a..105d57b22b1 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp @@ -1,19 +1,19 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "multibitvectoriterator.h" -#include "andsearch.h" -#include "andnotsearch.h" -#include "sourceblendersearch.h" +#include <vespa/searchlib/queryeval/multibitvectoriterator.h> +#include <vespa/searchlib/queryeval/andsearch.h> +#include <vespa/searchlib/queryeval/andnotsearch.h> +#include <vespa/searchlib/queryeval/sourceblendersearch.h> +#include <vespa/searchlib/queryeval/orsearch.h> #include <vespa/searchlib/common/bitvectoriterator.h> +#include <vespa/searchlib/attribute/attributeiterators.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/searchlib/fef/termfieldmatchdataarray.h> #include <vespa/vespalib/util/optimized.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> namespace search::queryeval { using vespalib::Trinary; -using vespalib::hwaccelrated::IAccelrated; namespace { @@ -21,15 +21,7 @@ template<typename Update> class MultiBitVectorIterator : public MultiBitVectorIteratorBase { public: - explicit MultiBitVectorIterator(Children children) - : MultiBitVectorIteratorBase(std::move(children)), - _update(), - _accel(IAccelrated::getAccelerator()), - _lastWords() - { - static_assert(sizeof(_lastWords) == 64, "Latswords should have 64 byte size"); - memset(&_lastWords, 0, sizeof(_lastWords)); - } + MultiBitVectorIterator(Children children) : MultiBitVectorIteratorBase(std::move(children)) { } protected: void updateLastValue(uint32_t docId); void strictSeek(uint32_t docId); @@ -37,55 +29,33 @@ private: void doSeek(uint32_t docId) override; Trinary is_strict() const override { return Trinary::False; } bool acceptExtraFilter() const override { return Update::isAnd(); } - Update _update; - const IAccelrated & _accel; - alignas(64) Word _lastWords[8]; + Update _update; }; template<typename Update> class MultiBitVectorIteratorStrict : public MultiBitVectorIterator<Update> { public: - explicit MultiBitVectorIteratorStrict(MultiSearch::Children children) - : MultiBitVectorIterator<Update>(std::move(children)) - { } + MultiBitVectorIteratorStrict(MultiSearch::Children children) : MultiBitVectorIterator<Update>(std::move(children)) { } private: void doSeek(uint32_t docId) override { this->strictSeek(docId); } Trinary is_strict() const override { return Trinary::True; } }; -struct And { - using Word = BitWord::Word; - void operator () (const IAccelrated & accel, size_t offset, const std::vector<std::pair<const void *, bool>> & src, Word *dest) { - accel.and64(offset*sizeof(uint64_t), src, dest); - } - static bool isAnd() { return true; } -}; - -struct Or { - using Word = BitWord::Word; - void operator () (const IAccelrated & accel, size_t offset, const std::vector<std::pair<const void *, bool>> & src, Word *dest) { - accel.or64(offset*sizeof(uint64_t), src, dest); - } - static bool isAnd() { return false; } -}; - template<typename Update> void MultiBitVectorIterator<Update>::updateLastValue(uint32_t docId) { if (docId >= _lastMaxDocIdLimit) { - if (__builtin_expect(docId >= _numDocs, false)) { + if (__builtin_expect(docId < _numDocs, true)) { + const uint32_t index(wordNum(docId)); + _lastValue = _bvs[0][index]; + for(uint32_t i(1); i < _bvs.size(); i++) { + _lastValue = _update(_lastValue, _bvs[i][index]); + } + _lastMaxDocIdLimit = (index + 1) * WordLen; + } else { setAtEnd(); - return; - } - const uint32_t index(wordNum(docId)); - if (docId >= _lastMaxDocIdLimitRequireFetch) { - uint32_t baseIndex = index & ~(sizeof(_lastWords)/sizeof(Word) - 1); - _update(_accel, baseIndex, _bvs, _lastWords); - _lastMaxDocIdLimitRequireFetch = (baseIndex + (sizeof(_lastWords)/sizeof(Word))) * WordLen; } - _lastValue = _lastWords[index % (sizeof(_lastWords)/sizeof(Word))]; - _lastMaxDocIdLimit = (index + 1) * WordLen; } } @@ -105,7 +75,7 @@ template<typename Update> void MultiBitVectorIterator<Update>::strictSeek(uint32_t docId) { - for (updateLastValue(docId), _lastValue = _lastValue & checkTab(docId); + for (updateLastValue(docId), _lastValue=_lastValue & checkTab(docId); (_lastValue == 0) && __builtin_expect(! isAtEnd(), true); updateLastValue(_lastMaxDocIdLimit)); if (__builtin_expect(!isAtEnd(), true)) { @@ -118,6 +88,21 @@ MultiBitVectorIterator<Update>::strictSeek(uint32_t docId) } } +struct And { + typedef BitWord::Word Word; + Word operator () (const Word a, const Word b) { + return a & b; + } + static bool isAnd() { return true; } +}; + +struct Or { + typedef BitWord::Word Word; + Word operator () (const Word a, const Word b) { + return a | b; + } + static bool isAnd() { return false; } +}; typedef MultiBitVectorIterator<And> AndBVIterator; typedef MultiBitVectorIteratorStrict<And> AndBVIteratorStrict; @@ -151,15 +136,14 @@ bool canOptimize(const MultiSearch & s) { MultiBitVectorIteratorBase::MultiBitVectorIteratorBase(Children children) : MultiSearch(std::move(children)), _numDocs(std::numeric_limits<unsigned int>::max()), - _lastMaxDocIdLimit(0), - _lastMaxDocIdLimitRequireFetch(0), _lastValue(0), + _lastMaxDocIdLimit(0), _bvs() { _bvs.reserve(getChildren().size()); - for (const auto & child : getChildren()) { - const auto * bv = static_cast<const BitVectorIterator *>(child.get()); - _bvs.emplace_back(bv->getBitValues(), bv->isInverted()); + for (size_t i(0); i < getChildren().size(); i++) { + const auto * bv = static_cast<const BitVectorIterator *>(getChildren()[i].get()); + _bvs.emplace_back(reinterpret_cast<const Word *>(bv->getBitValues()), bv->isInverted()); _numDocs = std::min(_numDocs, bv->getDocIdLimit()); } } @@ -171,7 +155,6 @@ MultiBitVectorIteratorBase::initRange(uint32_t beginId, uint32_t endId) { MultiSearch::initRange(beginId, endId); _lastMaxDocIdLimit = 0; - _lastMaxDocIdLimitRequireFetch = 0; } SearchIterator::UP @@ -180,10 +163,9 @@ MultiBitVectorIteratorBase::andWith(UP filter, uint32_t estimate) (void) estimate; if (filter->isBitVector() && acceptExtraFilter()) { const auto & bv = static_cast<const BitVectorIterator &>(*filter); - _bvs.emplace_back(bv.getBitValues(), bv.isInverted()); + _bvs.emplace_back(reinterpret_cast<const Word *>(bv.getBitValues()), bv.isInverted()); insert(getChildren().size(), std::move(filter)); _lastMaxDocIdLimit = 0; // force reload - _lastMaxDocIdLimitRequireFetch = 0; } return filter; } diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h index 29e92584ffe..cde9ffcbfe5 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h @@ -11,7 +11,7 @@ namespace search::queryeval { class MultiBitVectorIteratorBase : public MultiSearch, protected BitWord { public: - ~MultiBitVectorIteratorBase() override; + ~MultiBitVectorIteratorBase(); void initRange(uint32_t beginId, uint32_t endId) override; void addUnpackIndex(size_t index) { _unpackInfo.add(index); } /** @@ -20,21 +20,26 @@ public: */ static SearchIterator::UP optimize(SearchIterator::UP parent); protected: - MultiBitVectorIteratorBase(Children hildren); - using MetaWord = std::pair<const void *, bool>; + MultiBitVectorIteratorBase(Children children); + class MetaWord { + public: + MetaWord(const Word * words, bool inverted) : _words(words), _inverted(inverted) { } + Word operator [] (uint32_t index) const { return _inverted ? ~_words[index] : _words[index]; } + private: + const Word * _words; + bool _inverted; + }; uint32_t _numDocs; - uint32_t _lastMaxDocIdLimit; // next documentid requiring recomputation. - uint32_t _lastMaxDocIdLimitRequireFetch; Word _lastValue; // Last value computed + uint32_t _lastMaxDocIdLimit; // next documentid requiring recomputation. std::vector<MetaWord> _bvs; private: virtual bool acceptExtraFilter() const = 0; UP andWith(UP filter, uint32_t estimate) override; void doUnpack(uint32_t docid) override; + UnpackInfo _unpackInfo; static SearchIterator::UP optimizeMultiSearch(SearchIterator::UP parent); - - UnpackInfo _unpackInfo; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index d37495e85da..79f987c740c 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -17,7 +17,7 @@ template <typename FloatType> class SquaredEuclideanDistance : public DistanceFunction { public: SquaredEuclideanDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelrator()) {} double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { auto lhs_vector = lhs.typify<FloatType>(); @@ -60,7 +60,7 @@ template <typename FloatType> class AngularDistance : public DistanceFunction { public: AngularDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelrator()) {} double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { auto lhs_vector = lhs.typify<FloatType>(); diff --git a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp index e95e8a5c58b..d6e1aef9394 100644 --- a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp +++ b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp @@ -60,7 +60,7 @@ template <typename T> FullBenchmark<T>::FullBenchmark(size_t numDocs, size_t numValues) : _values(numDocs*numValues), _query(numValues), - _dp(IAccelrated::getAccelerator()) + _dp(IAccelrated::getAccelrator()) { for (size_t i(0); i < numDocs; i++) { for (size_t j(0); j < numValues; j++) { diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index 8588a5510f7..7ff393c87f8 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -20,14 +20,4 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz return avx::euclideanDistanceSelectAlignment<double, 32>(a, b, sz); } -void -Avx2Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::andChunks<32u, 2u>(offset, src, dest); -} - -void -Avx2Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::orChunks<32u, 2u>(offset, src, dest); -} - } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index b6f3d299748..3e0dbb28110 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -15,8 +15,6 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; - void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; - void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 4dade08e77a..0941e6d6ad8 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -32,14 +32,4 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s return avx::euclideanDistanceSelectAlignment<double, 64>(a, b, sz); } -void -Avx512Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::andChunks<64, 1>(offset, src, dest); -} - -void -Avx512Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::orChunks<64, 1>(offset, src, dest); -} - } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index a54d57407b2..209ec06c857 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -17,8 +17,6 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; - void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; - void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index f9dfaacf626..f9684e88c63 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -165,14 +165,4 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, return euclideanDistanceT<double, 4>(a, b, sz); } -void -GenericAccelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::andChunks<16, 4>(offset, src, dest); -} - -void -GenericAccelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const { - helper::orChunks<16,4>(offset, src, dest); -} - } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index 2335b40fe85..50a3d59d49d 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -25,8 +25,6 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; - void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; - void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index de917c5f065..bb132165e53 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -46,8 +46,7 @@ std::vector<T> createAndFill(size_t sz) { } template<typename T> -void -verifyDotproduct(const IAccelrated & accel) +void verifyDotproduct(const IAccelrated & accel) { const size_t testLength(255); srand(1); @@ -67,8 +66,7 @@ verifyDotproduct(const IAccelrated & accel) } template<typename T> -void -verifyEuclideanDistance(const IAccelrated & accel) { +void verifyEuclideanDistance(const IAccelrated & accel) { const size_t testLength(255); srand(1); std::vector<T> a = createAndFill<T>(testLength); @@ -86,8 +84,7 @@ verifyEuclideanDistance(const IAccelrated & accel) { } } -void -verifyPopulationCount(const IAccelrated & accel) +void verifyPopulationCount(const IAccelrated & accel) { const uint64_t words[7] = {0x123456789abcdef0L, // 32 0x0000000000000000L, // 0 @@ -104,118 +101,6 @@ verifyPopulationCount(const IAccelrated & accel) } } -void -fill(std::vector<uint64_t> & v, size_t n) { - v.reserve(n); - for (size_t i(0); i < n; i++) { - v.emplace_back(random()); - } -} - -void -simpleAndWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) { - for (size_t i(0); i < dest.size(); i++) { - dest[i] &= src[i]; - } -} - -void -simpleOrWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) { - for (size_t i(0); i < dest.size(); i++) { - dest[i] |= src[i]; - } -} - -std::vector<uint64_t> -simpleInvert(const std::vector<uint64_t> & src) { - std::vector<uint64_t> inverted; - inverted.reserve(src.size()); - for (size_t i(0); i < src.size(); i++) { - inverted.push_back(~src[i]); - } - return inverted; -} - -std::vector<uint64_t> -optionallyInvert(bool invert, std::vector<uint64_t> v) { - return invert ? simpleInvert(std::move(v)) : std::move(v); -} - -bool shouldInvert(bool invertSome) { - return invertSome ? (random() & 1) : false; -} - -void -verifyOr64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> & vectors, - size_t offset, size_t num_vectors, bool invertSome) -{ - std::vector<std::pair<const void *, bool>> vRefs; - for (size_t j(0); j < num_vectors; j++) { - vRefs.emplace_back(&vectors[j][0], shouldInvert(invertSome)); - } - - std::vector<uint64_t> expected = optionallyInvert(vRefs[0].second, vectors[0]); - for (size_t j = 1; j < num_vectors; j++) { - simpleOrWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); - } - - uint64_t dest[8] __attribute((aligned(64))); - accel.or64(offset*sizeof(uint64_t), vRefs, dest); - int diff = memcmp(&expected[offset], dest, sizeof(dest)); - if (diff != 0) { - LOG_ABORT("Accelerator fails to compute correct 64 bytes OR"); - } -} - -void -verifyAnd64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> & vectors, - size_t offset, size_t num_vectors, bool invertSome) -{ - std::vector<std::pair<const void *, bool>> vRefs; - for (size_t j(0); j < num_vectors; j++) { - vRefs.emplace_back(&vectors[j][0], shouldInvert(invertSome)); - } - std::vector<uint64_t> expected = optionallyInvert(vRefs[0].second, vectors[0]); - for (size_t j = 1; j < num_vectors; j++) { - simpleAndWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); - } - - uint64_t dest[8] __attribute((aligned(64))); - accel.and64(offset*sizeof(uint64_t), vRefs, dest); - int diff = memcmp(&expected[offset], dest, sizeof(dest)); - if (diff != 0) { - LOG_ABORT("Accelerator fails to compute correct 64 bytes AND"); - } -} - -void -verifyOr64(const IAccelrated & accel) { - std::vector<std::vector<uint64_t>> vectors(3) ; - for (auto & v : vectors) { - fill(v, 16); - } - for (size_t offset = 0; offset < 8; offset++) { - for (size_t i = 1; i < vectors.size(); i++) { - verifyOr64(accel, vectors, offset, i, false); - verifyOr64(accel, vectors, offset, i, true); - } - } -} - -void -verifyAnd64(const IAccelrated & accel) { - std::vector<std::vector<uint64_t>> vectors(3); - for (auto & v : vectors) { - fill(v, 16); - } - for (size_t offset = 0; offset < 8; offset++) { - for (size_t i = 1; i < vectors.size(); i++) { - verifyAnd64(accel, vectors, offset, i, false); - verifyAnd64(accel, vectors, offset, i, true); - } - } -} - class RuntimeVerificator { public: @@ -229,8 +114,6 @@ private: verifyEuclideanDistance<float>(accelrated); verifyEuclideanDistance<double>(accelrated); verifyPopulationCount(accelrated); - verifyAnd64(accelrated); - verifyOr64(accelrated); } }; @@ -239,7 +122,7 @@ RuntimeVerificator::RuntimeVerificator() GenericAccelrator generic; verify(generic); - const IAccelrated & thisCpu(IAccelrated::getAccelerator()); + const IAccelrated & thisCpu(IAccelrated::getAccelrator()); verify(thisCpu); } @@ -272,7 +155,7 @@ static Selector _G_selector; RuntimeVerificator _G_verifyAccelrator; const IAccelrated & -IAccelrated::getAccelerator() +IAccelrated::getAccelrator() { static IAccelrated::UP accelrator = _G_selector.create(); return *accelrator; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index 2594a48dd33..0292ad14643 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -4,7 +4,6 @@ #include <memory> #include <cstdint> -#include <vector> namespace vespalib::hwaccelrated { @@ -30,12 +29,8 @@ public: virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0; virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0; - // AND 64 bytes from multiple, optionally inverted sources - virtual void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const = 0; - // OR 64 bytes from multiple, optionally inverted sources - virtual void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const = 0; - static const IAccelrated & getAccelerator() __attribute__((noinline)); + static const IAccelrated & getAccelrator() __attribute__((noinline)); }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 6fc49f969f2..f5daf2b9081 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -24,51 +24,5 @@ populationCount(const uint64_t *a, size_t sz) { return count; } -template<typename T> -T get(const void * base, bool invert) { - T v; - memcpy(&v, base, sizeof(T)); - return __builtin_expect(invert, false) ? ~v : v; -} - -template <typename T> -const T * cast(const void * ptr, size_t offsetBytes) { - return static_cast<const T *>(static_cast<const void *>(static_cast<const char *>(ptr) + offsetBytes)); -} - -template<unsigned ChunkSize, unsigned Chunks> -void -andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { - typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); - Chunk * chunk = static_cast<Chunk *>(dest); - const Chunk * tmp = cast<Chunk>(src[0].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] = get<Chunk>(tmp+n, src[0].second); - } - for (size_t i(1); i < src.size(); i++) { - tmp = cast<Chunk>(src[i].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] &= get<Chunk>(tmp+n, src[i].second); - } - } -} - -template<unsigned ChunkSize, unsigned Chunks> -void -orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { - typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); - Chunk * chunk = static_cast<Chunk *>(dest); - const Chunk * tmp = cast<Chunk>(src[0].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] = get<Chunk>(tmp+n, src[0].second); - } - for (size_t i(1); i < src.size(); i++) { - tmp = cast<Chunk>(src[i].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] |= get<Chunk>(tmp+n, src[i].second); - } - } -} - } } |