From aaf479240531f08548bb236ade6f99b272481051 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Mon, 1 Apr 2019 07:36:04 +0000 Subject: Use main thread when only 1 thread. --- vespalib/src/tests/dotproduct/dotproductbenchmark.cpp | 18 +++++++++++------- vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp | 5 +++++ vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp | 12 ++++++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp index afa06907b02..3588e0ce239 100644 --- a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp +++ b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp @@ -27,13 +27,17 @@ runThread(size_t count, size_t docs, const Benchmark * benchmark, size_t stride) void runBenchmark(size_t numThreads, size_t count, size_t docs, const Benchmark & benchmark, size_t stride) { - std::vector threads; - threads.reserve(numThreads); - for (size_t i(0); i < numThreads; i++) { - threads.emplace_back(runThread, count, docs, &benchmark, stride); - } - for (auto & thread : threads) { - thread.join(); + if (numThreads > 1) { + std::vector threads; + threads.reserve(numThreads); + for (size_t i(0); i < numThreads; i++) { + threads.emplace_back(runThread, count, docs, &benchmark, stride); + } + for (auto & thread : threads) { + thread.join(); + } + } else { + runThread(count, docs, &benchmark, stride); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index 1bf7ea1c44c..4295f9850c5 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -10,6 +10,11 @@ template ACCUM multiplyAdd(const T * a, const T * b, size_t sz) { +#if 1 + for (int i(0); i < 16; i++) { + __builtin_prefetch(&b[(4+i)*(64/sizeof(T))], 0, 0); + } +#endif ACCUM partial[UNROLL]; for (size_t i(0); i < UNROLL; i++) { partial[i] = 0; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp index f135de52e5a..209de4666a2 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp @@ -16,12 +16,20 @@ bool validAlignment16(const void * a, const void * b) { } +#define PREFETCH_DISTANCE 16 +#define L1_DIST 4 float Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const { if ( ! validAlignment16(af, bf)) { return GenericAccelrator::dotProduct(af, bf, sz); } + +#if PREFETCH_DISTANCE > 0 + for (int i(0); i < PREFETCH_DISTANCE; i++) { + __builtin_prefetch(&bf[i*16], 0, 0); + } +#endif typedef float v4sf __attribute__ ((vector_size (16))); const size_t ChunkSize(16); const size_t VectorsPerChunk(ChunkSize/4); @@ -31,6 +39,10 @@ Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const const size_t numChunks(sz/ChunkSize); for (size_t i(0); i < numChunks; i++) { +#if PREFETCH_DISTANCE > 0 + __builtin_prefetch(&bf[(i+PREFETCH_DISTANCE)*16], 0, 0); + __builtin_prefetch(&af[((i+L1_DIST)&0xf)*16], 0, 3); +#endif for (size_t j(0); j < VectorsPerChunk; j++) { partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; } -- cgit v1.2.3