Use main thread when only 1 thread.

author: Henning Baldersheim <balder@yahoo-inc.com> 2019-04-01 07:36:04 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2019-04-01 07:36:04 +0000
commit: aaf479240531f08548bb236ade6f99b272481051 (patch)
tree: a7761f1820303edf408250876978b62c8869d8ff /vespalib
parent: 58940f65f1d3d9df2ff1c3d1225688050d36c212 (diff)
3 files changed, 28 insertions, 7 deletions
diff --git a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
index afa06907b02..3588e0ce239 100644
--- a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
+++ b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
@@ -27,13 +27,17 @@ runThread(size_t count, size_t docs, const Benchmark * benchmark, size_t stride)
 
 void
 runBenchmark(size_t numThreads, size_t count, size_t docs, const Benchmark & benchmark, size_t stride) {
-    std::vector<std::thread> threads;
-    threads.reserve(numThreads);
-    for (size_t i(0); i < numThreads; i++) {
-        threads.emplace_back(runThread, count, docs, &benchmark, stride);
-    }
-    for (auto & thread : threads) {
-        thread.join();
+    if (numThreads > 1) {
+        std::vector<std::thread> threads;
+        threads.reserve(numThreads);
+        for (size_t i(0); i < numThreads; i++) {
+            threads.emplace_back(runThread, count, docs, &benchmark, stride);
+        }
+        for (auto & thread : threads) {
+            thread.join();
+        }
+    } else {
+        runThread(count, docs, &benchmark, stride);
     }
 }
 
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index 1bf7ea1c44c..4295f9850c5 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -10,6 +10,11 @@ template <typename ACCUM, typename T, size_t UNROLL>
 ACCUM
 multiplyAdd(const T * a, const T * b, size_t sz)
 {
+#if 1
+    for (int i(0); i < 16; i++) {
+        __builtin_prefetch(&b[(4+i)*(64/sizeof(T))], 0, 0);
+    }
+#endif
     ACCUM partial[UNROLL];
     for (size_t i(0); i < UNROLL; i++) {
         partial[i] = 0;
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
index f135de52e5a..209de4666a2 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
@@ -16,12 +16,20 @@ bool validAlignment16(const void * a, const void * b) {
 
 }
 
+#define PREFETCH_DISTANCE 16
+#define L1_DIST 4
 float
 Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
 {
     if ( ! validAlignment16(af, bf)) {
         return GenericAccelrator::dotProduct(af, bf, sz);
     }
+
+#if PREFETCH_DISTANCE > 0
+    for (int i(0); i < PREFETCH_DISTANCE; i++) {
+        __builtin_prefetch(&bf[i*16], 0, 0);
+    }
+#endif
     typedef float v4sf __attribute__ ((vector_size (16)));
     const size_t ChunkSize(16);
     const size_t VectorsPerChunk(ChunkSize/4);
@@ -31,6 +39,10 @@ Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
 
     const size_t numChunks(sz/ChunkSize);
     for (size_t i(0); i < numChunks; i++) {
+#if PREFETCH_DISTANCE > 0
+        __builtin_prefetch(&bf[(i+PREFETCH_DISTANCE)*16], 0, 0);
+        __builtin_prefetch(&af[((i+L1_DIST)&0xf)*16], 0, 3);
+#endif
         for (size_t j(0); j < VectorsPerChunk; j++) {
             partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
         }
author	Henning Baldersheim <balder@yahoo-inc.com>	2019-04-01 07:36:04 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2019-04-01 07:36:04 +0000
commit	aaf479240531f08548bb236ade6f99b272481051 (patch)
tree	a7761f1820303edf408250876978b62c8869d8ff /vespalib
parent	58940f65f1d3d9df2ff1c3d1225688050d36c212 (diff)