26 files changed, 222 insertions, 147 deletions
diff --git a/vespamalloc/CMakeLists.txt b/vespamalloc/CMakeLists.txt
index 11bc1b60983..ef629698590 100644
--- a/vespamalloc/CMakeLists.txt
+++ b/vespamalloc/CMakeLists.txt
@@ -3,7 +3,7 @@ add_compile_options(-fvisibility=hidden)
 add_definitions(-DPARANOID_LEVEL=0)
 
 vespa_define_module(
-    TEST_DEPENDS
+    DEPENDS
     fastos
     vespalib
     vespalog
diff --git a/vespamalloc/src/tests/allocfree/allocfree.cpp b/vespamalloc/src/tests/allocfree/allocfree.cpp
index 7e81aaa9c1d..f1ecb74754b 100644
--- a/vespamalloc/src/tests/allocfree/allocfree.cpp
+++ b/vespamalloc/src/tests/allocfree/allocfree.cpp
@@ -1,15 +1,16 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "producerconsumer.h"
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
 #include <vespa/vespalib/testkit/testapp.h>
+#include "producerconsumer.h"
 #include <map>
 
-#include <vespa/log/log.h>
-LOG_SETUP("allocfree_test");
-
 using vespalib::Consumer;
 using vespalib::Producer;
 using vespalib::ProducerConsumer;
 
+LOG_SETUP("allocfree_test");
+
 TEST_SETUP(Test);
 
 //-----------------------------------------------------------------------------
diff --git a/vespamalloc/src/tests/allocfree/creatingmanythreads.cpp b/vespamalloc/src/tests/allocfree/creatingmanythreads.cpp
index 86da05311c6..53de3f274cc 100644
--- a/vespamalloc/src/tests/allocfree/creatingmanythreads.cpp
+++ b/vespamalloc/src/tests/allocfree/creatingmanythreads.cpp
@@ -1,7 +1,8 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/log/log.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 
-#include <vespa/log/log.h>
 LOG_SETUP("creatingmanythreads_test");
 
 TEST_SETUP(Test);
diff --git a/vespamalloc/src/tests/allocfree/linklist.cpp b/vespamalloc/src/tests/allocfree/linklist.cpp
index 9642c987899..39cd237420b 100644
--- a/vespamalloc/src/tests/allocfree/linklist.cpp
+++ b/vespamalloc/src/tests/allocfree/linklist.cpp
@@ -1,15 +1,17 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "producerconsumer.h"
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
+#include "producerconsumer.h"
 #include <vespamalloc/malloc/allocchunk.h>
 #include <vespamalloc/util/callstack.h>
 #include <vespa/log/log.h>
-LOG_SETUP("linklist_test");
 
 using vespalib::Consumer;
 using vespalib::Producer;
 using vespalib::ProducerConsumer;
 
+LOG_SETUP("linklist_test");
+
 TEST_SETUP(Test);
 
 //-----------------------------------------------------------------------------
@@ -58,9 +60,9 @@ List globalList[NumBlocks];
 
 class LinkIn : public Consumer {
 public:
-    LinkIn(List::AtomicHeadPtr & list, uint32_t maxQueue, bool inverse);
+    LinkIn(List::HeadPtr & list, uint32_t maxQueue, bool inverse);
 private:
-    List::AtomicHeadPtr & _head;
+    List::HeadPtr & _head;
     virtual void consume(void * p) {
         List * l((List *) p);
         if ( ! ((l >= &globalList[0]) && (l < &globalList[NumBlocks]))) { abort(); }
@@ -68,7 +70,7 @@ private:
     }
 };
 
-LinkIn::LinkIn(List::AtomicHeadPtr & list, uint32_t maxQueue, bool inverse) :
+LinkIn::LinkIn(List::HeadPtr & list, uint32_t maxQueue, bool inverse) :
     Consumer (maxQueue, inverse),
     _head(list)
 {
@@ -78,10 +80,10 @@ LinkIn::LinkIn(List::AtomicHeadPtr & list, uint32_t maxQueue, bool inverse) :
 
 class LinkOut : public Producer {
 public:
-    LinkOut(List::AtomicHeadPtr & list, uint32_t cnt, LinkIn &target)
+    LinkOut(List::HeadPtr & list, uint32_t cnt, LinkIn &target)
         : Producer(cnt, target), _head(list) {}
 private:
-    List::AtomicHeadPtr & _head;
+    List::HeadPtr & _head;
     virtual void * produce()       {
         void *p = List::linkOut(_head);
         List *l((List *)p);
@@ -94,10 +96,10 @@ private:
 
 class LinkInOutAndIn : public ProducerConsumer {
 public:
-    LinkInOutAndIn(List::AtomicHeadPtr & list, uint32_t cnt, bool inverse)
+    LinkInOutAndIn(List::HeadPtr & list, uint32_t cnt, bool inverse)
         : ProducerConsumer(cnt, inverse), _head(list) { }
 private:
-    List::AtomicHeadPtr & _head;
+    List::HeadPtr & _head;
     virtual void * produce()       {
         void *p = List::linkOut(_head);
         List *l((List *)p);
@@ -123,7 +125,10 @@ int Test::Main() {
     ASSERT_EQUAL(1024ul, sizeof(List));
 
     FastOS_ThreadPool      pool(128000);
-    List::AtomicHeadPtr    sharedList(List::HeadPtr(nullptr, 1));
+    List::HeadPtr    sharedList;
+    sharedList._tag = 1;
+    List::init();
+    List::enableThreadSupport();
     fprintf(stderr, "Start populating list\n");
     for (size_t i=0; i < NumBlocks; i++) {
         List * l(&globalList[i]);
@@ -138,9 +143,7 @@ int Test::Main() {
     List *n =  List::linkOut(sharedList);
     ASSERT_TRUE(n == NULL);
 
-    List::HeadPtr tmp(sharedList.load());
-    tmp._tag = 1;
-    sharedList.store(tmp);
+    sharedList._tag = 1;
     fprintf(stderr, "Start populating list\n");
     for (size_t i=0; i < NumBlocks; i++) {
         List * l(&globalList[i]);
diff --git a/vespamalloc/src/tests/allocfree/realloc.cpp b/vespamalloc/src/tests/allocfree/realloc.cpp
index efaf89f7e1b..8cfd50d0132 100644
--- a/vespamalloc/src/tests/allocfree/realloc.cpp
+++ b/vespamalloc/src/tests/allocfree/realloc.cpp
@@ -1,7 +1,10 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
+#include <vespa/log/log.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 
+LOG_SETUP("realloc_test");
+
 TEST_SETUP(Test);
 
 int Test::Main() {
diff --git a/vespamalloc/src/tests/doubledelete/expectsignal.cpp b/vespamalloc/src/tests/doubledelete/expectsignal.cpp
index f1fb6eb5694..0b2d5e154c4 100644
--- a/vespamalloc/src/tests/doubledelete/expectsignal.cpp
+++ b/vespamalloc/src/tests/doubledelete/expectsignal.cpp
@@ -1,7 +1,11 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/log/log.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 #include <vespa/vespalib/util/slaveproc.h>
 
+LOG_SETUP("expectsignal_test");
+
 using namespace vespalib;
 
 class Test : public TestApp
diff --git a/vespamalloc/src/tests/overwrite/expectsignal.cpp b/vespamalloc/src/tests/overwrite/expectsignal.cpp
index f1fb6eb5694..0b2d5e154c4 100644
--- a/vespamalloc/src/tests/overwrite/expectsignal.cpp
+++ b/vespamalloc/src/tests/overwrite/expectsignal.cpp
@@ -1,7 +1,11 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/log/log.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 #include <vespa/vespalib/util/slaveproc.h>
 
+LOG_SETUP("expectsignal_test");
+
 using namespace vespalib;
 
 class Test : public TestApp
diff --git a/vespamalloc/src/tests/overwrite/overwrite.cpp b/vespamalloc/src/tests/overwrite/overwrite.cpp
index 8c35fe841fe..d7057444505 100644
--- a/vespamalloc/src/tests/overwrite/overwrite.cpp
+++ b/vespamalloc/src/tests/overwrite/overwrite.cpp
@@ -1,6 +1,10 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/log/log.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 
+LOG_SETUP("overwrite_test");
+
 using namespace vespalib;
 
 class Test : public TestApp
diff --git a/vespamalloc/src/tests/test.cpp b/vespamalloc/src/tests/test.cpp
index d6208fdc240..24acb3368d8 100644
--- a/vespamalloc/src/tests/test.cpp
+++ b/vespamalloc/src/tests/test.cpp
@@ -1,7 +1,7 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include <stdio.h>
 #include <stdlib.h>
-#include <vespa/fastos/thread.h>
+#include <vespa/fastos/fastos.h>
 
 namespace vespamalloc {
 void info();
diff --git a/vespamalloc/src/tests/test1/CMakeLists.txt b/vespamalloc/src/tests/test1/CMakeLists.txt
index 3120f1d39d0..205de0f96fc 100644
--- a/vespamalloc/src/tests/test1/CMakeLists.txt
+++ b/vespamalloc/src/tests/test1/CMakeLists.txt
@@ -3,6 +3,5 @@ vespa_add_executable(vespamalloc_testatomic_app TEST
     SOURCES
     testatomic.cpp
     DEPENDS
-    atomic
 )
 vespa_add_test(NAME vespamalloc_testatomic_app NO_VALGRIND COMMAND vespamalloc_testatomic_app)
diff --git a/vespamalloc/src/tests/test1/testatomic.cpp b/vespamalloc/src/tests/test1/testatomic.cpp
index 78d94429a3f..1222493446c 100644
--- a/vespamalloc/src/tests/test1/testatomic.cpp
+++ b/vespamalloc/src/tests/test1/testatomic.cpp
@@ -1,8 +1,10 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/fastos/thread.h>
+#include <vespa/fastos/fastos.h>
 #include <vespa/vespalib/testkit/testapp.h>
 #include <vespa/vespalib/util/atomic.h>
-#include <vespamalloc/malloc/allocchunk.h>
+#include <vector>
+
+using vespalib::Atomic;
 
 class Test : public vespalib::TestApp
 {
@@ -37,10 +39,10 @@ void Test::testSwap(T initial)
 {
     T value(initial);
 
-    ASSERT_TRUE(vespalib::Atomic::cmpSwap(&value, initial+1, initial));
+    ASSERT_TRUE(Atomic::cmpSwap(&value, initial+1, initial));
     ASSERT_TRUE(value == initial+1);
 
-    ASSERT_TRUE(!vespalib::Atomic::cmpSwap(&value, initial+2, initial));
+    ASSERT_TRUE(!Atomic::cmpSwap(&value, initial+2, initial));
     ASSERT_TRUE(value == initial+1);
 }
 
@@ -89,7 +91,7 @@ template <typename T>
 void Stress<T>::stressSwap(T & value)
 {
     for (T old = value; old > 0; old = value) {
-        if (vespalib::Atomic::cmpSwap(&value, old-1, old)) {
+        if (Atomic::cmpSwap(&value, old-1, old)) {
             _successCount++;
         } else {
             _failedCount++;
@@ -101,20 +103,6 @@ int Test::Main()
 {
     TEST_INIT("atomic");
 
-    {
-        std::atomic<uint32_t> uint32V;
-        ASSERT_TRUE(uint32V.is_lock_free());
-    }
-    {
-        std::atomic<uint64_t> uint64V;
-        ASSERT_TRUE(uint64V.is_lock_free());
-    }
-    {
-        std::atomic<vespamalloc::TaggedPtr> taggedPtr;
-        ASSERT_EQUAL(16, sizeof(vespamalloc::TaggedPtr));
-        ASSERT_TRUE(taggedPtr.is_lock_free());
-    }
-
     testSwap<uint32_t>(6);
     testSwap<uint32_t>(7);
     testSwap<uint32_t>(uint32_t(-6));
diff --git a/vespamalloc/src/vespamalloc/malloc/allocchunk.cpp b/vespamalloc/src/vespamalloc/malloc/allocchunk.cpp
index ac9da6a26af..1a21e6f1c14 100644
--- a/vespamalloc/src/vespamalloc/malloc/allocchunk.cpp
+++ b/vespamalloc/src/vespamalloc/malloc/allocchunk.cpp
@@ -1,36 +1,72 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "allocchunk.h"
+#include <vespamalloc/malloc/allocchunk.h>
 
 namespace vespamalloc {
 
+char AFListBase::_atomicLinkSpace[sizeof(AFListBase::AtomicLink)];
+char AFListBase::_lockedLinkSpace[sizeof(AFListBase::LockedLink)];
+AFListBase::LinkI     *AFListBase::_link = NULL;
 
-void AFListBase::linkInList(AtomicHeadPtr & head, AFListBase * list)
+void AFListBase::init()
+{
+    _link =  new (_atomicLinkSpace)AtomicLink();
+}
+
+AFListBase::LinkI::~LinkI()
+{
+}
+
+void AFListBase::linkInList(HeadPtr & head, AFListBase * list)
 {
     AFListBase * tail;
     for (tail = list; tail->_next != NULL ;tail = tail->_next) { }
     linkIn(head, list, tail);
 }
 
-void AFListBase::linkIn(AtomicHeadPtr & head, AFListBase * csl, AFListBase * tail)
+void AFListBase::AtomicLink::linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail)
 {
-    HeadPtr oldHead = head.load(std::memory_order_relaxed);
+    HeadPtr oldHead = head;
     HeadPtr newHead(csl, oldHead._tag + 1);
     tail->_next = static_cast<AFListBase *>(oldHead._ptr);
-    while ( ! head.compare_exchange_weak(oldHead, newHead, std::memory_order_release, std::memory_order_relaxed) ) {
+    while ( ! Atomic::cmpSwap(&head, newHead, oldHead) ) {
+        oldHead = head;
         newHead._tag =  oldHead._tag + 1;
         tail->_next = static_cast<AFListBase *>(oldHead._ptr);
     }
 }
 
-AFListBase * AFListBase::linkOut(AtomicHeadPtr & head)
+void AFListBase::LockedLink::linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail)
+{
+    Guard guard(_mutex);
+    HeadPtr newHead(csl, head._tag + 1);
+    tail->_next = static_cast<AFListBase *>(head._ptr);
+    head = newHead;
+}
+
+AFListBase * AFListBase::LockedLink::linkOut(HeadPtr & head)
+{
+    Guard guard(_mutex);
+    HeadPtr oldHead = head;
+    AFListBase *csl = static_cast<AFListBase *>(oldHead._ptr);
+    if (csl == NULL) {
+        return NULL;
+    }
+    HeadPtr newHead(csl->_next, oldHead._tag + 1);
+    head = newHead;
+    csl->_next = NULL;
+    return csl;
+}
+
+AFListBase * AFListBase::AtomicLink::linkOut(HeadPtr & head)
 {
-    HeadPtr oldHead = head.load(std::memory_order_relaxed);
+    HeadPtr oldHead = head;
     AFListBase *csl = static_cast<AFListBase *>(oldHead._ptr);
     if (csl == NULL) {
         return NULL;
     }
     HeadPtr newHead(csl->_next, oldHead._tag + 1);
-    while ( ! head.compare_exchange_weak(oldHead, newHead, std::memory_order_acquire, std::memory_order_relaxed) ) {
+    while ( ! Atomic::cmpSwap(&head, newHead, oldHead) ) {
+        oldHead = head;
         csl = static_cast<AFListBase *>(oldHead._ptr);
         if (csl == NULL) {
             return NULL;
diff --git a/vespamalloc/src/vespamalloc/malloc/allocchunk.h b/vespamalloc/src/vespamalloc/malloc/allocchunk.h
index 6db29678b3d..48128e12687 100644
--- a/vespamalloc/src/vespamalloc/malloc/allocchunk.h
+++ b/vespamalloc/src/vespamalloc/malloc/allocchunk.h
@@ -1,43 +1,53 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "common.h"
+#include <vespamalloc/malloc/common.h>
 #include <algorithm>
 
 namespace vespamalloc {
 
-/**
- * @brief Pointer and tag - use instead of bare pointer for cmpSwap()
- *
- * When making a lock-free data structure by using cmpSwap
- * on pointers, you'll often run into the "ABA problem", see
- * http://en.wikipedia.org/wiki/ABA_problem for details.
- * The TaggedPtr makes it easy to do the woraround with tag bits,
- * but requires the double-word compare-and-swap instruction.
- * Very early Amd K7/8 CPUs are lacking this and will fail (Illegal Instruction).
- **/
-struct TaggedPtr {
-    TaggedPtr() noexcept : _ptr(nullptr), _tag(0) { }
-    TaggedPtr(void *h, size_t t) noexcept : _ptr(h), _tag(t) {}
-
-    void *_ptr;
-    size_t _tag;
-};
-
 class AFListBase
 {
 public:
-    using HeadPtr = TaggedPtr;
-    using AtomicHeadPtr = std::atomic<HeadPtr>;
+    typedef Atomic::TaggedPtr HeadPtr;
     AFListBase() : _next(NULL) { }
     void setNext(AFListBase * csl)           { _next = csl; }
     static void init();
-    static void linkInList(AtomicHeadPtr & head, AFListBase * list);
-    static void linkIn(AtomicHeadPtr & head, AFListBase * csl, AFListBase * tail);
+    static void enableThreadSupport()   { _link->enableThreadSupport(); }
+    static void linkInList(HeadPtr & head, AFListBase * list);
+    static void linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail) {
+        _link->linkIn(head, csl, tail);
+    }
 protected:
     AFListBase * getNext()                      { return _next; }
-    static AFListBase * linkOut(AtomicHeadPtr & head);
+    static AFListBase * linkOut(HeadPtr & head) { return _link->linkOut(head); }
 private:
+    class LinkI
+    {
+    public:
+        virtual ~LinkI();
+        virtual void enableThreadSupport() { }
+        virtual void linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail) = 0;
+        virtual AFListBase * linkOut(HeadPtr & head) = 0;
+    };
+    class AtomicLink : public LinkI
+    {
+    private:
+        virtual void linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail);
+        virtual AFListBase * linkOut(HeadPtr & head);
+    };
+    class LockedLink : public LinkI
+    {
+    public:
+        virtual void enableThreadSupport() { _mutex.init(); }
+    private:
+        virtual void linkIn(HeadPtr & head, AFListBase * csl, AFListBase * tail);
+        virtual AFListBase * linkOut(HeadPtr & head);
+        Mutex _mutex;
+    };
+    static char _atomicLinkSpace[sizeof(AtomicLink)];
+    static char _lockedLinkSpace[sizeof(LockedLink)];
+    static LinkI     *_link;
     AFListBase       *_next;
 };
 
@@ -64,7 +74,7 @@ public:
     bool full()                 const { return (_count == NumBlocks); }
     size_t fill(void * mem, SizeClassT sc, size_t blocksPerChunk = NumBlocks);
     AFList * getNext()                { return static_cast<AFList *>(AFListBase::getNext()); }
-    static AFList * linkOut(AtomicHeadPtr & head) {
+    static AFList * linkOut(HeadPtr & head) {
         return static_cast<AFList *>(AFListBase::linkOut(head));
     }
 private:
diff --git a/vespamalloc/src/vespamalloc/malloc/common.cpp b/vespamalloc/src/vespamalloc/malloc/common.cpp
index ed0551cd853..d14a0317630 100644
--- a/vespamalloc/src/vespamalloc/malloc/common.cpp
+++ b/vespamalloc/src/vespamalloc/malloc/common.cpp
@@ -1,10 +1,10 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "common.h"
+#include <vespamalloc/malloc/common.h>
 #include <pthread.h>
 
 namespace vespamalloc {
 
-std::atomic<uint32_t> Mutex::_threadCount(0);
+uint32_t Mutex::_threadCount = 0;
 bool     Mutex::_stopRecursion = true;
 
 void Mutex::lock()
diff --git a/vespamalloc/src/vespamalloc/malloc/common.h b/vespamalloc/src/vespamalloc/malloc/common.h
index 5041b545b4d..ee08cfbafaa 100644
--- a/vespamalloc/src/vespamalloc/malloc/common.h
+++ b/vespamalloc/src/vespamalloc/malloc/common.h
@@ -1,10 +1,13 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
+#include <vespa/vespalib/util/atomic.h>
+#include <vespa/vespalib/util/optimized.h>
 #include <new>
-#include <atomic>
 #include <vespamalloc/util/osmem.h>
 
+using vespalib::Atomic;
+
 extern "C" void MallocRecurseOnSuspend(bool recurse) __attribute__ ((noinline));
 
 namespace vespamalloc {
@@ -55,18 +58,13 @@ typedef MmapMemory OSMemory;
 
 typedef int SizeClassT;
 
-   
-inline int msbIdx(uint64_t v) {
-    return v ? 63 - __builtin_clzl(v) : 0;
-}    
-
 template <size_t MinClassSizeC>
 class CommonT
 {
 public:
     enum {MinClassSize = MinClassSizeC};
     static inline SizeClassT sizeClass(size_t sz) {
-        SizeClassT tmp(msbIdx(sz - 1) - (MinClassSizeC - 1));
+        SizeClassT tmp(vespalib::Optimized::msbIdx(sz - 1) - (MinClassSizeC - 1));
         return (sz <= (1 << MinClassSizeC )) ? 0 : tmp;
     }
     static inline size_t classSize(SizeClassT sc) { return (size_t(1) << (sc + MinClassSizeC)); }
@@ -84,14 +82,14 @@ public:
     ~Mutex()           { quit(); }
     void lock();
     void unlock();
-    static void addThread()      { _threadCount.fetch_add(1); }
-    static void subThread()      { _threadCount.fetch_sub(1); }
+    static void addThread()      { Atomic::postInc(&_threadCount); }
+    static void subThread()      { Atomic::postDec(&_threadCount); }
     static void stopRecursion()  { _stopRecursion = true; }
     static void allowRecursion() { _stopRecursion = false; }
     void init();
     void quit();
 private:
-    static std::atomic<uint32_t> _threadCount;
+    static uint32_t _threadCount;
     static bool     _stopRecursion;
     Mutex(const Mutex & org);
     Mutex & operator = (const Mutex & org);
diff --git a/vespamalloc/src/vespamalloc/malloc/datasegment.h b/vespamalloc/src/vespamalloc/malloc/datasegment.h
index 50f19233f04..c50d43dc1d8 100644
--- a/vespamalloc/src/vespamalloc/malloc/datasegment.h
+++ b/vespamalloc/src/vespamalloc/malloc/datasegment.h
@@ -1,7 +1,7 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <climits>
+#include <limits.h>
 #include <memory>
 #include <vespamalloc/malloc/common.h>
 #include <vespamalloc/util/traceutil.h>
diff --git a/vespamalloc/src/vespamalloc/malloc/globalpool.h b/vespamalloc/src/vespamalloc/malloc/globalpool.h
index 9757441524e..0669780b796 100644
--- a/vespamalloc/src/vespamalloc/malloc/globalpool.h
+++ b/vespamalloc/src/vespamalloc/malloc/globalpool.h
@@ -1,9 +1,9 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "common.h"
-#include "allocchunk.h"
-#include "datasegment.h"
+#include <vespamalloc/malloc/common.h>
+#include <vespamalloc/malloc/allocchunk.h>
+#include <vespamalloc/malloc/datasegment.h>
 #include <algorithm>
 
 #define USE_STAT2(a) a
@@ -46,8 +46,8 @@ private:
     {
     public:
         AllocFree() : _full(), _empty() { }
-        typename ChunkSList::AtomicHeadPtr _full;
-        typename ChunkSList::AtomicHeadPtr _empty;
+        typename ChunkSList::HeadPtr _full;
+        typename ChunkSList::HeadPtr _empty;
     };
     class Stat
     {
@@ -58,13 +58,13 @@ private:
                  _exchangeFree(0),
                  _exactAlloc(0),
                  _return(0),_malloc(0) { }
-        std::atomic<size_t> _getAlloc;
-        std::atomic<size_t> _getFree;
-        std::atomic<size_t> _exchangeAlloc;
-        std::atomic<size_t> _exchangeFree;
-        std::atomic<size_t> _exactAlloc;
-        std::atomic<size_t> _return;
-        std::atomic<size_t> _malloc;
+        size_t _getAlloc;
+        size_t _getFree;
+        size_t _exchangeAlloc;
+        size_t _exchangeFree;
+        size_t _exactAlloc;
+        size_t _return;
+        size_t _malloc;
         bool isUsed()       const {
             // Do not count _getFree.
             return (_getAlloc || _exchangeAlloc || _exchangeFree || _exactAlloc || _return || _malloc);
@@ -73,11 +73,11 @@ private:
 
     Mutex                       _mutex;
     ChunkSList                * _chunkPool;
-    AllocFree                   _scList[NUM_SIZE_CLASSES];
+    AllocFree                   _scList[NUM_SIZE_CLASSES] VESPALIB_ATOMIC_TAGGEDPTR_ALIGNMENT;
     DataSegment<MemBlockPtrT> & _dataSegment;
-    std::atomic<size_t>         _getChunks;
-    std::atomic<size_t>         _getChunksSum;
-    std::atomic<size_t>         _allocChunkList;
+    size_t                      _getChunks;
+    size_t                      _getChunksSum;
+    size_t                      _allocChunkList;
     Stat                        _stat[NUM_SIZE_CLASSES];
     static size_t               _threadCacheLimit __attribute__((visibility("hidden")));
     static size_t               _alwaysReuseLimit __attribute__((visibility("hidden")));
diff --git a/vespamalloc/src/vespamalloc/malloc/globalpool.hpp b/vespamalloc/src/vespamalloc/malloc/globalpool.hpp
index d84305ac2ee..b620c388fb6 100644
--- a/vespamalloc/src/vespamalloc/malloc/globalpool.hpp
+++ b/vespamalloc/src/vespamalloc/malloc/globalpool.hpp
@@ -1,10 +1,12 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "globalpool.h"
+#include <vespamalloc/malloc/globalpool.h>
 
 #define USE_STAT2(a) a
 
+using vespalib::Atomic;
+
 namespace vespamalloc {
 
 template <typename MemBlockPtrT>
@@ -20,6 +22,7 @@ AllocPoolT<MemBlockPtrT>::AllocPoolT(DataSegment<MemBlockPtrT> & ds)
       _getChunksSum(0),
       _allocChunkList(0)
 {
+    ChunkSList::init();
     memset(_scList, 0, sizeof(_scList));
 }
 
@@ -31,6 +34,7 @@ AllocPoolT<MemBlockPtrT>::~AllocPoolT()
 template <typename MemBlockPtrT>
 void AllocPoolT<MemBlockPtrT>::enableThreadSupport()
 {
+    ChunkSList::enableThreadSupport();
     _mutex.init();
 }
 
@@ -38,11 +42,11 @@ template <typename MemBlockPtrT>
 typename AllocPoolT<MemBlockPtrT>::ChunkSList *
 AllocPoolT<MemBlockPtrT>::getFree(SizeClassT sc)
 {
-    typename ChunkSList::AtomicHeadPtr & empty = _scList[sc]._empty;
+    typename ChunkSList::HeadPtr & empty = _scList[sc]._empty;
     ChunkSList * csl(NULL);
     while ((csl = ChunkSList::linkOut(empty)) == NULL) {
         Guard sync(_mutex);
-        if (empty.load(std::memory_order_relaxed)._ptr == NULL) {
+        if (empty._ptr == NULL) {
             ChunkSList * ncsl(getChunks(sync, 1));
             if (ncsl) {
                 ChunkSList::linkInList(empty, ncsl);
@@ -61,10 +65,10 @@ typename AllocPoolT<MemBlockPtrT>::ChunkSList *
 AllocPoolT<MemBlockPtrT>::getAlloc(SizeClassT sc)
 {
     ChunkSList * csl(NULL);
-    typename ChunkSList::AtomicHeadPtr & full = _scList[sc]._full;
+    typename ChunkSList::HeadPtr & full = _scList[sc]._full;
     while ((csl = ChunkSList::linkOut(full)) == NULL) {
         Guard sync(_mutex);
-        if (full.load(std::memory_order_relaxed)._ptr == NULL) {
+        if (full._ptr == NULL) {
             ChunkSList * ncsl(malloc(sync, sc));
             if (ncsl) {
                 ChunkSList::linkInList(full, ncsl);
@@ -72,7 +76,7 @@ AllocPoolT<MemBlockPtrT>::getAlloc(SizeClassT sc)
                 return NULL;
             }
         }
-        USE_STAT2(_stat[sc]._getAlloc.fetch_add(1, std::memory_order_relaxed));
+        USE_STAT2(Atomic::postInc(&_stat[sc]._getAlloc));
     }
     PARANOID_CHECK1( if (csl->empty() || (csl->count() > ChunkSList::NumBlocks)) { *(int*)0 = 0; } );
     return csl;
@@ -83,7 +87,7 @@ typename AllocPoolT<MemBlockPtrT>::ChunkSList *
 AllocPoolT<MemBlockPtrT>::getFree(SizeClassT sc, size_t UNUSED(minBlocks))
 {
     ChunkSList * csl = getFree(sc);
-    USE_STAT2(_stat[sc]._getFree.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._getFree));
     return csl;
 }
 
@@ -95,7 +99,7 @@ AllocPoolT<MemBlockPtrT>::exchangeFree(SizeClassT sc, typename AllocPoolT<MemBlo
     AllocFree & af = _scList[sc];
     ChunkSList::linkIn(af._full, csl, csl);
     ChunkSList *ncsl = getFree(sc);
-    USE_STAT2(_stat[sc]._exchangeFree.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._exchangeFree));
     return ncsl;
 }
 
@@ -107,7 +111,7 @@ AllocPoolT<MemBlockPtrT>::exchangeAlloc(SizeClassT sc, typename AllocPoolT<MemBl
     AllocFree & af = _scList[sc];
     ChunkSList::linkIn(af._empty, csl, csl);
     ChunkSList * ncsl = getAlloc(sc);
-    USE_STAT2(_stat[sc]._exchangeAlloc.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._exchangeAlloc));
     PARANOID_CHECK1( if (ncsl->empty() || (ncsl->count() > ChunkSList::NumBlocks)) { *(int*)0 = 0; } );
     return ncsl;
 }
@@ -122,7 +126,7 @@ AllocPoolT<MemBlockPtrT>::exactAlloc(size_t exactSize, SizeClassT sc,
     MemBlockPtrT mem(exactBlock, MemBlockPtrT::unAdjustSize(adjustedSize));
     csl->add(mem);
     ChunkSList * ncsl = csl;
-    USE_STAT2(_stat[sc]._exactAlloc.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._exactAlloc));
     mem.logBigBlock(exactSize, mem.adjustSize(exactSize), MemBlockPtrT::classSize(sc));
     PARANOID_CHECK1( if (ncsl->empty() || (ncsl->count() > ChunkSList::NumBlocks)) { *(int*)0 = 0; } );
     return ncsl;
@@ -145,7 +149,7 @@ AllocPoolT<MemBlockPtrT>::returnMemory(SizeClassT sc,
     }
     completelyEmpty = csl;
 #endif
-    USE_STAT2(_stat[sc]._return.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._return));
     return completelyEmpty;
 }
 
@@ -189,7 +193,7 @@ AllocPoolT<MemBlockPtrT>::malloc(const Guard & guard, SizeClassT sc)
         }
     }
     PARANOID_CHECK1( for (ChunkSList * c(csl); c; c = c->getNext()) { if (c->empty()) { *(int*)1 = 1; } } );
-    USE_STAT2(_stat[sc]._malloc.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_stat[sc]._malloc));
     return csl;
 }
 
@@ -219,8 +223,8 @@ AllocPoolT<MemBlockPtrT>::getChunks(const Guard & guard, size_t numChunks)
     } else {
         csl = NULL;
     }
-    USE_STAT2(_getChunks.fetch_add(1, std::memory_order_relaxed));
-    USE_STAT2(_getChunksSum.fetch_add(numChunks, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_getChunks));
+    USE_STAT2(_getChunksSum+=numChunks);
     PARANOID_CHECK1( for (ChunkSList * c(csl); c; c = c->getNext()) { if ( ! c->empty()) { *(int*)1 = 1; } } );
     return csl;
 }
@@ -241,7 +245,7 @@ AllocPoolT<MemBlockPtrT>::allocChunkList(const Guard & guard)
         }
         newList[chunksInBlock-1].setNext(NULL);
     }
-    USE_STAT2(_allocChunkList.fetch_add(1, std::memory_order_relaxed));
+    USE_STAT2(Atomic::postInc(&_allocChunkList));
     return newList;
 }
 
@@ -250,16 +254,16 @@ void AllocPoolT<MemBlockPtrT>::info(FILE * os, size_t level)
 {
     if (level > 0) {
         fprintf(os, "GlobalPool getChunks(%ld, %ld) allocChunksList(%ld):\n",
-                _getChunks.load(), _getChunksSum.load(), _allocChunkList.load());
+                _getChunks, _getChunksSum, _allocChunkList);
         for (size_t i = 0; i < NELEMS(_stat); i++) {
             const Stat & s = _stat[i];
             if (s.isUsed()) {
                 fprintf(os, "SC %2ld(%10ld) GetAlloc(%6ld) GetFree(%6ld) "
                             "ExChangeAlloc(%6ld) ExChangeFree(%6ld) ExactAlloc(%6ld) "
                             "Returned(%6ld) Malloc(%6ld)\n",
-                            i, MemBlockPtrT::classSize(i), s._getAlloc.load(), s._getFree.load(),
-                            s._exchangeAlloc.load(), s._exchangeFree.load(), s._exactAlloc.load(),
-                            s._return.load(), s._malloc.load());
+                            i, MemBlockPtrT::classSize(i), s._getAlloc, s._getFree,
+                            s._exchangeAlloc, s._exchangeFree, s._exactAlloc,
+                            s._return, s._malloc);
             }
         }
     }
diff --git a/vespamalloc/src/vespamalloc/malloc/memorywatcher.h b/vespamalloc/src/vespamalloc/malloc/memorywatcher.h
index aceef34ed09..9bbfa38c416 100644
--- a/vespamalloc/src/vespamalloc/malloc/memorywatcher.h
+++ b/vespamalloc/src/vespamalloc/malloc/memorywatcher.h
@@ -7,6 +7,7 @@
 #include <sys/stat.h>
 #include <ctype.h>
 #include <fcntl.h>
+#include <vespa/defaults.h>
 #include <vespamalloc/malloc/malloc.h>
 #include <vespamalloc/util/callstack.h>
 
diff --git a/vespamalloc/src/vespamalloc/malloc/mmap.cpp b/vespamalloc/src/vespamalloc/malloc/mmap.cpp
index 22a2b8fffce..6da13d9ac92 100644
--- a/vespamalloc/src/vespamalloc/malloc/mmap.cpp
+++ b/vespamalloc/src/vespamalloc/malloc/mmap.cpp
@@ -4,6 +4,7 @@
 #include <dlfcn.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <vespa/vespalib/util/backtrace.h>
 
 extern "C" {
 
@@ -56,7 +57,7 @@ void * local_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t
         }
     }
     if ((length >= getLogLimit()) && !isFromVespaMalloc(addr)) {
-        fprintf (stderr, "mmap requesting block of size %ld from %s\n", length, "no backtrace");
+        fprintf (stderr, "mmap requesting block of size %ld from %s\n", length, vespalib::getStackTrace(0).c_str());
     }
     return (*real_func)(addr, length, prot, flags, fd, offset);
 }
@@ -72,7 +73,7 @@ void * local_mmap64(void *addr, size_t length, int prot, int flags, int fd, off6
         }
     }
     if (length >= getLogLimit() && !isFromVespaMalloc(addr)) {
-        fprintf (stderr, "mmap requesting block of size %ld from %s\n", length, "no backtrace");
+        fprintf (stderr, "mmap requesting block of size %ld from %s\n", length, vespalib::getStackTrace(0).c_str());
     }
     return (*real_func)(addr, length, prot, flags, fd, offset);
 }
@@ -88,7 +89,7 @@ int local_munmap(void *addr, size_t length)
         }
     }
     if ((length >= getLogLimit()) && !isFromVespaMalloc(addr)) {
-        fprintf (stderr, "munmap releasing block of size %ld from %s\n", length, "no backtrace");
+        fprintf (stderr, "munmap releasing block of size %ld from %s\n", length, vespalib::getStackTrace(0).c_str());
     }
     return (*real_func)(addr, length);
 }
diff --git a/vespamalloc/src/vespamalloc/malloc/threadlist.h b/vespamalloc/src/vespamalloc/malloc/threadlist.h
index 875aec2942f..9901c9f6960 100644
--- a/vespamalloc/src/vespamalloc/malloc/threadlist.h
+++ b/vespamalloc/src/vespamalloc/malloc/threadlist.h
@@ -1,7 +1,7 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "threadpool.h"
+#include <vespamalloc/malloc/threadpool.h>
 
 namespace vespamalloc {
 
@@ -41,8 +41,8 @@ private:
     ThreadListT & operator = (const ThreadListT & tl);
     enum {ThreadStackSize=2048*1024};
     volatile bool              _isThreaded;
-    std::atomic<size_t>        _threadCount;
-    std::atomic<size_t>        _threadCountAccum;
+    volatile size_t            _threadCount;
+    volatile size_t            _threadCountAccum;
     ThreadPool                 _threadVector[NUM_THREADS];
     AllocPoolT<MemBlockPtrT> & _allocPool;
     static __thread ThreadPool * _myPool TLS_LINKAGE;
diff --git a/vespamalloc/src/vespamalloc/malloc/threadlist.hpp b/vespamalloc/src/vespamalloc/malloc/threadlist.hpp
index 8aa9b6a90b5..a1ea517beed 100644
--- a/vespamalloc/src/vespamalloc/malloc/threadlist.hpp
+++ b/vespamalloc/src/vespamalloc/malloc/threadlist.hpp
@@ -1,7 +1,7 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "threadlist.h"
+#include <vespamalloc/malloc/threadlist.h>
 
 namespace vespamalloc {
 
@@ -48,7 +48,7 @@ bool ThreadListT<MemBlockPtrT, ThreadStatT>::quitThisThread()
 {
     ThreadPool & tp = getCurrent();
     tp.quit();
-    _threadCount.fetch_sub(1);
+    Atomic::postDec(&_threadCount);
     return true;
 }
 
@@ -56,8 +56,8 @@ template <typename MemBlockPtrT, typename ThreadStatT>
 bool ThreadListT<MemBlockPtrT, ThreadStatT>::initThisThread()
 {
     bool retval(true);
-    _threadCount.fetch_add(1);
-    size_t lidAccum = _threadCountAccum.fetch_add(1);
+    Atomic::postInc(&_threadCount);
+    size_t lidAccum = Atomic::postInc(&_threadCountAccum);
     long localId(-1);
     for(size_t i = 0; (localId < 0) && (i < getMaxNumThreads()); i++) {
         ThreadPool & tp = _threadVector[i];
diff --git a/vespamalloc/src/vespamalloc/malloc/threadproxy.cpp b/vespamalloc/src/vespamalloc/malloc/threadproxy.cpp
index 193215ef83d..17da09f9b35 100644
--- a/vespamalloc/src/vespamalloc/malloc/threadproxy.cpp
+++ b/vespamalloc/src/vespamalloc/malloc/threadproxy.cpp
@@ -1,7 +1,9 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "threadproxy.h"
+#include <vespamalloc/malloc/threadproxy.h>
 #include <dlfcn.h>
+#include <pthread.h>
+#include <cstdio>
+#include <cerrno>
 
 namespace vespamalloc {
 
@@ -32,7 +34,7 @@ typedef int (*pthread_create_function) (pthread_t *thread,
 int linuxthreads_pthread_getattr_np(pthread_t pid, pthread_attr_t *dst);
 
 static void * _G_mallocThreadProxyReturnAddress = NULL;
-static std::atomic<size_t> _G_threadCount(1);  // You always have the main thread.
+static volatile size_t _G_threadCount = 1;  // You always have the main thread.
 
 static void cleanupThread(void * arg)
 {
@@ -40,7 +42,7 @@ static void cleanupThread(void * arg)
     delete ta;
     vespamalloc::_G_myMemP->quitThisThread();
     vespamalloc::Mutex::subThread();
-    _G_threadCount.fetch_sub(1);
+    vespalib::Atomic::postDec(&_G_threadCount);
 }
 
 void * mallocThreadProxy (void * arg)
@@ -75,11 +77,11 @@ VESPA_DLL_EXPORT int local_pthread_create (pthread_t *thread,
                           void * (*start_routine) (void *),
                           void * arg)
 {
-    size_t numThreads = _G_threadCount;
-    while ((numThreads < vespamalloc::_G_myMemP->getMaxNumThreads())
-           && ! _G_threadCount.compare_exchange_strong(numThreads, numThreads+1))
-    { }
-
+    size_t numThreads;
+    for (numThreads = _G_threadCount
+        ;(numThreads < vespamalloc::_G_myMemP->getMaxNumThreads()) && ! vespalib::Atomic::cmpSwap(&_G_threadCount, numThreads+1, numThreads)
+        ; numThreads = _G_threadCount) {
+    }
     if (numThreads >= vespamalloc::_G_myMemP->getMaxNumThreads()) {
         return EAGAIN;
     }
diff --git a/vespamalloc/src/vespamalloc/malloc/threadproxy.h b/vespamalloc/src/vespamalloc/malloc/threadproxy.h
index 0d86bef9e95..4865e5fbd5f 100644
--- a/vespamalloc/src/vespamalloc/malloc/threadproxy.h
+++ b/vespamalloc/src/vespamalloc/malloc/threadproxy.h
@@ -1,7 +1,7 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include "common.h"
+#include <vespamalloc/malloc/common.h>
 
 namespace vespamalloc {
 
diff --git a/vespamalloc/src/vespamalloc/util/index.h b/vespamalloc/src/vespamalloc/util/index.h
index b2c20f7bf02..f7513114edc 100644
--- a/vespamalloc/src/vespamalloc/util/index.h
+++ b/vespamalloc/src/vespamalloc/util/index.h
@@ -1,7 +1,8 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <cstddef>
+#include <vespa/vespalib/util/atomic.h>
+#include <stdio.h>
 
 namespace vespamalloc {
 
@@ -19,4 +20,19 @@ private:
     index_t _index;
 };
 
+class AtomicIndex
+{
+public:
+    typedef size_t index_t;
+    AtomicIndex(index_t index = 0) : _index(index) { }
+    operator index_t ()       const { return _index; }
+    index_t operator ++ (int)       { return vespalib::Atomic::postInc(&_index); }
+    index_t operator -- (int)       { return vespalib::Atomic::postDec(&_index); }
+    index_t operator += (index_t v) { return _index += v; }
+    index_t operator -= (index_t v) { return _index -= v; }
+private:
+    index_t _index;
+};
+
 }
+
diff --git a/vespamalloc/src/vespamalloc/util/osmem.h b/vespamalloc/src/vespamalloc/util/osmem.h
index b95f8ac72e8..f5c51c2000d 100644
--- a/vespamalloc/src/vespamalloc/util/osmem.h
+++ b/vespamalloc/src/vespamalloc/util/osmem.h
@@ -1,11 +1,11 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <cctype>
-#include <cstdlib>
+#include <ctype.h>
+#include <stdlib.h>
 #include <unistd.h>
-#include <cassert>
-#include <cstring>
+#include <assert.h>
+#include <string.h>
 #include <algorithm>
 
 namespace vespamalloc {