[mathicgb] 122/393: Fixed bug in Atomic.hpp (relaxed load/store needs to be not cached in registers) and generally cleaned up the atomics code.

Fri Apr 3 15:58:45 UTC 2015

This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch upstream
in repository mathicgb.

commit a382a515bd4bb1f72e34f779cef48a6f091a24d1
Author: Bjarke Hammersholt Roune <bjarkehr.code at gmail.com>
Date:   Tue Dec 11 21:01:25 2012 +0100

    Fixed bug in Atomic.hpp (relaxed load/store needs to be not cached in registers) and generally cleaned up the atomics code.
---
 src/mathicgb/Atomic.hpp | 256 ++++++++++++++++++++++++++++--------------------
 1 file changed, 148 insertions(+), 108 deletions(-)

diff --git a/src/mathicgb/Atomic.hpp b/src/mathicgb/Atomic.hpp
index d91c874..49c0690 100755
--- a/src/mathicgb/Atomic.hpp
+++ b/src/mathicgb/Atomic.hpp
@@ -5,42 +5,36 @@
 // using std::atomic.
 #include <atomic>
 
-#if defined(_MSC_VER) && defined(MATHICGB_USE_CUSTOM_ATOMIC_X86_X64)
-
-/// Tells the compiler (not the CPU) to not reorder reads across this line.
-#define MATHICGB_COMPILER_READ_MEMORY_BARRIER _ReadBarrier()
-
-/// Tells the compiler (not the CPU) to not reorder writes across this line.
-#define MATHICGB_COMPILER_WRITE_MEMORY_BARRIER _WriteBarrier()
+namespace AtomicInternal {
+  /// Tells the compiler (not the CPU) to not reorder reads across this line.
+  inline void compilerReadMemoryBarrier();
 
-/// Tells the compiler (not the CPU) to not reorder reads and writes across
-/// this line.
-#define MATHICGB_COMPILER_READ_WRITE_MEMORY_BARRIER _ReadWriteBarrier()
+  /// Tells the compiler (not the CPU) to not reorder writes across this line.
+  inline void compilerWriteMemoryBarrier();
 
-/// Tells the CPU and also the compiler to not reorder reads and writes
-/// across this line.
-#define MATHICGB_CPU_READ_WRITE_MEMORY_BARRIER MemoryBarrier()
+  /// Tells the compiler (not the CPU) to not reorder reads and writes across
+  /// this line.
+  inline void compilerReadWriteMemoryBarrier();
 
-/// Loads a variable with sequentially consistent ordering. The variable
-/// must be aligned and have a size such that aligned loads of that size
-/// are atomic.
-#define MATHICGB_SEQ_CST_LOAD(REF) \
-  ::AtomicInternalMsvc::SeqCstSelect<decltype(REF)>::load(REF)
+  /// Tells the CPU and also the compiler to not reorder reads and writes
+  /// across this line.
+  inline void cpuReadWriteMemoryBarrier();
 
-/// Stores a variable with sequentially consistent ordering. The variable
-/// must be aligned and have a size such that aligned reads of that size
-/// are atomic.
-#define MATHICGB_SEQ_CST_STORE(VALUE, REF) \
-  ::AtomicInternalMsvc::SeqCstSelect<decltype(REF)>::store(VALUE, REF)
+  /// Stores a variable with sequentially consistent ordering. The variable
+  /// must be aligned and have a size such that aligned reads of that size
+  /// are atomic. This requies a locked instruction like XHCG according to
+  /// http://goo.gl/U8xTK .
+  template<class T>
+  void seqCstStore(const T value, T& ref);
+}
 
+#if defined(_MSC_VER) && defined(MATHICGB_USE_CUSTOM_ATOMIC_X86_X64)
 #include <Windows.h>
-namespace AtomicInternalMsvc {
-  template<class T, size_t size> struct SeqCst {};
+namespace AtomicInternal {
+  template<class T, size_t size = sizeof(T)>
+  struct SeqCst {};
 #ifdef MATHICGB_USE_CUSTOM_ATOMIC_4BYTE
   template<class T> struct SeqCst<T, 4> {
-    static T load(const T& ref) {
-      return (T)_InterlockedOr((volatile LONG*)&ref, 0);
-    }
     static void store(const T value, T& ref) {
       _InterlockedExchange((volatile LONG*)&ref, (LONG)value);
     }
@@ -48,55 +42,46 @@ namespace AtomicInternalMsvc {
 #endif
 #ifdef MATHICGB_USE_CUSTOM_ATOMIC_8BYTE
   template<class T> struct SeqCst<T, 8> {
-    static T load(const T& ref) {
-      return (T)_InterlockedOr64((volatile _LONGLONG*)&ref, 0);
-    }
     static void store(const T value, T& ref) {
       _InterlockedExchange64((volatile _LONGLONG*)&ref, (_LONGLONG)value);
     }
   };
 #endif
-  template<class T> struct SeqCstSelect : public SeqCst<T, sizeof(T)> {};
+
+  inline void compilerReadMemoryBarrier() {_ReadBarrier();}
+  inline void compilerWriteMemoryBarrier() {_WriteBarrier();}
+  inline void compilerReadWriteMemoryBarrier() {_ReadWriteBarrier();}
+  inline void cpuReadWriteMemoryBarrier() {MemoryBarrier();}
+  template<class T>
+  void seqCstStore(const T value, T& ref) {SeqCst<T>::store(value, ref);}
 }
 #endif
 
 #if defined(__GNUC__) && defined(MATHICGB_USE_CUSTOM_ATOMIC_X86_X64)
+namespace AtomicInternal {
+  inline void compilerReadMemoryBarrier() {
+    // As far as I can tell there is no way to do a partial optimization
+    // barrier on GCC, so we have to do the full barrier every time.
+    compilerReadWriteMemoryBarrier();
+  }
 
-// As far as I can tell this is not documented to work, but it is the
-// only way to do this on GCC and it is what the Linux kernel does, so
-// that will have to be good enough for me.
-#define MATHICGB_COMPILER_READ_WRITE_MEMORY_BARRIER \
-  __asm__ __volatile__ ("" ::: "memory");
-
-// As far as I can tell there is no way to do a partial optimization
-// barrier on GCC, so we have to do the full barrier every time.
-#define MATHICGB_COMPILER_READ_MEMORY_BARRIER \
-  MATHICGB_COMPILER_READ_WRITE_MEMORY_BARRIER
-
-#define MATHICGB_COMPILER_WRITE_MEMORY_BARRIER \
-  MATHICGB_COMPILER_READ_WRITE_MEMORY_BARRIER
+  inline void compilerWriteMemoryBarrier() {compilerReadWriteMemoryBarrier();}
 
-#define MATHICGB_CPU_READ_WRITE_MEMORY_BARRIER __sync_synchronize()
+  inline void compilerReadWriteMemoryBarrier() {
+    // As far as I can tell this is not documented to work, but it is the
+    // only way to do this on GCC and it is what the Linux kernel does, so
+    // that will have to be good enough.
+    __asm__ __volatile__ ("" ::: "memory");
+  }
 
-#define MATHICGB_SEQ_CST_LOAD(REF) \
-  AtomicInternalGCC::SeqCst<decltype(REF)>::load(REF)
-#define MATHICGB_SEQ_CST_STORE(VALUE, REF) \
-  AtomicInternalGCC::SeqCst<decltype(REF)>::store(VALUE, REF)
+  inline void cpuReadWriteMemoryBarrier() {__sync_synchronize();}
 
-namespace AtomicInternalGCC {
-  template<class T> struct SeqCst {
-    static T load(const T& ref) {
-      const auto ptr = static_cast<volatile T*>(const_cast<T*>(&ref));
-      return __sync_fetch_and_or((volatile T*)&ref, 0);
-    }
-    static void store(const T value, T& ref) {
-      const auto ptr = static_cast<volatile T*>(&ref);
-      while (!__sync_bool_compare_and_swap(ptr, *ptr, value)) {}
-    }
-  };
-  template<class T> struct SeqCst<const T> : public SeqCst<T> {};
+  template<class T>
+  void seqCstStore(const T value, T& ref) {
+    const auto ptr = static_cast<volatile T*>(&ref);
+    while (!__sync_bool_compare_and_swap(ptr, *ptr, value)) {}
+  }    
 }
-
 #endif
 
 namespace AtomicInternal {
@@ -104,8 +89,8 @@ namespace AtomicInternal {
   // This class has the same interface as the actual custom atomic
   // class but it does absolutely no synchronization and it does not
   // constrain compiler optimizations in any way. The purpose of this class
-  // is to enable it while running single core to compare the single core
-  // overhead of the atomic ordering constraints.
+  // is to enable it while running only a single thread to determine the
+  // overhead imposed by the atomic operations.
   template<class T>
   class FakeAtomic {
   public:
@@ -136,13 +121,19 @@ namespace AtomicInternal {
 
 #ifdef MATHICGB_USE_CUSTOM_ATOMIC_X86_X64
 namespace AtomicInternal {
-  /// Custom Atomic class for x86 and x64. Uses special compiler instructions
+  /// Custom Atomic class for x86 and x64. Uses special compiler directives
   /// for barriers. Only instantiate this for sizes where aligned reads and
   /// writes are guaranteed to be atomic - this class only takes care of the
   /// ordering constraints using CPU and compiler fences. Since the directives
-  /// to achieve this are coming from the compiler it is very strange that
+  /// to achieve this are coming from the compiler it is surprising that
   /// any compiler ships with a std::atomic that is worse than this - but
-  /// that is very much the case.
+  /// that is very much the case. Though implementing atomic load and store
+  /// is very little code, as you can see, it is quite tricky and took me
+  /// a long time to understand everything well enough to actually know what
+  /// those few lines of code should say. Perhaps compiler vendors have
+  /// postponed fast atomics just because they want to focus on other things
+  /// since it's not that easy to figure out how to do right (on that topic,
+  /// bug reports on this code are welcome!).
   ///
   /// There are 5 kinds of reorderings that we are concerned with here. Let
   /// S,S' be stores and let L,L' be stores. Note that these short-hands may
@@ -161,28 +152,29 @@ namespace AtomicInternal {
   /// DLL can happen on DEC Alpha if p->a is cached locally while p is not.
   /// Then p will be loaded from memory while p->a is loaded from the cache,
   /// which is functionally identical to loading p->a before p since we may
-  /// see a value of p->a that was stored before the value of p. This happens
-  /// even if the processor that stored p did a full memory barrier between
-  /// storing p->a and storing p.
+  /// see a value of p->a from the past while we get an up to date value of p.
+  /// This can happen even if the processor that stored p did a full memory
+  /// barrier between storing p->a and storing p.
   ///
   /// Compilers will also perform all of these reorderings to optimize the
   /// code - even including DLL. DLL happens if the compiler guesses what
   /// the value of p is, loads p->a and then checks that the guess for p
   /// was correct. This directly causes p->a to be actually loaded before p.
-  /// These kinds of optimizations turn up in profile-driven optimization,
-  /// but it is always allowed unless we tell the compiler not to do it.
+  /// These kinds of optimizations turn up in profile-driven optimization.
   ///
   /// You can check this out here:
   ///   http://en.wikipedia.org/wiki/Memory_ordering
   ///
-  /// On x86 and x64 only SL is doe by the CPU, so we need a CPU barrier to
+  /// On x86 and x64 only SL is done by the CPU, so we need a CPU barrier to
   /// prevent that and nothing else. The compiler is free to perform all of
   /// these reorderings, so we need lots of compiler optimization barriers
   /// to deal with all of these cases.
   ///
-  /// Some of the quotes below are from
-  ///
+  /// Some of the quotes below are from:
   ///   http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1525.htm
+  ///
+  /// Also interesting:
+  ///http://bartoszmilewski.com/2008/11/05/who-ordered-memory-fences-on-an-x86/
   template<class T>
   class CustomAtomicX86X64 {
   public:
@@ -193,45 +185,87 @@ namespace AtomicInternal {
     T load(const std::memory_order order) const {
       switch (order) {
       case std::memory_order_relaxed:
-        // The only constraint here is that if you read *p, then you will never
+        // There are two constraints for memory_order_relaxed. The first
+        // constraint is that if you read *p, then you will never
         // after that read a value of *p that was stored before the value
         // you just read, where "before" is in terms of either the same thread
         // that did the writing or external synchronization of another thread
         // with this thread. This is automaticaly guaranteed on this platform
         // and the compiler cannot break this guarantee.
-        return mValue;
+        //
+        // The second constraint is that writes performed by other threads will
+        // appear "in a timely manner" when reading the written-to memory
+        // location. It is not clear how much time is allowed to elapse before
+        // this constraint is broken, but clearly the written value must turn
+        // up eventually. This constraint could be broken in cases like this:
+        //
+        //   while (x.load(std::memory_order_relaxed) == 0) {
+        //     // do something that does not write to x
+        //   }
+        //
+        // The problem is that the compiler might observe that x is not being
+        // written to inside the loop, so the expression x.load(...) should be
+        // the value for every iteration, so the code can safely be transformed
+        // to:
+        //
+        //   if (x.load(std::memory_order_relaxed) == 0) {
+        //     while (true) {
+        //       // ...
+        //     }
+        //   }
+        //
+        // So we need to tell the compiler that the value can change due to
+        // code not in view of the current thread of execution. There are two
+        // solutions that I have considered:
+        //
+        // * A compiler read memory barrier. This is overkill because it stops
+        //   the compiler from moving all reads across this line.
+        //
+        // * A volatile read. This is pretty much exactly what we need on gcc,
+        //   but on MSVC it provides extra guarantees that we do not need here:
+        //   http://msdn.microsoft.com/en-us/library/12a04hfd%28v=vs.80%29.aspx
+        //
+        // I do not think there is any best choice on MSVC, so I went for a
+        // volatile read since it is the best choice on GCC.
+        return const_cast<volatile T&>(mValue);
 
       case std::memory_order_consume: {
         // Loads in this thread that depend on the loaded value must not be
         // reordered to before this load. So no DLL reorderings past this
         // load from after to before (up). So we need a read barrier AFTER the
-        // load. It is a compiler only barrier since the CPU does not do DLL
-        // reorderings. 
+        // load. It is a compiler only barrier since x86 and x64 CPUs do not do
+        // DLL reorderings. 
         const auto value = mValue;
-        MATHICGB_COMPILER_READ_MEMORY_BARRIER;
+        compilerReadMemoryBarrier();
         return value;
       }
 
       case std::memory_order_acquire: {
         // Loads in this thread must not be reordered to before this load.
         // So no LL reorderings past this load from after to before (up).
-        // So we need a barrier AFTER the load. It is a compiler only barrier
-        // since the CPU does not do LL reorderings.
+        // So we need a read barrier AFTER the load. It is a compiler only
+        // barrier since x86 and x64 CPUs do not do LL reorderings.
         const auto value = mValue;
-        MATHICGB_COMPILER_READ_MEMORY_BARRIER;
-        return mValue;
+        compilerReadMemoryBarrier();
+        return value;
       }
 
-      case std::memory_order_seq_cst:
+      case std::memory_order_seq_cst: {
         // There must be some global order in which all sequentially consistent
-        // atomic operations are considered to have happened in. This is automatic
-        // on x64, ARM, SPARC and x64 too for reads (but not writes) - see:
-        //   http://www.stdthread.co.uk/forum/index.php?topic=72.0
-        return MATHICGB_SEQ_CST_LOAD(mValue);
+        // atomic operations are considered to have happened in. On x86 and x64
+        // this is guaranteed by just a normal read as long as all writes use
+        // locked instructions like XHCG. See: http://goo.gl/U8xTK
+        //
+        // We still need to prevent the compiler from reordering the reads,
+        // which is the same constraint as for std::memory_order_acquire.
+        const auto value = mValue;
+        compilerReadMemoryBarrier();
+        return value;
+      }
 
       case std::memory_order_release: // not available for load
       case std::memory_order_acq_rel: // not available for load
-      default:
+      default: // specified value is not a known std::memory_order
         MATHICGB_UNREACHABLE;
       }
     }
@@ -240,17 +274,25 @@ namespace AtomicInternal {
     void store(const T value, const std::memory_order order) {
       switch (order) {
       case std::memory_order_relaxed:
-        // No ordering constraints here other than atomicity and as noted
-        // for relaxed load so we can just store directly.
-        mValue = value;
+        // There are no reordering constraints here but we need to tell the
+        // compiler that it must actually write out the value to memory in
+        // a scenario like this:
+        //
+        //   x.store(1, std::memory_order_relaxed);
+        //   while (true) {}
+        //
+        // So as for relaxed store we need either a volatile access or a memory
+        // barrier. I chose a volatile access for the same reason as for the
+        // store: MSVC has no best choice and for GCC the volatile is perfect.
+        const_cast<volatile T&>(mValue) = value;
         break;
 
       case std::memory_order_release:
         // Stores in this thread must not be reordered to after this store.
         // So no SS reorderings past this load from before to after (down).
         // So we need a barrier BEFORE the load. It is a compiler only barrier
-        // since the CPU does not do SS reorderings.
-        MATHICGB_COMPILER_WRITE_MEMORY_BARRIER;
+        // since x86 and x64 CPUs do not do SS reorderings.
+        compilerWriteMemoryBarrier();
         mValue = value;
         break;
 
@@ -258,24 +300,22 @@ namespace AtomicInternal {
         // Combine the guarantees for std::memory_order_acquire and
         // std::memory_order_release. So no loads moved up past here (SL) and
         // no stores moved down past here (LL). We need a compiler barrier
-        // BEFORE the load to avoid LL and a CPU barrier (implies also a
-        // compiler barrier AFTER the load to avoid SL, since the CPU can in
-        // fact do SL reordering.
-        MATHICGB_COMPILER_WRITE_MEMORY_BARRIER;
+        // BEFORE the load to avoid LL and a CPU (+compiler) barrier AFTER the
+        // load to avoid SL, since x86 and x64 CPUs can in fact do SL
+        // reordering.
+        compilerWriteMemoryBarrier();
         mValue = value;
-        MATHICGB_CPU_READ_WRITE_MEMORY_BARRIER;
+        cpuReadWriteMemoryBarrier();
         break;
 
       case std::memory_order_seq_cst:
-        // All operations happen in a globally consistent linear order. I am
-        // sure if this can be achieved with barriers but I know that it can be
-        // achieved with locked instructions, so I am using that.
-        MATHICGB_SEQ_CST_STORE(value, mValue);
+        // All operations happen in a globally consistent total order.
+        seqCstStore(value, mValue);
         break;
 
       case std::memory_order_consume: // not available for store
       case std::memory_order_acquire: // not available for store
-      default:
+      default: // specified value is not a known std::memory_order
         MATHICGB_UNREACHABLE;
       }
     }
@@ -300,7 +340,7 @@ namespace AtomicInternal {
 }
 #endif
 
-/// This class is equivalent to std::atomic<T*>. Some functions from the
+/// This class is equivalent to std::atomic<T>. Some functions from the
 /// interface of std::atomic are missing - add them as necessary. Do not add
 /// operator= and operator T() --- it is better to make the code explicit
 /// about when and how loading and storing of atomic variables occurs.
@@ -340,7 +380,7 @@ private:
   void operator=(const Atomic<T>&); // not available
 
   bool debugAligned() const {
-    return reinterpret_cast<size_t>(&mValue) % sizeof(void*) == 0;
+    return reinterpret_cast<size_t>(&mValue) % sizeof(T) == 0;
   }
 
   typename AtomicInternal::ChooseAtomic<T, sizeof(T)>::type mValue;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/mathicgb.git