/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- * * Copyright (c) 2004-2008, Erik Lindahl * * Unfortunately, some of the constructs in this file are _very_ sensitive * to compiler optimizations and architecture changes. If you find any such * errors, please send a message to lindahl@cbr.su.se to help us fix the * upstream version too. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * * And Hey: * Gnomes, ROck Monsters And Chili Sauce */ #ifndef _GMX_ATOMIC_H_ #define _GMX_ATOMIC_H_ /*! \file gmx_atomic.h * * @brief Atomic operations for fast SMP synchronization * * This file defines atomic integer operations and spinlocks for * fast synchronization in performance-critical regions of gromacs. * * In general, the best option is to use functions without explicit * locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg(). * * Not all architecture support atomic operations though inline assembly, * and even if they do it might not be implemented here. In that case * we use a fallback mutex implementation, so you can always count on * the function interfaces working in Gromacs. * * Don't use spinlocks in non-performance-critical regions like file I/O. * Since they always spin busy they would waste CPU cycles instead of * properly yielding to a computation thread while waiting for the disk. * * Finally, note that all our spinlock operations are defined to return * 0 if initialization or locking completes successfully. * This is the opposite of some other implementations, but the same standard * as used for pthread mutexes. So, if e.g. are trying to lock a spinlock, * you will have gotten the lock if the return value is 0. * * gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked, * and 0 if it is available, though... */ #include #include #ifdef __cplusplus extern "C" { #endif #if 0 } /* Avoids screwing up auto-indentation */ #endif #if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \ (defined(i386) || defined(__x86_64__)) ) \ || defined (DOXYGEN) ) /* This code is executed for x86 and x86-64, with these compilers: * GNU * Intel * Pathscale * All these support GCC-style inline assembly. * We also use this section for the documentation. */ /*! \brief Memory barrier operation * * Modern CPUs rely heavily on out-of-order execution, and one common feature * is that load/stores might be reordered. Also, when using inline assembly * the compiler might already have loaded the variable we are changing into * a register, so any update to memory won't be visible. * * This command creates a memory barrier, i.e. all memory results before * it in the code should be visible to all memory operations after it - the * CPU cannot propagate load/stores across it. */ #define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory") /* Only gcc and Intel support this check, otherwise set it to true (skip doc) */ #if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN) #define __builtin_constant_p(i) (1) #endif /*! \brief Gromacs atomic operations datatype * * Portable synchronization primitives like mutexes are effective for * many purposes, but usually not very high performance. * One of the problem is that you have the overhead of a function call, * and another is that Mutexes often have extra overhead to make the * scheduling fair. Finally, if performance is important we don't want * to suspend the thread if we cannot lock a mutex, but spin-lock at 100% * CPU usage until the resources is available (e.g. increment a counter). * * These things can often be implemented with inline-assembly or other * system-dependent functions, and we provide such functionality for the * most common platforms. For portability we also have a fallback * implementation using a mutex for locking. * * Performance-wise, the fastest solution is always to avoid locking * completely (obvious, but remember it!). If you cannot do that, the * next best thing is to use atomic operations that e.g. increment a * counter without explicit locking. Spinlocks are useful to lock an * entire region, but leads to more overhead and can be difficult to * debug - it is up to you to make sure that only the thread owning the * lock unlocks it! * * You should normally NOT use atomic operations for things like * I/O threads. These should yield to other threads while waiting for * the disk instead of spinning at 100% CPU usage. * * It is imperative that you use the provided routines for reading * and writing, since some implementations require memory barriers before * the CPU or memory sees an updated result. The structure contents is * only visible here so it can be inlined for performance - it might * change without further notice. * * \note No initialization is required for atomic variables. * * Currently, we have (real) atomic operations for: * * - x86 or x86_64, using GNU compilers * - x86 or x86_64, using Intel compilers * - x86 or x86_64, using Pathscale compilers * - Itanium, using GNU compilers * - Itanium, using Intel compilers * - Itanium, using HP compilers * - PowerPC, using GNU compilers * - PowerPC, using IBM AIX compilers * - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X. */ typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; /*! \brief Gromacs spinlock * * Spinlocks provide a faster synchronization than mutexes, * although they consume CPU-cycles while waiting. They are implemented * with atomic operations and inline assembly whenever possible, and * otherwise we use a fallback implementation where a spinlock is identical * to a mutex (this is one of the reasons why you have to initialize them). * * There are no guarantees whatsoever about fair scheduling or * debugging if you make a mistake and unlock a variable somebody * else has locked - performance is the primary goal of spinlocks. * */ typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; /*! \brief Spinlock static initializer * * This is used for static spinlock initialization, and has the same * properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes. * This is only for inlining in the gmx_thread.h header file. Whether * it is 0, 1, or something else when unlocked depends on the platform. * Don't assume anything about it. It might even be a mutex when using the * fallback implementation! */ #define GMX_SPINLOCK_INITIALIZER { 1 } /*! \brief Return value of an atomic integer * * Also implements proper memory barriers when necessary. * The actual implementation is system-dependent. * * \param a Atomic variable to read * \return Integer value of the atomic variable */ #define gmx_atomic_read(a) ((a)->value) /*! \brief Write value to an atomic integer * * Also implements proper memory barriers when necessary. * The actual implementation is system-dependent. * * \param a Atomic variable * \param i Integer to set the atomic variable to. */ #define gmx_atomic_set(a,i) (((a)->value) = (i)) /*! \brief Add integer to atomic variable * * Also implements proper memory barriers when necessary. * The actual implementation is system-dependent. * * \param a atomic datatype to modify * \param i integer to increment with. Use i<0 to subtract atomically. * * \return The new value (after summation). */ static inline int gmx_atomic_add_return(gmx_atomic_t * a, volatile int i) { int __i; __i = i; __asm__ __volatile__("lock ; xaddl %0, %1;" :"=r"(i) :"m"(a->value), "0"(i)); return i + __i; } /*! \brief Add to variable, return the old value. * * This operation is quite useful for synchronization counters. * By performing a fetchadd with N, a thread can e.g. reserve a chunk * with the next N iterations, and the return value is the index * of the first element to treat. * * Also implements proper memory barriers when necessary. * The actual implementation is system-dependent. * * \param a atomic datatype to modify * \param i integer to increment with. Use i<0 to subtract atomically. * * \return The value of the atomic variable before addition. */ static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, volatile int i) { int __i; __i = i; __asm__ __volatile__("lock ; xaddl %0, %1;" :"=r"(i) :"m"(a->value), "0"(i)); return i; } /*! \brief Atomic compare-exchange operation * * The \a old value is compared with the memory value in the atomic datatype. * If the are identical, the atomic type is updated to the new value, * and otherwise left unchanged. * * This is a very useful synchronization primitive: You can start by reading * a value (without locking anything), perform some calculations, and then * atomically try to update it in memory unless it has changed. If it has * changed you will get an error return code - reread the new value * an repeat the calculations in that case. * * \param a Atomic datatype ('memory' value) * \param oldval Integer value read from the atomic type at an earlier point * \param newval New value to write to the atomic type if it currently is * identical to the old value. * * \return The value of the atomic memory variable in memory when this * instruction was executed. This, if the operation succeeded the * return value was identical to the \a old parameter, and if not * it returns the updated value in memory so you can repeat your * operations on it. * * \note The exchange occured if the return value is identical to \a old. */ static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { volatile unsigned long prev; __asm__ __volatile__("lock ; cmpxchgl %1,%2" : "=a"(prev) : "q"(newval), "m"(a->value), "0"(oldval) : "memory"); return prev; } /*! \brief Initialize spinlock * * In theory you can call this from multiple threads, but remember * that we don't check for errors. If the first thread proceeded to * lock the spinlock after initialization, the second will happily * overwrite the contents and unlock it without warning you. * * \param x Gromacs spinlock pointer. */ static inline void gmx_spinlock_init(gmx_spinlock_t * x) { x->lock = 1; } /*! \brief Acquire spinlock * * This routine blocks until the spinlock is available, and * the locks it again before returning. * * \param x Gromacs spinlock pointer */ static inline void gmx_spinlock_lock(gmx_spinlock_t * x) { __asm__ __volatile__("\n1:\t" "lock ; decb %0\n\t" "jns 3f\n" "2:\t" "rep;nop\n\t" "cmpb $0,%0\n\t" "jle 2b\n\t" "jmp 1b\n" "3:\n\t" :"=m" (x->lock) : : "memory"); } /*! \brief Attempt to acquire spinlock * * This routine acquires the spinlock if possible, but if * already locked it return an error code immediately. * * \param x Gromacs spinlock pointer * * \return 0 if the mutex was available so we could lock it, * otherwise a non-zero integer (1) if the lock is busy. */ static inline int gmx_spinlock_trylock(gmx_spinlock_t * x) { char old_value; __asm__ __volatile__("xchgb %b0,%1" :"=q" (old_value), "=m" (x->lock) :"0" (0) : "memory"); return (old_value <= 0); } /*! \brief Release spinlock * * \param x Gromacs spinlock pointer * * Unlocks the spinlock, regardless if which thread locked it. */ static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { char old_value = 1; __asm__ __volatile__( "xchgb %b0, %1" :"=q" (old_value), "=m" (x->lock) :"0" (old_value) : "memory" ); } /*! \brief Check if spinlock is locked * * This routine returns immediately with the lock status. * * \param x Gromacs spinlock pointer * * \return 1 if the spinlock is locked, 0 otherwise. */ static inline int gmx_spinlock_islocked(gmx_spinlock_t * x) { return (*(volatile signed char *)(&(x)->lock) <= 0); } /*! \brief Wait for a spinlock to become available * * This routine blocks until the spinlock is unlocked, * but in contrast to gmx_spinlock_lock() it returns without * trying to lock the spinlock. * * \param x Gromacs spinlock pointer */ static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { do { gmx_atomic_memory_barrier(); } while(gmx_spinlock_islocked(x)); } #elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) /* PowerPC using proper GCC inline assembly. * Recent versions of xlC (>=7.0) _partially_ support this, but since it is * not 100% compatible we provide a separate implementation for xlC in * the next section. */ /* Compiler-dependent stuff: GCC memory barrier */ #define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory") typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; #define GMX_SPINLOCK_INITIALIZER { 0 } #define gmx_atomic_read(a) ((a)->value) #define gmx_atomic_set(a,i) (((a)->value) = (i)) static inline int gmx_atomic_add_return(gmx_atomic_t * a, int i) { int t; __asm__ __volatile__("1: lwarx %0,0,%2\n" "\tadd %0,%1,%0\n" "\tstwcx. %0,0,%2 \n" "\tbne- 1b" "\tisync\n" : "=&r" (t) : "r" (i), "r" (&a->value) : "cc" , "memory"); return t; } static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, int i) { int t; __asm__ __volatile__("\teieio\n" "1: lwarx %0,0,%2\n" "\tadd %0,%1,%0\n" "\tstwcx. %0,0,%2 \n" "\tbne- 1b\n" "\tisync\n" : "=&r" (t) : "r" (i), "r" (&a->value) : "cc", "memory"); return (t - i); } static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { int prev; __asm__ __volatile__ ("1: lwarx %0,0,%2 \n" "\tcmpw 0,%0,%3 \n" "\tbne 2f \n" "\tstwcx. %4,0,%2 \n" "bne- 1b\n" "\tsync\n" "2:\n" : "=&r" (prev), "=m" (a->value) : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value) : "cc", "memory"); return prev; } static inline void gmx_spinlock_init(gmx_spinlock_t *x) { x->lock = 0; } static inline void gmx_spinlock_lock(gmx_spinlock_t * x) { unsigned int tmp; __asm__ __volatile__("\tb 1f\n" "2: lwzx %0,0,%1\n" "\tcmpwi 0,%0,0\n" "\tbne+ 2b\n" "1: lwarx %0,0,%1\n" "\tcmpwi 0,%0,0\n" "\tbne- 2b\n" "\tstwcx. %2,0,%1\n" "bne- 2b\n" "\tisync\n" : "=&r"(tmp) : "r"(&x->lock), "r"(1) : "cr0", "memory"); } static inline int gmx_spinlock_trylock(gmx_spinlock_t * x) { unsigned int old, t; unsigned int mask = 1; volatile unsigned int *p = &x->lock; __asm__ __volatile__("\teieio\n" "1: lwarx %0,0,%4 \n" "\tor %1,%0,%3 \n" "\tstwcx. %1,0,%4 \n" "\tbne 1b\n" "\tsync\n" : "=&r" (old), "=&r" (t), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) : "cc", "memory"); return ((old & mask) != 0); } static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { __asm__ __volatile__("\teieio\n": : :"memory"); x->lock = 0; } static inline int gmx_spinlock_islocked(gmx_spinlock_t * x) { return ( x->lock != 0); } static inline void gmx_spinlock_wait(gmx_spinlock_t *x) { do { gmx_atomic_memory_barrier(); } while(gmx_spinlock_islocked(x)); } #elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM)) && \ (defined(__powerpc__) || defined(__ppc__))) /* PowerPC using xlC inline assembly. * Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly * if you use the option -qasm=gcc but we have had to hack things a bit, in * particular when it comes to clobbered variables. Since this implementation * _could_ be buggy, we have separated it from the known-to-be-working gcc * one above. */ /* memory barrier - no idea how to create one with xlc! */ #define gmx_atomic_memory_barrier() typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; #define GMX_SPINLOCK_INITIALIZER { 0 } #define gmx_atomic_read(a) ((a)->value) #define gmx_atomic_set(a,i) (((a)->value) = (i)) static inline int gmx_atomic_add_return(gmx_atomic_t * a, int i) { int t; __asm__ __volatile__("1: lwarx %0,0,%2 \n" "\t add %0,%1,%0 \n" "\t stwcx. %0,0,%2 \n" "\t bne- 1b \n" "\t isync \n" : "=&r" (t) : "r" (i), "r" (&a->value) ); return t; } static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, int i) { int t; __asm__ __volatile__("\t eieio\n" "1: lwarx %0,0,%2 \n" "\t add %0,%1,%0 \n" "\t stwcx. %0,0,%2 \n" "\t bne- 1b \n" "\t isync \n" : "=&r" (t) : "r" (i), "r" (&a->value)); return (t - i); } static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { int prev; __asm__ __volatile__ ("1: lwarx %0,0,%2 \n" "\t cmpw 0,%0,%3 \n" "\t bne 2f \n" "\t stwcx. %4,0,%2 \n" "\t bne- 1b \n" "\t sync \n" "2: \n" : "=&r" (prev), "=m" (a->value) : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)); return prev; } static inline void gmx_spinlock_init(gmx_spinlock_t *x) { x->lock = 0; } static inline void gmx_spinlock_lock(gmx_spinlock_t * x) { unsigned int tmp; __asm__ __volatile__("\t b 1f \n" "2: lwzx %0,0,%1 \n" "\t cmpwi 0,%0,0 \n" "\t bne+ 2b \n" "1: lwarx %0,0,%1 \n" "\t cmpwi 0,%0,0 \n" "\t bne- 2b \n" "\t stwcx. %2,0,%1 \n" "\t bne- 2b \n" "\t isync\n" : "=&r"(tmp) : "r"(&x->lock), "r"(1)); } static inline int gmx_spinlock_trylock(gmx_spinlock_t * x) { unsigned int old, t; unsigned int mask = 1; volatile unsigned int *p = &x->lock; __asm__ __volatile__("\t eieio\n" "1: lwarx %0,0,%4 \n" "\t or %1,%0,%3 \n" "\t stwcx. %1,0,%4 \n" "\t bne 1b \n" "\t sync \n" : "=&r" (old), "=&r" (t), "=m" (*p) : "r" (mask), "r" (p), "m" (*p)); return ((old & mask) != 0); } static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { __asm__ __volatile__("\t eieio \n"); x->lock = 0; } static inline void gmx_spinlock_islocked(gmx_spinlock_t * x) { return ( x->lock != 0); } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { do { gmx_atomic_memory_barrier(); } while(gmx_spinlock_islocked(x)); } #elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER))) /* ia64 with GCC or Intel compilers. Since we need to define everything through * cmpxchg and fetchadd on ia64, we merge the different compilers and only provide * different implementations for that single function. * Documentation? Check the gcc/x86 section. */ typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; #define GMX_SPINLOCK_INITIALIZER { 0 } #define gmx_atomic_read(a) ((a)->value) #define gmx_atomic_set(a,i) (((a)->value) = (i)) /* Compiler thingies */ #ifdef __INTEL_COMPILER void __memory_barrier(void); int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp); unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment); /* ia64 memory barrier */ # define gmx_atomic_memory_barrier() __memory_barrier() /* ia64 cmpxchg */ # define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval) /* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */ # define gmx_ia64_fetchadd(a, inc) __fetchadd4_rel(a, inc) #elif defined __GNUC__ /* ia64 memory barrier */ # define gmx_atomic_memory_barrier() asm volatile ("":::"memory") /* ia64 cmpxchg */ static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { volatile int res; asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval)); asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv": "=r"(res) : "r"(&a->value), "r"(newval) : "memory"); return res; } /* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */ #define gmx_ia64_fetchadd(a, inc) \ ({ unsigned long res; \ asm volatile ("fetchadd4.rel %0=[%1],%2" \ : "=r"(res) : "r"(a), "i" (inc) : "memory"); \ res; \ }) #else /* Unknown compiler */ # error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h! #endif static inline int gmx_atomic_add_return(gmx_atomic_t * a, volatile int i) { volatile int oldval,newval; volatile int __i = i; /* Use fetchadd if, and only if, the increment value can be determined * at compile time (otherwise this check is optimized away) and it is * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16). */ if (__builtin_constant_p(i) && ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) || (__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) ) { oldval = gmx_ia64_fetchadd(a,__i); newval = oldval + i; } else { /* Use compare-exchange addition that works with any value */ do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval); } return newval; } static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, volatile int i) { volatile int oldval,newval; volatile int __i = i; /* Use ia64 fetchadd if, and only if, the increment value can be determined * at compile time (otherwise this check is optimized away) and it is * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16). */ if (__builtin_constant_p(i) && ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) || (__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) ) { oldval = gmx_ia64_fetchadd(a,__i); newval = oldval + i; } else { /* Use compare-exchange addition that works with any value */ do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval); } return oldval; } static inline void gmx_spinlock_init(gmx_spinlock_t *x) { x->lock = 0; } static inline void gmx_spinlock_lock(gmx_spinlock_t * x) { gmx_atomic_t *a = (gmx_atomic_t *) x; unsigned long value; value = gmx_atomic_cmpxchg(a, 0, 1); if (value) { do { while (a->value != 0) { gmx_atomic_memory_barrier(); } value = gmx_atomic_cmpxchg(a, 0, 1); } while (value); } } static inline int gmx_spinlock_trylock(gmx_spinlock_t * x) { return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0); } static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { do { gmx_atomic_memory_barrier(); x->lock = 0; } while (0); } static inline int gmx_spinlock_islocked(gmx_spinlock_t * x) { return (x->lock != 0); } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { do { gmx_atomic_memory_barrier(); } while(gmx_spinlock_islocked(x)); } #undef gmx_ia64_fetchadd #elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64) /* HP compiler on ia64 */ #include #define gmx_atomic_memory_barrier() _Asm_mf() #define gmx_hpia64_fetchadd(a, i) \ _Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL, \ (UInt32*)a,(unsigned int) i, \ (_Asm_ldhint)LDHINT_NONE) typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { int ret; _Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval, (_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE | _DOWN_CALL_FENCE | _DOWN_SYS_FENCE)); ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a, (Uint32)newval,(_Asm_ldhint)_LDHINT_NONE); return ret; } #define GMX_SPINLOCK_INITIALIZER { 0 } #define gmx_atomic_read(a) ((a)->value) #define gmx_atomic_set(a,i) (((a)->value) = (i)) static inline void gmx_atomic_add_return(gmx_atomic_t * a, int i) { int old,new; int __i = i; /* On HP-UX we don't know any macro to determine whether the increment * is known at compile time, but hopefully the call uses something simple * like a constant, and then the optimizer should be able to do the job. */ if ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) || (__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) { oldval = gmx_hpia64_fetchadd(a,__i); newval = oldval + i; } else { /* Use compare-exchange addition that works with any value */ do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval); } return newval; } static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, int i) { int oldval,newval; int __i = i; /* On HP-UX we don't know any macro to determine whether the increment * is known at compile time, but hopefully the call uses something simple * like a constant, and then the optimizer should be able to do the job. */ if ( (__i == 1) || (__i == 4) || (__i == 8) || (__i == 16) || (__i == -1) || (__i == -4) || (__i == -8) || (__i == -16) ) { oldval = gmx_hpia64_fetchadd(a,__i); newval = oldval + i; } else { /* Use compare-exchange addition that works with any value */ do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval); } return oldval; } static inline void gmx_spinlock_init(gmx_spinlock_t *x) { x->lock = 0; } static inline void gmx_spinlock_trylock(gmx_spinlock_t *x) { int rc; rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1 (_Asm_ldhit)_LDHINT_NONE); return ( (rc>0) ? 1 : 0); } static inline void gmx_spinlock_lock(gmx_spinlock_t *x) { int status = 1; do { if( *((unsigned int *)x->lock) == 0 ) { status = gmx_spinlock_trylock(x); } } while( status != 0); } static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { _Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL, (unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE); } static inline void gmx_spinlock_islocked(gmx_spinlock_t * x) { return ( x->lock != 0 ); } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { do { gmx_atomic_memory_barrier(); } while(gmx_spinlock_islocked(x)); } #undef gmx_hpia64_fetchadd #elif (defined(_MSC_VER) && (_MSC_VER >= 1200)) /* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */ #include #define gmx_atomic_memory_barrier() typedef struct gmx_atomic { LONG volatile value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { LONG volatile lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; #define GMX_SPINLOCK_INITIALIZER { 0 } #define gmx_atomic_read(a) ((a)->value) #define gmx_atomic_set(a,i) (((a)->value) = (i)) #define gmx_atomic_fetch_add(a, i) \ InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) #define gmx_atomic_add_return(a, i) \ ( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) ) #define gmx_atomic_cmpxchg(a, oldval, newval) \ InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval) # define gmx_spinlock_lock(x) \ while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0) #define gmx_spinlock_trylock(x) \ InterlockedCompareExchange((LONG volatile *)&x, 1, 0) static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { x->lock = 0; } static inline int gmx_spinlock_islocked(gmx_spinlock_t * x) { return (*(volatile signed char *)(&(x)->lock) != 0); } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { while(gmx_spinlock_islocked(x)) { Sleep(0); } } #elif defined(__xlC__) && defined (_AIX) /* IBM xlC compiler on AIX */ #include #define gmx_atomic_memory_barrier() typedef struct gmx_atomic { volatile int value; /*!< Volatile, to avoid compiler aliasing */ } gmx_atomic_t; typedef struct gmx_spinlock { volatile unsigned int lock; /*!< Volatile, to avoid compiler aliasing */ } gmx_spinlock_t; static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldval, int newval) { int t; if(__check_lock((atomic_p)&a->value, oldval, newval)) { /* Not successful - value had changed in memory. Reload value. */ t = a->value; } else { /* replacement suceeded */ t = oldval; } return t; } static inline void gmx_atomic_add_return(gmx_atomic_t * a, int i) { int oldval,newval; do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(__check_lock((atomic_p)&a->value, oldval, newval)); return newval; } static inline void gmx_atomic_fetch_add(gmx_atomic_t * a, int i) { int oldval,newval; do { oldval = gmx_atomic_read(a); newval = oldval + i; } while(__check_lock((atomic_p)&a->value, oldval, newval)); return oldval; } static inline void gmx_spinlock_init(gmx_spinlock_t * x) { __clear_lock((atomic_p)x,0); } static inline void gmx_spinlock_lock(gmx_spinlock_t * x) { do { ; } while(__check_lock((atomic_p)x, 0, 1)); } static inline void gmx_spinlock_trylock(gmx_spinlock_t * x) { /* Return 0 if we got the lock */ return (__check_lock((atomic_p)x, 0, 1) != 0) } static inline void gmx_spinlock_unlock(gmx_spinlock_t * x) { __clear_lock((atomic_p)x,0); } static inline void gmx_spinlock_islocked(gmx_spinlock_t * x) { return (*((atomic_p)x) != 0); } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { while(gmx_spinlock_islocked(x)) { ; } } #else /* No atomic operations, use mutex fallback. Documentation is in x86 section */ #define gmx_atomic_memory_barrier() /* System mutex used for locking to guarantee atomicity */ static pthread_mutex_t gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER; typedef struct gmx_atomic { int value; } gmx_atomic_t; #define gmx_spinlock_t pthread_mutex_t # define GMX_SPINLOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER /* Since mutexes guarantee memory barriers this works fine */ #define gmx_atomic_read(a) ((a)->value) static inline void gmx_atomic_set(gmx_atomic_t * a, int i) { /* Mutexes here are necessary to guarantee memory visibility */ pthread_mutex_lock(&gmx_atomic_mutex); a->value = i; pthread_mutex_unlock(&gmx_atomic_mutex); } static inline int gmx_atomic_add_return(gmx_atomic_t * a, int i) { int t; pthread_mutex_lock(&gmx_atomic_mutex); t = a->value + i; a->value = t; pthread_mutex_unlock(&gmx_atomic_mutex); return t; } static inline int gmx_atomic_fetch_add(gmx_atomic_t * a, int i) { int old_value; pthread_mutex_lock(&gmx_atomic_mutex); old_value = a->value; a->value = old_value + i; pthread_mutex_unlock(&gmx_atomic_mutex); return old_value; } static inline int gmx_atomic_cmpxchg(gmx_atomic_t * a, int oldv, int newv) { int t; pthread_mutex_lock(&gmx_atomic_mutex); t = a->value; if (t == oldv) { a->value = newv; } pthread_mutex_unlock(&gmx_atomic_mutex); return t; } #define gmx_spinlock_init(lock) pthread_mutex_init(lock) #define gmx_spinlock_lock(lock) pthread_mutex_lock(lock) #define gmx_spinlock_trylock(lock) pthread_mutex_trylock(lock) #define gmx_spinlock_unlock(lock) pthread_mutex_unlock(lock) static inline int gmx_spinlock_islocked(gmx_spinlock_t * x) { int rc; if(gmx_spinlock_trylock(x) != 0) { /* It was locked */ return 1; } else { /* We just locked it */ gmx_spinlock_unlock(x); return 0; } } static inline void gmx_spinlock_wait(gmx_spinlock_t * x) { int rc; gmx_spinlock_lock(x); /* Got the lock now, so the waiting is over */ gmx_spinlock_unlock(x); } #endif /*! \brief Spinlock-based barrier type * * This barrier has the same functionality as the standard * gmx_thread_barrier_t, but since it is based on spinlocks * it provides faster synchronization at the cost of busy-waiting. * * Variables of this type should be initialized by calling * gmx_spinlock_barrier_init() to set the number of threads * that should be synchronized. */ typedef struct gmx_spinlock_barrier { gmx_atomic_t count; /*!< Number of threads remaining */ int threshold; /*!< Total number of threads */ volatile int cycle; /*!< Current cycle (alternating 0/1) */ } gmx_spinlock_barrier_t; /*! \brief Initialize spinlock-based barrier * * \param barrier Pointer to _spinlock_ barrier. Note that this is not * the same datatype as the full, thread based, barrier. * \param count Number of threads to synchronize. All threads * will be released after \a count calls to * gmx_spinlock_barrier_wait(). */ static inline void gmx_spinlock_barrier_init(gmx_spinlock_barrier_t * barrier, int count) { barrier->threshold = count; barrier->cycle = 0; gmx_atomic_set(&(barrier->count),count); } /*! \brief Perform busy-waiting barrier synchronization * * This routine blocks until it has been called N times, * where N is the count value the barrier was initialized with. * After N total calls all threads return. The barrier automatically * cycles, and thus requires another N calls to unblock another time. * * Note that spinlock-based barriers are completely different from * standard ones (using mutexes and condition variables), only the * functionality and names are similar. * * \param barrier Pointer to previously create barrier. * * \return The last thread returns -1, all the others 0. */ static inline int gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t * barrier) { int cycle; int status; /* We don't need to lock or use atomic ops here, since the cycle index * cannot change until after the last thread has performed the check * further down. Further, they cannot reach this point in the next * barrier iteration until all of them have been released, and that * happens after the cycle value has been updated. * * No synchronization == fast synchronization. */ cycle = barrier->cycle; /* Decrement the count atomically and check if it is zero. * This will only be true for the last thread calling us. */ if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0) { gmx_atomic_set(&(barrier->count), barrier->threshold); barrier->cycle = !barrier->cycle; status = -1; } else { /* Wait until the last thread changes the cycle index. * We are both using a memory barrier, and explicit * volatile pointer cast to make sure the compiler * doesn't try to be smart and cache the contents. */ do { gmx_atomic_memory_barrier(); } while( *(volatile int *)(&(barrier->cycle)) == cycle); status = 0; } return status; } #ifdef __cplusplus } #endif #endif /* _GMX_ATOMIC_H_ */