• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

sst/elements/genericProc/programs/qthread-1.4/include/qthread/qthread.h

00001 #ifndef _QTHREAD_H_
00002 #define _QTHREAD_H_
00003 
00004 #include <errno.h>                     /* for ENOMEM */
00005 
00006 #include <qthread/qthread-int.h>       /* for uint32_t and uint64_t */
00007 #include <qthread/common.h>            /* important configuration options */
00008 
00009 #include <string.h>                    /* for memcpy() */
00010 
00011 #ifdef QTHREAD_NEEDS_IA64INTRIN
00012 # ifdef HAVE_IA64INTRIN_H
00013 #  include <ia64intrin.h>
00014 # elif defined(HAVE_IA32INTRIN_H)
00015 #  include <ia32intrin.h>
00016 # endif
00017 #endif
00018 
00019 /*****************************************************************************
00020  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00021  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
00022  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  NOTE  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! *
00023  *                                                                           *
00024  *    The most complete documentaton is going to be in the man pages. The    *
00025  *    documentation here is just to give you a general idea of what each     *
00026  *    function does.                                                         *
00027  *                                                                           *
00028  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
00029  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00030  *****************************************************************************/
00031 
00032 /* Return Codes */
00033 #define QTHREAD_REDUNDANT       1
00034 #define QTHREAD_SUCCESS         0
00035 #define QTHREAD_BADARGS         -1
00036 #define QTHREAD_PTHREAD_ERROR   -2
00037 #define QTHREAD_NOT_ALLOWED     -3
00038 #define QTHREAD_MALLOC_ERROR    ENOMEM
00039 #define QTHREAD_THIRD_PARTY_ERROR -4
00040 #define QTHREAD_TIMEOUT         -5     /* neither ETIME nor ETIMEDOUT seem appropriate, strictly speaking */
00041 #ifdef EOVERFLOW
00042 # define QTHREAD_OVERFLOW       EOVERFLOW
00043 #else
00044 # define QTHREAD_OVERFLOW       -6
00045 #endif
00046 #define NO_SHEPHERD ((qthread_shepherd_id_t)-1)
00047 
00048 #ifdef __cplusplus
00049 #define Q_STARTCXX extern "C" {
00050 #define Q_ENDCXX }
00051 #else
00052 #define Q_STARTCXX
00053 #define Q_ENDCXX
00054 #endif
00055 
00056 #ifdef QTHREAD_ALIGNEDDATA_ALLOWED
00057 # define Q_ALIGNED(x) __attribute__((aligned(x)))
00058 #endif
00059 
00060 Q_STARTCXX /* */
00061 /* NOTE!!!!!!!!!!!
00062  * Reads and writes operate on aligned_t-size segments of memory.
00063  *
00064  * FEB locking only works on aligned addresses. On 32-bit architectures, this
00065  * isn't too much of an inconvenience. On 64-bit architectures, it's a pain in
00066  * the BUTT! This is here to try and help a little bit. */
00067 #if QTHREAD_SIZEOF_ALIGNED_T == 4
00068 typedef uint32_t Q_ALIGNED(QTHREAD_ALIGNMENT_ALIGNED_T) aligned_t;
00069 typedef int32_t Q_ALIGNED(QTHREAD_ALIGNMENT_ALIGNED_T) saligned_t;
00070 #elif QTHREAD_SIZEOF_ALIGNED_T == 8
00071 typedef uint64_t Q_ALIGNED(QTHREAD_ALIGNMENT_ALIGNED_T) aligned_t;
00072 typedef int64_t Q_ALIGNED(QTHREAD_ALIGNMENT_ALIGNED_T) saligned_t;
00073 #else
00074 #error "Don't know type for sizeof aligned_t"
00075 #endif
00076 
00077 typedef struct _syncvar_s {
00078     volatile union {
00079         volatile uint64_t w;
00080         volatile struct {
00081 #ifdef BITFIELD_ORDER_FORWARD
00082             volatile uint64_t data : 60;
00083             volatile unsigned state : 3;
00084             volatile unsigned lock : 1;
00085 #else
00086             volatile unsigned lock : 1;
00087             volatile unsigned state : 3;
00088             volatile uint64_t data : 60;
00089 #endif
00090         } s;
00091     } u;
00092 } syncvar_t;
00093 #define SYNCVAR_STATIC_INITIALIZER { { 0 } }
00094 Q_ENDCXX /* */
00095 
00096 #ifdef QTHREAD_SST_PRIMITIVES
00097 # include <qthread/qthread-sst.h>
00098 #else
00099 
00100 Q_STARTCXX /* */
00101 typedef struct qthread_s qthread_t;
00102 typedef unsigned int qthread_shepherd_id_t;
00103 
00104 /* for convenient arguments to qthread_fork */
00105 typedef aligned_t(*qthread_f) (qthread_t * me, void *arg);
00106 
00107 /* use this function to initialize the qthreads environment before spawning any
00108  * qthreads. The argument to this function used to specify the number of
00109  * pthreads that will be spawned to shepherd the qthreads. This number is now
00110  * ignored, the qthread_init() function is deprecated, and qthread_initialize()
00111  * now takes its place. If you MUST specify the number of shepherds, use the
00112  * environment variable QTHREAD_NUM_SHEPHERDS. */
00113 int qthread_init(qthread_shepherd_id_t nshepherds);
00114 int qthread_initialize(void);
00115 
00116 /* use this function to clean up the qthreads environment after execution of
00117  * the program is finished. This function will terminate any currently running
00118  * qthreads, so only use it when you are certain that execution has completed.
00119  * This function is automatically called when the program exits, so only use if
00120  * reclaiming resources from the library is necessary before the program exits.
00121  */
00122 void qthread_finalize(void);
00123 
00124 /* use this function to tell a shepherd to stop accepting new threads and to
00125  * offload its existing threads to nearby shepherds. This latter may not take
00126  * effect immediately, but may only take effect when the current executing
00127  * qthread on that shepherd next stops executing */
00128 int qthread_disable_shepherd(const qthread_shepherd_id_t shep);
00129 void qthread_enable_shepherd(const qthread_shepherd_id_t shep);
00130 
00131 /* this function allows a qthread to specifically give up control of the
00132  * processor even though it has not blocked. This is useful for things like
00133  * busy-waits or cooperative multitasking. Without this function, threads will
00134  * only ever allow other threads assigned to the same pthread to execute when
00135  * they block. */
00136 void qthread_yield(qthread_t * me);
00137 
00138 /* this function allows a qthread to retrieve its qthread_t pointer if it has
00139  * been lost for some reason */
00140 qthread_t *qthread_self(void);
00141 
00142 /* these are the functions for generating a new qthread.
00143  *
00144  * Using qthread_fork() and variants:
00145  *
00146  *     The specified function (the first argument; note that it is a qthread_f
00147  *     and not a qthread_t) will be run to completion. You can detect that a
00148  *     thread has finished by specifying a location to store the return value
00149  *     (which will be stored with a qthread_writeF call). The qthread_fork_to
00150  *     function spawns the thread to a specific shepherd.
00151  */
00152 int qthread_fork(const qthread_f f, const void *const arg, aligned_t * ret);
00153 int qthread_fork_syncvar(const qthread_f f, const void *const arg, syncvar_t * ret);
00154 int qthread_fork_to(const qthread_f f, const void *const arg, aligned_t * ret,
00155                     const qthread_shepherd_id_t shepherd);
00156 int qthread_fork_syncvar_to(const qthread_f f, const void *const arg, syncvar_t * ret,
00157                     const qthread_shepherd_id_t shepherd);
00158 
00159 /* Using qthread_prepare()/qthread_schedule() and variants:
00160  *
00161  *     The combination of these two functions works like qthread_fork().
00162  *     First, qthread_prepare() creates a qthread_t object that is ready to be
00163  *     run (almost), but has not been scheduled. Next, qthread_schedule puts
00164  *     the finishing touches on the qthread_t structure and places it into an
00165  *     active queue.
00166  */
00167 qthread_t *qthread_prepare(const qthread_f f, const void *const arg,
00168                            aligned_t * ret);
00169 qthread_t *qthread_prepare_for(const qthread_f f, const void *const arg,
00170                                aligned_t * ret,
00171                                const qthread_shepherd_id_t shepherd);
00172 
00173 int qthread_schedule(qthread_t * t);
00174 int qthread_schedule_on(qthread_t * t, const qthread_shepherd_id_t shepherd);
00175 
00176 /* This is a function to move a thread from one shepherd to another. */
00177 int qthread_migrate_to(qthread_t * me, const qthread_shepherd_id_t shepherd);
00178 
00179 /* This function sets the debug level if debugging has been enabled */
00180 int qthread_debuglevel(int);
00181 
00182 /* these are accessor functions for use by the qthreads to retrieve information
00183  * about themselves */
00184 unsigned qthread_id(const qthread_t * t);
00185 qthread_shepherd_id_t qthread_shep(const qthread_t * t);
00186 size_t qthread_stackleft(const qthread_t * t);
00187 aligned_t *qthread_retloc(const qthread_t * t);
00188 int qthread_shep_ok(const qthread_t * t);
00189 
00190 /* returns the distance from one shepherd to another */
00191 int qthread_distance(const qthread_shepherd_id_t src,
00192                      const qthread_shepherd_id_t dest);
00193 /* returns a list of shepherds, sorted by their distance from either this
00194  * qthread or the specified shepherd */
00195 const qthread_shepherd_id_t *qthread_sorted_sheps(const qthread_t * t);
00196 const qthread_shepherd_id_t *qthread_sorted_sheps_remote(const
00197                                                          qthread_shepherd_id_t
00198                                                          src);
00199 /* returns the number of shepherds (i.e. one more than the largest valid shepherd id) */
00200 qthread_shepherd_id_t qthread_num_shepherds(void);
00201 
00202 /****************************************************************************
00203  * functions to implement FEB locking/unlocking
00204  ****************************************************************************
00205  *
00206  * These are the FEB functions. All but empty/fill have the potential of
00207  * blocking until the corresponding precondition is met. All FEB
00208  * blocking/reading/writing is done on a machine-word basis. Memory is assumed
00209  * to be full unless otherwise asserted, and as such memory that is full and
00210  * does not have dependencies (i.e. no threads are waiting for it to become
00211  * empty) does not require state data to be stored. It is expected that while
00212  * there may be locks instantiated at one time or another for a very large
00213  * number of addresses in the system, relatively few will be in a non-default
00214  * (full, no waiters) state at any one time.
00215  */
00216 
00217 /* This function is just to assist with debugging; it returns 1 if the address
00218  * is full, and 0 if the address is empty */
00219 int qthread_feb_status(const aligned_t * addr);
00220 int qthread_syncvar_status(syncvar_t *const v);
00221 
00222 /* The empty/fill functions merely assert the empty or full state of the given
00223  * address. You may be wondering why they require a qthread_t argument. The
00224  * reason for this is memory pooling; memory is allocated on a per-shepherd
00225  * basis (to avoid needing to lock the memory pool). Anyway, if you pass it a
00226  * NULL qthread_t, it will still work, it just won't be as fast. */
00227 int qthread_empty(qthread_t * me, const aligned_t * dest);
00228 int qthread_syncvar_empty(qthread_t * restrict const me, syncvar_t * restrict const dest);
00229 int qthread_fill(qthread_t * me, const aligned_t * dest);
00230 int qthread_syncvar_fill(qthread_t * restrict const me, syncvar_t * restrict const dest);
00231 
00232 /* These functions wait for memory to become empty, and then fill it. When
00233  * memory becomes empty, only one thread blocked like this will be awoken. Data
00234  * is read from src and written to dest.
00235  *
00236  * The semantics of writeEF are:
00237  * 1 - destination's FEB state must be "empty"
00238  * 2 - data is copied from src to destination
00239  * 3 - the destination's FEB state gets changed from empty to full
00240  *
00241  * This function takes a qthread_t pointer as an argument. If this is called
00242  * from somewhere other than a qthread, use NULL for the me argument. If you
00243  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00244  * (which, conveniently, returns NULL if you aren't a qthread).
00245  */
00246 int qthread_writeEF(qthread_t * me, aligned_t * restrict const dest,
00247                     const aligned_t * restrict const src);
00248 int qthread_writeEF_const(qthread_t * me, aligned_t * const dest,
00249                           const aligned_t src);
00250 int qthread_syncvar_writeEF(qthread_t *restrict me, syncvar_t *restrict
00251                             const dest, const uint64_t *restrict const src);
00252 int qthread_syncvar_writeEF_const(qthread_t *restrict me, syncvar_t *restrict
00253                             const dest, const uint64_t src);
00254 
00255 /* This function is a cross between qthread_fill() and qthread_writeEF(). It
00256  * does not wait for memory to become empty, but performs the write and sets
00257  * the state to full atomically with respect to other FEB-based actions. Data
00258  * is read from src and written to dest.
00259  *
00260  * The semantics of writeF are:
00261  * 1 - data is copied from src to destination
00262  * 2 - the destination's FEB state gets set to full
00263  *
00264  * This function takes a qthread_t pointer as an argument. If this is called
00265  * from somewhere other than a qthread, use NULL for the me argument. If you
00266  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00267  * (which, conveniently, returns NULL if you aren't a qthread).
00268  */
00269 int qthread_writeF(qthread_t * me, aligned_t * restrict const dest,
00270                    const aligned_t * restrict const src);
00271 int qthread_writeF_const(qthread_t * me, aligned_t * const dest,
00272                          const aligned_t src);
00273 int qthread_syncvar_writeF(qthread_t *restrict me, syncvar_t *restrict
00274                            const dest, const uint64_t *restrict const src);
00275 int qthread_syncvar_writeF_const(qthread_t *restrict me, syncvar_t *restrict
00276                            const dest, const uint64_t src);
00277 
00278 /* This function waits for memory to become full, and then reads it and leaves
00279  * the memory as full. When memory becomes full, all threads waiting for it to
00280  * become full with a readFF will receive the value at once and will be queued
00281  * to run. Data is read from src and stored in dest.
00282  *
00283  * The semantics of readFF are:
00284  * 1 - src's FEB state must be "full"
00285  * 2 - data is copied from src to destination
00286  *
00287  * This function takes a qthread_t pointer as an argument. If this is called
00288  * from somewhere other than a qthread, use NULL for the me argument. If you
00289  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00290  * (which, conveniently, returns NULL if you aren't a qthread).
00291  */
00292 int qthread_readFF(qthread_t * me, aligned_t * const dest,
00293                    const aligned_t * const src);
00294 int qthread_syncvar_readFF(qthread_t * restrict const me, uint64_t * restrict const dest,
00295                    syncvar_t * restrict const src);
00296 
00297 /* These functions wait for memory to become full, and then empty it. When
00298  * memory becomes full, only one thread blocked like this will be awoken. Data
00299  * is read from src and written to dest.
00300  *
00301  * The semantics of readFE are:
00302  * 1 - src's FEB state must be "full"
00303  * 2 - data is copied from src to destination
00304  * 3 - the src's FEB bits get changed from full to empty when the data is copied
00305  *
00306  * This function takes a qthread_t pointer as an argument. If this is called
00307  * from somewhere other than a qthread, use NULL for the me argument. If you
00308  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00309  * (which, conveniently, returns NULL if you aren't a qthread).
00310  */
00311 int qthread_readFE(qthread_t * me, aligned_t * const dest,
00312                    const aligned_t * const src);
00313 int qthread_syncvar_readFE(qthread_t * restrict const me, uint64_t * restrict const dest,
00314                    syncvar_t * restrict const src);
00315 
00316 /* functions to implement FEB-ish locking/unlocking
00317  *
00318  * These are atomic and functional, but do not have the same semantics as full
00319  * FEB locking/unlocking (namely, unlocking cannot block), however because of
00320  * this, they have lower overhead.
00321  *
00322  * These functions take a qthread_t pointer as an argument. If this is called
00323  * from somewhere other than a qthread, use NULL for the me argument. If you
00324  * have lost your qthread_t pointer, it can be reclaimed using qthread_self().
00325  */
00326 int qthread_lock(qthread_t * me, const aligned_t * a);
00327 int qthread_unlock(qthread_t * me, const aligned_t * a);
00328 
00329 #if defined(QTHREAD_MUTEX_INCREMENT) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32)
00330 uint32_t qthread_incr32_(volatile uint32_t *, const int32_t);
00331 uint64_t qthread_incr64_(volatile uint64_t *, const int64_t);
00332 float qthread_fincr_(volatile float *, const float);
00333 double qthread_dincr_(volatile double *, const double);
00334 uint32_t qthread_cas32_(volatile uint32_t *, const uint32_t, const uint32_t);
00335 uint64_t qthread_cas64_(volatile uint64_t *, const uint64_t, const uint64_t);
00336 #endif
00337 
00338 /* the following three functions implement variations on atomic increment. It
00339  * is done with architecture-specific assembly (on supported architectures,
00340  * when possible) and does NOT use FEB's or lock/unlock unless the architecture
00341  * is unsupported or cannot perform atomic operations at the right granularity.
00342  * All of these functions return the value of the contents of the operand
00343  * *after* incrementing.
00344  */
00345 static QINLINE float qthread_fincr(volatile float *operand, const float incr)
00346 {                                      /*{{{ */
00347 #if defined (QTHREAD_MUTEX_INCREMENT)
00348     return qthread_fincr_(operand,incr);
00349 #else
00350 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00351     union
00352     {
00353         float f;
00354         uint32_t i;
00355     } retval;
00356     register float incremented_value;
00357     register uint32_t scratch_int;
00358     uint32_t conversion_memory = conversion_memory;
00359     __asm__ __volatile__("1:\n\t"
00360             "lwarx  %0,0,%4\n\t"
00361             // convert from int to float
00362             "stw    %0,%2\n\t"
00363             "lfs    %1,%2\n\t"
00364             // do the addition
00365             "fadds  %1,%1,%5\n\t"
00366             // convert from float to int
00367             "stfs   %1,%2\n\t"
00368             "lwz    %3,%2\n\t"
00369             // store back to original location
00370             "stwcx. %3,0,%4\n\t"
00371             "bne-   1b\n\t"
00372             "isync"
00373             :"=&b" (retval.i),          /* %0 */
00374              "=&f" (incremented_value), /* %1 */
00375              "=m"  (conversion_memory), /* %2 */
00376              "=&r" (scratch_int)        /* %3 */
00377             :"r"   (operand),           /* %4 */
00378              "f"   (incr)               /* %5 */
00379             :"cc", "memory");
00380 
00381     return retval.f;
00382 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00383     union
00384     {
00385         float f;
00386         uint32_t i;
00387     } oldval, newval;
00388 
00389     /* newval.f = *operand; */
00390     do {
00391         /* you *should* be able to move the *operand reference outside the
00392          * loop and use the output of the CAS (namely, newval) instead.
00393          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00394          * that, the while() comparison uses a temporary register value for
00395          * newval that has nothing to do with the output of the CAS
00396          * instruction. (See how obviously wrong that is?) For some reason that
00397          * I haven't been able to figure out, moving the *operand reference
00398          * inside the loop fixes that problem, even at -O2 optimization. */
00399         oldval.f = *operand;
00400         newval.f = oldval.f + incr;
00401         __asm__ __volatile__
00402                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00403                  "cas [%1], %2, %0"
00404                  :"+r"(newval.i)
00405                  :"r"    (operand), "r"(oldval.i)
00406                  :"cc", "memory");
00407     } while (oldval.i != newval.i);
00408     return oldval.f;
00409 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00410     union
00411     {
00412         float f;
00413         uint32_t i;
00414     } oldval, newval, res;
00415 
00416     do {
00417         oldval.f = *operand;
00418         newval.f = oldval.f + incr;
00419         __asm__ __volatile__("mov ar.ccv=%0;;"::"rO"(oldval.i));
00420         __asm__ __volatile__("cmpxchg4.acq %0=[%1],%2,ar.ccv"
00421                              :"=r"(res.i)
00422                              :"r"    (operand), "r"(newval.i)
00423                              :"memory");
00424     } while (res.i != oldval.i);       /* if res!=old, the calc is out of date */
00425     return oldval.f;
00426 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00427     union
00428     {
00429         float f;
00430         uint32_t i;
00431     } oldval, newval, retval;
00432 
00433     do {
00434         oldval.f = *operand;
00435         newval.f = oldval.f + incr;
00436         __asm__ __volatile__("lock; cmpxchg %1, (%2)"
00437                              :"=a"(retval.i)    /* store from EAX */
00438                              :"r"    (newval.i),
00439                               "r"(operand),
00440                               "0"(oldval.i)     /* load into EAX */
00441                              :"cc", "memory");
00442     } while (retval.i != oldval.i);
00443     return oldval.f;
00444 # else
00445 # error Unsupported assembly architecture for qthread_fincr
00446 # endif
00447 #endif
00448 }                                      /*}}} */
00449 
00450 static QINLINE double qthread_dincr(volatile double *operand,
00451                                     const double incr)
00452 {                                      /*{{{ */
00453 #if defined(QTHREAD_MUTEX_INCREMENT) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32)
00454     return qthread_dincr_(operand, incr);
00455 #else
00456 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00457     register uint64_t scratch_int;
00458     register double incremented_value;
00459     union
00460     {
00461         uint64_t i;
00462         double d;
00463     } retval;
00464     uint64_t conversion_memory = conversion_memory;
00465     __asm__ __volatile__("1:\n\t"
00466                          "ldarx %0,0,%4\n\t"
00467                          /*convert from integer to floating point */
00468                          "std   %0,%2\n\t"      // %2 is scratch memory (NOT a register)
00469                          "lfd   %1,%2\n\t"      // %1 is a scratch floating point register
00470                          /* do the addition */
00471                          "fadd  %1,%1,%5\n\t"   // %4 is the input increment
00472                          /* convert from floating point to integer */
00473                          "stfd   %1,%2\n\t"
00474                          "ld     %3,%2\n\t"
00475                          /* store back to original location */
00476                          "stdcx. %3,0,%4\n\t"
00477                          "bne-   1b\n\t"
00478                          "isync"
00479                          :"=&b" (retval.i),             /* %0 */
00480                           "=&f" (incremented_value),    /* %1 */
00481                           "=m"  (conversion_memory),    /* %2 */
00482                           "=r&" (scratch_int)           /* %3 */
00483                          :"r"   (operand),              /* %4 */
00484                           "f"   (incr)                  /* %5 */
00485                          :"cc", "memory");
00486 
00487     return retval.d;
00488 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00489     double oldval, newval;
00490 
00491     newval = *operand;
00492     do {
00493         /* this allows the compiler to be as flexible as possible with register
00494          * assignments */
00495         register uint64_t tmp1 = tmp1;
00496         register uint64_t tmp2 = tmp2;
00497 
00498         oldval = newval;
00499         newval = oldval + incr;
00500         __asm__ __volatile__("ldx %0, %1\n\t"
00501                              "ldx %4, %2\n\t"
00502                              "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00503                              "casx [%3], %2, %1\n\t"
00504                              "stx %1, %0"
00505                              /* h means 64-BIT REGISTER
00506                               * (probably unnecessary, but why take chances?) */
00507                              :"=m"   (newval), "=&h"(tmp1), "=&h"(tmp2)
00508                              :"r"    (operand), "m"(oldval)
00509                              :"memory");
00510     } while (oldval != newval);
00511     return oldval;
00512 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00513     union
00514     {
00515         uint64_t i;
00516         double d;
00517     } oldval, newval;
00518 
00519     /*newval.d = *operand; */
00520     do {
00521         /* you *should* be able to move the *operand reference outside the
00522          * loop and use the output of the CAS (namely, newval) instead.
00523          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00524          * that, the while() comparison uses a temporary register value for
00525          * newval that has nothing to do with the output of the CAS
00526          * instruction. (See how obviously wrong that is?) For some reason that
00527          * I haven't been able to figure out, moving the *operand reference
00528          * inside the loop fixes that problem, even at -O2 optimization. */
00529         oldval.d = *operand;
00530         newval.d = oldval.d + incr;
00531         __asm__ __volatile__
00532                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00533                  "casx [%1], %2, %0"
00534                  :"+r"(newval.i)
00535                  :"r"(operand), "r"(oldval.i)
00536                  :"memory");
00537     } while (oldval.d != newval.d);
00538     return oldval.d;
00539 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00540     union
00541     {
00542         uint64_t i;
00543         double d;
00544     } oldval, newval, res;
00545 
00546     do {
00547         oldval.d = *operand;
00548         newval.d = oldval.d + incr;
00549         __asm__ __volatile__("mov ar.ccv=%0;;"::"rO"(oldval.i));
00550         __asm__ __volatile__("cmpxchg8.acq %0=[%1],%2,ar.ccv"
00551                              :"=r"(res.i)
00552                              :"r"    (operand), "r"(newval.i)
00553                              :"memory");
00554     } while (res.i != oldval.i);       /* if res!=old, the calc is out of date */
00555     return oldval.d;
00556 
00557 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00558     union
00559     {
00560         double d;
00561         uint64_t i;
00562     } oldval, newval, retval;
00563 
00564     do {
00565         oldval.d = *operand;
00566         newval.d = oldval.d + incr;
00567 #  ifdef __PGI
00568         __asm__ __volatile__("lock; cmpxchgq %1, (%2)\n\t"
00569                 "mov %%rax,(%0)"
00570                              ::"r"(&retval.i),
00571                               "r"(newval.i), "r"(operand),
00572                               "a"(oldval.i)
00573                              :"memory");
00574 #  else
00575         __asm__ __volatile__("lock; cmpxchgq %1, (%2)"
00576                              :"=a"(retval.i)
00577                              :"r"(newval.i), "r"(operand),
00578                               "0"(oldval.i)
00579                              :"memory");
00580 #  endif
00581     } while (retval.i != oldval.i);
00582     return oldval.d;
00583 
00584 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00585     union
00586     {
00587         double d;
00588         uint64_t i;
00589         struct
00590         {
00591             /* note: the ordering of these is both important and
00592              * counter-intuitive; welcome to little-endian! */
00593             uint32_t l;
00594             uint32_t h;
00595         } s;
00596     } oldval, newval;
00597     register char test;
00598 
00599     do {
00600 #ifdef __PIC__
00601        /* this saves off %ebx to make PIC code happy :P */
00602 # define QTHREAD_PIC_PREFIX "xchg %%ebx, %4\n\t"
00603        /* this restores it */
00604 # define QTHREAD_PIC_SUFFIX "\n\txchg %%ebx, %4"
00605 # define QTHREAD_PIC_REG_4 "r"
00606 #else
00607 # define QTHREAD_PIC_PREFIX
00608 # define QTHREAD_PIC_SUFFIX
00609 # define QTHREAD_PIC_REG_4 "b"
00610 #endif
00611         oldval.d = *operand;
00612         newval.d = oldval.d + incr;
00613         /* Yeah, this is weird looking, but it really makes sense when you
00614          * understand the instruction's semantics (which make sense when you
00615          * consider that it's doing a 64-bit op on a 32-bit proc):
00616          *
00617          *    Compares the 64-bit value in EDX:EAX with the operand
00618          *    (destination operand). If the values are equal, the 64-bit value
00619          *    in ECX:EBX is stored in the destination operand. Otherwise, the
00620          *    value in the destination operand is loaded into EDX:EAX."
00621          *
00622          * So what happens is the oldval is loaded into EDX:EAX and the newval
00623          * is loaded into ECX:EBX to start with (i.e. as inputs). Then
00624          * CMPXCHG8B does its business, after which EDX:EAX is guaranteed to
00625          * contain the value of *operand when the instruction executed. We test
00626          * the ZF field to see if the operation succeeded. We *COULD* save
00627          * EDX:EAX back into oldval to save ourselves a step when the loop
00628          * fails, but that's a waste when the loop succeeds (i.e. in the common
00629          * case). Optimizing for the common case, in this situation, means
00630          * minimizing our extra write-out to the one-byte test variable.
00631          */
00632         __asm__ __volatile__(QTHREAD_PIC_PREFIX
00633                              "lock; cmpxchg8b (%1)\n\t"
00634                              "setne %0" /* test = (ZF==0) */
00635                              QTHREAD_PIC_SUFFIX
00636                              :"=q"(test)
00637                              :"r"(operand),
00638                              /*EAX*/ "a"(oldval.s.l),
00639                              /*EDX*/ "d"(oldval.s.h),
00640                              /*EBX*/ QTHREAD_PIC_REG_4(newval.s.l),
00641                              /*ECX*/ "c"(newval.s.h)
00642                              :"memory");
00643     } while (test);                    /* if ZF was cleared, the calculation is out of date */
00644     return oldval.d;
00645 
00646 # else
00647 #  error Unimplemented assembly architecture for qthread_dincr
00648 # endif
00649 #endif
00650 }                                      /*}}} */
00651 
00652 static QINLINE uint32_t qthread_incr32(volatile uint32_t * operand,
00653                                        const uint32_t incr)
00654 {                                      /*{{{ */
00655 #ifdef QTHREAD_MUTEX_INCREMENT
00656     return qthread_incr32_(operand,incr);
00657 #else
00658 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
00659      (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00660     uint32_t retval;
00661     register unsigned int incrd = incrd;        /* no initializing */
00662     __asm__ __volatile__("1:\tlwarx  %0,0,%2\n\t"
00663                          "add    %1,%0,%3\n\t"
00664                          "stwcx. %1,0,%2\n\t"
00665                          "bne-   1b\n\t"        /* if it failed, try again */
00666                          "isync"        /* make sure it wasn't all a dream */
00667                          :"=&b"  (retval), "=&r"(incrd)
00668                          :"r"    (operand), "r"(incr)
00669                          :"cc", "memory");
00670 
00671     return retval;
00672 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32) || \
00673        (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00674     register uint32_t oldval, newval;
00675 
00676     /* newval = *operand; */
00677     do {
00678         /* you *should* be able to move the *operand reference outside the
00679          * loop and use the output of the CAS (namely, newval) instead.
00680          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00681          * that, the while() comparison uses a temporary register value for
00682          * newval that has nothing to do with the output of the CAS
00683          * instruction. (See how obviously wrong that is?) For some reason that
00684          * I haven't been able to figure out, moving the *operand reference
00685          * inside the loop fixes that problem, even at -O2 optimization. */
00686         oldval = *operand;
00687         newval = oldval + incr;
00688         /* newval always gets the value of *operand; if it's
00689          * the same as oldval, then the swap was successful */
00690         __asm__ __volatile__
00691                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00692                  "cas [%1] , %2, %0"
00693                  :"+r"  (newval)
00694                  :"r"    (operand), "r"(oldval)
00695                  :"cc", "memory");
00696     } while (oldval != newval);
00697     return oldval;
00698 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00699     uint32_t res;
00700 
00701     if (incr == 1) {
00702         asm volatile ("fetchadd4.rel %0=[%1],1"
00703                       :"=r" (res)
00704                       :"r"  (operand));
00705     } else {
00706         uint32_t old, newval;
00707 
00708         do {
00709             old = *operand;            /* atomic, because operand is aligned */
00710             newval = old + incr;
00711             asm volatile ("mov ar.ccv=%0;;":    /* no output */
00712                           :"rO"    (old));
00713 
00714             /* separate so the compiler can insert its junk */
00715             asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv"
00716                           :"=r"(res)
00717                           :"r" (operand), "r"(newval)
00718                           :"memory");
00719         } while (res != old);          /* if res!=old, the calc is out of date */
00720     }
00721     return res;
00722 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32) || \
00723       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00724 
00725     uint32_t retval = incr;
00726     __asm__ __volatile__ ("lock ;  xaddl %0, (%1);"
00727                   :"+r" (retval)
00728                   :"r"  (operand)
00729                   :"memory");
00730 
00731     return retval;
00732 # else
00733 #  error Unimplemented assembly architecture for qthread_incr32
00734 # endif
00735 #endif
00736 }                                      /*}}} */
00737 
00738 static QINLINE uint64_t qthread_incr64(volatile uint64_t * operand,
00739                                        const uint64_t incr)
00740 {                                      /*{{{ */
00741 #if defined(QTHREAD_MUTEX_INCREMENT) || \
00742     (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
00743     (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00744     return qthread_incr64_(operand,incr);
00745 #else
00746 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00747     uint64_t retval;
00748     register uint64_t incrd = incrd;    /* no initializing */
00749 
00750     asm volatile ("1:\tldarx  %0,0,%2\n\t"
00751                   "add    %1,%0,%3\n\t"
00752                   "stdcx. %1,0,%2\n\t"
00753                   "bne-   1b\n\t"       /* if it failed, try again */
00754                   "isync"       /* make sure it wasn't all a dream */
00755                   :"=&b"   (retval), "=&r"(incrd)
00756                   :"r"     (operand), "r"(incr)
00757                   :"cc", "memory");
00758 
00759     return retval;
00760 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00761     uint64_t oldval, newval = *operand;
00762 
00763     do {
00764         /* this allows the compiler to be as flexible as possible with register
00765          * assignments */
00766         register uint64_t tmp1 = tmp1;
00767         register uint64_t tmp2 = tmp2;
00768 
00769         oldval = newval;
00770         newval += incr;
00771         /* newval always gets the value of *operand; if it's
00772          * the same as oldval, then the swap was successful */
00773         __asm__ __volatile__("ldx %0, %1\n\t"
00774                              "ldx %4, %2\n\t"
00775                              "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00776                              "casx [%3] , %2, %1\n\t"
00777                              "stx %1, %0"
00778                              /* h means 64-BIT REGISTER
00779                               * (probably unnecessary, but why take chances?) */
00780                              :"=m"   (newval), "=&h"(tmp1), "=&h"(tmp2)
00781                              :"r"    (operand), "m"(oldval)
00782                              :"cc", "memory");
00783     } while (oldval != newval);
00784     return oldval;
00785 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00786     register uint64_t oldval, newval;
00787 
00788 #  ifdef QTHREAD_ATOMIC_CAS
00789     newval = *operand;
00790     do {
00791         oldval = newval;
00792         newval = __sync_val_compare_and_swap(operand, oldval, oldval + incr);
00793     } while (oldval != newval);
00794 #  else
00795     do {
00796         /* you *should* be able to move the *operand reference outside the
00797          * loop and use the output of the CAS (namely, newval) instead.
00798          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00799          * that, the while() comparison uses a temporary register value for
00800          * newval that has nothing to do with the output of the CAS
00801          * instruction. (See how obviously wrong that is?) For some reason that
00802          * I haven't been able to figure out, moving the *operand reference
00803          * inside the loop fixes that problem, even at -O2 optimization. */
00804         oldval = *operand;
00805         newval = oldval + incr;
00806         /* newval always gets the value of *operand; if it's
00807          * the same as oldval, then the swap was successful */
00808         __asm__ __volatile__
00809                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00810                  "casx [%1] , %2, %0"
00811                  :"+r"(newval)
00812                  :"r"    (operand), "r"(oldval)
00813                  :"cc", "memory");
00814     } while (oldval != newval);
00815 #  endif
00816     return oldval;
00817 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00818     uint64_t res;
00819 
00820     if (incr == 1) {
00821         asm volatile ("fetchadd8.rel %0=%1,1"
00822                       :"=r" (res)
00823                       :"m"     (*operand));
00824     } else {
00825         uint64_t old, newval;
00826 
00827         do {
00828             old = *operand;            /* atomic, because operand is aligned */
00829             newval = old + incr;
00830             asm volatile ("mov ar.ccv=%0;;":    /* no output */
00831                           :"rO"    (old));
00832 
00833             /* separate so the compiler can insert its junk */
00834             asm volatile ("cmpxchg8.acq %0=[%1],%2,ar.ccv"
00835                           :"=r" (res)
00836                           :"r"     (operand), "r"(newval)
00837                           :"memory");
00838         } while (res != old);          /* if res!=old, the calc is out of date */
00839     }
00840     return res;
00841 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00842     union
00843     {
00844         uint64_t i;
00845         struct
00846         {
00847             /* note: the ordering of these is both important and
00848              * counter-intuitive; welcome to little-endian! */
00849             uint32_t l;
00850             uint32_t h;
00851         } s;
00852     } oldval, newval;
00853     register char test;
00854 
00855     do {
00856 #ifndef QTHREAD_PIC_PREFIX
00857 # ifdef __PIC__
00858         /* should share this code with the dincr stuff */
00859         /* this saves off %ebx to make PIC code happy :P */
00860 #  define QTHREAD_PIC_PREFIX "xchg %%ebx, %4\n\t"
00861         /* this restores it */
00862 #  define QTHREAD_PIC_SUFFIX "\n\txchg %%ebx, %4"
00863 #  define QTHREAD_PIC_REG_4 "r"
00864 # else
00865 #  define QTHREAD_PIC_PREFIX
00866 #  define QTHREAD_PIC_SUFFIX
00867 #  define QTHREAD_PIC_REG_4 "b"
00868 # endif
00869 #endif
00870         oldval.i = *operand;
00871         newval.i = oldval.i + incr;
00872         /* Yeah, this is weird looking, but it really makes sense when you
00873          * understand the instruction's semantics (which make sense when you
00874          * consider that it's doing a 64-bit op on a 32-bit proc):
00875          *
00876          *    Compares the 64-bit value in EDX:EAX with the operand
00877          *    (destination operand). If the values are equal, the 64-bit value
00878          *    in ECX:EBX is stored in the destination operand. Otherwise, the
00879          *    value in the destination operand is loaded into EDX:EAX."
00880          *
00881          * So what happens is the oldval is loaded into EDX:EAX and the newval
00882          * is loaded into ECX:EBX to start with (i.e. as inputs). Then
00883          * CMPXCHG8B does its business, after which EDX:EAX is guaranteed to
00884          * contain the value of *operand when the instruction executed. We test
00885          * the ZF field to see if the operation succeeded. We *COULD* save
00886          * EDX:EAX back into oldval to save ourselves a step when the loop
00887          * fails, but that's a waste when the loop succeeds (i.e. in the common
00888          * case). Optimizing for the common case, in this situation, means
00889          * minimizing our extra write-out to the one-byte test variable.
00890          */
00891         __asm__ __volatile__(QTHREAD_PIC_PREFIX
00892                              "lock; cmpxchg8b (%1)\n\t"
00893                              "setne %0" /* test = (ZF==0) */
00894                              QTHREAD_PIC_SUFFIX
00895                              :"=q"(test)
00896                              :"r"    (operand),
00897                              /*EAX*/ "a"(oldval.s.l),
00898                              /*EDX*/ "d"(oldval.s.h),
00899                              /*EBX*/ QTHREAD_PIC_REG_4(newval.s.l),
00900                              /*ECX*/ "c"(newval.s.h)
00901                              :"memory");
00902     } while (test);                    /* if ZF was cleared, the calculation is out of date */
00903     return oldval.i;
00904 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00905     uint64_t retval = incr;
00906 
00907 #  ifdef __PGI
00908     /* this is a workaround for a bug in the PGI compiler where the width of
00909      * retval is not respected and %eax is used instead of %rax */
00910     __asm__ __volatile__ ("lock xaddq %0, (%2)\n\t"
00911             "mov %0,(%1)"
00912                   ::"r" (incr),
00913                   "r"(&retval),
00914                   "r" (operand)
00915                   :"memory");
00916 #  else
00917     __asm__ __volatile__ ("lock ; xaddq %0, (%1);"
00918                   :"+r" (retval)
00919                   :"r" (operand)
00920                   :"memory");
00921 #  endif
00922 
00923     return retval;
00924 # else
00925 #  error Unimplemented assembly architecture for qthread_incr64
00926 # endif
00927 #endif
00928 }                                      /*}}} */
00929 
00930 static QINLINE unsigned long qthread_incr_xx(
00931     volatile void *addr,
00932     const long int incr,
00933     const size_t length)
00934 {                                      /*{{{ */
00935     switch (length) {
00936         case 4:
00937             return qthread_incr32((volatile uint32_t *)addr, incr);
00938         case 8:
00939             return qthread_incr64((volatile uint64_t *)addr, incr);
00940         default:
00941             /* This should never happen, so deliberately cause a seg fault
00942              * for corefile analysis */
00943             *(int *)(0) = 0;
00944     }
00945     return 0;                          /* compiler check */
00946 }                                      /*}}} */
00947 
00948 #if ! defined(QTHREAD_ATOMIC_CAS) || defined(QTHREAD_MUTEX_INCREMENT)
00949 static QINLINE uint32_t qthread_cas32(volatile uint32_t * operand,
00950                                       const uint32_t oldval,
00951                                       const uint32_t newval)
00952 {                                      /*{{{ */
00953 #ifdef QTHREAD_MUTEX_INCREMENT // XXX: this is only valid if you don't read *operand without the lock
00954     return qthread_cas32_(operand,oldval,newval);
00955 #else
00956 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
00957       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00958     register uint32_t result;
00959     __asm__ __volatile__ ("1:\n\t"
00960             "lwarx  %0,0,%3\n\t"
00961             "cmpw   %0,%1\n\t"
00962             "bne    2f\n\t"
00963             "stwcx. %2,0,%3\n\t"
00964             "bne-   1b\n"
00965             "2:"
00966             "isync" /* make sure it wasn't all a dream */
00967             :"=&b" (result)
00968             :"r"(oldval), "r"(newval), "r"(operand)
00969             :"cc", "memory");
00970     return result;
00971 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32) || \
00972         (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00973     register uint32_t newv = newval;
00974     __asm__ __volatile__
00975         ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00976          "cas [%1], %2, %0"
00977          : "+r" (newv)
00978          : "r" (operand), "r"(oldval)
00979          : "cc", "memory");
00980     return newv;
00981 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00982     register uint32_t retval;
00983     __asm__ __volatile__ ("mov ar.ccv=%0;;": :"rO" (oldval));
00984     __asm__ __volatile__ ("cmpxchg4.acq %0=[%1],%2,ar.ccv"
00985             :"=r"(retval)
00986             :"r"(operand), "r"(newval)
00987             :"memory");
00988     return retval;
00989 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64) || \
00990         (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00991     uint32_t retval;
00992     /* note that this is GNU/Linux syntax (aka AT&T syntax), not Intel syntax.
00993      * Thus, this instruction has the form:
00994      * [lock] cmpxchg reg, reg/mem
00995      *                src, dest
00996      */
00997     __asm__ __volatile__ ("lock; cmpxchg %1,(%2)"
00998             : "=&a"(retval) /* store from EAX */
00999             : "r"(newval), "r" (operand),
01000               "0"(oldval) /* load into EAX */
01001             :"cc","memory");
01002     return retval;
01003 # else
01004 #  error Unimplemented assembly architecture for qthread_cas32
01005 # endif
01006 #endif
01007 }                                      /*}}} */
01008 
01009 static QINLINE uint64_t qthread_cas64(volatile uint64_t * operand,
01010                                       const uint64_t oldval,
01011                                       const uint64_t newval)
01012 {                                      /*{{{ */
01013 #ifdef QTHREAD_MUTEX_INCREMENT
01014     return qthread_cas64_(operand, oldval, newval);
01015 #else
01016 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
01017     register uint64_t result;
01018     __asm__ __volatile__ ("1:\n\t"
01019             "ldarx  %0,0,%3\n\t"
01020             "cmpw   %0,%1\n\t"
01021             "bne    2f\n\t"
01022             "stdcx. %2,0,%3\n\t"
01023             "bne-   1b\n"
01024             "2:"
01025             "isync" /* make sure it wasn't all a dream */
01026             :"=&b" (result)
01027             :"r"(oldval), "r"(newval), "r"(operand)
01028             :"cc", "memory");
01029     return result;
01030 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
01031     register uint64_t tmp1=tmp1;
01032     register uint64_t tmp2=tmp2;
01033     uint64_t newv = newval;
01034     __asm__ __volatile__
01035         ("ldx %0, %1\n\t"
01036          "ldx %4, %2\n\t"
01037          "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
01038          "casx [%3], %2, %1\n\t"
01039          "stx %1, %0"
01040          /* h means 64-BIT REGISTER
01041           * (probably unneecessary, but why take chances?) */
01042          : "+m" (newv), "=&h" (tmp1), "=&h"(tmp2)
01043          : "r" (operand), "m"(oldval)
01044          : "cc", "memory");
01045     return newv;
01046 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
01047     register uint64_t newv = newval;
01048     __asm__ __volatile__
01049         ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
01050          "casx [%1], %2, %0"
01051          : "+r" (newv)
01052          : "r" (operand), "r"(oldval)
01053          : "cc", "memory");
01054     return newv;
01055 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
01056     register uint32_t retval;
01057     __asm__ __volatile__ ("mov ar.ccv=%0;;": :"rO" (oldval));
01058     __asm__ __volatile__ ("cmpxchg8.acq %0=[%1],%2,ar.ccv"
01059             :"=r"(retval)
01060             :"r"(operand), "r"(newval)
01061             :"memory");
01062     return retval;
01063 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
01064     union {
01065         uint64_t i;
01066         struct {
01067             /* note: the ordering of these is both important and
01068              * counter-intuitive; welcome to little-endian! */
01069             uint32_t l;
01070             uint32_t h;
01071         } s;
01072     } oldv, newv, ret;
01073     oldv.i = oldval;
01074     newv.i = newval;
01075 #ifndef QTHREAD_PIC_PREFIX
01076 #ifdef __PIC__
01077        /* this saves off %ebx to make PIC code happy :P */
01078 # define QTHREAD_PIC_PREFIX "xchg %%ebx, %4\n\t"
01079        /* this restores it */
01080 # define QTHREAD_PIC_SUFFIX "\n\txchg %%ebx, %4"
01081 # define QTHREAD_PIC_REG_4 "r"
01082 #else
01083 # define QTHREAD_PIC_PREFIX
01084 # define QTHREAD_PIC_SUFFIX
01085 # define QTHREAD_PIC_REG_4 "b"
01086 #endif
01087 #endif
01088     /* the PIC stuff is already defined above */
01089     __asm__ __volatile__ (
01090             QTHREAD_PIC_PREFIX
01091             "lock; cmpxchg8b (%2)"
01092             QTHREAD_PIC_SUFFIX
01093             :/*EAX*/"=a"(ret.s.l),
01094             /*EDX*/"=d"(ret.s.h)
01095             :"r"(operand),
01096             /*EAX*/"a"(oldv.s.l),
01097             /*EBX*/QTHREAD_PIC_REG_4(newv.s.l),
01098             /*EDX*/"d"(oldv.s.h),
01099             /*ECX*/"c"(newv.s.h)
01100             :"memory");
01101     return ret.i;
01102 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
01103     /* note that this is GNU/Linux syntax (aka AT&T syntax), not Intel syntax.
01104      * Thus, this instruction has the form:
01105      * [lock] cmpxchg reg, reg/mem
01106      *                src, dest
01107      */
01108 #  ifdef __PGI
01109     /* this is a workaround for a bug in the PGI compiler where the width of
01110      * retval is not respected and %eax is used instead of %rax */
01111     uint64_t retval;
01112     __asm__ __volatile__ ("lock cmpxchg %1,(%2)\n\t"
01113             "mov %%rax,(%0)"
01114             ::"r"(&retval), "r"(newval), "r"(operand),
01115             "a"(oldval) /* load into RAX */
01116             :"cc","memory");
01117     return retval;
01118 #  else
01119     uint64_t retval;
01120     __asm__ __volatile__ ("lock; cmpxchg %1,(%2)"
01121             : "=a"(retval) /* store from RAX */
01122             : "r"(newval), "r" (operand),
01123               "a"(oldval) /* load into RAX */
01124             :"cc","memory");
01125     return retval;
01126 #  endif
01127 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32)
01128     /* In general, RISC doesn't provide a way to do 64 bit operations from 32
01129      * bit code. Sorry! */
01130     uint64_t retval;
01131     qthread_t *me = qthread_self();
01132 
01133     qthread_lock(me, (aligned_t*)operand);
01134     retval = *operand;
01135     if (retval == oldval) {
01136         *operand = newval;
01137     }
01138     qthread_unlock(me, (aligned_t*)operand);
01139     return retval;
01140 # else
01141 #  error Unimplemented assembly architecture for qthread_cas64
01142 # endif
01143 #endif
01144 }                                      /*}}} */
01145 
01146 static QINLINE aligned_t qthread_cas_xx(
01147     volatile aligned_t * addr,
01148     const aligned_t oldval,
01149     const aligned_t newval,
01150     const size_t length)
01151 {                                      /*{{{ */
01152     switch (length) {
01153         case 4:
01154             return qthread_cas32((volatile uint32_t *)addr, oldval, newval);
01155         case 8:
01156             return qthread_cas64((volatile uint64_t *)addr, oldval, newval);
01157         default:
01158             /* This should never happen, so deliberately cause a seg fault
01159              * for corefile analysis */
01160             *(int *)(0) = 0;
01161     }
01162     return 0;                          /* compiler check */
01163 }                                      /*}}} */
01164 
01165 static QINLINE void *qthread_cas_ptr_(
01166     void *volatile *const addr,
01167     void *const oldval,
01168     void *const newval)
01169 {
01170 #if (SIZEOF_VOIDP == 4)
01171     return (void *)(uintptr_t) qthread_cas32((volatile uint32_t *)
01172                                              addr, (uint32_t) (uintptr_t)
01173                                              oldval, (uint32_t) (uintptr_t)
01174                                              newval);
01175 #elif (SIZEOF_VOIDP == 8)
01176     return (void *)(uintptr_t) qthread_cas64((volatile uint64_t *)
01177                                              addr, (uint64_t) (uintptr_t)
01178                                              oldval, (uint64_t) (uintptr_t)
01179                                              newval);
01180 #else
01181 #error The size of void* either could not be determined, or is very unusual.
01182     /* This should never happen, so deliberately cause a seg fault for
01183      * corefile analysis */
01184     *(int *)(0) = 0;
01185     return NULL;                       /* compiler check */
01186 #endif
01187 }
01188 #endif /* ifdef QTHREAD_ATOMIC_CAS */
01189 
01190 #ifdef QTHREAD_ATOMIC_CAS
01191 # define qthread_cas(ADDR, OLDV, NEWV) \
01192     __sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01193 # define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01194     (void*)__sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01195 #else
01196 # define qthread_cas(ADDR, OLDV, NEWV) \
01197     qthread_cas_xx((volatile aligned_t*)(ADDR), (aligned_t)(OLDV), (aligned_t)(NEWV), sizeof(*(ADDR)))
01198 # ifdef QTHREAD_ATOMIC_CAS_PTR
01199 #  define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01200     (void*)__sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01201 # else
01202 #  define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01203     qthread_cas_ptr_((void*volatile*const)(ADDR), (void*const)(OLDV), (void*const)(NEWV))
01204 # endif
01205 #endif
01206 
01207 #ifdef QTHREAD_USE_ROSE_EXTENSIONS
01208 extern int __qthreads_temp;
01209 void qthread_reset_forCount(qthread_t *);
01210 #endif
01211 
01212 Q_ENDCXX /* */
01213 
01214 #ifndef __cplusplus
01215 
01216 # if defined(QTHREAD_ATOMIC_INCR) && !defined(QTHREAD_MUTEX_INCREMENT)
01217 #  define qthread_incr( ADDR, INCVAL ) \
01218     __sync_fetch_and_add(ADDR, INCVAL)
01219 # else
01220 #  define qthread_incr( ADDR, INCVAL )                  \
01221    qthread_incr_xx( (volatile void*)(ADDR), (long int)(INCVAL), sizeof(*(ADDR)) )
01222 # endif
01223 
01224 #else /* ifdef __cplusplus */
01225 # include <qthread/qthread.hpp>
01226 #endif /* __cplusplus */
01227 
01228 #endif /* QTHREAD_SST_PRIMITIVES */
01229 
01230 #endif /* _QTHREAD_H_ */

Generated on Fri Oct 22 2010 11:02:24 for SST by  doxygen 1.7.1