• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

sst/elements/genericProc/programs/qthread-1.4/include/qthread/qthread-sst.h

00001 #ifndef _QTHREAD_SST_H_
00002 #define _QTHREAD_SST_H_
00003 
00004 #include <errno.h>                     /* for ENOMEM */
00005 
00006 #include <qthread/qthread-int.h>       /* for uint32_t and uint64_t */
00007 #include <qthread/common.h>            /* important configuration options */
00008 
00009 #include <string.h>                    /* for memcpy() */
00010 #include <ppcPimCalls.h>
00011 
00012 /*****************************************************************************
00013  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00014  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
00015  * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  NOTE  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! *
00016  *                                                                           *
00017  *    The most complete documentaton is going to be in the man pages. The    *
00018  *    documentation here is just to give you a general idea of what each     *
00019  *    function does.                                                         *
00020  *                                                                           *
00021  *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *   *
00022  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00023  *****************************************************************************/
00024 
00025 #ifdef __cplusplus
00026 #define Q_STARTCXX extern "C" {
00027 #define Q_ENDCXX }
00028 #else
00029 #define Q_STARTCXX
00030 #define Q_ENDCXX
00031 #endif
00032 
00033 Q_STARTCXX /* */
00034 typedef int qthread_t;
00035 typedef unsigned short qthread_shepherd_id_t;   /* doubt we'll run more than 65k shepherds */
00036 
00037 /* for convenient arguments to qthread_fork */
00038 typedef aligned_t(*qthread_f) (qthread_t * me, void *arg);
00039 
00040 /* While this function is *required* for UNIX, in a PIM environment, it serves
00041  * primarily to prove that qthreads are being used (thus the quickPrint call
00042  * with the "scalable" argument) */
00043 #define qthread_init(x) PIM_quickPrint(0x5ca1ab1e,x,PIM_readSpecial(PIM_CMD_LOC_COUNT))
00044 #define qthread_initialize() PIM_quickPrint(0x5ca1ab1e,0,PIM_readSpecial(PIM_CMD_LOC_COUNT))
00045 
00046 /* XXX: not sure how to handle this in a truly multithreaded environment */
00047 #define qthread_finalize() PIM_quickPrint(0xaced,0,0)
00048 
00049 /* means nothing in a truly multithreaded environment */
00050 #define qthread_yield(x)
00051 
00052 /* cannot be done without hardware support */
00053 #define qthread_disable_shepherd(x)
00054 #define qthread_enable_shepherd(x)
00055 
00056 /* this function allows a qthread to retrieve its qthread_t pointer if it has
00057  * been lost for some reason */
00058 static inline qthread_t *qthread_self(void) {
00059     return (qthread_t*)PIM_readSpecial(PIM_CMD_THREAD_SEQ);
00060 }
00061 
00062 /* these are the functions for generating a new qthread.
00063  *
00064  * Using qthread_fork() and variants:
00065  *
00066  *     The specified function (the first argument; note that it is a qthread_f
00067  *     and not a qthread_t) will be run to completion. You can detect that a
00068  *     thread has finished by specifying a location to store the return value
00069  *     (which will be stored with a qthread_writeF call). The qthread_fork_to
00070  *     function spawns the thread to a specific shepherd.
00071  *
00072  * In the SST PIM environment, the shepherd is the CPU ID number.
00073  */
00074 #define qthread_fork(f, arg, ret) qthread_fork_to((f), (arg), (ret), NO_SHEPHERD)
00075 #define qthread_fork_syncvar(f, arg, ret) qthread_fork_to((f), (arg), (aligned_t*)(ret), NO_SHEPHERD)
00076 int qthread_fork_to(const qthread_f f, const void *arg, aligned_t * ret,
00077                     const qthread_shepherd_id_t shepherd);
00078 #define qthread_fork_syncvar_to(f, arg, ret, shep) qthread_fork_to((f), (arg), (aligned_t*)(ret), (shep))
00079 
00080 /* Using qthread_prepare()/qthread_schedule() and variants:
00081  *
00082  *     The combination of these two functions works like qthread_fork().
00083  *     First, qthread_prepare() creates a qthread_t object that is ready to be
00084  *     run (almost), but has not been scheduled. Next, qthread_schedule puts
00085  *     the finishing touches on the qthread_t structure and places it into an
00086  *     active queue.
00087  */
00088 #define qthread_prepare(f, arg, ret) qthread_prepare_for((f), (arg), (ret), NO_SHEPHERD)
00089 qthread_t *qthread_prepare_for(const qthread_f f, const void *arg,
00090                                aligned_t * ret,
00091                                const qthread_shepherd_id_t shepherd);
00092 
00093 #define qthread_schedule(t) qthread_schedule_on(t, NO_SHEPHERD)
00094 #define qthread_schedule_on(t,shep) PIM_startStoppedThread((int)t,(int)shep)
00095 
00096 static inline
00097 unsigned qthread_migrate_to(const qthread_t *me, const int shepherd)
00098 {
00099     __asm__ __volatile__ (
00100             "mr r3, %1\n\t" /* put the stack ptr in the argument */
00101             "li r0, %0\n\t"
00102             "sc"
00103             ::"M"(SS_PIM_MOVE_TO),
00104             "r"(shepherd)
00105             :"r0","r3");
00106     return 0;
00107 }
00108 
00109 /* these are accessor functions for use by the qthreads to retrieve information
00110  * about themselves */
00111 static inline
00112 unsigned qthread_id(const qthread_t * t)
00113 {
00114     return PIM_readSpecial(PIM_CMD_THREAD_SEQ);
00115 }
00116 static inline
00117 qthread_shepherd_id_t qthread_shep(const qthread_t * t)
00118 {
00119     return PIM_readSpecial(PIM_CMD_PROC_NUM);
00120 }
00121 static inline
00122 size_t qthread_stackleft(const qthread_t * t)
00123 {
00124     return 0;                          /* XXX: this is a bug! */
00125 }
00126 static inline
00127 aligned_t *qthread_retloc(const qthread_t * t)
00128 {
00129     return 0;                          /* XXX: this is a bug! */
00130 }
00131 static inline
00132 int qthread_shep_ok(const qthread_t * t)
00133 {
00134     return 1;
00135 }
00136 
00137 /* returns the distance from one shepherd to another */
00138 static inline
00139 int qthread_distance(const qthread_shepherd_id_t src,
00140                      const qthread_shepherd_id_t dest)
00141 {
00142     return 0;                          /* XXX: this is a bug! */
00143 }
00144 /* returns a list of shepherds, sorted by their distance from either this
00145  * qthread or the specified shepherd */
00146 static inline
00147 const qthread_shepherd_id_t *qthread_sorted_sheps(const qthread_t * t)
00148 {
00149     return NULL;
00150 }
00151 static inline
00152 const qthread_shepherd_id_t *qthread_sorted_sheps_remote(const
00153                                                          qthread_shepherd_id_t
00154                                                          src)
00155 {
00156     return NULL;
00157 }
00158 /* returns the number of shepherds (i.e. one more than the largest valid shepherd id) */
00159 #define qthread_num_shepherds() ((qthread_shepherd_id_t) PIM_readSpecial(PIM_CMD_LOC_COUNT))
00160 
00161 /****************************************************************************
00162  * functions to implement FEB locking/unlocking
00163  ****************************************************************************
00164  *
00165  * These are the FEB functions. All but empty/fill have the potential of
00166  * blocking until the corresponding precondition is met. All FEB
00167  * blocking/reading/writing is done on a machine-word basis. Memory is assumed
00168  * to be full unless otherwise asserted, and as such memory that is full and
00169  * does not have dependencies (i.e. no threads are waiting for it to become
00170  * empty) does not require state data to be stored. It is expected that while
00171  * there may be locks instantiated at one time or another for a very large
00172  * number of addresses in the system, relatively few will be in a non-default
00173  * (full, no waiters) state at any one time.
00174  */
00175 
00176 /* This function is just to assist with debugging; it returns 1 if the address
00177  * is full, and 0 if the address is empty */
00178 static inline int qthread_feb_status(const aligned_t *addr)
00179 {
00180     return PIM_feb_is_full((unsigned int*)addr);
00181 }
00182 #define qthread_syncvar_status(addr) qthread_feb_status((const aligned_t*)addr)
00183 
00184 /* The empty/fill functions merely assert the empty or full state of the given
00185  * address. */
00186 static inline
00187 int qthread_empty(qthread_t * me, const aligned_t *dest)
00188 {
00189     PIM_feb_empty((unsigned int*)dest);
00190     return 0;
00191 }
00192 #define qthread_syncvar_empty(me, dest) qthread_empty((me), (aligned_t*)(dest))
00193 static inline
00194 int qthread_fill(qthread_t * me, const aligned_t *dest)
00195 {
00196     PIM_feb_fill((unsigned int*)dest);
00197     return 0;
00198 }
00199 #define qthread_syncvar_fill(me, dest) qthread_fill((me), (aligned_t*)(dest))
00200 
00201 /* These functions wait for memory to become empty, and then fill it. When
00202  * memory becomes empty, only one thread blocked like this will be awoken. Data
00203  * is read from src and written to dest.
00204  *
00205  * The semantics of writeEF are:
00206  * 1 - destination's FEB state must be "empty"
00207  * 2 - data is copied from src to destination
00208  * 3 - the destination's FEB state gets changed from empty to full
00209  *
00210  * This function takes a qthread_t pointer as an argument. If this is called
00211  * from somewhere other than a qthread, use NULL for the me argument. If you
00212  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00213  * (which, conveniently, returns NULL if you aren't a qthread).
00214  */
00215 static inline
00216 int qthread_writeEF(qthread_t * me, aligned_t * const dest,
00217                     const aligned_t * const src)
00218 {
00219     PIM_feb_writeef(dest, *src);
00220     return 0;
00221 }
00222 static inline
00223 int qthread_writeEF_const(qthread_t * me, aligned_t * const dest,
00224                           const aligned_t src)
00225 {
00226     PIM_feb_writeef(dest, src);
00227     return 0;
00228 }
00229 #define qthread_syncvar_writeEF(me, dest, src) qthread_writeEF((me), (aligned_t*)(dest), (aligned_t*)(src))
00230 #define qthread_syncvar_writeEF_const(me, dest, src) qthread_writeEF_const((me), (aligned_t*)(dest), (aligned_t*)(src))
00231 
00232 /* This function is a cross between qthread_fill() and qthread_writeEF(). It
00233  * does not wait for memory to become empty, but performs the write and sets
00234  * the state to full atomically with respect to other FEB-based actions. Data
00235  * is read from src and written to dest.
00236  *
00237  * The semantics of writeF are:
00238  * 1 - data is copied from src to destination
00239  * 2 - the destination's FEB state gets set to full
00240  *
00241  * This function takes a qthread_t pointer as an argument. If this is called
00242  * from somewhere other than a qthread, use NULL for the me argument. If you
00243  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00244  * (which, conveniently, returns NULL if you aren't a qthread).
00245  */
00246 static inline int qthread_writeF(qthread_t * me, aligned_t * const dest,
00247                                  const aligned_t * const src)
00248 {
00249     *dest = *src;
00250     PIM_feb_fill(dest);
00251     return 0;
00252 }
00253 static inline int qthread_writeF_const(qthread_t * me, aligned_t * const dest,
00254                                        const aligned_t src)
00255 {
00256     *dest = src;
00257     PIM_feb_fill(dest);
00258     return 0;
00259 }
00260 #define qthread_syncvar_writeF(me, dest, src) qthread_writeF((me), (aligned_t*)(dest), (aligned_t*)(src))
00261 #define qthread_syncvar_writeF_const(me, dest, src) qthread_writeF_const((me), (aligned_t*)(dest), (aligned_t)(src))
00262 
00263 /* This function waits for memory to become full, and then reads it and leaves
00264  * the memory as full. When memory becomes full, all threads waiting for it to
00265  * become full with a readFF will receive the value at once and will be queued
00266  * to run. Data is read from src and stored in dest.
00267  *
00268  * The semantics of readFF are:
00269  * 1 - src's FEB state must be "full"
00270  * 2 - data is copied from src to destination
00271  *
00272  * This function takes a qthread_t pointer as an argument. If this is called
00273  * from somewhere other than a qthread, use NULL for the me argument. If you
00274  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00275  * (which, conveniently, returns NULL if you aren't a qthread).
00276  */
00277 static inline
00278 int qthread_readFF(qthread_t * me, aligned_t * const dest,
00279                    const aligned_t * const src)
00280 {
00281     if (dest != NULL && dest != src) {
00282         *dest = PIM_feb_readff((aligned_t * const)src);
00283     } else {
00284         PIM_feb_readff((aligned_t * const)src);
00285     }
00286     return 0;
00287 }
00288 #define qthread_syncvar_readFF(me, dest, src) qthread_readFF((me), (aligned_t*)(dest), (aligned_t*)(src))
00289 
00290 /* These functions wait for memory to become full, and then empty it. When
00291  * memory becomes full, only one thread blocked like this will be awoken. Data
00292  * is read from src and written to dest.
00293  *
00294  * The semantics of readFE are:
00295  * 1 - src's FEB state must be "full"
00296  * 2 - data is copied from src to destination
00297  * 3 - the src's FEB bits get changed from full to empty when the data is copied
00298  *
00299  * This function takes a qthread_t pointer as an argument. If this is called
00300  * from somewhere other than a qthread, use NULL for the me argument. If you
00301  * have lost your qthread_t pointer, it can be reclaimed using qthread_self()
00302  * (which, conveniently, returns NULL if you aren't a qthread).
00303  */
00304 static inline
00305 int qthread_readFE(qthread_t * me, aligned_t * restrict const dest,
00306                    const aligned_t * restrict const src)
00307 {
00308     if (dest != NULL && dest != src) {
00309         *dest = PIM_feb_readfe((aligned_t * const)src);
00310     } else {
00311         PIM_feb_readfe((aligned_t * const)src);
00312     }
00313     return 0;
00314 }
00315 #define qthread_syncvar_readFE(me, dest, src) qthread_readFE((me), (aligned_t*)(dest), (aligned_t*)(src))
00316 
00317 /* functions to implement FEB-ish locking/unlocking
00318  *
00319  * These are atomic and functional, but do not have the same semantics as full
00320  * FEB locking/unlocking (namely, unlocking cannot block), however because of
00321  * this, they have lower overhead.
00322  *
00323  * These functions take a qthread_t pointer as an argument. If this is called
00324  * from somewhere other than a qthread, use NULL for the me argument. If you
00325  * have lost your qthread_t pointer, it can be reclaimed using qthread_self().
00326  */
00327 static inline
00328 int qthread_lock(qthread_t * me, const aligned_t * a)
00329 {
00330     PIM_feb_lock((aligned_t * const)a);
00331     return 0;
00332 }
00333 static inline
00334 int qthread_unlock(qthread_t * me, const aligned_t * a)
00335 {
00336     PIM_feb_unlock((aligned_t * const)a);
00337     return 0;
00338 }
00339 
00340 /* the following three functions implement variations on atomic increment. It
00341  * is done with architecture-specific assembly (on supported architectures,
00342  * when possible) and does NOT use FEB's or lock/unlock unless the architecture
00343  * is unsupported or cannot perform atomic operations at the right granularity.
00344  * All of these functions return the value of the contents of the operand
00345  * *after* incrementing.
00346  */
00347 
00348 static QINLINE float qthread_fincr(volatile float *operand, const float incr)
00349 {                                      /*{{{ */
00350 #if defined(HAVE_GCC_INLINE_ASSEMBLY)
00351 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00352     union
00353     {
00354         float f;
00355         uint32_t i;
00356     } retval;
00357     register float incremented_value;
00358     register uint32_t scratch_int;
00359     uint32_t conversion_memory = conversion_memory;
00360     __asm__ __volatile__("1:\n\t"
00361             "lwarx  %0,0,%4\n\t"
00362             // convert from int to float
00363             "stw    %0,%2\n\t"
00364             "lfs    %1,%2\n\t"
00365             // do the addition
00366             "fadds  %1,%1,%5\n\t"
00367             // convert from float to int
00368             "stfs   %1,%2\n\t"
00369             "lwz    %3,%2\n\t"
00370             // store back to original location
00371             "stwcx. %3,0,%4\n\t"
00372             "bne-   1b\n\t"
00373             "isync"
00374             :"=&b" (retval.i),          /* %0 */
00375              "=&f" (incremented_value), /* %1 */
00376              "=m"  (conversion_memory), /* %2 */
00377              "=&r" (scratch_int)        /* %3 */
00378             :"r"   (operand),           /* %4 */
00379              "f"   (incr)               /* %5 */
00380             :"cc", "memory");
00381 
00382     return retval.f;
00383 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00384     union
00385     {
00386         float f;
00387         uint32_t i;
00388     } oldval, newval;
00389 
00390     /* newval.f = *operand; */
00391     do {
00392         /* you *should* be able to move the *operand reference outside the
00393          * loop and use the output of the CAS (namely, newval) instead.
00394          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00395          * that, the while() comparison uses a temporary register value for
00396          * newval that has nothing to do with the output of the CAS
00397          * instruction. (See how obviously wrong that is?) For some reason that
00398          * I haven't been able to figure out, moving the *operand reference
00399          * inside the loop fixes that problem, even at -O2 optimization. */
00400         oldval.f = *operand;
00401         newval.f = oldval.f + incr;
00402 #if defined(__SUNPRO_CC)
00403         asm volatile
00404 #else
00405         __asm__ __volatile__
00406 #endif
00407                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00408                  "cas [%1], %2, %0"
00409                  :"=&r"(newval.i)
00410                  :"r"    (operand), "r"(oldval.i), "0"(newval.i)
00411                  :"cc", "memory");
00412     } while (oldval.i != newval.i);
00413     return oldval.f;
00414 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00415     union
00416     {
00417         float f;
00418         uint32_t i;
00419     } oldval, newval, res;
00420 
00421     do {
00422         oldval.f = *operand;
00423         newval.f = oldval.f + incr;
00424         __asm__ __volatile__("mov ar.ccv=%0;;"::"rO"(oldval.i));
00425         __asm__ __volatile__("cmpxchg4.acq %0=[%1],%2,ar.ccv"
00426                              :"=r"(res.i)
00427                              :"r"    (operand), "r"(newval.i)
00428                              :"memory");
00429     } while (res.i != oldval.i);       /* if res!=old, the calc is out of date */
00430     return oldval.f;
00431 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00432     union
00433     {
00434         float f;
00435         uint32_t i;
00436     } oldval, newval, retval;
00437 
00438     do {
00439         oldval.f = *operand;
00440         newval.f = oldval.f + incr;
00441         __asm__ __volatile__("lock; cmpxchg %1, (%2)"
00442                              :"=a"(retval.i)    /* store from EAX */
00443                              :"r"    (newval.i),
00444                               "r"(operand),
00445                               "0"(oldval.i)     /* load into EAX */
00446                              :"cc", "memory");
00447     } while (retval.i != oldval.i);
00448     return oldval.f;
00449 # endif
00450 #elif defined (QTHREAD_MUTEX_INCREMENT)
00451 
00452     float retval;
00453     qthread_t *me = qthread_self();
00454 
00455     qthread_lock(me, (aligned_t *) operand);
00456     retval = *operand;
00457     *operand += incr;
00458     qthread_unlock(me, (aligned_t *) operand);
00459     return retval;
00460 #else
00461 #error "Neither atomic nor mutex increment enabled; needed for qthread_fincr"
00462 #endif
00463 }                                      /*}}} */
00464 
00465 static QINLINE double qthread_dincr(volatile double *operand,
00466                                     const double incr)
00467 {                                      /*{{{ */
00468 #if defined(HAVE_GCC_INLINE_ASSEMBLY) && (QTHREAD_ASSEMBLY_ARCH != QTHREAD_POWERPC32)
00469 #if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00470     register uint64_t scratch_int;
00471     register double incremented_value;
00472     union
00473     {
00474         uint64_t i;
00475         double d;
00476     } retval;
00477     uint64_t conversion_memory = conversion_memory;
00478     __asm__ __volatile__("1:\n\t"
00479                          "ldarx %0,0,%4\n\t"
00480                          /*convert from integer to floating point */
00481                          "std   %0,%2\n\t"      // %2 is scratch memory (NOT a register)
00482                          "lfd   %1,%2\n\t"      // %1 is a scratch floating point register
00483                          /* do the addition */
00484                          "fadd  %1,%1,%5\n\t"   // %4 is the input increment
00485                          /* convert from floating point to integer */
00486                          "stfd   %1,%2\n\t"
00487                          "ld     %3,%2\n\t"
00488                          /* store back to original location */
00489                          "stdcx. %3,0,%4\n\t"
00490                          "bne-   1b\n\t"
00491                          "isync"
00492                          :"=&b" (retval.i),             /* %0 */
00493                           "=&f" (incremented_value),    /* %1 */
00494                           "=m"  (conversion_memory),    /* %2 */
00495                           "=r&" (scratch_int)           /* %3 */
00496                          :"r"   (operand),              /* %4 */
00497                           "f"   (incr)                  /* %5 */
00498                          :"cc", "memory");
00499 
00500     return retval.d;
00501 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00502     double oldval, newval;
00503 
00504     newval = *operand;
00505     do {
00506         /* this allows the compiler to be as flexible as possible with register
00507          * assignments */
00508         register uint64_t tmp1 = tmp1;
00509         register uint64_t tmp2 = tmp2;
00510 
00511         oldval = newval;
00512         newval = oldval + incr;
00513         __asm__ __volatile__("ldx %0, %1\n\t"
00514                              "ldx %4, %2\n\t"
00515                              "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00516                              "casx [%3], %2, %1\n\t"
00517                              "stx %1, %0"
00518                              /* h means 64-BIT REGISTER
00519                               * (probably unnecessary, but why take chances?) */
00520                              :"=m"   (newval), "=&h"(tmp1), "=&h"(tmp2)
00521                              :"r"    (operand), "m"(oldval)
00522                              :"memory");
00523     } while (oldval != newval);
00524     return oldval;
00525 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00526     union
00527     {
00528         uint64_t i;
00529         double d;
00530     } oldval, newval;
00531 
00532     /*newval.d = *operand; */
00533     do {
00534         /* you *should* be able to move the *operand reference outside the
00535          * loop and use the output of the CAS (namely, newval) instead.
00536          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00537          * that, the while() comparison uses a temporary register value for
00538          * newval that has nothing to do with the output of the CAS
00539          * instruction. (See how obviously wrong that is?) For some reason that
00540          * I haven't been able to figure out, moving the *operand reference
00541          * inside the loop fixes that problem, even at -O2 optimization. */
00542         oldval.d = *operand;
00543         newval.d = oldval.d + incr;
00544 #if defined(__SUNPRO_CC)
00545         asm volatile
00546 #else
00547         __asm__ __volatile__
00548 #endif
00549                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00550                  "casx [%1], %2, %0"
00551                  :"=&r"(newval.i)
00552                  :"r"(operand), "r"(oldval.i), "0"(newval.i)
00553                  :"memory");
00554     } while (oldval.d != newval.d);
00555     return oldval.d;
00556 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00557     union
00558     {
00559         uint64_t i;
00560         double d;
00561     } oldval, newval, res;
00562 
00563     do {
00564         oldval.d = *operand;
00565         newval.d = oldval.d + incr;
00566         __asm__ __volatile__("mov ar.ccv=%0;;"::"rO"(oldval.i));
00567         __asm__ __volatile__("cmpxchg8.acq %0=[%1],%2,ar.ccv"
00568                              :"=r"(res.i)
00569                              :"r"    (operand), "r"(newval.i)
00570                              :"memory");
00571     } while (res.i != oldval.i);       /* if res!=old, the calc is out of date */
00572     return oldval.d;
00573 
00574 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00575     union
00576     {
00577         double d;
00578         uint64_t i;
00579     } oldval, newval, retval;
00580 
00581     do {
00582         oldval.d = *operand;
00583         newval.d = oldval.d + incr;
00584         __asm__ __volatile__("lock; cmpxchgq %1, (%2)"
00585                              :"=a"(retval.i)
00586                              :"r"(newval.i), "r"(operand),
00587                               "0"(oldval.i)
00588                              :"memory");
00589     } while (retval.i != oldval.i);
00590     return oldval.d;
00591 
00592 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00593     union
00594     {
00595         double d;
00596         uint64_t i;
00597         struct
00598         {
00599             /* note: the ordering of these is both important and
00600              * counter-intuitive; welcome to little-endian! */
00601             uint32_t l;
00602             uint32_t h;
00603         } s;
00604     } oldval, newval;
00605     register char test;
00606 
00607     do {
00608 #ifdef __PIC__
00609        /* this saves off %ebx to make PIC code happy :P */
00610 # define QTHREAD_PIC_PREFIX "xchg %%ebx, %4\n\t"
00611        /* this restores it */
00612 # define QTHREAD_PIC_SUFFIX "\n\txchg %%ebx, %4"
00613 # define QTHREAD_PIC_REG_4 "r"
00614 #else
00615 # define QTHREAD_PIC_PREFIX
00616 # define QTHREAD_PIC_SUFFIX
00617 # define QTHREAD_PIC_REG_4 "b"
00618 #endif
00619         oldval.d = *operand;
00620         newval.d = oldval.d + incr;
00621         /* Yeah, this is weird looking, but it really makes sense when you
00622          * understand the instruction's semantics (which make sense when you
00623          * consider that it's doing a 64-bit op on a 32-bit proc):
00624          *
00625          *    Compares the 64-bit value in EDX:EAX with the operand
00626          *    (destination operand). If the values are equal, the 64-bit value
00627          *    in ECX:EBX is stored in the destination operand. Otherwise, the
00628          *    value in the destination operand is loaded into EDX:EAX."
00629          *
00630          * So what happens is the oldval is loaded into EDX:EAX and the newval
00631          * is loaded into ECX:EBX to start with (i.e. as inputs). Then
00632          * CMPXCHG8B does its business, after which EDX:EAX is guaranteed to
00633          * contain the value of *operand when the instruction executed. We test
00634          * the ZF field to see if the operation succeeded. We *COULD* save
00635          * EDX:EAX back into oldval to save ourselves a step when the loop
00636          * fails, but that's a waste when the loop succeeds (i.e. in the common
00637          * case). Optimizing for the common case, in this situation, means
00638          * minimizing our extra write-out to the one-byte test variable.
00639          */
00640         __asm__ __volatile__(QTHREAD_PIC_PREFIX
00641                              "lock; cmpxchg8b (%1)\n\t"
00642                              "setne %0" /* test = (ZF==0) */
00643                              QTHREAD_PIC_SUFFIX
00644                              :"=q"(test)
00645                              :"r"(operand),
00646                              /*EAX*/ "a"(oldval.s.l),
00647                              /*EDX*/ "d"(oldval.s.h),
00648                              /*EBX*/ QTHREAD_PIC_REG_4(newval.s.l),
00649                              /*ECX*/ "c"(newval.s.h)
00650                              :"memory");
00651     } while (test);                    /* if ZF was cleared, the calculation is out of date */
00652     return oldval.d;
00653 
00654 #else
00655 #error "Unimplemented assembly architecture"
00656 #endif
00657 #elif defined (QTHREAD_MUTEX_INCREMENT) || (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32)
00658 
00659     double retval;
00660     qthread_t *me = qthread_self();
00661 
00662     qthread_lock(me, (aligned_t *) operand);
00663     retval = *operand;
00664     *operand += incr;
00665     qthread_unlock(me, (aligned_t *) operand);
00666     return retval;
00667 #else
00668 #error "Neither atomic nor mutex increment enabled; needed for qthread_dincr"
00669 #endif
00670 }                                      /*}}} */
00671 
00672 static QINLINE uint32_t qthread_incr32(volatile uint32_t * operand,
00673                                        const int incr)
00674 {                                      /*{{{ */
00675 #if defined(HAVE_GCC_INLINE_ASSEMBLY)
00676 
00677 #if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
00678     (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00679     uint32_t retval;
00680     register unsigned int incrd = incrd;        /* no initializing */
00681     __asm__ __volatile__("1:\tlwarx  %0,0,%1\n\t"
00682                          "add    %3,%0,%2\n\t"
00683                          "stwcx. %3,0,%1\n\t"
00684                          "bne-   1b\n\t"        /* if it failed, try again */
00685                          "isync"        /* make sure it wasn't all a dream */
00686                          :"=&b"  (retval)
00687                          :"r"    (operand), "r"(incr), "r"(incrd)
00688                          :"cc", "memory");
00689 
00690     return retval;
00691 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32) || \
00692       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00693     register uint32_t oldval, newval;
00694 
00695     /* newval = *operand; */
00696     do {
00697         /* you *should* be able to move the *operand reference outside the
00698          * loop and use the output of the CAS (namely, newval) instead.
00699          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00700          * that, the while() comparison uses a temporary register value for
00701          * newval that has nothing to do with the output of the CAS
00702          * instruction. (See how obviously wrong that is?) For some reason that
00703          * I haven't been able to figure out, moving the *operand reference
00704          * inside the loop fixes that problem, even at -O2 optimization. */
00705         oldval = *operand;
00706         newval = oldval + incr;
00707         /* newval always gets the value of *operand; if it's
00708          * the same as oldval, then the swap was successful */
00709 #if defined(__SUNPRO_CC)
00710         asm volatile
00711 #else
00712         __asm__ __volatile__
00713 #endif
00714                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00715                  "cas [%1] , %2, %0"
00716                  :"=&r"  (newval)
00717                  :"r"    (operand), "r"(oldval), "0"(newval)
00718                  :"cc", "memory");
00719     } while (oldval != newval);
00720     return oldval;
00721 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00722     uint32_t res;
00723 
00724     if (incr == 1) {
00725         asm volatile ("fetchadd4.rel %0=[%1],1"
00726                       :"=r" (res)
00727                       :"r"  (operand));
00728     } else {
00729         uint32_t old, newval;
00730 
00731         do {
00732             old = *operand;            /* atomic, because operand is aligned */
00733             newval = old + incr;
00734             asm volatile ("mov ar.ccv=%0;;":    /* no output */
00735                           :"rO"    (old));
00736 
00737             /* separate so the compiler can insert its junk */
00738             asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv"
00739                           :"=r"(res)
00740                           :"r" (operand), "r"(newval)
00741                           :"memory");
00742         } while (res != old);          /* if res!=old, the calc is out of date */
00743     }
00744     return res;
00745 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32) || \
00746       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00747 
00748     uint32_t retval = incr;
00749     asm volatile ("lock ;  xaddl %0, (%1);"
00750                   :"=r" (retval)
00751                   :"r"  (operand), "0"(retval)
00752                   :"memory");
00753 
00754     return retval;
00755 #else
00756 
00757 #error "Unimplemented assembly architecture"
00758 
00759 #endif
00760 
00761 #elif defined(QTHREAD_MUTEX_INCREMENT)
00762     uint32_t retval;
00763     qthread_t *me = qthread_self();
00764 
00765     qthread_lock(me, (aligned_t *) operand);
00766     retval = *operand;
00767     *operand += incr;
00768     qthread_unlock(me, (aligned_t *) operand);
00769     return retval;
00770 #else
00771 
00772 #error "Architecture unsupported for 32-bit atomic ops, and FEB increment not enabled"
00773 
00774 #endif
00775 }                                      /*}}} */
00776 
00777 static QINLINE uint64_t qthread_incr64(volatile uint64_t * operand,
00778                                        const int incr)
00779 {                                      /*{{{ */
00780 #if defined(HAVE_GCC_INLINE_ASSEMBLY)
00781 
00782 #if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
00783     uint64_t retval;
00784     register uint64_t incrd = incrd;    /* no initializing */
00785 
00786     asm volatile ("1:\tldarx  %0,0,%1\n\t"
00787                   "add    %3,%0,%2\n\t"
00788                   "stdcx. %3,0,%1\n\t"
00789                   "bne-   1b\n\t"       /* if it failed, try again */
00790                   "isync"       /* make sure it wasn't all a dream */
00791                   :"=&b"   (retval)
00792                   :"r"     (operand), "r"(incr), "r"(incrd)
00793                   :"cc", "memory");
00794 
00795     return retval;
00796 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00797     uint64_t oldval, newval = *operand;
00798 
00799     do {
00800         /* this allows the compiler to be as flexible as possible with register
00801          * assignments */
00802         register uint64_t tmp1 = tmp1;
00803         register uint64_t tmp2 = tmp2;
00804 
00805         oldval = newval;
00806         newval += incr;
00807         /* newval always gets the value of *operand; if it's
00808          * the same as oldval, then the swap was successful */
00809         __asm__ __volatile__("ldx %0, %1\n\t"
00810                              "ldx %4, %2\n\t"
00811                              "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00812                              "casx [%3] , %2, %1\n\t"
00813                              "stx %1, %0"
00814                              /* h means 64-BIT REGISTER
00815                               * (probably unnecessary, but why take chances?) */
00816                              :"=m"   (newval), "=&h"(tmp1), "=&h"(tmp2)
00817                              :"r"    (operand), "m"(oldval)
00818                              :"cc", "memory");
00819     } while (oldval != newval);
00820     return oldval;
00821 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
00822     register uint64_t oldval, newval;
00823 
00824     /* newval = *operand; */
00825     do {
00826         /* you *should* be able to move the *operand reference outside the
00827          * loop and use the output of the CAS (namely, newval) instead.
00828          * However, there seems to be a bug in gcc 4.0.4 wherein, if you do
00829          * that, the while() comparison uses a temporary register value for
00830          * newval that has nothing to do with the output of the CAS
00831          * instruction. (See how obviously wrong that is?) For some reason that
00832          * I haven't been able to figure out, moving the *operand reference
00833          * inside the loop fixes that problem, even at -O2 optimization. */
00834         oldval = *operand;
00835         newval = oldval + incr;
00836         /* newval always gets the value of *operand; if it's
00837          * the same as oldval, then the swap was successful */
00838 #if defined(__SUNPRO_CC)
00839         /* Sun's C++ compiler doesn't do __asm__, but their C compiler does
00840          * :P */
00841         asm volatile
00842 #else
00843         __asm__ __volatile__
00844 #endif
00845                 ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
00846                  "casx [%1] , %2, %0"
00847                  :"=&r"(newval)
00848                  :"r"    (operand), "r"(oldval), "0"(newval)
00849                  :"cc", "memory");
00850     } while (oldval != newval);
00851     return oldval;
00852 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
00853     uint64_t res;
00854 
00855     if (incr == 1) {
00856         asm volatile ("fetchadd8.rel %0=%1,1"
00857                       :"=r" (res)
00858                       :"m"     (*operand));
00859     } else {
00860         uint64_t old, newval;
00861 
00862         do {
00863             old = *operand;            /* atomic, because operand is aligned */
00864             newval = old + incr;
00865             asm volatile ("mov ar.ccv=%0;;":    /* no output */
00866                           :"rO"    (old));
00867 
00868             /* separate so the compiler can insert its junk */
00869             asm volatile ("cmpxchg8.acq %0=[%1],%2,ar.ccv"
00870                           :"=r" (res)
00871                           :"r"     (operand), "r"(newval)
00872                           :"memory");
00873         } while (res != old);          /* if res!=old, the calc is out of date */
00874     }
00875     return res;
00876 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
00877     union
00878     {
00879         uint64_t i;
00880         struct
00881         {
00882             /* note: the ordering of these is both important and
00883              * counter-intuitive; welcome to little-endian! */
00884             uint32_t l;
00885             uint32_t h;
00886         } s;
00887     } oldval, newval;
00888     register char test;
00889 
00890     do {
00891 #ifndef QTHREAD_PIC_PREFIX
00892 # ifdef __PIC__
00893         /* should share this code with the dincr stuff */
00894         /* this saves off %ebx to make PIC code happy :P */
00895 #  define QTHREAD_PIC_PREFIX "xchg %%ebx, %4\n\t"
00896         /* this restores it */
00897 #  define QTHREAD_PIC_SUFFIX "\n\txchg %%ebx, %4"
00898 #  define QTHREAD_PIC_REG_4 "r"
00899 # else
00900 #  define QTHREAD_PIC_PREFIX
00901 #  define QTHREAD_PIC_SUFFIX
00902 #  define QTHREAD_PIC_REG_4 "b"
00903 # endif
00904 #endif
00905         oldval.i = *operand;
00906         newval.i = oldval.i + incr;
00907         /* Yeah, this is weird looking, but it really makes sense when you
00908          * understand the instruction's semantics (which make sense when you
00909          * consider that it's doing a 64-bit op on a 32-bit proc):
00910          *
00911          *    Compares the 64-bit value in EDX:EAX with the operand
00912          *    (destination operand). If the values are equal, the 64-bit value
00913          *    in ECX:EBX is stored in the destination operand. Otherwise, the
00914          *    value in the destination operand is loaded into EDX:EAX."
00915          *
00916          * So what happens is the oldval is loaded into EDX:EAX and the newval
00917          * is loaded into ECX:EBX to start with (i.e. as inputs). Then
00918          * CMPXCHG8B does its business, after which EDX:EAX is guaranteed to
00919          * contain the value of *operand when the instruction executed. We test
00920          * the ZF field to see if the operation succeeded. We *COULD* save
00921          * EDX:EAX back into oldval to save ourselves a step when the loop
00922          * fails, but that's a waste when the loop succeeds (i.e. in the common
00923          * case). Optimizing for the common case, in this situation, means
00924          * minimizing our extra write-out to the one-byte test variable.
00925          */
00926         __asm__ __volatile__(QTHREAD_PIC_PREFIX
00927                              "lock; cmpxchg8b (%1)\n\t"
00928                              "setne %0" /* test = (ZF==0) */
00929                              QTHREAD_PIC_SUFFIX
00930                              :"=q"(test)
00931                              :"r"    (operand),
00932                              /*EAX*/ "a"(oldval.s.l),
00933                              /*EDX*/ "d"(oldval.s.h),
00934                              /*EBX*/ QTHREAD_PIC_REG_4(newval.s.l),
00935                              /*ECX*/ "c"(newval.s.h)
00936                              :"memory");
00937     } while (test);                    /* if ZF was cleared, the calculation is out of date */
00938     return oldval.i;
00939 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
00940     uint64_t retval = incr;
00941 
00942     asm volatile ("lock ; xaddq %0, (%1);"
00943                   :"=r" (retval)
00944                   :"r"     (operand), "0"(retval)
00945                   :"memory");
00946 
00947     return retval;
00948 
00949 #elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
00950       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
00951 
00952     /* In general, RISC doesn't provide a way to do 64 bit
00953      * operations from 32 bit code.  Sorry. */
00954     uint64_t retval;
00955     qthread_t *me = qthread_self();
00956 
00957     qthread_lock(me, (aligned_t *) operand);
00958     retval = *operand;
00959     *operand += incr;
00960     qthread_unlock(me, (aligned_t *) operand);
00961     return retval;
00962 
00963 #else
00964 
00965 #error "Unimplemented assembly architecture"
00966 
00967 #endif
00968 
00969 #elif defined(QTHREAD_MUTEX_INCREMENT)
00970 
00971     uint64_t retval;
00972     qthread_t *me = qthread_self();
00973 
00974     qthread_lock(me, (aligned_t *) operand);
00975     retval = *operand;
00976     *operand += incr;
00977     qthread_unlock(me, (aligned_t *) operand);
00978     return retval;
00979 
00980 #else
00981 
00982 #error "Architecture unsupported for 64-bit atomic ops, and FEB increment not enabled"
00983 
00984 #endif
00985 }                                      /*}}} */
00986 
00987 static QINLINE unsigned long qthread_incr_xx(volatile void *addr, const int incr,
00988                                              const size_t length)
00989 {
00990     switch (length) {
00991         case 4:
00992             return qthread_incr32((volatile uint32_t *)addr, incr);
00993         case 8:
00994             return qthread_incr64((volatile uint64_t *)addr, incr);
00995         default:
00996             /* This should never happen, so deliberately cause a seg fault
00997              * for corefile analysis */
00998             *(int *)(0) = 0;
00999     }
01000     return 0;                          /* compiler check */
01001 }
01002 
01003 static QINLINE uint32_t qthread_cas32(volatile uint32_t * operand,
01004                                       const uint32_t oldval,
01005                                       const uint32_t newval)
01006 {                                      /*{{{ */
01007 #if defined(HAVE_GCC_INLINE_ASSEMBLY)
01008 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32) || \
01009       (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
01010     register uint32_t result;
01011     __asm__ __volatile__ ("1:\n\t"
01012             "lwarx  %0,0,%3\n\t"
01013             "cmpw   %0,%1\n\t"
01014             "bne    2f\n\t"
01015             "stwcx. %2,0,%3\n\t"
01016             "bne-   1b\n"
01017             "2:"
01018             "isync" /* make sure it wasn't all a dream */
01019             :"=&b" (result)
01020             :"r"(oldval), "r"(newval), "r"(operand)
01021             :"cc", "memory");
01022     return result;
01023 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32) || \
01024         (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
01025     register uint32_t newv = newval;
01026 #  if defined(__SUNPRO_CC)
01027     asm volatile
01028 #  else
01029     __asm__ __volatile__
01030 #  endif
01031         ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
01032          "cas [%1], %2, %0"
01033          : "=&r" (newv)
01034          : "r" (operand), "r"(oldval), "0"(newv)
01035          : "cc", "memory");
01036     return newv;
01037 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
01038     register uint32_t retval;
01039     __asm__ __volatile__ ("mov ar.ccv=%0;;": :"rO" (oldval));
01040     __asm__ __volatile__ ("cmpxchg4.acq %0=[%1],%2,ar.ccv"
01041             :"=r"(retval)
01042             :"r"(operand), "r"(newval)
01043             :"memory");
01044     return retval;
01045 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64) || \
01046         (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
01047     uint32_t retval;
01048     /* note that this is GNU/Linux syntax (aka AT&T syntax), not Intel syntax.
01049      * Thus, this instruction has the form:
01050      * [lock] cmpxchg reg, reg/mem
01051      *                src, dest
01052      */
01053     __asm__ __volatile__ ("lock; cmpxchg %1,(%2)"
01054             : "=&a"(retval) /* store from EAX */
01055             : "r"(newval), "r" (operand),
01056               "0"(oldval) /* load into EAX */
01057             :"cc","memory");
01058     return retval;
01059 # else
01060 #  error "Don't have a qthread_cas implementation for this architecture"
01061 # endif
01062 #else
01063 # error "CAS needs inline assembly OR __sync_val_compare_and_swap"
01064 #endif
01065 }                                      /*}}} */
01066 
01067 static QINLINE uint64_t qthread_cas64(volatile uint64_t * operand,
01068                                       const uint64_t oldval,
01069                                       const uint64_t newval)
01070 {                                      /*{{{ */
01071 #if defined(HAVE_GCC_INLINE_ASSEMBLY)
01072 # if (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC64)
01073     register uint64_t result;
01074     __asm__ __volatile__ ("1:\n\t"
01075             "ldarx  %0,0,%3\n\t"
01076             "cmpw   %0,%1\n\t"
01077             "bne    2f\n\t"
01078             "stdcx. %2,0,%3\n\t"
01079             "bne-   1b\n"
01080             "2:"
01081             "isync" /* make sure it wasn't all a dream */
01082             :"=&b" (result)
01083             :"r"(oldval), "r"(newval), "r"(operand)
01084             :"cc", "memory");
01085     return result;
01086 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_32)
01087     register uint64_t tmp1=tmp1;
01088     register uint64_t tmp2=tmp2;
01089     uint64_t newv = newval;
01090 #  if defined(__SUNPRO_CC)
01091     asm volatile
01092 #  else
01093     __asm__ __volatile__
01094 #  endif
01095         ("ldx %0, %1\n\t"
01096          "ldx %4, %2\n\t"
01097          "membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
01098          "casx [%3], %2, %1\n\t"
01099          "stx %1, %0"
01100          /* h means 64-BIT REGISTER
01101           * (probably unneecessary, but why take chances?) */
01102          : "=m" (newv), "=&h" (tmp1), "=&h"(tmp2)
01103          : "r" (operand), "m"(oldval), "0"(newv)
01104          : "cc", "memory");
01105     return newv;
01106 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_SPARCV9_64)
01107     register uint64_t newv = newval;
01108 #  if defined(__SUNPRO_CC)
01109     asm volatile
01110 #  else
01111     __asm__ __volatile__
01112 #  endif
01113         ("membar #StoreStore|#LoadStore|#StoreLoad|#LoadLoad\n\t"
01114          "casx [%1], %2, %0"
01115          : "=&r" (newv)
01116          : "r" (operand), "r"(oldval), "0"(newv)
01117          : "cc", "memory");
01118     return newv;
01119 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA64)
01120     register uint32_t retval;
01121     __asm__ __volatile__ ("mov ar.ccv=%0;;": :"rO" (oldval));
01122     __asm__ __volatile__ ("cmpxchg8.acq %0=[%1],%2,ar.ccv"
01123             :"=r"(retval)
01124             :"r"(operand), "r"(newval)
01125             :"memory");
01126     return retval;
01127 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_IA32)
01128     union {
01129         uint64_t i;
01130         struct {
01131             /* note: the ordering of these is both important and
01132              * counter-intuitive; welcome to little-endian! */
01133             uint32_t l;
01134             uint32_t h;
01135         } s;
01136     } oldv, newv, ret;
01137     oldv.i = oldval;
01138     newv.i = newval;
01139     /* the PIC stuff is already defined above */
01140     __asm__ __volatile__ (
01141             QTHREAD_PIC_PREFIX
01142             "lock; cmpxchg8b (%2)"
01143             QTHREAD_PIC_SUFFIX
01144             :/*EAX*/"=a"(ret.s.l),
01145             /*EDX*/"=d"(ret.s.h)
01146             :"r"(operand),
01147             /*EAX*/"a"(oldv.s.l),
01148             /*EBX*/QTHREAD_PIC_REG_4(newv.s.l),
01149             /*EDX*/"d"(oldv.s.h),
01150             /*ECX*/"c"(newv.s.h)
01151             :"memory");
01152     return ret.i;
01153 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_AMD64)
01154     register uint64_t retval;
01155     /* note that this is GNU/Linux syntax (aka AT&T syntax), not Intel syntax.
01156      * Thus, this instruction has the form:
01157      * [lock] cmpxchg reg, reg/mem
01158      *                src, dest
01159      */
01160     __asm__ __volatile__ ("lock; cmpxchg %1,(%2)"
01161             : "=&a"(retval) /* store from EAX */
01162             : "r"(newval), "r" (operand),
01163               "0"(oldval) /* load into EAX */
01164             :"cc","memory");
01165     return retval;
01166 # elif (QTHREAD_ASSEMBLY_ARCH == QTHREAD_POWERPC32)
01167     /* In general, RISC doesn't provide a way to do 64 bit operations from 32
01168      * bit code. Sorry! */
01169     uint64_t retval;
01170     qthread_t *me = qthread_self();
01171 
01172     qthread_lock(me, (aligned_t*)operand);
01173     retval = *operand;
01174     if (retval == oldval) {
01175         *operand = newval;
01176     }
01177     qthread_unlock(me, (aligned_t*)operand);
01178     return retval;
01179 # else
01180 #  error "Don't have a qthread_cas64 implementation for this architecture"
01181 # endif
01182 #else
01183 # error "CAS needs inline assembly OR __sync_val_compare_and_swap"
01184 #endif
01185 }                                      /*}}} */
01186 
01187 static QINLINE aligned_t qthread_cas_xx(volatile aligned_t * addr,
01188                                         const aligned_t oldval,
01189                                         const aligned_t newval, const size_t length)
01190 {
01191     switch (length) {
01192         case 4:
01193             return qthread_cas32((volatile uint32_t *)addr, oldval, newval);
01194         case 8:
01195             return qthread_cas64((volatile uint64_t *)addr, oldval, newval);
01196         default:
01197             /* This should never happen, so deliberately cause a seg fault
01198              * for corefile analysis */
01199             *(int *)(0) = 0;
01200     }
01201     return 0;                          /* compiler check */
01202 }
01203 
01204 static QINLINE void *qthread_cas_ptr_(void *volatile*const addr,
01205                                      void *const oldval, void *const newval)
01206 {
01207     switch (sizeof(void *)) {
01208         case 4:
01209             return (void *)(uintptr_t) qthread_cas32((volatile uint32_t *)
01210                                                      addr,
01211                                                      (uint32_t)(uintptr_t)
01212                                                      oldval,
01213                                                      (uint32_t)(uintptr_t)
01214                                                      newval);
01215         case 8:
01216             return (void *)(uintptr_t) qthread_cas64((volatile uint64_t *)
01217                                                      addr,
01218                                                      (uint64_t)(uintptr_t)
01219                                                      oldval,
01220                                                      (uint64_t)(uintptr_t)
01221                                                      newval);
01222         default:
01223             /* This should never happen, so deliberately cause a seg fault for
01224              * corefile analysis */
01225             *(int *)(0) = 0;
01226     }
01227     return NULL;                       /* compiler check */
01228 }
01229 
01230 #ifdef QTHREAD_ATOMIC_CAS
01231 # define qthread_cas(ADDR, OLDV, NEWV) \
01232     __sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01233 # define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01234     __sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01235 #else
01236 # define qthread_cas(ADDR, OLDV, NEWV) \
01237     qthread_cas_xx((volatile aligned_t*)(ADDR), (aligned_t)(OLDV), (aligned_t)(NEWV), sizeof(*(ADDR)))
01238 # ifdef QTHREAD_ATOMIC_CAS_PTR
01239 #  define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01240     __sync_val_compare_and_swap((ADDR), (OLDV), (NEWV))
01241 # else
01242 #  define qthread_cas_ptr(ADDR, OLDV, NEWV) \
01243     qthread_cas_ptr_((void*volatile*const)(ADDR), (void*const)(OLDV), (void*const)(NEWV))
01244 # endif
01245 #endif
01246 
01247 Q_ENDCXX /* */
01248 
01249 #ifndef __cplusplus
01250 
01251 # ifdef QTHREAD_ATOMIC_INCR
01252 #  define qthread_incr( ADDR, INCVAL ) \
01253     __sync_fetch_and_add(ADDR, INCVAL)
01254 # else
01255 #  define qthread_incr( ADDR, INCVAL )                  \
01256    qthread_incr_xx( (volatile void*)(ADDR), (int)(INCVAL), sizeof(*(ADDR)) )
01257 # endif
01258 
01259 #else /* ifdef __cplusplus */
01260 # include "qthread.hpp"
01261 #endif /* __cplusplus */
01262 
01263 #endif /* _QTHREAD_H_ */

Generated on Fri Oct 22 2010 11:02:24 for SST by  doxygen 1.7.1