00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #if SHIFT == 0
00022 #define Reg MMXReg
00023 #define XMM_ONLY(x...)
00024 #define B(n) MMX_B(n)
00025 #define W(n) MMX_W(n)
00026 #define L(n) MMX_L(n)
00027 #define Q(n) q
00028 #define SUFFIX _mmx
00029 #else
00030 #define Reg XMMReg
00031 #define XMM_ONLY(x...) x
00032 #define B(n) XMM_B(n)
00033 #define W(n) XMM_W(n)
00034 #define L(n) XMM_L(n)
00035 #define Q(n) XMM_Q(n)
00036 #define SUFFIX _xmm
00037 #endif
00038
00039 void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
00040 {
00041 int shift;
00042
00043 if (s->Q(0) > 15) {
00044 d->Q(0) = 0;
00045 #if SHIFT == 1
00046 d->Q(1) = 0;
00047 #endif
00048 } else {
00049 shift = s->B(0);
00050 d->W(0) >>= shift;
00051 d->W(1) >>= shift;
00052 d->W(2) >>= shift;
00053 d->W(3) >>= shift;
00054 #if SHIFT == 1
00055 d->W(4) >>= shift;
00056 d->W(5) >>= shift;
00057 d->W(6) >>= shift;
00058 d->W(7) >>= shift;
00059 #endif
00060 }
00061 }
00062
00063 void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
00064 {
00065 int shift;
00066
00067 if (s->Q(0) > 15) {
00068 shift = 15;
00069 } else {
00070 shift = s->B(0);
00071 }
00072 d->W(0) = (int16_t)d->W(0) >> shift;
00073 d->W(1) = (int16_t)d->W(1) >> shift;
00074 d->W(2) = (int16_t)d->W(2) >> shift;
00075 d->W(3) = (int16_t)d->W(3) >> shift;
00076 #if SHIFT == 1
00077 d->W(4) = (int16_t)d->W(4) >> shift;
00078 d->W(5) = (int16_t)d->W(5) >> shift;
00079 d->W(6) = (int16_t)d->W(6) >> shift;
00080 d->W(7) = (int16_t)d->W(7) >> shift;
00081 #endif
00082 }
00083
00084 void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
00085 {
00086 int shift;
00087
00088 if (s->Q(0) > 15) {
00089 d->Q(0) = 0;
00090 #if SHIFT == 1
00091 d->Q(1) = 0;
00092 #endif
00093 } else {
00094 shift = s->B(0);
00095 d->W(0) <<= shift;
00096 d->W(1) <<= shift;
00097 d->W(2) <<= shift;
00098 d->W(3) <<= shift;
00099 #if SHIFT == 1
00100 d->W(4) <<= shift;
00101 d->W(5) <<= shift;
00102 d->W(6) <<= shift;
00103 d->W(7) <<= shift;
00104 #endif
00105 }
00106 }
00107
00108 void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
00109 {
00110 int shift;
00111
00112 if (s->Q(0) > 31) {
00113 d->Q(0) = 0;
00114 #if SHIFT == 1
00115 d->Q(1) = 0;
00116 #endif
00117 } else {
00118 shift = s->B(0);
00119 d->L(0) >>= shift;
00120 d->L(1) >>= shift;
00121 #if SHIFT == 1
00122 d->L(2) >>= shift;
00123 d->L(3) >>= shift;
00124 #endif
00125 }
00126 }
00127
00128 void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
00129 {
00130 int shift;
00131
00132 if (s->Q(0) > 31) {
00133 shift = 31;
00134 } else {
00135 shift = s->B(0);
00136 }
00137 d->L(0) = (int32_t)d->L(0) >> shift;
00138 d->L(1) = (int32_t)d->L(1) >> shift;
00139 #if SHIFT == 1
00140 d->L(2) = (int32_t)d->L(2) >> shift;
00141 d->L(3) = (int32_t)d->L(3) >> shift;
00142 #endif
00143 }
00144
00145 void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
00146 {
00147 int shift;
00148
00149 if (s->Q(0) > 31) {
00150 d->Q(0) = 0;
00151 #if SHIFT == 1
00152 d->Q(1) = 0;
00153 #endif
00154 } else {
00155 shift = s->B(0);
00156 d->L(0) <<= shift;
00157 d->L(1) <<= shift;
00158 #if SHIFT == 1
00159 d->L(2) <<= shift;
00160 d->L(3) <<= shift;
00161 #endif
00162 }
00163 }
00164
00165 void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
00166 {
00167 int shift;
00168
00169 if (s->Q(0) > 63) {
00170 d->Q(0) = 0;
00171 #if SHIFT == 1
00172 d->Q(1) = 0;
00173 #endif
00174 } else {
00175 shift = s->B(0);
00176 d->Q(0) >>= shift;
00177 #if SHIFT == 1
00178 d->Q(1) >>= shift;
00179 #endif
00180 }
00181 }
00182
00183 void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
00184 {
00185 int shift;
00186
00187 if (s->Q(0) > 63) {
00188 d->Q(0) = 0;
00189 #if SHIFT == 1
00190 d->Q(1) = 0;
00191 #endif
00192 } else {
00193 shift = s->B(0);
00194 d->Q(0) <<= shift;
00195 #if SHIFT == 1
00196 d->Q(1) <<= shift;
00197 #endif
00198 }
00199 }
00200
00201 #if SHIFT == 1
00202 void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
00203 {
00204 int shift, i;
00205
00206 shift = s->L(0);
00207 if (shift > 16)
00208 shift = 16;
00209 for(i = 0; i < 16 - shift; i++)
00210 d->B(i) = d->B(i + shift);
00211 for(i = 16 - shift; i < 16; i++)
00212 d->B(i) = 0;
00213 }
00214
00215 void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
00216 {
00217 int shift, i;
00218
00219 shift = s->L(0);
00220 if (shift > 16)
00221 shift = 16;
00222 for(i = 15; i >= shift; i--)
00223 d->B(i) = d->B(i - shift);
00224 for(i = 0; i < shift; i++)
00225 d->B(i) = 0;
00226 }
00227 #endif
00228
00229 #define SSE_HELPER_B(name, F)\
00230 void glue(name, SUFFIX) (Reg *d, Reg *s)\
00231 {\
00232 d->B(0) = F(d->B(0), s->B(0));\
00233 d->B(1) = F(d->B(1), s->B(1));\
00234 d->B(2) = F(d->B(2), s->B(2));\
00235 d->B(3) = F(d->B(3), s->B(3));\
00236 d->B(4) = F(d->B(4), s->B(4));\
00237 d->B(5) = F(d->B(5), s->B(5));\
00238 d->B(6) = F(d->B(6), s->B(6));\
00239 d->B(7) = F(d->B(7), s->B(7));\
00240 XMM_ONLY(\
00241 d->B(8) = F(d->B(8), s->B(8));\
00242 d->B(9) = F(d->B(9), s->B(9));\
00243 d->B(10) = F(d->B(10), s->B(10));\
00244 d->B(11) = F(d->B(11), s->B(11));\
00245 d->B(12) = F(d->B(12), s->B(12));\
00246 d->B(13) = F(d->B(13), s->B(13));\
00247 d->B(14) = F(d->B(14), s->B(14));\
00248 d->B(15) = F(d->B(15), s->B(15));\
00249 )\
00250 }
00251
00252 #define SSE_HELPER_W(name, F)\
00253 void glue(name, SUFFIX) (Reg *d, Reg *s)\
00254 {\
00255 d->W(0) = F(d->W(0), s->W(0));\
00256 d->W(1) = F(d->W(1), s->W(1));\
00257 d->W(2) = F(d->W(2), s->W(2));\
00258 d->W(3) = F(d->W(3), s->W(3));\
00259 XMM_ONLY(\
00260 d->W(4) = F(d->W(4), s->W(4));\
00261 d->W(5) = F(d->W(5), s->W(5));\
00262 d->W(6) = F(d->W(6), s->W(6));\
00263 d->W(7) = F(d->W(7), s->W(7));\
00264 )\
00265 }
00266
00267 #define SSE_HELPER_L(name, F)\
00268 void glue(name, SUFFIX) (Reg *d, Reg *s)\
00269 {\
00270 d->L(0) = F(d->L(0), s->L(0));\
00271 d->L(1) = F(d->L(1), s->L(1));\
00272 XMM_ONLY(\
00273 d->L(2) = F(d->L(2), s->L(2));\
00274 d->L(3) = F(d->L(3), s->L(3));\
00275 )\
00276 }
00277
00278 #define SSE_HELPER_Q(name, F)\
00279 void glue(name, SUFFIX) (Reg *d, Reg *s)\
00280 {\
00281 d->Q(0) = F(d->Q(0), s->Q(0));\
00282 XMM_ONLY(\
00283 d->Q(1) = F(d->Q(1), s->Q(1));\
00284 )\
00285 }
00286
00287 #if SHIFT == 0
00288 static inline int satub(int x)
00289 {
00290 if (x < 0)
00291 return 0;
00292 else if (x > 255)
00293 return 255;
00294 else
00295 return x;
00296 }
00297
00298 static inline int satuw(int x)
00299 {
00300 if (x < 0)
00301 return 0;
00302 else if (x > 65535)
00303 return 65535;
00304 else
00305 return x;
00306 }
00307
00308 static inline int satsb(int x)
00309 {
00310 if (x < -128)
00311 return -128;
00312 else if (x > 127)
00313 return 127;
00314 else
00315 return x;
00316 }
00317
00318 static inline int satsw(int x)
00319 {
00320 if (x < -32768)
00321 return -32768;
00322 else if (x > 32767)
00323 return 32767;
00324 else
00325 return x;
00326 }
00327
00328 #define FADD(a, b) ((a) + (b))
00329 #define FADDUB(a, b) satub((a) + (b))
00330 #define FADDUW(a, b) satuw((a) + (b))
00331 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
00332 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
00333
00334 #define FSUB(a, b) ((a) - (b))
00335 #define FSUBUB(a, b) satub((a) - (b))
00336 #define FSUBUW(a, b) satuw((a) - (b))
00337 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
00338 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
00339 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
00340 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
00341 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
00342 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
00343
00344 #define FAND(a, b) (a) & (b)
00345 #define FANDN(a, b) ((~(a)) & (b))
00346 #define FOR(a, b) (a) | (b)
00347 #define FXOR(a, b) (a) ^ (b)
00348
00349 #define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
00350 #define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
00351 #define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
00352 #define FCMPEQ(a, b) (a) == (b) ? -1 : 0
00353
00354 #define FMULLW(a, b) (a) * (b)
00355 #define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
00356 #define FMULHUW(a, b) (a) * (b) >> 16
00357 #define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
00358
00359 #define FAVG(a, b) ((a) + (b) + 1) >> 1
00360 #endif
00361
00362 SSE_HELPER_B(helper_paddb, FADD)
00363 SSE_HELPER_W(helper_paddw, FADD)
00364 SSE_HELPER_L(helper_paddl, FADD)
00365 SSE_HELPER_Q(helper_paddq, FADD)
00366
00367 SSE_HELPER_B(helper_psubb, FSUB)
00368 SSE_HELPER_W(helper_psubw, FSUB)
00369 SSE_HELPER_L(helper_psubl, FSUB)
00370 SSE_HELPER_Q(helper_psubq, FSUB)
00371
00372 SSE_HELPER_B(helper_paddusb, FADDUB)
00373 SSE_HELPER_B(helper_paddsb, FADDSB)
00374 SSE_HELPER_B(helper_psubusb, FSUBUB)
00375 SSE_HELPER_B(helper_psubsb, FSUBSB)
00376
00377 SSE_HELPER_W(helper_paddusw, FADDUW)
00378 SSE_HELPER_W(helper_paddsw, FADDSW)
00379 SSE_HELPER_W(helper_psubusw, FSUBUW)
00380 SSE_HELPER_W(helper_psubsw, FSUBSW)
00381
00382 SSE_HELPER_B(helper_pminub, FMINUB)
00383 SSE_HELPER_B(helper_pmaxub, FMAXUB)
00384
00385 SSE_HELPER_W(helper_pminsw, FMINSW)
00386 SSE_HELPER_W(helper_pmaxsw, FMAXSW)
00387
00388 SSE_HELPER_Q(helper_pand, FAND)
00389 SSE_HELPER_Q(helper_pandn, FANDN)
00390 SSE_HELPER_Q(helper_por, FOR)
00391 SSE_HELPER_Q(helper_pxor, FXOR)
00392
00393 SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
00394 SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
00395 SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
00396
00397 SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
00398 SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
00399 SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
00400
00401 SSE_HELPER_W(helper_pmullw, FMULLW)
00402 #if SHIFT == 0
00403 SSE_HELPER_W(helper_pmulhrw, FMULHRW)
00404 #endif
00405 SSE_HELPER_W(helper_pmulhuw, FMULHUW)
00406 SSE_HELPER_W(helper_pmulhw, FMULHW)
00407
00408 SSE_HELPER_B(helper_pavgb, FAVG)
00409 SSE_HELPER_W(helper_pavgw, FAVG)
00410
00411 void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
00412 {
00413 d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
00414 #if SHIFT == 1
00415 d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
00416 #endif
00417 }
00418
00419 void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
00420 {
00421 int i;
00422
00423 for(i = 0; i < (2 << SHIFT); i++) {
00424 d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
00425 (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
00426 }
00427 }
00428
00429 #if SHIFT == 0
00430 static inline int abs1(int a)
00431 {
00432 if (a < 0)
00433 return -a;
00434 else
00435 return a;
00436 }
00437 #endif
00438 void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
00439 {
00440 unsigned int val;
00441
00442 val = 0;
00443 val += abs1(d->B(0) - s->B(0));
00444 val += abs1(d->B(1) - s->B(1));
00445 val += abs1(d->B(2) - s->B(2));
00446 val += abs1(d->B(3) - s->B(3));
00447 val += abs1(d->B(4) - s->B(4));
00448 val += abs1(d->B(5) - s->B(5));
00449 val += abs1(d->B(6) - s->B(6));
00450 val += abs1(d->B(7) - s->B(7));
00451 d->Q(0) = val;
00452 #if SHIFT == 1
00453 val = 0;
00454 val += abs1(d->B(8) - s->B(8));
00455 val += abs1(d->B(9) - s->B(9));
00456 val += abs1(d->B(10) - s->B(10));
00457 val += abs1(d->B(11) - s->B(11));
00458 val += abs1(d->B(12) - s->B(12));
00459 val += abs1(d->B(13) - s->B(13));
00460 val += abs1(d->B(14) - s->B(14));
00461 val += abs1(d->B(15) - s->B(15));
00462 d->Q(1) = val;
00463 #endif
00464 }
00465
00466 void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
00467 {
00468 int i;
00469 for(i = 0; i < (8 << SHIFT); i++) {
00470 if (s->B(i) & 0x80)
00471 stb(a0 + i, d->B(i));
00472 }
00473 }
00474
00475 void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
00476 {
00477 d->L(0) = val;
00478 d->L(1) = 0;
00479 #if SHIFT == 1
00480 d->Q(1) = 0;
00481 #endif
00482 }
00483
00484 #ifdef TARGET_X86_64
00485 void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
00486 {
00487 d->Q(0) = val;
00488 #if SHIFT == 1
00489 d->Q(1) = 0;
00490 #endif
00491 }
00492 #endif
00493
00494 #if SHIFT == 0
00495 void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
00496 {
00497 Reg r;
00498 r.W(0) = s->W(order & 3);
00499 r.W(1) = s->W((order >> 2) & 3);
00500 r.W(2) = s->W((order >> 4) & 3);
00501 r.W(3) = s->W((order >> 6) & 3);
00502 *d = r;
00503 }
00504 #else
00505 void helper_shufps(Reg *d, Reg *s, int order)
00506 {
00507 Reg r;
00508 r.L(0) = d->L(order & 3);
00509 r.L(1) = d->L((order >> 2) & 3);
00510 r.L(2) = s->L((order >> 4) & 3);
00511 r.L(3) = s->L((order >> 6) & 3);
00512 *d = r;
00513 }
00514
00515 void helper_shufpd(Reg *d, Reg *s, int order)
00516 {
00517 Reg r;
00518 r.Q(0) = d->Q(order & 1);
00519 r.Q(1) = s->Q((order >> 1) & 1);
00520 *d = r;
00521 }
00522
00523 void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
00524 {
00525 Reg r;
00526 r.L(0) = s->L(order & 3);
00527 r.L(1) = s->L((order >> 2) & 3);
00528 r.L(2) = s->L((order >> 4) & 3);
00529 r.L(3) = s->L((order >> 6) & 3);
00530 *d = r;
00531 }
00532
00533 void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
00534 {
00535 Reg r;
00536 r.W(0) = s->W(order & 3);
00537 r.W(1) = s->W((order >> 2) & 3);
00538 r.W(2) = s->W((order >> 4) & 3);
00539 r.W(3) = s->W((order >> 6) & 3);
00540 r.Q(1) = s->Q(1);
00541 *d = r;
00542 }
00543
00544 void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
00545 {
00546 Reg r;
00547 r.Q(0) = s->Q(0);
00548 r.W(4) = s->W(4 + (order & 3));
00549 r.W(5) = s->W(4 + ((order >> 2) & 3));
00550 r.W(6) = s->W(4 + ((order >> 4) & 3));
00551 r.W(7) = s->W(4 + ((order >> 6) & 3));
00552 *d = r;
00553 }
00554 #endif
00555
00556 #if SHIFT == 1
00557
00558
00559
00560 #define SSE_HELPER_S(name, F)\
00561 void helper_ ## name ## ps (Reg *d, Reg *s)\
00562 {\
00563 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
00564 d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
00565 d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
00566 d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
00567 }\
00568 \
00569 void helper_ ## name ## ss (Reg *d, Reg *s)\
00570 {\
00571 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
00572 }\
00573 void helper_ ## name ## pd (Reg *d, Reg *s)\
00574 {\
00575 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
00576 d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
00577 }\
00578 \
00579 void helper_ ## name ## sd (Reg *d, Reg *s)\
00580 {\
00581 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
00582 }
00583
00584 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
00585 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
00586 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
00587 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
00588 #define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
00589 #define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
00590 #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
00591
00592 SSE_HELPER_S(add, FPU_ADD)
00593 SSE_HELPER_S(sub, FPU_SUB)
00594 SSE_HELPER_S(mul, FPU_MUL)
00595 SSE_HELPER_S(div, FPU_DIV)
00596 SSE_HELPER_S(min, FPU_MIN)
00597 SSE_HELPER_S(max, FPU_MAX)
00598 SSE_HELPER_S(sqrt, FPU_SQRT)
00599
00600
00601
00602 void helper_cvtps2pd(Reg *d, Reg *s)
00603 {
00604 float32 s0, s1;
00605 s0 = s->XMM_S(0);
00606 s1 = s->XMM_S(1);
00607 d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
00608 d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
00609 }
00610
00611 void helper_cvtpd2ps(Reg *d, Reg *s)
00612 {
00613 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
00614 d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
00615 d->Q(1) = 0;
00616 }
00617
00618 void helper_cvtss2sd(Reg *d, Reg *s)
00619 {
00620 d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
00621 }
00622
00623 void helper_cvtsd2ss(Reg *d, Reg *s)
00624 {
00625 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
00626 }
00627
00628
00629 void helper_cvtdq2ps(Reg *d, Reg *s)
00630 {
00631 d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
00632 d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
00633 d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
00634 d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
00635 }
00636
00637 void helper_cvtdq2pd(Reg *d, Reg *s)
00638 {
00639 int32_t l0, l1;
00640 l0 = (int32_t)s->XMM_L(0);
00641 l1 = (int32_t)s->XMM_L(1);
00642 d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
00643 d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
00644 }
00645
00646 void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
00647 {
00648 d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
00649 d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
00650 }
00651
00652 void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
00653 {
00654 d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
00655 d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
00656 }
00657
00658 void helper_cvtsi2ss(XMMReg *d, uint32_t val)
00659 {
00660 d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
00661 }
00662
00663 void helper_cvtsi2sd(XMMReg *d, uint32_t val)
00664 {
00665 d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
00666 }
00667
00668 #ifdef TARGET_X86_64
00669 void helper_cvtsq2ss(XMMReg *d, uint64_t val)
00670 {
00671 d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
00672 }
00673
00674 void helper_cvtsq2sd(XMMReg *d, uint64_t val)
00675 {
00676 d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
00677 }
00678 #endif
00679
00680
00681 void helper_cvtps2dq(XMMReg *d, XMMReg *s)
00682 {
00683 d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
00684 d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
00685 d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
00686 d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
00687 }
00688
00689 void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
00690 {
00691 d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
00692 d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
00693 d->XMM_Q(1) = 0;
00694 }
00695
00696 void helper_cvtps2pi(MMXReg *d, XMMReg *s)
00697 {
00698 d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
00699 d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
00700 }
00701
00702 void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
00703 {
00704 d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
00705 d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
00706 }
00707
00708 int32_t helper_cvtss2si(XMMReg *s)
00709 {
00710 return float32_to_int32(s->XMM_S(0), &env->sse_status);
00711 }
00712
00713 int32_t helper_cvtsd2si(XMMReg *s)
00714 {
00715 return float64_to_int32(s->XMM_D(0), &env->sse_status);
00716 }
00717
00718 #ifdef TARGET_X86_64
00719 int64_t helper_cvtss2sq(XMMReg *s)
00720 {
00721 return float32_to_int64(s->XMM_S(0), &env->sse_status);
00722 }
00723
00724 int64_t helper_cvtsd2sq(XMMReg *s)
00725 {
00726 return float64_to_int64(s->XMM_D(0), &env->sse_status);
00727 }
00728 #endif
00729
00730
00731 void helper_cvttps2dq(XMMReg *d, XMMReg *s)
00732 {
00733 d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
00734 d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
00735 d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
00736 d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
00737 }
00738
00739 void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
00740 {
00741 d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
00742 d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
00743 d->XMM_Q(1) = 0;
00744 }
00745
00746 void helper_cvttps2pi(MMXReg *d, XMMReg *s)
00747 {
00748 d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
00749 d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
00750 }
00751
00752 void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
00753 {
00754 d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
00755 d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
00756 }
00757
00758 int32_t helper_cvttss2si(XMMReg *s)
00759 {
00760 return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
00761 }
00762
00763 int32_t helper_cvttsd2si(XMMReg *s)
00764 {
00765 return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
00766 }
00767
00768 #ifdef TARGET_X86_64
00769 int64_t helper_cvttss2sq(XMMReg *s)
00770 {
00771 return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
00772 }
00773
00774 int64_t helper_cvttsd2sq(XMMReg *s)
00775 {
00776 return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
00777 }
00778 #endif
00779
00780 void helper_rsqrtps(XMMReg *d, XMMReg *s)
00781 {
00782 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
00783 d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
00784 d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
00785 d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
00786 }
00787
00788 void helper_rsqrtss(XMMReg *d, XMMReg *s)
00789 {
00790 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
00791 }
00792
00793 void helper_rcpps(XMMReg *d, XMMReg *s)
00794 {
00795 d->XMM_S(0) = approx_rcp(s->XMM_S(0));
00796 d->XMM_S(1) = approx_rcp(s->XMM_S(1));
00797 d->XMM_S(2) = approx_rcp(s->XMM_S(2));
00798 d->XMM_S(3) = approx_rcp(s->XMM_S(3));
00799 }
00800
00801 void helper_rcpss(XMMReg *d, XMMReg *s)
00802 {
00803 d->XMM_S(0) = approx_rcp(s->XMM_S(0));
00804 }
00805
00806 void helper_haddps(XMMReg *d, XMMReg *s)
00807 {
00808 XMMReg r;
00809 r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
00810 r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
00811 r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
00812 r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
00813 *d = r;
00814 }
00815
00816 void helper_haddpd(XMMReg *d, XMMReg *s)
00817 {
00818 XMMReg r;
00819 r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
00820 r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
00821 *d = r;
00822 }
00823
00824 void helper_hsubps(XMMReg *d, XMMReg *s)
00825 {
00826 XMMReg r;
00827 r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
00828 r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
00829 r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
00830 r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
00831 *d = r;
00832 }
00833
00834 void helper_hsubpd(XMMReg *d, XMMReg *s)
00835 {
00836 XMMReg r;
00837 r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
00838 r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
00839 *d = r;
00840 }
00841
00842 void helper_addsubps(XMMReg *d, XMMReg *s)
00843 {
00844 d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
00845 d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
00846 d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
00847 d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
00848 }
00849
00850 void helper_addsubpd(XMMReg *d, XMMReg *s)
00851 {
00852 d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
00853 d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
00854 }
00855
00856
00857 #define SSE_HELPER_CMP(name, F)\
00858 void helper_ ## name ## ps (Reg *d, Reg *s)\
00859 {\
00860 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
00861 d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
00862 d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
00863 d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
00864 }\
00865 \
00866 void helper_ ## name ## ss (Reg *d, Reg *s)\
00867 {\
00868 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
00869 }\
00870 void helper_ ## name ## pd (Reg *d, Reg *s)\
00871 {\
00872 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
00873 d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
00874 }\
00875 \
00876 void helper_ ## name ## sd (Reg *d, Reg *s)\
00877 {\
00878 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
00879 }
00880
00881 #define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
00882 #define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
00883 #define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
00884 #define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
00885 #define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
00886 #define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
00887 #define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
00888 #define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
00889
00890 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
00891 SSE_HELPER_CMP(cmplt, FPU_CMPLT)
00892 SSE_HELPER_CMP(cmple, FPU_CMPLE)
00893 SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
00894 SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
00895 SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
00896 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
00897 SSE_HELPER_CMP(cmpord, FPU_CMPORD)
00898
00899 const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
00900
00901 void helper_ucomiss(Reg *d, Reg *s)
00902 {
00903 int ret;
00904 float32 s0, s1;
00905
00906 s0 = d->XMM_S(0);
00907 s1 = s->XMM_S(0);
00908 ret = float32_compare_quiet(s0, s1, &env->sse_status);
00909 CC_SRC = comis_eflags[ret + 1];
00910 }
00911
00912 void helper_comiss(Reg *d, Reg *s)
00913 {
00914 int ret;
00915 float32 s0, s1;
00916
00917 s0 = d->XMM_S(0);
00918 s1 = s->XMM_S(0);
00919 ret = float32_compare(s0, s1, &env->sse_status);
00920 CC_SRC = comis_eflags[ret + 1];
00921 }
00922
00923 void helper_ucomisd(Reg *d, Reg *s)
00924 {
00925 int ret;
00926 float64 d0, d1;
00927
00928 d0 = d->XMM_D(0);
00929 d1 = s->XMM_D(0);
00930 ret = float64_compare_quiet(d0, d1, &env->sse_status);
00931 CC_SRC = comis_eflags[ret + 1];
00932 }
00933
00934 void helper_comisd(Reg *d, Reg *s)
00935 {
00936 int ret;
00937 float64 d0, d1;
00938
00939 d0 = d->XMM_D(0);
00940 d1 = s->XMM_D(0);
00941 ret = float64_compare(d0, d1, &env->sse_status);
00942 CC_SRC = comis_eflags[ret + 1];
00943 }
00944
00945 uint32_t helper_movmskps(Reg *s)
00946 {
00947 int b0, b1, b2, b3;
00948 b0 = s->XMM_L(0) >> 31;
00949 b1 = s->XMM_L(1) >> 31;
00950 b2 = s->XMM_L(2) >> 31;
00951 b3 = s->XMM_L(3) >> 31;
00952 return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
00953 }
00954
00955 uint32_t helper_movmskpd(Reg *s)
00956 {
00957 int b0, b1;
00958 b0 = s->XMM_L(1) >> 31;
00959 b1 = s->XMM_L(3) >> 31;
00960 return b0 | (b1 << 1);
00961 }
00962
00963 #endif
00964
00965 uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
00966 {
00967 uint32_t val;
00968 val = 0;
00969 val |= (s->B(0) >> 7);
00970 val |= (s->B(1) >> 6) & 0x02;
00971 val |= (s->B(2) >> 5) & 0x04;
00972 val |= (s->B(3) >> 4) & 0x08;
00973 val |= (s->B(4) >> 3) & 0x10;
00974 val |= (s->B(5) >> 2) & 0x20;
00975 val |= (s->B(6) >> 1) & 0x40;
00976 val |= (s->B(7)) & 0x80;
00977 #if SHIFT == 1
00978 val |= (s->B(8) << 1) & 0x0100;
00979 val |= (s->B(9) << 2) & 0x0200;
00980 val |= (s->B(10) << 3) & 0x0400;
00981 val |= (s->B(11) << 4) & 0x0800;
00982 val |= (s->B(12) << 5) & 0x1000;
00983 val |= (s->B(13) << 6) & 0x2000;
00984 val |= (s->B(14) << 7) & 0x4000;
00985 val |= (s->B(15) << 8) & 0x8000;
00986 #endif
00987 return val;
00988 }
00989
00990 void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
00991 {
00992 Reg r;
00993
00994 r.B(0) = satsb((int16_t)d->W(0));
00995 r.B(1) = satsb((int16_t)d->W(1));
00996 r.B(2) = satsb((int16_t)d->W(2));
00997 r.B(3) = satsb((int16_t)d->W(3));
00998 #if SHIFT == 1
00999 r.B(4) = satsb((int16_t)d->W(4));
01000 r.B(5) = satsb((int16_t)d->W(5));
01001 r.B(6) = satsb((int16_t)d->W(6));
01002 r.B(7) = satsb((int16_t)d->W(7));
01003 #endif
01004 r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
01005 r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
01006 r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
01007 r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
01008 #if SHIFT == 1
01009 r.B(12) = satsb((int16_t)s->W(4));
01010 r.B(13) = satsb((int16_t)s->W(5));
01011 r.B(14) = satsb((int16_t)s->W(6));
01012 r.B(15) = satsb((int16_t)s->W(7));
01013 #endif
01014 *d = r;
01015 }
01016
01017 void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
01018 {
01019 Reg r;
01020
01021 r.B(0) = satub((int16_t)d->W(0));
01022 r.B(1) = satub((int16_t)d->W(1));
01023 r.B(2) = satub((int16_t)d->W(2));
01024 r.B(3) = satub((int16_t)d->W(3));
01025 #if SHIFT == 1
01026 r.B(4) = satub((int16_t)d->W(4));
01027 r.B(5) = satub((int16_t)d->W(5));
01028 r.B(6) = satub((int16_t)d->W(6));
01029 r.B(7) = satub((int16_t)d->W(7));
01030 #endif
01031 r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
01032 r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
01033 r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
01034 r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
01035 #if SHIFT == 1
01036 r.B(12) = satub((int16_t)s->W(4));
01037 r.B(13) = satub((int16_t)s->W(5));
01038 r.B(14) = satub((int16_t)s->W(6));
01039 r.B(15) = satub((int16_t)s->W(7));
01040 #endif
01041 *d = r;
01042 }
01043
01044 void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
01045 {
01046 Reg r;
01047
01048 r.W(0) = satsw(d->L(0));
01049 r.W(1) = satsw(d->L(1));
01050 #if SHIFT == 1
01051 r.W(2) = satsw(d->L(2));
01052 r.W(3) = satsw(d->L(3));
01053 #endif
01054 r.W((2 << SHIFT) + 0) = satsw(s->L(0));
01055 r.W((2 << SHIFT) + 1) = satsw(s->L(1));
01056 #if SHIFT == 1
01057 r.W(6) = satsw(s->L(2));
01058 r.W(7) = satsw(s->L(3));
01059 #endif
01060 *d = r;
01061 }
01062
01063 #define UNPCK_OP(base_name, base) \
01064 \
01065 void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s) \
01066 { \
01067 Reg r; \
01068 \
01069 r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
01070 r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
01071 r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
01072 r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
01073 r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
01074 r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
01075 r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
01076 r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
01077 XMM_ONLY( \
01078 r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
01079 r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
01080 r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
01081 r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
01082 r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
01083 r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
01084 r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
01085 r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
01086 ) \
01087 *d = r; \
01088 } \
01089 \
01090 void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s) \
01091 { \
01092 Reg r; \
01093 \
01094 r.W(0) = d->W((base << (SHIFT + 1)) + 0); \
01095 r.W(1) = s->W((base << (SHIFT + 1)) + 0); \
01096 r.W(2) = d->W((base << (SHIFT + 1)) + 1); \
01097 r.W(3) = s->W((base << (SHIFT + 1)) + 1); \
01098 XMM_ONLY( \
01099 r.W(4) = d->W((base << (SHIFT + 1)) + 2); \
01100 r.W(5) = s->W((base << (SHIFT + 1)) + 2); \
01101 r.W(6) = d->W((base << (SHIFT + 1)) + 3); \
01102 r.W(7) = s->W((base << (SHIFT + 1)) + 3); \
01103 ) \
01104 *d = r; \
01105 } \
01106 \
01107 void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s) \
01108 { \
01109 Reg r; \
01110 \
01111 r.L(0) = d->L((base << SHIFT) + 0); \
01112 r.L(1) = s->L((base << SHIFT) + 0); \
01113 XMM_ONLY( \
01114 r.L(2) = d->L((base << SHIFT) + 1); \
01115 r.L(3) = s->L((base << SHIFT) + 1); \
01116 ) \
01117 *d = r; \
01118 } \
01119 \
01120 XMM_ONLY( \
01121 void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s) \
01122 { \
01123 Reg r; \
01124 \
01125 r.Q(0) = d->Q(base); \
01126 r.Q(1) = s->Q(base); \
01127 *d = r; \
01128 } \
01129 )
01130
01131 UNPCK_OP(l, 0)
01132 UNPCK_OP(h, 1)
01133
01134
01135 #if SHIFT == 0
01136 void helper_pi2fd(MMXReg *d, MMXReg *s)
01137 {
01138 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
01139 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
01140 }
01141
01142 void helper_pi2fw(MMXReg *d, MMXReg *s)
01143 {
01144 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
01145 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
01146 }
01147
01148 void helper_pf2id(MMXReg *d, MMXReg *s)
01149 {
01150 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
01151 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
01152 }
01153
01154 void helper_pf2iw(MMXReg *d, MMXReg *s)
01155 {
01156 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
01157 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
01158 }
01159
01160 void helper_pfacc(MMXReg *d, MMXReg *s)
01161 {
01162 MMXReg r;
01163 r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
01164 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
01165 *d = r;
01166 }
01167
01168 void helper_pfadd(MMXReg *d, MMXReg *s)
01169 {
01170 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
01171 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
01172 }
01173
01174 void helper_pfcmpeq(MMXReg *d, MMXReg *s)
01175 {
01176 d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
01177 d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
01178 }
01179
01180 void helper_pfcmpge(MMXReg *d, MMXReg *s)
01181 {
01182 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
01183 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
01184 }
01185
01186 void helper_pfcmpgt(MMXReg *d, MMXReg *s)
01187 {
01188 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
01189 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
01190 }
01191
01192 void helper_pfmax(MMXReg *d, MMXReg *s)
01193 {
01194 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
01195 d->MMX_S(0) = s->MMX_S(0);
01196 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
01197 d->MMX_S(1) = s->MMX_S(1);
01198 }
01199
01200 void helper_pfmin(MMXReg *d, MMXReg *s)
01201 {
01202 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
01203 d->MMX_S(0) = s->MMX_S(0);
01204 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
01205 d->MMX_S(1) = s->MMX_S(1);
01206 }
01207
01208 void helper_pfmul(MMXReg *d, MMXReg *s)
01209 {
01210 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
01211 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
01212 }
01213
01214 void helper_pfnacc(MMXReg *d, MMXReg *s)
01215 {
01216 MMXReg r;
01217 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
01218 r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
01219 *d = r;
01220 }
01221
01222 void helper_pfpnacc(MMXReg *d, MMXReg *s)
01223 {
01224 MMXReg r;
01225 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
01226 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
01227 *d = r;
01228 }
01229
01230 void helper_pfrcp(MMXReg *d, MMXReg *s)
01231 {
01232 d->MMX_S(0) = approx_rcp(s->MMX_S(0));
01233 d->MMX_S(1) = d->MMX_S(0);
01234 }
01235
01236 void helper_pfrsqrt(MMXReg *d, MMXReg *s)
01237 {
01238 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
01239 d->MMX_S(1) = approx_rsqrt(d->MMX_S(1));
01240 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
01241 d->MMX_L(0) = d->MMX_L(1);
01242 }
01243
01244 void helper_pfsub(MMXReg *d, MMXReg *s)
01245 {
01246 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
01247 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
01248 }
01249
01250 void helper_pfsubr(MMXReg *d, MMXReg *s)
01251 {
01252 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
01253 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
01254 }
01255
01256 void helper_pswapd(MMXReg *d, MMXReg *s)
01257 {
01258 MMXReg r;
01259 r.MMX_L(0) = s->MMX_L(1);
01260 r.MMX_L(1) = s->MMX_L(0);
01261 *d = r;
01262 }
01263 #endif
01264
01265
01266 void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
01267 {
01268 int i;
01269 Reg r;
01270
01271 for (i = 0; i < (8 << SHIFT); i++)
01272 r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
01273
01274 *d = r;
01275 }
01276
01277 void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
01278 {
01279 d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
01280 d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
01281 XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
01282 XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
01283 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
01284 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
01285 XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
01286 XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
01287 }
01288
01289 void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
01290 {
01291 d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
01292 XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
01293 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
01294 XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
01295 }
01296
01297 void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
01298 {
01299 d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
01300 d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
01301 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
01302 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
01303 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
01304 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
01305 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
01306 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
01307 }
01308
01309 void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
01310 {
01311 d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
01312 (int8_t)s->B( 1) * (uint8_t)d->B( 1));
01313 d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
01314 (int8_t)s->B( 3) * (uint8_t)d->B( 3));
01315 d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
01316 (int8_t)s->B( 5) * (uint8_t)d->B( 5));
01317 d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
01318 (int8_t)s->B( 7) * (uint8_t)d->B( 7));
01319 #if SHIFT == 1
01320 d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
01321 (int8_t)s->B( 9) * (uint8_t)d->B( 9));
01322 d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
01323 (int8_t)s->B(11) * (uint8_t)d->B(11));
01324 d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
01325 (int8_t)s->B(13) * (uint8_t)d->B(13));
01326 d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
01327 (int8_t)s->B(15) * (uint8_t)d->B(15));
01328 #endif
01329 }
01330
01331 void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
01332 {
01333 d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
01334 d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
01335 XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
01336 XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
01337 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
01338 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
01339 XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
01340 XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
01341 }
01342
01343 void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
01344 {
01345 d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
01346 XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
01347 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
01348 XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
01349 }
01350
01351 void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
01352 {
01353 d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
01354 d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
01355 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
01356 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
01357 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
01358 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
01359 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
01360 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
01361 }
01362
01363 #define FABSB(_, x) x > INT8_MAX ? -(int8_t ) x : x
01364 #define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
01365 #define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
01366 SSE_HELPER_B(helper_pabsb, FABSB)
01367 SSE_HELPER_W(helper_pabsw, FABSW)
01368 SSE_HELPER_L(helper_pabsd, FABSL)
01369
01370 #define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
01371 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
01372
01373 #define FSIGNB(d, s) s <= INT8_MAX ? s ? d : 0 : -(int8_t ) d
01374 #define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
01375 #define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
01376 SSE_HELPER_B(helper_psignb, FSIGNB)
01377 SSE_HELPER_W(helper_psignw, FSIGNW)
01378 SSE_HELPER_L(helper_psignd, FSIGNL)
01379
01380 void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
01381 {
01382 Reg r;
01383
01384
01385 if (shift >= (16 << SHIFT)) {
01386 r.Q(0) = 0;
01387 XMM_ONLY(r.Q(1) = 0);
01388 } else {
01389 shift <<= 3;
01390 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
01391 #if SHIFT == 0
01392 r.Q(0) = SHR(s->Q(0), shift - 0) |
01393 SHR(d->Q(0), shift - 64);
01394 #else
01395 r.Q(0) = SHR(s->Q(0), shift - 0) |
01396 SHR(s->Q(1), shift - 64) |
01397 SHR(d->Q(0), shift - 128) |
01398 SHR(d->Q(1), shift - 192);
01399 r.Q(1) = SHR(s->Q(0), shift + 64) |
01400 SHR(s->Q(1), shift - 0) |
01401 SHR(d->Q(0), shift - 64) |
01402 SHR(d->Q(1), shift - 128);
01403 #endif
01404 #undef SHR
01405 }
01406
01407 *d = r;
01408 }
01409
01410 #define XMM0 env->xmm_regs[0]
01411
01412 #if SHIFT == 1
01413 #define SSE_HELPER_V(name, elem, num, F)\
01414 void glue(name, SUFFIX) (Reg *d, Reg *s)\
01415 {\
01416 d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
01417 d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
01418 if (num > 2) {\
01419 d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
01420 d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
01421 if (num > 4) {\
01422 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
01423 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
01424 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
01425 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
01426 if (num > 8) {\
01427 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
01428 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
01429 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
01430 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
01431 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
01432 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
01433 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
01434 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
01435 }\
01436 }\
01437 }\
01438 }
01439
01440 #define SSE_HELPER_I(name, elem, num, F)\
01441 void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
01442 {\
01443 d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
01444 d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
01445 if (num > 2) {\
01446 d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
01447 d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
01448 if (num > 4) {\
01449 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
01450 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
01451 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
01452 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
01453 if (num > 8) {\
01454 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
01455 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
01456 d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
01457 d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
01458 d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
01459 d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
01460 d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
01461 d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
01462 }\
01463 }\
01464 }\
01465 }
01466
01467
01468 #define FBLENDVB(d, s, m) (m & 0x80) ? s : d
01469 #define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
01470 #define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d
01471 SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
01472 SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
01473 SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
01474
01475 void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
01476 {
01477 uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1));
01478 uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
01479
01480 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
01481 }
01482
01483 #define SSE_HELPER_F(name, elem, num, F)\
01484 void glue(name, SUFFIX) (Reg *d, Reg *s)\
01485 {\
01486 d->elem(0) = F(0);\
01487 d->elem(1) = F(1);\
01488 if (num > 2) {\
01489 d->elem(2) = F(2);\
01490 d->elem(3) = F(3);\
01491 if (num > 4) {\
01492 d->elem(4) = F(4);\
01493 d->elem(5) = F(5);\
01494 d->elem(6) = F(6);\
01495 d->elem(7) = F(7);\
01496 }\
01497 }\
01498 }
01499
01500 SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
01501 SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
01502 SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
01503 SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
01504 SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
01505 SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
01506 SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
01507 SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
01508 SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
01509 SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
01510 SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
01511 SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
01512
01513 void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
01514 {
01515 d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
01516 d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
01517 }
01518
01519 #define FCMPEQQ(d, s) d == s ? -1 : 0
01520 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
01521
01522 void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
01523 {
01524 d->W(0) = satuw((int32_t) d->L(0));
01525 d->W(1) = satuw((int32_t) d->L(1));
01526 d->W(2) = satuw((int32_t) d->L(2));
01527 d->W(3) = satuw((int32_t) d->L(3));
01528 d->W(4) = satuw((int32_t) s->L(0));
01529 d->W(5) = satuw((int32_t) s->L(1));
01530 d->W(6) = satuw((int32_t) s->L(2));
01531 d->W(7) = satuw((int32_t) s->L(3));
01532 }
01533
01534 #define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
01535 #define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
01536 #define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
01537 #define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
01538 SSE_HELPER_B(helper_pminsb, FMINSB)
01539 SSE_HELPER_L(helper_pminsd, FMINSD)
01540 SSE_HELPER_W(helper_pminuw, MIN)
01541 SSE_HELPER_L(helper_pminud, MIN)
01542 SSE_HELPER_B(helper_pmaxsb, FMAXSB)
01543 SSE_HELPER_L(helper_pmaxsd, FMAXSD)
01544 SSE_HELPER_W(helper_pmaxuw, MAX)
01545 SSE_HELPER_L(helper_pmaxud, MAX)
01546
01547 #define FMULLD(d, s) (int32_t) d * (int32_t) s
01548 SSE_HELPER_L(helper_pmulld, FMULLD)
01549
01550 void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
01551 {
01552 int idx = 0;
01553
01554 if (s->W(1) < s->W(idx))
01555 idx = 1;
01556 if (s->W(2) < s->W(idx))
01557 idx = 2;
01558 if (s->W(3) < s->W(idx))
01559 idx = 3;
01560 if (s->W(4) < s->W(idx))
01561 idx = 4;
01562 if (s->W(5) < s->W(idx))
01563 idx = 5;
01564 if (s->W(6) < s->W(idx))
01565 idx = 6;
01566 if (s->W(7) < s->W(idx))
01567 idx = 7;
01568
01569 d->Q(1) = 0;
01570 d->L(1) = 0;
01571 d->W(1) = idx;
01572 d->W(0) = s->W(idx);
01573 }
01574
01575 void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
01576 {
01577 signed char prev_rounding_mode;
01578
01579 prev_rounding_mode = env->sse_status.float_rounding_mode;
01580 if (!(mode & (1 << 2)))
01581 switch (mode & 3) {
01582 case 0:
01583 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
01584 break;
01585 case 1:
01586 set_float_rounding_mode(float_round_down, &env->sse_status);
01587 break;
01588 case 2:
01589 set_float_rounding_mode(float_round_up, &env->sse_status);
01590 break;
01591 case 3:
01592 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
01593 break;
01594 }
01595
01596 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
01597 d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
01598 d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
01599 d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
01600
01601 #if 0
01602 if (mode & (1 << 3))
01603 set_float_exception_flags(
01604 get_float_exception_flags(&env->sse_status) &
01605 ~float_flag_inexact,
01606 &env->sse_status);
01607 #endif
01608 env->sse_status.float_rounding_mode = prev_rounding_mode;
01609 }
01610
01611 void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
01612 {
01613 signed char prev_rounding_mode;
01614
01615 prev_rounding_mode = env->sse_status.float_rounding_mode;
01616 if (!(mode & (1 << 2)))
01617 switch (mode & 3) {
01618 case 0:
01619 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
01620 break;
01621 case 1:
01622 set_float_rounding_mode(float_round_down, &env->sse_status);
01623 break;
01624 case 2:
01625 set_float_rounding_mode(float_round_up, &env->sse_status);
01626 break;
01627 case 3:
01628 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
01629 break;
01630 }
01631
01632 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
01633 d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
01634
01635 #if 0
01636 if (mode & (1 << 3))
01637 set_float_exception_flags(
01638 get_float_exception_flags(&env->sse_status) &
01639 ~float_flag_inexact,
01640 &env->sse_status);
01641 #endif
01642 env->sse_status.float_rounding_mode = prev_rounding_mode;
01643 }
01644
01645 void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
01646 {
01647 signed char prev_rounding_mode;
01648
01649 prev_rounding_mode = env->sse_status.float_rounding_mode;
01650 if (!(mode & (1 << 2)))
01651 switch (mode & 3) {
01652 case 0:
01653 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
01654 break;
01655 case 1:
01656 set_float_rounding_mode(float_round_down, &env->sse_status);
01657 break;
01658 case 2:
01659 set_float_rounding_mode(float_round_up, &env->sse_status);
01660 break;
01661 case 3:
01662 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
01663 break;
01664 }
01665
01666 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
01667
01668 #if 0
01669 if (mode & (1 << 3))
01670 set_float_exception_flags(
01671 get_float_exception_flags(&env->sse_status) &
01672 ~float_flag_inexact,
01673 &env->sse_status);
01674 #endif
01675 env->sse_status.float_rounding_mode = prev_rounding_mode;
01676 }
01677
01678 void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
01679 {
01680 signed char prev_rounding_mode;
01681
01682 prev_rounding_mode = env->sse_status.float_rounding_mode;
01683 if (!(mode & (1 << 2)))
01684 switch (mode & 3) {
01685 case 0:
01686 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
01687 break;
01688 case 1:
01689 set_float_rounding_mode(float_round_down, &env->sse_status);
01690 break;
01691 case 2:
01692 set_float_rounding_mode(float_round_up, &env->sse_status);
01693 break;
01694 case 3:
01695 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
01696 break;
01697 }
01698
01699 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
01700
01701 #if 0
01702 if (mode & (1 << 3))
01703 set_float_exception_flags(
01704 get_float_exception_flags(&env->sse_status) &
01705 ~float_flag_inexact,
01706 &env->sse_status);
01707 #endif
01708 env->sse_status.float_rounding_mode = prev_rounding_mode;
01709 }
01710
01711 #define FBLENDP(d, s, m) m ? s : d
01712 SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
01713 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
01714 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
01715
01716 void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
01717 {
01718 float32 iresult = 0 ;
01719
01720 if (mask & (1 << 4))
01721 iresult = float32_add(iresult,
01722 float32_mul(d->L(0), s->L(0), &env->sse_status),
01723 &env->sse_status);
01724 if (mask & (1 << 5))
01725 iresult = float32_add(iresult,
01726 float32_mul(d->L(1), s->L(1), &env->sse_status),
01727 &env->sse_status);
01728 if (mask & (1 << 6))
01729 iresult = float32_add(iresult,
01730 float32_mul(d->L(2), s->L(2), &env->sse_status),
01731 &env->sse_status);
01732 if (mask & (1 << 7))
01733 iresult = float32_add(iresult,
01734 float32_mul(d->L(3), s->L(3), &env->sse_status),
01735 &env->sse_status);
01736 d->L(0) = (mask & (1 << 0)) ? iresult : 0 ;
01737 d->L(1) = (mask & (1 << 1)) ? iresult : 0 ;
01738 d->L(2) = (mask & (1 << 2)) ? iresult : 0 ;
01739 d->L(3) = (mask & (1 << 3)) ? iresult : 0 ;
01740 }
01741
01742 void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
01743 {
01744 float64 iresult = 0 ;
01745
01746 if (mask & (1 << 4))
01747 iresult = float64_add(iresult,
01748 float64_mul(d->Q(0), s->Q(0), &env->sse_status),
01749 &env->sse_status);
01750 if (mask & (1 << 5))
01751 iresult = float64_add(iresult,
01752 float64_mul(d->Q(1), s->Q(1), &env->sse_status),
01753 &env->sse_status);
01754 d->Q(0) = (mask & (1 << 0)) ? iresult : 0 ;
01755 d->Q(1) = (mask & (1 << 1)) ? iresult : 0 ;
01756 }
01757
01758 void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
01759 {
01760 int s0 = (offset & 3) << 2;
01761 int d0 = (offset & 4) << 0;
01762 int i;
01763 Reg r;
01764
01765 for (i = 0; i < 8; i++, d0++) {
01766 r.W(i) = 0;
01767 r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
01768 r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
01769 r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
01770 r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
01771 }
01772
01773 *d = r;
01774 }
01775
01776
01777
01778 #define FCMPGTQ(d, s) d > s ? -1 : 0
01779 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
01780
01781 static inline int pcmp_elen(int reg, uint32_t ctrl)
01782 {
01783 int val;
01784
01785
01786 if (ctrl >> 8)
01787 val = abs1((int64_t) env->regs[reg]);
01788 else
01789 val = abs1((int32_t) env->regs[reg]);
01790
01791 if (ctrl & 1) {
01792 if (val > 8)
01793 return 8;
01794 } else
01795 if (val > 16)
01796 return 16;
01797
01798 return val;
01799 }
01800
01801 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
01802 {
01803 int val = 0;
01804
01805 if (ctrl & 1) {
01806 while (val < 8 && r->W(val))
01807 val++;
01808 } else
01809 while (val < 16 && r->B(val))
01810 val++;
01811
01812 return val;
01813 }
01814
01815 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
01816 {
01817 switch ((ctrl >> 0) & 3) {
01818 case 0:
01819 return r->B(i);
01820 case 1:
01821 return r->W(i);
01822 case 2:
01823 return (int8_t) r->B(i);
01824 case 3:
01825 default:
01826 return (int16_t) r->W(i);
01827 }
01828 }
01829
01830 static inline unsigned pcmpxstrx(Reg *d, Reg *s,
01831 int8_t ctrl, int valids, int validd)
01832 {
01833 unsigned int res = 0;
01834 int v;
01835 int j, i;
01836 int upper = (ctrl & 1) ? 7 : 15;
01837
01838 valids--;
01839 validd--;
01840
01841 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
01842
01843 switch ((ctrl >> 2) & 3) {
01844 case 0:
01845 for (j = valids; j >= 0; j--) {
01846 res <<= 1;
01847 v = pcmp_val(s, ctrl, j);
01848 for (i = validd; i >= 0; i--)
01849 res |= (v == pcmp_val(d, ctrl, i));
01850 }
01851 break;
01852 case 1:
01853 for (j = valids; j >= 0; j--) {
01854 res <<= 1;
01855 v = pcmp_val(s, ctrl, j);
01856 for (i = ((validd - 1) | 1); i >= 0; i -= 2)
01857 res |= (pcmp_val(d, ctrl, i - 0) <= v &&
01858 pcmp_val(d, ctrl, i - 1) >= v);
01859 }
01860 break;
01861 case 2:
01862 res = (2 << (upper - MAX(valids, validd))) - 1;
01863 res <<= MAX(valids, validd) - MIN(valids, validd);
01864 for (i = MIN(valids, validd); i >= 0; i--) {
01865 res <<= 1;
01866 v = pcmp_val(s, ctrl, i);
01867 res |= (v == pcmp_val(d, ctrl, i));
01868 }
01869 break;
01870 case 3:
01871 for (j = valids - validd; j >= 0; j--) {
01872 res <<= 1;
01873 res |= 1;
01874 for (i = MIN(upper - j, validd); i >= 0; i--)
01875 res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
01876 }
01877 break;
01878 }
01879
01880 switch ((ctrl >> 4) & 3) {
01881 case 1:
01882 res ^= (2 << upper) - 1;
01883 break;
01884 case 3:
01885 res ^= (2 << valids) - 1;
01886 break;
01887 }
01888
01889 if (res)
01890 CC_SRC |= CC_C;
01891 if (res & 1)
01892 CC_SRC |= CC_O;
01893
01894 return res;
01895 }
01896
01897 static inline int rffs1(unsigned int val)
01898 {
01899 int ret = 1, hi;
01900
01901 for (hi = sizeof(val) * 4; hi; hi /= 2)
01902 if (val >> hi) {
01903 val >>= hi;
01904 ret += hi;
01905 }
01906
01907 return ret;
01908 }
01909
01910 static inline int ffs1(unsigned int val)
01911 {
01912 int ret = 1, hi;
01913
01914 for (hi = sizeof(val) * 4; hi; hi /= 2)
01915 if (val << hi) {
01916 val <<= hi;
01917 ret += hi;
01918 }
01919
01920 return ret;
01921 }
01922
01923 void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
01924 {
01925 unsigned int res = pcmpxstrx(d, s, ctrl,
01926 pcmp_elen(R_EDX, ctrl),
01927 pcmp_elen(R_EAX, ctrl));
01928
01929 if (res)
01930 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
01931 else
01932 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
01933 }
01934
01935 void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
01936 {
01937 int i;
01938 unsigned int res = pcmpxstrx(d, s, ctrl,
01939 pcmp_elen(R_EDX, ctrl),
01940 pcmp_elen(R_EAX, ctrl));
01941
01942 if ((ctrl >> 6) & 1) {
01943 if (ctrl & 1)
01944 for (i = 0; i <= 8; i--, res >>= 1)
01945 d->W(i) = (res & 1) ? ~0 : 0;
01946 else
01947 for (i = 0; i <= 16; i--, res >>= 1)
01948 d->B(i) = (res & 1) ? ~0 : 0;
01949 } else {
01950 d->Q(1) = 0;
01951 d->Q(0) = res;
01952 }
01953 }
01954
01955 void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
01956 {
01957 unsigned int res = pcmpxstrx(d, s, ctrl,
01958 pcmp_ilen(s, ctrl),
01959 pcmp_ilen(d, ctrl));
01960
01961 if (res)
01962 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
01963 else
01964 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
01965 }
01966
01967 void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
01968 {
01969 int i;
01970 unsigned int res = pcmpxstrx(d, s, ctrl,
01971 pcmp_ilen(s, ctrl),
01972 pcmp_ilen(d, ctrl));
01973
01974 if ((ctrl >> 6) & 1) {
01975 if (ctrl & 1)
01976 for (i = 0; i <= 8; i--, res >>= 1)
01977 d->W(i) = (res & 1) ? ~0 : 0;
01978 else
01979 for (i = 0; i <= 16; i--, res >>= 1)
01980 d->B(i) = (res & 1) ? ~0 : 0;
01981 } else {
01982 d->Q(1) = 0;
01983 d->Q(0) = res;
01984 }
01985 }
01986
01987 #define CRCPOLY 0x1edc6f41
01988 #define CRCPOLY_BITREV 0x82f63b78
01989 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
01990 {
01991 target_ulong crc = (msg & ((target_ulong) -1 >>
01992 (TARGET_LONG_BITS - len))) ^ crc1;
01993
01994 while (len--)
01995 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
01996
01997 return crc;
01998 }
01999
02000 #define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
02001 #define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
02002 target_ulong helper_popcnt(target_ulong n, uint32_t type)
02003 {
02004 CC_SRC = n ? 0 : CC_Z;
02005
02006 n = POPCOUNT(n, 0);
02007 n = POPCOUNT(n, 1);
02008 n = POPCOUNT(n, 2);
02009 n = POPCOUNT(n, 3);
02010 if (type == 1)
02011 return n & 0xff;
02012
02013 n = POPCOUNT(n, 4);
02014 #ifndef TARGET_X86_64
02015 return n;
02016 #else
02017 if (type == 2)
02018 return n & 0xff;
02019
02020 return POPCOUNT(n, 5);
02021 #endif
02022 }
02023 #endif
02024
02025 #undef SHIFT
02026 #undef XMM_ONLY
02027 #undef Reg
02028 #undef B
02029 #undef W
02030 #undef L
02031 #undef Q
02032 #undef SUFFIX