| /* APPLE LOCAL file 5612787 mainline sse4 */ |
| /* { dg-do run { target i?86-*-* x86_64-*-* } } */ |
| /* { dg-require-effective-target sse4 } */ |
| /* { dg-options "-O2 -msse4.1" } */ |
| |
| #include "sse4_1-check.h" |
| |
| #include <smmintrin.h> |
| |
| #define lmskN 0x00 |
| #define lmsk0 0x01 |
| #define lmsk1 0x02 |
| #define lmsk2 0x04 |
| #define lmsk3 0x08 |
| #define lmsk01 0x03 |
| #define lmsk02 0x05 |
| #define lmsk03 0x09 |
| #define lmsk12 0x06 |
| #define lmsk13 0x0A |
| #define lmsk23 0x0C |
| #define lmskA 0x0F |
| |
| #define hmskN 0x00 |
| #define hmskA 0xF0 |
| #define hmsk0 0x10 |
| #define hmsk1 0x20 |
| #define hmsk2 0x40 |
| #define hmsk3 0x80 |
| #define hmsk01 0x30 |
| #define hmsk02 0x50 |
| #define hmsk03 0x90 |
| #define hmsk12 0x60 |
| #define hmsk13 0xA0 |
| #define hmsk23 0xC0 |
| |
| #ifndef HIMASK |
| #define HIMASK hmskA |
| #endif |
| |
| static void |
| sse4_1_test (void) |
| { |
| union |
| { |
| __m128 x; |
| float f[4]; |
| } val1, val2, res[16]; |
| int masks[16]; |
| int i, j; |
| |
| val1.f[0] = 2.; |
| val1.f[1] = 3.; |
| val1.f[2] = 4.; |
| val1.f[3] = 5.; |
| |
| val2.f[0] = 10.; |
| val2.f[1] = 100.; |
| val2.f[2] = 1000.; |
| val2.f[3] = 10000.; |
| |
| res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0); |
| res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1); |
| res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2); |
| res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3); |
| res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01); |
| res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02); |
| res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03); |
| res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12); |
| res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13); |
| res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23); |
| res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0)); |
| res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1)); |
| res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2)); |
| res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3)); |
| res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN); |
| res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA); |
| |
| masks[0] = HIMASK | lmsk0; |
| masks[1] = HIMASK | lmsk1; |
| masks[2] = HIMASK | lmsk2; |
| masks[3] = HIMASK | lmsk3; |
| masks[4] = HIMASK | lmsk01; |
| masks[5] = HIMASK | lmsk02; |
| masks[6] = HIMASK | lmsk03; |
| masks[7] = HIMASK | lmsk12; |
| masks[8] = HIMASK | lmsk13; |
| masks[9] = HIMASK | lmsk23; |
| masks[10] = HIMASK | (0x0F & ~lmsk0); |
| masks[11] = HIMASK | (0x0F & ~lmsk1); |
| masks[12] = HIMASK | (0x0F & ~lmsk2); |
| masks[13] = HIMASK | (0x0F & ~lmsk3); |
| masks[14] = HIMASK | lmskN; |
| masks[15] = HIMASK | lmskA; |
| |
| for (i = 0; i <= 15; i++) |
| { |
| float tmp = 0.; |
| |
| for (j = 0; j < 4; j++) |
| if ((HIMASK & (0x10 << j))) |
| tmp += val1.f[j] * val2.f[j]; |
| |
| for (j = 0; j < 4; j++) |
| if ((masks[i] & (1 << j)) && res[i].f[j] != tmp) |
| abort (); |
| } |
| } |