| /* APPLE LOCAL file 4271691 */ |
| #include <xmmintrin.h> |
| #include <emmintrin.h> |
| /* { dg-do compile { target i?86-*-* } } */ |
| /* { dg-options "-O2 -msse2" } */ |
| |
| void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n) { |
| |
| |
| static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; |
| |
| int i; |
| |
| __m128i v_row0_0, v_row0_1; |
| __m128i v_temp_0, v_temp_1; |
| __m128i v_result; |
| |
| __m128i vZero; |
| vZero = _mm_setzero_si128(); |
| __m128i v_32 = _mm_loadu_si128((__m128i*)c_32); |
| |
| __m128i* coef_ptr = (__m128i*) coef_buf; |
| |
| v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); |
| v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); |
| v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); |
| ref_part_ptr += ref_part_stride; |
| // row0: 0 1 2 3 4 5 6 7 |
| // row1: 2 3 4 5 6 7 8 9 |
| |
| v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); |
| v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); |
| |
| for ( i = 0; i < n; i++ ) |
| { |
| v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]); |
| v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]); |
| |
| v_result = v_32; |
| v_result = _mm_add_epi16(v_result, v_row0_0); |
| v_result = _mm_add_epi16(v_result, v_row0_1); |
| |
| v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); |
| v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); |
| v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); |
| ref_part_ptr += ref_part_stride; |
| v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); |
| v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); |
| v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]); |
| v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]); |
| |
| v_result = _mm_add_epi16(v_result, v_temp_0); |
| v_result = _mm_add_epi16(v_result, v_temp_1); |
| v_result = _mm_srli_epi16(v_result, 6); |
| |
| _mm_store_si128((__m128i*)(current_part_ptr), v_result); |
| current_part_ptr += current_part_stride; |
| } |
| |
| } |
| |
| /* Should be no reg-reg copies. */ |
| /* { dg-final { scan-assembler-not "movdqa\t%xmm\[0-7\], %xmm\[0-7\]" } } */ |