lib/Target/X86/README-MMX.txt - llvm - Git at Google

 //===---------------------------------------------------------------------===//
 // Random ideas for the X86 backend: MMX-specific stuff.
 //===---------------------------------------------------------------------===//

 //===---------------------------------------------------------------------===//

 This:

 #include <mmintrin.h>

 __v2si qux(int A) {
   return (__v2si){ 0, A };
 }

 is compiled into:

 _qux:
         subl $28, %esp
         movl 32(%esp), %eax
         movd %eax, %mm0
         movq %mm0, (%esp)
         movl (%esp), %eax
         movl %eax, 20(%esp)
         movq %mm0, 8(%esp)
         movl 12(%esp), %eax
         movl %eax, 16(%esp)
         movq 16(%esp), %mm0
         addl $28, %esp
         ret

 Yuck!

 GCC gives us:

 _qux:
         subl    $12, %esp
         movl    16(%esp), %eax
         movl    20(%esp), %edx
         movl    $0, (%eax)
         movl    %edx, 4(%eax)
         addl    $12, %esp
         ret     $4

 //===---------------------------------------------------------------------===//

 We generate crappy code for this:

 __m64 t() {
   return _mm_cvtsi32_si64(1);
 }

 _t:
 	subl	$12, %esp
 	movl	$1, %eax
 	movd	%eax, %mm0
 	movq	%mm0, (%esp)
 	movl	(%esp), %eax
 	movl	4(%esp), %edx
 	addl	$12, %esp
 	ret

 The extra stack traffic is covered in the previous entry. But the other reason
 is we are not smart about materializing constants in MMX registers. With -m64

 	movl	$1, %eax
 	movd	%eax, %mm0
 	movd	%mm0, %rax
 	ret

 We should be using a constantpool load instead:
 	movq	LC0(%rip), %rax
	//===---------------------------------------------------------------------===//
	// Random ideas for the X86 backend: MMX-specific stuff.
	//===---------------------------------------------------------------------===//

	//===---------------------------------------------------------------------===//

	This:

	#include <mmintrin.h>

	__v2si qux(int A) {
	return (__v2si){ 0, A };
	}

	is compiled into:

	_qux:
	subl $28, %esp
	movl 32(%esp), %eax
	movd %eax, %mm0
	movq %mm0, (%esp)
	movl (%esp), %eax
	movl %eax, 20(%esp)
	movq %mm0, 8(%esp)
	movl 12(%esp), %eax
	movl %eax, 16(%esp)
	movq 16(%esp), %mm0
	addl $28, %esp
	ret

	Yuck!

	GCC gives us:

	_qux:
	subl $12, %esp
	movl 16(%esp), %eax
	movl 20(%esp), %edx
	movl $0, (%eax)
	movl %edx, 4(%eax)
	addl $12, %esp
	ret $4

	//===---------------------------------------------------------------------===//

	We generate crappy code for this:

	__m64 t() {
	return _mm_cvtsi32_si64(1);
	}

	_t:
	subl $12, %esp
	movl $1, %eax
	movd %eax, %mm0
	movq %mm0, (%esp)
	movl (%esp), %eax
	movl 4(%esp), %edx
	addl $12, %esp
	ret

	The extra stack traffic is covered in the previous entry. But the other reason
	is we are not smart about materializing constants in MMX registers. With -m64

	movl $1, %eax
	movd %eax, %mm0
	movd %mm0, %rax
	ret

	We should be using a constantpool load instead:
	movq LC0(%rip), %rax