One of my other hobbies is game development. I remember my family's first PC, a then state of the art IBM/PC Intel 80386. We had a few games for the machine, courtesy of my father. I remember being fascinated with the DOS version of Rogue (you can still find the binary and source for it here!). I played that, and other games a lot growing up, and ever since I've always been really interested in computer game development.

I've been working, off and on, on my own game developemnt projects. Most are entirely for fun and learning and likely no released game will ever come of it. Some time ago I wrote a simple vector/matrix library, since the basis of most game engines is largely a lot of vector and matrix manipulations. Recently I was curious how the compiler would translate these instructions. Mainly I wondered if it would use the MMX/SIMD instructions. These have been available widely since 1996, so surely the compiler technology has had time to catch up?

Consider the following code:

class vector4 {

public:
  vector4() {}
  vector4(float x, float y, float z, float w) : _x(x), _y(y), _z(z), _w(w) {};

  void cross_product(vector4 &vector_1, vector4 &vector_2)
  {
    _x = (vector_1._y * vector_2._z) - (vector_1._z * vector_2._y);
    _y = (vector_1._z * vector_2._x) - (vector_1._x * vector_2._z);
    _z = (vector_1._x * vector_2._y) - (vector_1._y * vector_2._x);
    _w = 1.0;
  }


private:
  float _x;
  float _y;
  float _z;
  float _w;
};


Pretty straightforward. The method cross will calculate the cross product of these two vectors. Now take a look at the compiled code (using gcc 4.8.1 on Linux). You'll probably note that this includes arg setup, implying a function call. So before you ask, yes, this is compiled with out any optimization flags, but I did check with maximum optimization and the result is basically the same (except optimized it inlined all the function calls).


  400b5e:	55                   	push   %rbp
  400b5f:	48 89 e5             	mov    %rsp,%rbp
  400b62:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
  400b66:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
  400b6a:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
  400b6e:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400b72:	f3 0f 10 48 04       	movss  0x4(%rax),%xmm1
  400b77:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400b7b:	f3 0f 10 40 08       	movss  0x8(%rax),%xmm0
  400b80:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
  400b84:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400b88:	f3 0f 10 50 08       	movss  0x8(%rax),%xmm2
  400b8d:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400b91:	f3 0f 10 48 04       	movss  0x4(%rax),%xmm1
  400b96:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
  400b9a:	f3 0f 5c c1          	subss  %xmm1,%xmm0
  400b9e:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  400ba2:	f3 0f 11 00          	movss  %xmm0,(%rax)
  400ba6:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400baa:	f3 0f 10 48 08       	movss  0x8(%rax),%xmm1
  400baf:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400bb3:	f3 0f 10 00          	movss  (%rax),%xmm0
  400bb7:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
  400bbb:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400bbf:	f3 0f 10 10          	movss  (%rax),%xmm2
  400bc3:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400bc7:	f3 0f 10 48 08       	movss  0x8(%rax),%xmm1
  400bcc:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
  400bd0:	f3 0f 5c c1          	subss  %xmm1,%xmm0
  400bd4:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  400bd8:	f3 0f 11 40 04       	movss  %xmm0,0x4(%rax)
  400bdd:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400be1:	f3 0f 10 08          	movss  (%rax),%xmm1
  400be5:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400be9:	f3 0f 10 40 04       	movss  0x4(%rax),%xmm0
  400bee:	f3 0f 59 c1          	mulss  %xmm1,%xmm0
  400bf2:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400bf6:	f3 0f 10 50 04       	movss  0x4(%rax),%xmm2
  400bfb:	48 8b 45 e8          	mov    -0x18(%rbp),%rax
  400bff:	f3 0f 10 08          	movss  (%rax),%xmm1
  400c03:	f3 0f 59 ca          	mulss  %xmm2,%xmm1
  400c07:	f3 0f 5c c1          	subss  %xmm1,%xmm0
  400c0b:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  400c0f:	f3 0f 11 40 08       	movss  %xmm0,0x8(%rax)
  400c14:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  400c18:	8b 05 6e 01 00 00    	mov    0x16e(%rip),%eax        # 400d8c <_IO_stdin_used+0xc>
  400c1e:	89 42 0c             	mov    %eax,0xc(%rdx)
  400c21:	5d                   	pop    %rbp
  400c22:	c3                   	retq


Interesting. It does use the MMX operations, but does not use the packge variants. Consider this: floats in C++ are 32 bit values, and the MMX registers are 128 bits. A vector contains four 32 bit values. What if you could pack all 4 of those in there and do multiple operations at once? Well, you can thanks to the mulps operation. The cross product function can be modified like so to force the use of the packed operations, rather than handling them one at a time. (Apologies in advance for the horrible GAS syntax. I prefer Intel syntax but I dont think its possible to force GCC/GAS to use/accept Intel syntax). We also want to shuffle the floats around as we perform the operations. They are moved into the xmm registers in series (x, y, z, w) but we need them to "line up" properly before we can execute the multiply and subtraction operations.


void cross_product(vector4 &vector_1, vector4 &vector_2)
{
  asm("movq %0, %%rsi;"
	  "movq %1, %%rdi;"
	  "movaps (%%rsi), %%xmm0;"
	  "movaps (%%rdi), %%xmm1;"
	  "movaps %%xmm0, %%xmm2;"
	  "movaps %%xmm1, %%xmm3;"
	  "shufps $0xC9, %%xmm0, %%xmm0;"
	  "shufps $0xD2, %%xmm1, %%xmm1;"
	  "mulps %%xmm1, %%xmm0;"
	  "shufps $0xD2, %%xmm2, %%xmm2;"
	  "shufps $0xC9, %%xmm3, %%xmm3;"
	  "mulps %%xmm3, %%xmm2;"
	  "subps %%xmm2, %%xmm0;"
	  "movq %2, %%rsi;"
	  "movaps %%xmm0, (%%rsi);"
	  : /* none */
	  : "r" (&vector_1), "r" (&vector_2), "r" (this)
	  : "%rsi", "%rdi"
	 );

    _w = 1.0;
}



The generated code has the expected setup, but is clearly less intensive computationally than the GCC produced original.


  400b02:	55                   	push   %rbp
  400b03:	48 89 e5             	mov    %rsp,%rbp
  400b06:	48 89 7d f8          	mov    %rdi,-0x8(%rbp)
  400b0a:	48 89 75 f0          	mov    %rsi,-0x10(%rbp)
  400b0e:	48 89 55 e8          	mov    %rdx,-0x18(%rbp)
  400b12:	48 8b 45 f0          	mov    -0x10(%rbp),%rax
  400b16:	48 8b 55 e8          	mov    -0x18(%rbp),%rdx
  400b1a:	48 8b 4d f8          	mov    -0x8(%rbp),%rcx
  400b1e:	48 89 c6             	mov    %rax,%rsi
  400b21:	48 89 d7             	mov    %rdx,%rdi
  400b24:	0f 28 06             	movaps (%rsi),%xmm0
  400b27:	0f 28 0f             	movaps (%rdi),%xmm1
  400b2a:	0f 28 d0             	movaps %xmm0,%xmm2
  400b2d:	0f 28 d9             	movaps %xmm1,%xmm3
  400b30:	0f c6 c0 c9          	shufps $0xc9,%xmm0,%xmm0
  400b34:	0f c6 c9 d2          	shufps $0xd2,%xmm1,%xmm1
  400b38:	0f 59 c1             	mulps  %xmm1,%xmm0
  400b3b:	0f c6 d2 d2          	shufps $0xd2,%xmm2,%xmm2
  400b3f:	0f c6 db c9          	shufps $0xc9,%xmm3,%xmm3
  400b43:	0f 59 d3             	mulps  %xmm3,%xmm2
  400b46:	0f 5c c2             	subps  %xmm2,%xmm0
  400b49:	48 89 ce             	mov    %rcx,%rsi
  400b4c:	0f 29 06             	movaps %xmm0,(%rsi)
  400b4f:	48 8b 55 f8          	mov    -0x8(%rbp),%rdx
  400b53:	8b 05 33 02 00 00    	mov    0x233(%rip),%eax        # 400d8c <_IO_stdin_used+0xc>
  400b59:	89 42 0c             	mov    %eax,0xc(%rdx)
  400b5c:	5d                   	pop    %rbp
  400b5d:	c3                   	retq


Even with maximum optimizations, GCC will still leave this code as is (it might inline it), so you can force better behavior, when necessary.