Friday, September 11, 2009

Benefits of flatter/more balanced expression trees

I am rewriting (partly) my magum opus. This time, I have overloaded operators of a simple class wrapping the __m128i datatype. This allows me to generate much flatter expression trees, and boy, the code generated is fantastic. I have never seen so awesome assembly code generated by gcc from my code.

Here is the assembly dump for the brave and/or the foolish :)

0000000000400ba0 <_z11addressegenpkspkis0_s0_s0_s0_ssss>:
400ba0: 66 44 0f 6e 5c 24 08 movd 0x8(%rsp),%xmm11
400ba7: 44 0f bf 54 24 20 movswl 0x20(%rsp),%r10d
400bad: 66 0f 6e 74 24 10 movd 0x10(%rsp),%xmm6
400bb3: 4c 8b 1d 06 18 20 00 mov 0x201806(%rip),%r11 # 6023c0
400bba: 66 45 0f 61 db punpcklwd %xmm11,%xmm11
400bbf: 66 0f 61 f6 punpcklwd %xmm6,%xmm6
400bc3: 66 44 0f 6e 74 24 18 movd 0x18(%rsp),%xmm14
400bca: 41 f7 da neg %r10d
400bcd: 66 45 0f ef ff pxor %xmm15,%xmm15
400bd2: 66 41 0f 70 e3 00 pshufd $0x0,%xmm11,%xmm4
400bd8: 66 0f 70 de 00 pshufd $0x0,%xmm6,%xmm3
400bdd: 66 44 0f 6e 6c 24 20 movd 0x20(%rsp),%xmm13
400be4: 66 45 0f 61 f6 punpcklwd %xmm14,%xmm14
400be9: 66 41 0f fd 63 40 paddw 0x40(%r11),%xmm4
400bef: 66 45 0f 61 ed punpcklwd %xmm13,%xmm13
400bf4: 66 44 0f 6f d4 movdqa %xmm4,%xmm10
400bf9: 66 41 0f fd 5b 50 paddw 0x50(%r11),%xmm3
400bff: 44 89 54 24 d4 mov %r10d,-0x2c(%rsp)
400c04: 66 44 0f 6f cb movdqa %xmm3,%xmm9
400c09: 66 0f 6e 44 24 d4 movd -0x2c(%rsp),%xmm0
400c0f: 66 41 0f 70 ee 00 pshufd $0x0,%xmm14,%xmm5
400c15: 66 41 0f 6f 53 70 movdqa 0x70(%r11),%xmm2
400c1b: f3 44 0f 10 f8 movss %xmm0,%xmm15
400c20: 66 45 0f 70 e5 00 pshufd $0x0,%xmm13,%xmm12
400c26: 66 44 0f db ca pand %xmm2,%xmm9
400c2b: 66 45 0f 61 ff punpcklwd %xmm15,%xmm15
400c30: 66 44 0f 6f f3 movdqa %xmm3,%xmm14
400c35: 66 44 0f db d2 pand %xmm2,%xmm10
400c3a: 66 41 0f fd ec paddw %xmm12,%xmm5
400c3f: 66 41 0f 71 f1 02 psllw $0x2,%xmm9
400c45: 66 44 0f 6f e4 movdqa %xmm4,%xmm12
400c4a: 66 41 0f 71 d6 0f psrlw $0xf,%xmm14
400c50: 66 41 0f 70 ff 00 pshufd $0x0,%xmm15,%xmm7
400c56: 66 0f 6f f3 movdqa %xmm3,%xmm6
400c5a: 66 41 0f 71 f2 04 psllw $0x4,%xmm10
400c60: 66 45 0f eb d1 por %xmm9,%xmm10
400c65: 66 41 0f 71 d4 0f psrlw $0xf,%xmm12
400c6b: 66 41 0f ef 7b 60 pxor 0x60(%r11),%xmm7
400c71: 66 0f fd ef paddw %xmm7,%xmm5
400c75: 66 44 0f 6f c5 movdqa %xmm5,%xmm8
400c7a: 66 44 0f 6f ed movdqa %xmm5,%xmm13
400c7f: 66 0f 6f fd movdqa %xmm5,%xmm7
400c83: 66 44 0f db c2 pand %xmm2,%xmm8
400c88: 66 41 0f 71 d5 0f psrlw $0xf,%xmm13
400c8e: 66 0f 6f d5 movdqa %xmm5,%xmm2
400c92: 66 45 0f eb d0 por %xmm8,%xmm10
400c97: 66 0f 71 d2 04 psrlw $0x4,%xmm2
400c9c: 66 44 0f 6f c6 movdqa %xmm6,%xmm8
400ca1: 44 0f 29 17 movaps %xmm10,(%rdi)
400ca5: 0f 29 22 movaps %xmm4,(%rdx)
400ca8: 0f 29 19 movaps %xmm3,(%rcx)
400cab: 41 0f 29 28 movaps %xmm5,(%r8)
400caf: 48 8b 05 0a 17 20 00 mov 0x20170a(%rip),%rax # 6023c0
400cb6: 66 0f 6f 80 a0 00 00 movdqa 0xa0(%rax),%xmm0
400cbd: 00
400cbe: 66 44 0f 6f b8 90 00 movdqa 0x90(%rax),%xmm15
400cc5: 00 00
400cc7: 66 0f f9 c3 psubw %xmm3,%xmm0
400ccb: 66 0f 71 d0 0f psrlw $0xf,%xmm0
400cd0: 66 44 0f db f0 pand %xmm0,%xmm14
400cd5: 66 0f 6f c2 movdqa %xmm2,%xmm0
400cd9: 66 44 0f f9 fc psubw %xmm4,%xmm15
400cde: 66 41 0f 71 d7 0f psrlw $0xf,%xmm15
400ce4: 66 45 0f db e7 pand %xmm15,%xmm12
400ce9: 66 0f 6f 88 b0 00 00 movdqa 0xb0(%rax),%xmm1
400cf0: 00
400cf1: 66 0f 71 f0 06 psllw $0x6,%xmm0
400cf6: 66 0f f9 cd psubw %xmm5,%xmm1
400cfa: 66 45 0f db e6 pand %xmm14,%xmm12
400cff: 66 0f 71 d1 0f psrlw $0xf,%xmm1
400d04: 66 44 0f db e9 pand %xmm1,%xmm13
400d09: 66 0f 6f ec movdqa %xmm4,%xmm5
400d0d: 66 0f ef c9 pxor %xmm1,%xmm1
400d11: 66 0f 71 d5 04 psrlw $0x4,%xmm5
400d16: 66 45 0f db e5 pand %xmm13,%xmm12
400d1b: 66 0f 61 e9 punpcklwd %xmm1,%xmm5
400d1f: 66 0f 61 c1 punpcklwd %xmm1,%xmm0
400d23: 66 44 0f eb a0 80 00 por 0x80(%rax),%xmm12
400d2a: 00 00
400d2c: 45 0f 29 21 movaps %xmm12,(%r9)
400d30: 4c 8b 1d 89 16 20 00 mov 0x201689(%rip),%r11 # 6023c0
400d37: 66 45 0f 6f 8b c0 00 movdqa 0xc0(%r11),%xmm9
400d3e: 00 00
400d40: 66 41 0f db f9 pand %xmm9,%xmm7
400d45: 66 45 0f db c1 pand %xmm9,%xmm8
400d4a: 66 44 0f db cc pand %xmm4,%xmm9
400d4f: 66 0f 6f e3 movdqa %xmm3,%xmm4
400d53: 66 0f 71 d7 02 psrlw $0x2,%xmm7
400d58: 66 0f 71 d4 04 psrlw $0x4,%xmm4
400d5d: 66 0f 61 e1 punpcklwd %xmm1,%xmm4
400d61: 66 41 0f 71 f1 02 psllw $0x2,%xmm9
400d67: 66 45 0f eb c1 por %xmm9,%xmm8
400d6c: 66 44 0f eb c7 por %xmm7,%xmm8
400d71: 66 44 0f 61 c1 punpcklwd %xmm1,%xmm8
400d76: 44 0f 29 06 movaps %xmm8,(%rsi)
400d7a: 4c 8b 1d 3f 16 20 00 mov 0x20163f(%rip),%r11 # 6023c0
400d81: 0f 29 6c 24 e8 movaps %xmm5,-0x18(%rsp)
400d86: 41 8b bb d0 00 00 00 mov 0xd0(%r11),%edi
400d8d: 0f af 7c 24 e8 imul -0x18(%rsp),%edi
400d92: 01 3e add %edi,(%rsi)
400d94: 48 8d 7e 04 lea 0x4(%rsi),%rdi
400d98: 41 8b 8b d4 00 00 00 mov 0xd4(%r11),%ecx
400d9f: 0f af 4c 24 ec imul -0x14(%rsp),%ecx
400da4: 01 0f add %ecx,(%rdi)
400da6: 41 8b 93 d8 00 00 00 mov 0xd8(%r11),%edx
400dad: 48 8d 4e 08 lea 0x8(%rsi),%rcx
400db1: 0f af 54 24 f0 imul -0x10(%rsp),%edx
400db6: 0f 29 44 24 d8 movaps %xmm0,-0x28(%rsp)
400dbb: 01 11 add %edx,(%rcx)
400dbd: 48 8d 56 0c lea 0xc(%rsi),%rdx
400dc1: 45 8b 93 dc 00 00 00 mov 0xdc(%r11),%r10d
400dc8: 44 0f af 54 24 f4 imul -0xc(%rsp),%r10d
400dce: 44 01 12 add %r10d,(%rdx)
400dd1: 0f 29 64 24 e8 movaps %xmm4,-0x18(%rsp)
400dd6: 45 8b 8b e0 00 00 00 mov 0xe0(%r11),%r9d
400ddd: 44 0f af 4c 24 e8 imul -0x18(%rsp),%r9d
400de3: 44 01 0e add %r9d,(%rsi)
400de6: 44 8b 4c 24 d8 mov -0x28(%rsp),%r9d
400deb: 45 8b 83 e4 00 00 00 mov 0xe4(%r11),%r8d
400df2: 44 0f af 44 24 ec imul -0x14(%rsp),%r8d
400df8: 44 01 07 add %r8d,(%rdi)
400dfb: 41 8b 83 e8 00 00 00 mov 0xe8(%r11),%eax
400e02: 0f af 44 24 f0 imul -0x10(%rsp),%eax
400e07: 01 01 add %eax,(%rcx)
400e09: 45 8b 93 ec 00 00 00 mov 0xec(%r11),%r10d
400e10: 44 0f af 54 24 f4 imul -0xc(%rsp),%r10d
400e16: 44 01 12 add %r10d,(%rdx)
400e19: 0f 29 44 24 e8 movaps %xmm0,-0x18(%rsp)
400e1e: 44 01 0e add %r9d,(%rsi)
400e21: 44 8b 44 24 ec mov -0x14(%rsp),%r8d
400e26: 44 01 07 add %r8d,(%rdi)
400e29: 8b 74 24 f0 mov -0x10(%rsp),%esi
400e2d: 01 31 add %esi,(%rcx)
400e2f: 8b 44 24 f4 mov -0xc(%rsp),%eax
400e33: 01 02 add %eax,(%rdx)
400e35: c3 retq
400e36: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
400e3d: 00 00 00

To be fair, I am comparing gcc 4.3 to gcc4.4.

The compiler runs out of steam at around 400d86, but still this is great. I am using eigen after that. I have no idea why the operations have not been vectorized. Need to look into that as well.
But still, +1 to this.

EDIT: The vectorization for the code after 400d86 can be fixed if you follow this and this.

No comments: