C-- pipeline/NCG fails to optimize simple repeated addition
While debugging #14346 (closed) I noticed some rather abhorrent code in a disassembly of the newPinnedByteArray#
primop:
Dump of assembler code for function stg_newPinnedByteArrayzh:
0x00000000004a8518 <+0>: mov 0x378(%r13),%rax
0x00000000004a851f <+7>: cmpq $0x0,0x10(%rax)
0x00000000004a8524 <+12>: je 0x4a8593 <stg_newPinnedByteArrayzh+123>
0x00000000004a8526 <+14>: mov 0x4f5730,%rax
0x00000000004a852e <+22>: mov 0x38(%rax),%rax
0x00000000004a8532 <+26>: cmp 0x4f5718,%rax
0x00000000004a853a <+34>: jae 0x4a8593 <stg_newPinnedByteArrayzh+123>
0x00000000004a853c <+36>: mov %rbx,%rax
0x00000000004a853f <+39>: lea 0x7(%rax),%rcx
0x00000000004a8543 <+43>: shr $0x3,%rcx
0x00000000004a8547 <+47>: add $0x10,%rax <--- starts here
0x00000000004a854b <+51>: add $0xf,%rax
0x00000000004a854f <+55>: add $0x7,%rax
0x00000000004a8553 <+59>: shr $0x3,%rax
0x00000000004a8557 <+63>: mov $0x49d820,%ecx
...
That is three successive add
instructions; surely those should be collapsed into one by the Cmm-to-Cmm pipeline.