Poor register allocation and redundant moves when using `foreign import prim`
This may well be the same as #12232 but I didn't want to risk hijacking that one.
I've been looking at the dumped asm from a library where I use foreign import prim
to call a simple assembly routine (that may be irrelevant). I see many movq
instructions, and when I traced them by hand many or most seemed superfluous saving and restoring of registers.
Here is a contiguous and mostly self-contained snippet of the assembly, with notes. I have four values I'm interested in V0, V1, V2, V3 which I started tracing at the note "registers good for sipRound_s_x2 at this point".
Below I specifically follow V3 through the assmebly, marking lines with "*** V3 ***"
, and the moves don't seem sensible to my (untrained) eye:
_c7Ns:
movq %r9,%rbx
movq %rax,%r9
movq %r8,%rax ; *** V3 ***
movq %rbx,%r8
movq %rdi,%rbx
movq %rax,%rdi ; *** V3 (back to rdi!) ***
movq %rsi,%rax
movq %rbx,%rsi
movq %r14,%rbx
movq %rax,%r14
movq %rcx,16(%rbp)
addq $16,%rbp
jmp *8(%rbp)
.align 8
.quad 836
.quad 32
block_info:
_c7Nk:
movq 16(%rbp),%rax
movq $8,16(%rbp)
movq 24(%rbp),%rcx
incq %rcx
movq %rcx,24(%rbp)
movq 32(%rbp),%rdx
incq %rdx
movq %rdx,32(%rbp)
xorq 8(%rbp),%rbx
addq $16,%rbp
movl $8,%r8d
xorl %r9d,%r9d
_n7T1:
movq %rax,64(%rsp)
movq %r8,%rax
; --- registers good for sipRound_s_x2 at this point ------
movq %rdi,%r8 ; moving V3 *OUT* of rdi *** V3 ***
movq %rsi,%rdi ; moving V2 *OUT* of rdi
movq %r14,%rsi ; moving V1 *OUT* of r14
movq %rbx,%r14 ; moving V0 *OUT* of rbx
movq 64(%rsp),%rbx
_c7My: ; (NOTE: there are a few jumps to here from other sections not included here)
cmpq 24(%rbx),%rdx
je _c7Ns
_c7Nr:
movq 16(%rbx),%r10
movzbl (%r10,%rdx,1),%r10d
cmpq $1,%rax
jne _c7N9
_c7Nl:
movq $block_info,-16(%rbp)
shlq $8,%r9
orq %r10,%r9
;----- at this point (name: register): V0: r14, V1: rsi, V2: rdi, V3: r8
movq %rdi,%rax ; save rdi, trying to be rsi
movq %r8,%rdi ; prepared rdi(V3) *** V3 ***
xorq %r9,%rdi
movq %rsi,%rcx
movq %rax,%rsi ; prepared rsi(V2)
movq %r14,%rax
movq %rcx,%r14 ; prepared r14(V1)
movq %rbx,%rcx
movq %rax,%rbx ; prepared rbx(V0)
movq %r9,-8(%rbp)
movq %rcx,(%rbp)
addq $-16,%rbp
jmp sipRound_s_x2
.align 8
.quad 1733
.quad 32
block_info:
_c7N2:
movq 24(%rbp),%rax
movq $7,24(%rbp)
movq 32(%rbp),%rcx
incq %rcx
movq %rcx,32(%rbp)
movq 40(%rbp),%rdx
incq %rdx
movq %rdx,40(%rbp)
movq 16(%rbp),%r8
xorq 8(%rbp),%rbx
addq $24,%rbp
movl $7,%r9d
_n7T2:
movq %rax,64(%rsp)
movq %r9,%rax
movq %r8,%r9
movq %rdi,%r8
movq %rsi,%rdi
movq %r14,%rsi
movq %rbx,%r14
movq 64(%rsp),%rbx
jmp _c7My
_c7N9:
cmpq $1,%rax
jbe _c7N3
_c7MW:
decq %rax
movq %rax,(%rbp)
incq %rcx
movq %rcx,8(%rbp)
incq %rdx
movq %rdx,16(%rbp)
shlq $8,%r9
orq %r10,%r9
jmp _c7My
_c7N3:
movq $block_info,-24(%rbp)
movq %rdi,%rax
movq %r8,%rdi ; *** V3 (back to rdi!) ***
xorq %r9,%rdi
movq %rsi,%rcx
movq %rax,%rsi
movq %r14,%rax
movq %rcx,%r14
movq %rbx,%rcx
movq %rax,%rbx
movq %r9,-16(%rbp)
movq %r10,-8(%rbp)
movq %rcx,(%rbp)
addq $-24,%rbp
jmp sipRound_s_x2
.size $whashRemainingBytes_info, .-$whashRemainingBytes_info
This is my first time looking closely at assembly, so maybe this is normal or no big deal performance-wise (I haven't gotten around to trying to correlate number of moves with performance of my variations yet), or I'm missing something obvious. I wasn't able to make sense of an objdump-ed version of the llvm-compiled program. The code for the version where the same sipRound
stuff is implemented in normal haskell also has a lot of moves (even more in fact), but they seem interspersed throughout and are less easy to sort through.
EDIT: extraneous information removed
You can check out the branch here, which I'll keep at 838b27a:
You can build and observe the assembly with:
$ cabal configure -fdev --enable-benchmarks && cabal build core
$ gvim ./dist/build/core/core-tmp/core.dump-asm