Poor register allocation and redundant moves when using `foreign import prim`

This may well be the same as #12232 but I didn't want to risk hijacking that one.

I've been looking at the dumped asm from a library where I use foreign import prim to call a simple assembly routine (that may be irrelevant). I see many movq instructions, and when I traced them by hand many or most seemed superfluous saving and restoring of registers.

Here is a contiguous and mostly self-contained snippet of the assembly, with notes. I have four values I'm interested in V0, V1, V2, V3 which I started tracing at the note "registers good for sipRound_s_x2 at this point".

Below I specifically follow V3 through the assmebly, marking lines with "*** V3 ***", and the moves don't seem sensible to my (untrained) eye:

_c7Ns:
	movq %r9,%rbx
	movq %rax,%r9
	movq %r8,%rax     ;               *** V3  ***
	movq %rbx,%r8
	movq %rdi,%rbx
	movq %rax,%rdi    ;               *** V3 (back to rdi!) ***
	movq %rsi,%rax
	movq %rbx,%rsi
	movq %r14,%rbx
	movq %rax,%r14
	movq %rcx,16(%rbp)
	addq $16,%rbp
	jmp *8(%rbp)
.align 8
	.quad	836
	.quad	32
block_info:
_c7Nk:
	movq 16(%rbp),%rax
	movq $8,16(%rbp)
	movq 24(%rbp),%rcx
	incq %rcx
	movq %rcx,24(%rbp)
	movq 32(%rbp),%rdx
	incq %rdx
	movq %rdx,32(%rbp)
	xorq 8(%rbp),%rbx
	addq $16,%rbp
	movl $8,%r8d
	xorl %r9d,%r9d
_n7T1:
	movq %rax,64(%rsp)
	movq %r8,%rax
; --- registers good for sipRound_s_x2 at this point ------
	movq %rdi,%r8   ; moving V3 *OUT* of rdi         *** V3 ***
	movq %rsi,%rdi  ; moving V2 *OUT* of rdi
	movq %r14,%rsi  ; moving V1 *OUT* of r14   
	movq %rbx,%r14  ; moving V0 *OUT* of rbx   
	movq 64(%rsp),%rbx
_c7My:              ; (NOTE: there are a few jumps to here from other sections not included here)
	cmpq 24(%rbx),%rdx
	je _c7Ns
_c7Nr:
	movq 16(%rbx),%r10
	movzbl (%r10,%rdx,1),%r10d
	cmpq $1,%rax
	jne _c7N9
_c7Nl:
	movq $block_info,-16(%rbp)
	shlq $8,%r9
	orq %r10,%r9
;----- at this point (name: register): V0: r14, V1: rsi, V2: rdi, V3: r8
	movq %rdi,%rax ; save rdi, trying to be rsi
	movq %r8,%rdi  ; prepared rdi(V3)                  *** V3 ***
	xorq %r9,%rdi
	movq %rsi,%rcx
	movq %rax,%rsi ; prepared rsi(V2)
	movq %r14,%rax
	movq %rcx,%r14 ; prepared r14(V1)
	movq %rbx,%rcx
	movq %rax,%rbx ; prepared rbx(V0)
	movq %r9,-8(%rbp)
	movq %rcx,(%rbp)
	addq $-16,%rbp
	jmp sipRound_s_x2
.align 8
	.quad	1733
	.quad	32
block_info:
_c7N2:
	movq 24(%rbp),%rax
	movq $7,24(%rbp)
	movq 32(%rbp),%rcx
	incq %rcx
	movq %rcx,32(%rbp)
	movq 40(%rbp),%rdx
	incq %rdx
	movq %rdx,40(%rbp)
	movq 16(%rbp),%r8
	xorq 8(%rbp),%rbx
	addq $24,%rbp
	movl $7,%r9d
_n7T2:
	movq %rax,64(%rsp)
	movq %r9,%rax
	movq %r8,%r9
	movq %rdi,%r8
	movq %rsi,%rdi
	movq %r14,%rsi
	movq %rbx,%r14
	movq 64(%rsp),%rbx
	jmp _c7My
_c7N9:
	cmpq $1,%rax
	jbe _c7N3
_c7MW:
	decq %rax
	movq %rax,(%rbp)
	incq %rcx
	movq %rcx,8(%rbp)
	incq %rdx
	movq %rdx,16(%rbp)
	shlq $8,%r9
	orq %r10,%r9
	jmp _c7My
_c7N3:
	movq $block_info,-24(%rbp)
	movq %rdi,%rax
	movq %r8,%rdi   ;                   *** V3 (back to rdi!) ***
	xorq %r9,%rdi
	movq %rsi,%rcx
	movq %rax,%rsi
	movq %r14,%rax
	movq %rcx,%r14
	movq %rbx,%rcx
	movq %rax,%rbx
	movq %r9,-16(%rbp)
	movq %r10,-8(%rbp)
	movq %rcx,(%rbp)
	addq $-24,%rbp
	jmp sipRound_s_x2
	.size $whashRemainingBytes_info, .-$whashRemainingBytes_info

This is my first time looking closely at assembly, so maybe this is normal or no big deal performance-wise (I haven't gotten around to trying to correlate number of moves with performance of my variations yet), or I'm missing something obvious. I wasn't able to make sense of an objdump-ed version of the llvm-compiled program. The code for the version where the same sipRound stuff is implemented in normal haskell also has a lot of moves (even more in fact), but they seem interspersed throughout and are less easy to sort through.

EDIT: extraneous information removed

You can check out the branch here, which I'll keep at 838b27a:

https://github.com/jberryman/hashabler/tree/saved-branch-exhibiting-seemingly-bad-register-allocation

You can build and observe the assembly with:

$ cabal configure -fdev --enable-benchmarks   && cabal build  core
$ gvim ./dist/build/core/core-tmp/core.dump-asm

Edited Mar 10, 2019 by jberryman

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information