Opened 9 years ago

Closed 5 years ago

#2253 closed bug (fixed)

Native code generator could do better

Reported by: dons Owned by:
Priority: normal Milestone: 7.6.1
Component: Compiler (NCG) Version: 6.8.2
Keywords: Cc: dons@…, pho@…, lemmih@…, morrow@…
Operating System: Unknown/Multiple Architecture: Unknown/Multiple
Type of failure: Runtime performance bug Test Case:
Blocked By: Blocking:
Related Tickets: Differential Rev(s):
Wiki Page:

Description

An example set of programs that came up in the ndp library, where the C backend outperforms the current native code generator. Logging them here so we don't forget to check again with the new backend.

Program 1

import Data.Array.Vector
import Data.Bits
main = print . sumU $ zipWith3U (\x y z -> x * y * z)
                        (enumFromToU 1 (100000000 :: Int))
                        (enumFromToU 2 (100000001 :: Int))
                        (enumFromToU 7 (100000008 :: Int))

Core:

Main.$s$wfold =
  \ (sc_sPH :: Int#)
    (sc1_sPI :: Int#)
    (sc2_sPJ :: Int#)
    (sc3_sPK :: Int#) ->
    case ># sc2_sPJ 100000000 of wild_aJo {
      False ->
        case ># sc1_sPI 100000001 of wild1_XK6 {
          False ->
            case ># sc_sPH 100000008 of wild2_XKd {
              False ->
                Main.$s$wfold
                  (+# sc_sPH 1)
                  (+# sc1_sPI 1)
                  (+# sc2_sPJ 1)
                  (+# sc3_sPK (*# (*# sc2_sPJ sc1_sPI) sc_sPH));
              True -> sc3_sPK
            };
          True -> sc3_sPK
        };
      True -> sc3_sPK
    }

}}}}

Which is great.

C backend:

{{{

Main_zdszdwfold_info:
  .text
  .p2align 4,,15
.text
  .align 8
  .type     Main_zdszdwfold_info, @function
  cmpq        $100000000, %r8
  jg  .L9
  cmpq        $100000001, %rdi
  jg  .L9
  cmpq        $100000008, %rsi
  jg  .L9
  movq        %r8, %rdx
  incq        %r8
  imulq       %rdi, %rdx
  incq        %rdi
  imulq       %rsi, %rdx
  incq        %rsi
  addq        %rdx, %r9
  jmp Main_zdszdwfold_info
.L5:
.L7:
  .p2align 6,,7
.L9:
  movq        %r9, %rbx
  jmp *(%rbp)


}}}


Native code generator:


{{{

Main_zdszdwfold_info:
  cmpq $100000000,%r8
  jg .LcRP
  cmpq $100000001,%rdi
  jg .LcRR
  cmpq $100000008,%rsi
  jg .LcRU
  movq %rdi,%rax
  imulq %rsi,%rax
  movq %r8,%rcx
  imulq %rax,%rcx
  movq %r9,%rax
  addq %rcx,%rax
  leaq 1(%r8),%rcx
  leaq 1(%rdi),%rdx
  incq %rsi
  movq %rdx,%rdi
  movq %rcx,%r8
  movq %rax,%r9
  jmp Main_zdszdwfold_info
.LcRP:
  movq %r9,%rbx
  jmp *(%rbp)
.LcRR:
  movq %r9,%rbx
  jmp *(%rbp)
.LcRU:
  movq %r9,%rbx
  jmp *(%rbp)

}}}

Runtime performance:

  C backend:    0.269
  Asm backend:  0.410s


== Program 2 ==

Source:

{{{

import Data.Array.Vector
import Data.Bits
main = print . sumU . mapU (`shiftL` 2) $
            appendU (replicateU 1000000000 (1::Int))
                    (replicateU 1000000000 (7::Int))

}}}

Core:

{{{

$s$wfold_rPr =
  \ (sc_sOw :: Int#) (sc1_sOx :: Int#) ->
    case sc_sOw of wild_X1j {
      __DEFAULT -> $s$wfold_rPr (+# wild_X1j 1) (+# sc1_sOx 28);
      1000000000 -> sc1_sOx
    }
}}}

Runtime:

    Native backend: 2.637
    C backend:      2.365

Change History (17)

comment:1 Changed 9 years ago by dons

Architecture: x86_64 (amd64)Multiple

Program 3

Source:

import Data.Array.Vector
main = print . sumU . consU 0xdeadbeef . replicateU (100000000::Int) $ (8::Int)

Core:

Main.$s$wfold =
  \ (sc_sMc :: Int#) (sc1_sMd :: Int#) ->
    case sc_sMc of wild_X13 {
      __DEFAULT -> Main.$s$wfold (+# wild_X13 1) (+# sc1_sMd 8);
      100000000 -> sc1_sMd
    }

Native backend:

Main_zdszdwfold_info:
  movq %rsi,%rax
  cmpq $100000000,%rax
  jne .LcND
  movq %rdi,%rbx
  jmp *(%rbp)
.LcND:
  leaq 8(%rdi),%rcx
  leaq 1(%rax),%rsi
  movq %rcx,%rdi
  jmp Main_zdszdwfold_info

C backend:

Main_zdszdwfold_info:
  cmpq        $100000000, %rsi
  je  .L5
.L3:
  leaq        1(%rsi), %rsi
  addq        $8, %rdi
  jmp Main_zdszdwfold_info

.L5:
  movq        %rdi, %rbx
  jmp *(%rbp)

Runtime:

Native backend: 0.143 C backend: 0.120

Program 4

Source:

import Data.Array.Vector
import Data.Bits
main = print . sumU . mapU (`shiftL` 1) . filterU (<20). mapU (*2) . mapU (+1) . replicateU (100000000::Int) $ (8::Int)

Core:

Main.$wfold =
  \ (ww_sNZ :: Int#) (ww1_sO3 :: Int#) ->
    case ww1_sO3 of wild_X1j {
      __DEFAULT -> Main.$wfold (+# ww_sNZ 36) (+# wild_X1j 1);
      100000000 -> ww_sNZ
    }

(Ridiculously awesome!)

Native backend:

Main_zdwfold_info:
  movq %rdi,%rax
  cmpq $100000000,%rax
  jne .LcPY
  movq %rsi,%rbx
  jmp *(%rbp)
.LcPY:
  incq %rax
  addq $36,%rsi
  movq %rax,%rdi
  jmp Main_zdwfold_info

C backend:

Main_zdwfold_info:
  cmpq        $100000000, %rdi
  je  .L5
.L3:
  addq        $36, %rsi
  leaq        1(%rdi), %rdi
  jmp Main_zdwfold_info
.L5:
  movq        %rsi, %rbx
  jmp *(%rbp)


Runtime:

C backend: 0.120s Native backend: 0.195s

Program 5

Source:

import Data.Array.Vector
import Data.Bits
main = print . sumU . mapU fstS $ zipU
                        (enumFromToU 1 (100000000 :: Int))
                        (enumFromToU 2 (100000001 :: Int))

Core:

Main.$s$wfold =
  \ (sc_sRJ :: Int#)
    (sc1_sRK :: Int#)
    (sc2_sRL :: Int#) ->
    case ># sc1_sRK 100000000 of wild_aM2 {
      False ->
        case ># sc_sRJ 100000001 of wild1_XMw {
          False ->
            Main.$s$wfold
              (+# sc_sRJ 1) (+# sc1_sRK 1) (+# sc2_sRL sc1_sRK);
          True -> sc2_sRL
        };
      True -> sc2_sRL
    }

Native backend:

Main_zdszdwfold_info:
  cmpq $100000000,%rdi
  jg .LcTr
  cmpq $100000001,%rsi
  jg .LcTu
  movq %r8,%rax
  addq %rdi,%rax
  leaq 1(%rdi),%rcx
  incq %rsi
  movq %rcx,%rdi
  movq %rax,%r8
  jmp Main_zdszdwfold_info
.LcTr:
  movq %r8,%rbx
  jmp *(%rbp)
.LcTu:
  movq %r8,%rbx
  jmp *(%rbp)

C backend:

Main_zdszdwfold_info:
  cmpq        $100000000, %rdi
  jg  .L5
  cmpq        $100000001, %rsi
  jg  .L5
  leaq        (%rdi,%r8), %rax
  incq        %rsi
  incq        %rdi
  movq        %rax, %r8
  jmp Main_zdszdwfold_info
.L3:
.L5:
  movq        %r8, %rbx
  jmp *(%rbp)

Runtime:

Native backend: 0.216 C backend: 0.194

Program 6

Source:

n = 40000000

main = do
      let c = replicateU n (2::Double)
          a = mapU fromIntegral (enumFromToU 0 (n-1) ) :: UArr Double
      print (sumU (zipWithU (*) c a))

Core

Main.$s$wfold =
  \ (sc_sXT :: Int#)
    (sc1_sXU :: Int#)
    (sc2_sXV :: Double#) ->
    case sc1_sXU of wild_X1h {
      __DEFAULT ->
        case ># sc_sXT 39999999 of wild1_aMi {
          False ->
            Main.$s$wfold
              (+# sc_sXT 1)
              (+# wild_X1h 1)
              (+## sc2_sXV (*## 2.0 (int2Double# sc_sXT)));
          True -> sc2_sXV
        };
      40000000 -> sc2_sXV
    }

Native backend:

Main_zdszdwfold_info:
  movq %rdi,%rax
  cmpq $40000000,%rax
  jne .LcZK
  jmp *(%rbp)

.LcZK:
  cmpq $39999999,%rsi
  jg .LcZN
  cvtsi2sdq %rsi,%xmm0
  mulsd .LnZP(%rip),%xmm0
  movsd %xmm5,%xmm7
  addsd %xmm0,%xmm7
  incq %rax
  incq %rsi
  movq %rax,%rdi
  movsd %xmm7,%xmm5
  jmp Main_zdszdwfold_info

.LcZN:
  jmp *(%rbp)

C backend:

Main_zdszdwfold_info:
  cmpq        $40000000, %rdi
  je  .L9
.L5:
  cmpq        $39999999, %rsi
  jg  .L9
  cvtsi2sdq   %rsi, %xmm0
  leaq        1(%rdi), %rdi
  incq        %rsi
  addsd       %xmm0, %xmm0
  addsd       %xmm0, %xmm5
  jmp Main_zdszdwfold_info
.L9:
  jmp *(%rbp)

Runtime:

Native backend: 0.192 C backend: 0.142

comment:2 Changed 9 years ago by igloo

difficulty: Unknown
Milestone: 6.10 branch

Thanks don!

comment:3 Changed 9 years ago by PHO

Cc: pho@… added

comment:5 Changed 9 years ago by simonmar

Architecture: MultipleUnknown/Multiple

comment:6 Changed 9 years ago by simonmar

Operating System: UnknownUnknown/Multiple

comment:7 Changed 9 years ago by igloo

Milestone: 6.10 branch6.12 branch

comment:8 Changed 8 years ago by Lemmih

Cc: lemmih@… added

comment:9 Changed 8 years ago by simonmar

The main problem here is those extra reg->reg moves between argument registers (e.g. %rsi) and temporary registers (e.g. %rax).

One way to fix it is to make cmmMiniInline a little smarter, by substituting for assignments of temporaries from global regs. The new register allocator might also do better. However, the new backend will solve this in a much better way, by having liveness information about global registers.

comment:10 Changed 8 years ago by simonmar

Type of failure: Runtime performance bug

comment:11 in reply to:  1 Changed 8 years ago by morrow

Cc: morrow@… added

Replying to dons:

Program 6

..
  let c = replicateU n (2::Double)
..

Native backend:

  ..
  mulsd .LnZP(%rip),%xmm0   /*.LnZP==2.0*/
  addsd %xmm0,%xmm5
  ..

C backend:

  ..
  addsd       %xmm0, %xmm0
  addsd       %xmm0, %xmm5
  ..

Also, gcc is strength-reducing (2.0 * x) to (x + x).

MULSD mem64,xmmreg    ==> latency=6
ADDSD xmmreg2,xmmreg1 ==> latency=4

comment:12 Changed 7 years ago by igloo

Milestone: 6.12 branch6.12.3

comment:13 Changed 7 years ago by igloo

Milestone: 6.12.36.14.1
Priority: normallow

comment:14 Changed 7 years ago by igloo

Blocked By: 4258 added
Milestone: 6.14.16.16.1

comment:15 Changed 6 years ago by igloo

Milestone: 7.4.17.6.1
Priority: lownormal

comment:16 Changed 5 years ago by simonmar

Blocked By: 4258 removed

I came to check these with the new backend, and it turns out that the old backend is doing just fine on these now. It might be mostly due to this: 3d8ab554ced45c51f39951f29cc53277d5788c37.

These are compiled with HEAD as of yesterday, with -O2.

Program 1:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2vG:
	cmpq $100000000,%rsi
	jle .Lc2vM
	movq %r14,%rbx
	jmp *0(%rbp)
.Lc2vM:
	cmpq $100000001,%rdi
	jle .Lc2vO
	movq %r14,%rbx
	jmp *0(%rbp)
.Lc2vO:
	cmpq $100000008,%r8
	jle .Lc2vR
	movq %r14,%rbx
	jmp *0(%rbp)
.Lc2vR:
	movq %rdi,%rbx
	imulq %r8,%rbx
	movq %rsi,%rax
	imulq %rbx,%rax
	addq %rax,%r14
	incq %rsi
	incq %rdi
	incq %r8
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info

The new code generator does a bit better, commoning up the duplicate blocks:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2vW:
	cmpq $100000000,%rsi
	jle .Lc2wt
.Lc2wj:
	movq %r14,%rbx
	jmp *(%rbp)
.Lc2wt:
	cmpq $100000001,%rdi
	jg .Lc2wj
	cmpq $100000008,%r8
	jg .Lc2wj
	movq %rdi,%rbx
	imulq %r8,%rbx
	movq %rsi,%rax
	imulq %rbx,%rax
	addq %rax,%r14
	incq %rsi
	incq %rdi
	incq %r8
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info

Program 2 (with -O2 -fno-regs-graph, the graph-colouring allocator generates a tiny bit worse code on this one):

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2mJ:
	testq %rsi,%rsi
	jle .Lc2mR
.Lc2mS:
	addq $4,%r14
	decq %rsi
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info
.Lc2mR:
	movl $1000000000,%esi
	jmp r2kR_info

r2kR_info:
.Lc2m8:
	testq %rsi,%rsi
	jle .Lc2mg
.Lc2mh:
	addq $28,%r14
	decq %rsi
	jmp r2kR_info
.Lc2mg:
	movq %r14,%rbx
	jmp *(%rbp)

Program 3:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2hW:
	testq %rsi,%rsi
	jle .Lc2i1
	addq $8,%r14
	decq %rsi
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info
.Lc2i1:
	movq %r14,%rbx
	jmp *0(%rbp)

Program 4:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2lj:
	testq %rsi,%rsi
	jle .Lc2lo
	addq $36,%r14
	decq %rsi
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info
.Lc2lo:
	movq %r14,%rbx
	jmp *0(%rbp)

Program 5:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2rk:
	cmpq $100000000,%rsi
	jle .Lc2ro
	movq %r14,%rbx
	jmp *0(%rbp)
.Lc2ro:
	cmpq $100000001,%rdi
	jle .Lc2rr
	movq %r14,%rbx
	jmp *0(%rbp)
.Lc2rr:
	addq %rsi,%r14
	incq %rsi
	incq %rdi
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info

Program 6:

Main_mainzuzdszdwfoldlMzqzuloop_info:
.Lc2tu:
	testq %r14,%r14
	jle .Lc2tA
	cmpq $39999999,%rsi
	jle .Lc2tD
	jmp *0(%rbp)
.Lc2tA:
	jmp *0(%rbp)
.Lc2tD:
	cvtsi2sdq %rsi,%xmm0
	movsd .Ln2tF(%rip),%xmm1
	mulsd %xmm0,%xmm1
	addsd %xmm1,%xmm5
	decq %r14
	incq %rsi
	jmp Main_mainzuzdszdwfoldlMzqzuloop_info

We still need the strength reduction, I'll make a separate ticket for that.

comment:17 Changed 5 years ago by simonmar

Resolution: fixed
Status: newclosed
Note: See TracTickets for help on using tickets.