Opened 8 years ago

Closed 7 years ago

Last modified 4 years ago

#3553 closed bug (fixed)

parallel gc suffers badly if one thread is descheduled

Reported by: simonmar Owned by: simonmar
Priority: normal Milestone: 6.12.2
Component: Runtime System Version: 6.10.4
Keywords: Cc: sveina@…, simonmar
Operating System: Unknown/Multiple Architecture: Unknown/Multiple
Type of failure: None/Unknown Test Case:
Blocked By: Blocking:
Related Tickets: Differential Rev(s):
Wiki Page:

Description

The parallel GC synchronisation uses pure spinlocks, which leads to a severe decline in performance when one thread is descheduled. This is the main cause of the "last core parallel slowdown": using a -N value that matches the number of cores in the machine can be slower than using one fewer. The effect seems to be quite bad on Linux, reports are that it is less of an issue on OS X.

Switching to mutexes would help, but it isn't easy because we sometimes unlock these from a different thread than they were locked from, and standard mutexes don't let you do that (the locks in question are mut_spin and gc_spin in the GcThread structure).

Attachments (1)

futex-spinlocks.patch.bz2 (207.0 KB) - added by simonmar 7 years ago.

Download all attachments as: .zip

Change History (16)

comment:1 Changed 8 years ago by simonmar

I tried using condition variables, but it was significantly slower than spinlocks. Experimental patch attached (or would be, if it wasn't so large).

Tue Oct  6 16:32:58 BST 2009  Simon Marlow <marlowsd@gmail.com>
  * EXPERIMENT: use condition variables instead of spinlocks for the GC barrier
  This was significantly slower, averaging +20% with 7 cores on
  nofib/parallel.
    {
    hunk ./rts/sm/GC.c 132
    +#if defined(THREADED_RTS)
    +Mutex     gc_threads_ready_lock;
    +Condition gc_threads_ready_cond;
    +nat       gc_threads_ready;
    +
    +Mutex     gc_threads_go_lock;
    +Condition gc_threads_go_cond;
    +rtsBool   gc_threads_go;
    +#endif
    +
    hunk ./rts/sm/GC.c 885
    -    initSpinLock(&t->gc_spin);
    -    initSpinLock(&t->mut_spin);
    -    ACQUIRE_SPIN_LOCK(&t->gc_spin);
    hunk ./rts/sm/GC.c 937
    +
    +        initMutex(&gc_threads_ready_lock);
    +        initCondition(&gc_threads_ready_cond);
    +        gc_threads_ready = 0;
    +        initMutex(&gc_threads_go_lock);
    +        initCondition(&gc_threads_go_cond);
    +        gc_threads_go = rtsFalse;
    hunk ./rts/sm/GC.c 1094
    -    // Wait until we're told to wake up
    -    RELEASE_SPIN_LOCK(&gct->mut_spin);
    hunk ./rts/sm/GC.c 1095
    +
    +    // Wait until we're told to wake up
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    gc_threads_ready++;
    +    debugTrace(DEBUG_gc, "GC thread %d: gc_threads_ready = %d", gct->thread_index, gc_threads_ready);
    +    if (gc_threads_ready >= RtsFlags.ParFlags.nNodes-1) {
    +        signalCondition(&gc_threads_ready_cond);
    +    }
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    +
    hunk ./rts/sm/GC.c 1106
    -    ACQUIRE_SPIN_LOCK(&gct->gc_spin);
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    while (!gc_threads_go) {
    +        waitCondition(&gc_threads_go_cond, &gc_threads_go_lock);
    +    }
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GC.c 1134
    -    // Wait until we're told to continue
    -    RELEASE_SPIN_LOCK(&gct->gc_spin);
    hunk ./rts/sm/GC.c 1135
    +
    +    // Wait until we're told to continue
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    gc_threads_ready++;
    +    if (gc_threads_ready >= RtsFlags.ParFlags.nNodes-1) {
    +        signalCondition(&gc_threads_ready_cond);
    +    }
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    +
    hunk ./rts/sm/GC.c 1146
    -    ACQUIRE_SPIN_LOCK(&gct->mut_spin);
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    while (gc_threads_go) {
    +        waitCondition(&gc_threads_go_cond, &gc_threads_go_lock);
    +    }
    +    RELEASE_LOCK(&gc_threads_go_lock);
    +
    hunk ./rts/sm/GC.c 1155
    +    gct->wakeup = GC_THREAD_INACTIVE;
    +
    hunk ./rts/sm/GC.c 1169
    -    nat i, j;
    +    nat i;
    hunk ./rts/sm/GC.c 1172
    -    while(retry) {
    -        for (i=0; i < n_threads; i++) {
    -            if (i == me) continue;
    -            if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    -                prodCapability(&capabilities[i], cap->running_task);
    -            }
    -        }
    -        for (j=0; j < 10000000; j++) {
    -            retry = rtsFalse;
    -            for (i=0; i < n_threads; i++) {
    -                if (i == me) continue;
    -                write_barrier();
    -                setContextSwitches();
    -                if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    -                    retry = rtsTrue;
    -                }
    -            }
    -            if (!retry) break;
    +    for (i=0; i < n_threads; i++) {
    +        if (i == me) continue;
    +        if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    +            prodCapability(&capabilities[i], cap->running_task);
    hunk ./rts/sm/GC.c 1178
    +    setContextSwitches();
    +    $
    +    gc_threads_go = rtsFalse;
    +    $
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    while (gc_threads_ready < n_threads-1) {
    +        debugTrace(DEBUG_gc, "waitForGcThreads: gc_threads_ready = %d", gc_threads_ready);
    +        waitCondition(&gc_threads_ready_cond, &gc_threads_ready_lock);
    +    } $
    +    gc_threads_ready = 0;
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1213
    -        ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    -        RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    +//        ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    +//        RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    hunk ./rts/sm/GC.c 1216
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    gc_threads_go = rtsTrue;
    +    broadcastCondition(&gc_threads_go_cond);
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GC.c 1231
    -    nat i;
    -    for (i=0; i < n_threads; i++) {
    -        if (i == me) continue;
    -        while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); }
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    while (gc_threads_ready < n_threads-1) {
    +        debugTrace(DEBUG_gc, "shutdown_gc_threads: %d", gc_threads_ready);
    +        waitCondition(&gc_threads_ready_cond, &gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1236
    +    gc_threads_ready = 0;
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1243
    -releaseGCThreads (Capability *cap USED_IF_THREADS)
    +releaseGCThreads (Capability *cap STG_UNUSED)
    hunk ./rts/sm/GC.c 1245
    -    nat n_threads = RtsFlags.ParFlags.nNodes;
    -    nat me = cap->no;
    -    nat i;
    -    for (i=0; i < n_threads; i++) {
    -        if (i == me) continue;
    -        if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) $
    -            barf("releaseGCThreads");
    -        $
    -        gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
    -        ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    -        RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    -    }
    +    debugTrace(DEBUG_gc, "releaseGCThreads");
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    gc_threads_go = rtsFalse;
    +    broadcastCondition(&gc_threads_go_cond);
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GCThread.h 119
    -    SpinLock   gc_spin;
    -    SpinLock   mut_spin;
    }

comment:2 Changed 7 years ago by simonmar

Owner: set to igloo
Type: bugmerge
Type of failure: None/Unknown

This patch helps a lot:

Fri Jan 22 16:49:11 GMT 2010  Simon Marlow <marlowsd@gmail.com>
  * When acquiring a spinlock, yieldThread() every 1000 spins (#3553, #3758)
  
  This helps when the thread holding the lock has been descheduled,
  which is the main cause of the "last-core slowdown" problem.  With
  this patch, I get much better results with -N8 on an 8-core box,
  although some benchmarks are still worse than with 7 cores.
  
  I also added a yieldThread() into the any_work() loop of the parallel
  GC when it has no work to do. Oddly, this seems to improve performance
  on the parallel GC benchmarks even when all the cores are busy.
  Perhaps it is due to reducing contention on the memory bus.

See this blog post for more info.

I'm going to close the bug, since I think this is the best we can do until we implement on-the-fly minor GCs.

comment:3 Changed 7 years ago by wuzzeb

I would be careful about adding sched_yield. Ingo Molnar, the developer who wrote and maintains the linux scheduler, says he has never seen a valid use of sched_yield. See http://kerneltrap.org/Linux/Using_sched_yield_Improperly for the discussion. At least with the CFS scheduler in linux, using sched_yield might not always do what you expect.

Also, from your blog post at least, it seems like futexes are perfect for what you need. In the uncontended case, they are completely in userspace. They can be acquired on one thread and released on another thread. Any reasons why futexes don't work?

http://people.redhat.com/drepper/futex.pdf http://en.wikipedia.org/wiki/Futex

comment:4 Changed 7 years ago by simonmar

I tried using futexes today, and so far haven't managed to beat the performance of the sched_yield spinlocks, but I might not be using them right. I fully expect futexes to be better. (also, futexes are not exactly a real API. The syscall isn't even exposed by glibc, you have to define it manually, as I discovered with some help from folks on #ghc).

I can entirely believe that sched_yield is never the absolute right thing, but it might be the best portable solution we have.

Changed 7 years ago by simonmar

Attachment: futex-spinlocks.patch.bz2 added

comment:5 Changed 7 years ago by simonmar

Patch to use futexes attached. This is significantly slower than the sched_yield version currently in use. I don't know why - as far as I can tell I'm using futexes correctly. The protocol I'm using is from Drepper's paper, and I tried it with and without some user-space spinning in the acquire case.

nofib/parallel/ray on 8 cores, first with futexes and then with yield:

$ ./ray 1000 +RTS -N8 -s >/dev/null
  14,784,695,584 bytes allocated in the heap
     246,403,264 bytes copied during GC
         108,232 bytes maximum residency (169 sample(s))
         310,000 bytes maximum slop
               6 MB total memory in use (0 MB lost due to fragmentation)

  Generation 0:  4606 collections,  4605 parallel,  4.31s,  2.16s elapsed
  Generation 1:   169 collections,   169 parallel,  0.22s,  0.08s elapsed

  Parallel GC work balance: 1.56 (30214391 / 19430130, ideal 8)

  SPARKS: 1000000 (978174 converted, 21636 pruned)

  INIT  time    0.00s  (  0.00s elapsed)
  MUT   time    8.96s  (  1.86s elapsed)
  GC    time    4.53s  (  2.24s elapsed)
  EXIT  time    0.00s  (  0.00s elapsed)
  Total time   13.49s  (  4.11s elapsed)


$ ./ray-yield 1000 +RTS -N8 -s >/dev/null 
  14,834,802,304 bytes allocated in the heap
     237,105,080 bytes copied during GC
          97,736 bytes maximum residency (158 sample(s))
         299,160 bytes maximum slop
               6 MB total memory in use (0 MB lost due to fragmentation)

  Generation 0:  4515 collections,  4514 parallel,  7.73s,  1.65s elapsed
  Generation 1:   158 collections,   158 parallel,  0.39s,  0.06s elapsed

  Parallel GC work balance: 1.53 (29092959 / 18954020, ideal 8)

  SPARKS: 1000000 (980491 converted, 19356 pruned)

  INIT  time    0.00s  (  0.00s elapsed)
  MUT   time   10.74s  (  1.93s elapsed)
  GC    time    8.11s  (  1.71s elapsed)
  EXIT  time    0.00s  (  0.00s elapsed)
  Total time   18.86s  (  3.64s elapsed)

The extra CPU time in the yield version is due to the higher spin threshold, but I've tried different spin thresholds in the futex case and it didn't help.

comment:6 Changed 7 years ago by igloo

Owner: changed from igloo to simonmar
Type: mergebug

Patch merged. Simon, I'm bouncing it back to you in case you want it left open for the futex investigation.

comment:7 Changed 7 years ago by simonmar

Resolution: fixed
Status: newclosed

I'm not going to investigate this any more.

comment:8 Changed 7 years ago by wuzzeb

I know you said you weren't going to investigate further and the ticket is closed, so I am just adding a pointer to some info for future investigation if someone decides to look into it again (I might myself if I become more familiar with ghc internals).

There is a recent patch to the linux kernel to implement a futux which spins in the kernel, see http://thread.gmane.org/gmane.linux.kernel/970412

You can spin in the kernel until the lock is released or you are scheduled out, might be closer to what ghc wants.

comment:9 Changed 7 years ago by Baughn

Cc: sveina@… added

comment:10 Changed 4 years ago by parcs

Cc: simonmar added

I noticed that if I change nofib/parallel/ray to do its work in a separate thread (see diff), that this "last core slowdown" completely vanishes. Can anybody explain this massive variance?

diff --git a/parallel/ray/Main.lhs b/parallel/ray/Main.lhs
index a1a72e6..bcd020f 100644
--- a/parallel/ray/Main.lhs
+++ b/parallel/ray/Main.lhs
@@ -5,10 +5,15 @@ Michaelson for SML, converted to (parallel) Haskell by Kevin Hammond!
 > import Control.Parallel
 > import Control.Parallel.Strategies (Strategy, withStrategy, rseq, parBuffer)
 > import System.Environment
+> import Control.Concurrent

 > main = do
->   [detail] <- fmap (map read) getArgs
->   putStr (top detail 10.0 7.0 6.0 sc)
+>   v <- newEmptyMVar
+>   forkIO $ do
+>       [detail] <- fmap (map read) getArgs
+>       putStr (top detail 10.0 7.0 6.0 sc)
+>       putMVar v ()
+>   takeMVar v

 > type Coord = (Double,Double,Double)

Before patch:

parcs@wolfgang:~/ghc/nofib/parallel/ray$ perf stat ./ray 3000 +RTS -N > /dev/null

 Performance counter stats for './ray 3000 +RTS -N':

      94255.765778 task-clock                #    6.343 CPUs utilized
         2,686,596 context-switches          #    0.029 M/sec
             6,592 CPU-migrations            #    0.000 M/sec
             2,243 page-faults               #    0.000 M/sec
   338,901,646,069 cycles                    #    3.596 GHz
   <not supported> stalled-cycles-frontend
   <not supported> stalled-cycles-backend
   264,852,020,580 instructions              #    0.78  insns per cycle
    48,676,712,628 branches                  #  516.432 M/sec
     1,114,612,869 branch-misses             #    2.29% of all branches

      14.859603166 seconds time elapsed

After patch:

parcs@wolfgang:~/ghc/nofib/parallel/ray$ perf stat ./ray 3000 +RTS -N > /dev/null

 Performance counter stats for './ray 3000 +RTS -N':

      65701.145514 task-clock                #    7.919 CPUs utilized
            41,217 context-switches          #    0.001 M/sec
               101 CPU-migrations            #    0.000 M/sec
             2,274 page-faults               #    0.000 M/sec
   242,921,371,383 cycles                    #    3.697 GHz
   <not supported> stalled-cycles-frontend
   <not supported> stalled-cycles-backend
   216,526,115,297 instructions              #    0.89  insns per cycle
    39,079,392,369 branches                  #  594.805 M/sec
       840,462,837 branch-misses             #    2.15% of all branches

       8.296947901 seconds time elapsed


Last edited 4 years ago by tibbe (previous) (diff)

comment:11 Changed 4 years ago by tibbe

I noticed that if I change nofib/parallel/ray to do its work in a separate thread (see diff), that this "last core slowdown" completely vanishes. Can anybody explain this massive variance?

It could be because main runs in a bound thread. Why that makes a huge difference I'm not sure (more context switching?), but that's a likely cause.

comment:12 Changed 4 years ago by parcs

I also think it's worth noting that with the above change to nofib/parallel/ray applied, yieldThread() could be made a no-op and the runtime of the program does not change.

comment:13 Changed 4 years ago by simonmar

Interesting - I hadn't realised that the main thread being bound could affect the performance of ordinary parallel programs too.

I'm very tempted to change the main thread to be non-bound.

comment:14 in reply to:  13 ; Changed 4 years ago by parcs

Replying to simonmar:

I'm very tempted to change the main thread to be non-bound.

If so, would the earlier commits that helped mitigate this particular slowdown be undone (like the one that makes ACQUIRE_SPIN_LOCK() call yieldThread() every 1000 spins)?

Last edited 4 years ago by parcs (previous) (diff)

comment:15 in reply to:  14 Changed 4 years ago by simonmar

Replying to parcs:

If so, would the earlier commits that helped mitigate this particular slowdown be undone (like the one that makes ACQUIRE_SPIN_LOCK() call yieldThread() every 1000 spins)?

No, the problem is still very real, and the yieldThread() helps a lot. Making the main thread non-bound would just make it less likely to manifest in some cases.

Note: See TracTickets for help on using tickets.