float-out causes extra allocation
The Haskell fannkuchredux (included in nofib as fannkuch-redux) contribution of Louis Wasserman to "The Computer Language Benchmarks Game" at shootout.alioth.debian.org times out on the amd64 machines, but not on the i386. I can reproduce it on my Debian amd64 machine.
It turns out that compiling without optimisation or with a simple -O produces a fast program, but with enormously large heap space allocated (10G compared with 67k on a virtual i386 machine) and also more garbage collector activity.
The source is below (because I don't find a way to attach the file). At the end of the source I copied my make command line, run command line and output produced with option -sstderr.
{- The Computer Language Benchmarks Game
http://shootout.alioth.debian.org/
contributed by Louis Wasserman
This should be compiled with:
-threaded -O2 -fexcess-precision -fasm
and run with:
+RTS -N<number of cores> -RTS <input>
-}
import Control.Concurrent
import Control.Monad
import System.Environment
import Foreign hiding (rotate)
import Data.Monoid
type Perm = Ptr Word8
data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int
instance Monoid F where
mempty = F 0 0
F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)
incPtr = (`advancePtr` 1)
decPtr = (`advancePtr` (-1))
flop :: Int -> Perm -> IO ()
flop k xs = flopp xs (xs `advancePtr` k)
where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
swap i j = do
a <- peek i
b <- peek j
poke j a
poke i b
flopS :: Perm -> (Int -> IO a) -> IO a
flopS !xs f = do
let go !acc = do
k <- peekElemOff xs 0
if k == 0 then f acc else flop (fromIntegral k) xs >> go (acc+1)
go 0
increment :: Ptr Word8 -> Ptr Word8 -> IO ()
increment !p !ct = do
first <- peekElemOff p 1
pokeElemOff p 1 =<< peekElemOff p 0
pokeElemOff p 0 first
let go !i !first = do
ci <- peekElemOff ct i
if fromIntegral ci < i then pokeElemOff ct i (ci+1) else do
pokeElemOff ct i 0
let !i' = i + 1
moveArray p (incPtr p) i'
pokeElemOff p i' first
go i' =<< peekElemOff p 0
go 1 first
genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
let upd j !f run = do
p0 <- peekElemOff perm 0
if p0 == 0 then increment perm count >> run f else do
copyArray destF perm n
increment perm count
flopS destF $ \ flops ->
run (f `mappend` F (checksum j flops) flops)
let go j !f = if j >= r then return f else upd j f (go (j+1))
go l mempty
where checksum i f = if i .&. 1 == 0 then f else -f
facts :: [Int]
facts = scanl (*) 1 [1..12]
unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
allocaArray n $ \ pp -> do
mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
let go i !idx = when (i >= 0) $ do
let fi = facts !! i
let (q, r) = idx `quotRem` fi
pokeElemOff count i (fromIntegral q)
copyArray pp p (i+1)
let go' j = when (j <= i) $ do
let jq = j + q
pokeElemOff p j =<< peekElemOff pp (if jq <= i then jq else jq - i - 1)
go' (j+1)
go' 0
go (i-1) r
go (n-1) idx
f p count
main = do
n <- fmap (read.head) getArgs
let fact = product [1..n]
let bk = fact `quot` 4
vars <- forM [0,bk..fact-1] $ \ ix -> do
var <- newEmptyMVar
forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix + bk)) p >=> putMVar var)
return var
F chksm mflops <- liftM mconcat (mapM takeMVar vars)
putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++ (show $ mflops)
{-
wurmli@noah-nofen:~/hpw/haskell/work/fannkuch$ ghc --make
-XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs
[1 of 1] Compiling Main ( fannkuchredux.ghc-3.hs, fannkuchredux.ghc-3.o )
Linking fannkuchredux.ghc-3 ...
wurmli@noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS -N4 -sstderr -RTS 12
3968050
Pfannkuchen(12) = 65
10,538,122,952 bytes allocated in the heap
359,512 bytes copied during GC
47,184 bytes maximum residency (2 sample(s))
51,120 bytes maximum slop
4 MB total memory in use (1 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 6053 colls, 6053 par 0.16s 0.04s 0.0000s 0.0001s
Gen 1 2 colls, 1 par 0.00s 0.00s 0.0001s 0.0001s
Parallel GC work balance: 40.82% (serial 0%, perfect 100%)
TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.00s ( 0.00s elapsed)
MUT time 44.73s ( 11.51s elapsed)
GC time 0.16s ( 0.04s elapsed)
EXIT time 0.00s ( 0.00s elapsed)
Total time 44.89s ( 11.55s elapsed)
Alloc rate 235,589,887 bytes per MUT second
Productivity 99.6% of total user, 387.3% of total elapsed
gc_alloc_block_sync: 31024
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
-}