Change in specialisation(?) behaviour since 8.0.2 causes 6x slowdown

changed weight to 5

pacak could reproduce this as well, here is the difference in numbers. http://lpaste.net/356380

Here's a version with no dependencies:

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE UndecidableInstances #-}
module Main (main) where

-- | Benchmarks for various effect system implementations

-- import Criterion.Main
import Data.Bits
import Data.Int
import Data.IORef
import Data.Ratio
import Data.Time ( getCurrentTime, utctDayTime )

import Control.Exception
import Control.Monad
import Control.Monad.Trans.Class
import qualified Control.Monad.Trans.State.Strict as S
import Control.Monad.Trans.Reader

import System.CPUTime   ( getCPUTime )
import System.IO.Unsafe

-- Use only state, lift variable number of effects over/under
--------------------------------------------------------------------------------

test1mtl :: MonadState Int m => Int -> m Int
test1mtl n = foldM f 1 [0..n] where
  f acc x | x `rem` 5 == 0 = do
              s <- get
              put $! (s + 1)
              pure $! max acc x
          | otherwise = pure $! max acc x

main = do

  -- Used to definitively disable bench argument inlining
  -- !n <- randomRIO (1000000, 1000000) :: IO Int
  !m <- randomRIO (0, 0) :: IO Int

  let runRT = (`runReaderT`  (m :: Int))
  let runS  = (`S.runState`  (m :: Int))

  replicateM_ 100 $ do
    !n <- randomRIO (1000000, 1000000) :: IO Int
    evaluate $ (runS . runRT . test1mtl) n
  replicateM_ 100 $ do
    !n <- randomRIO (1000000, 1000000) :: IO Int
    evaluate $ (runS . runRT . test1mtl) n
  replicateM_ 100 $ do
    !n <- randomRIO (1000000, 1000000) :: IO Int
    evaluate $ (runS . runRT . runRT . test1mtl) n

-----
-- Auxiliary
----

class Monad m => MonadState s m | m -> s where
    get :: m s
    get = state (\s -> (s, s))

    put :: s -> m ()
    put s = state (\_ -> ((), s))

    state :: (s -> (a, s)) -> m a
    state f = do
      s <- get
      let ~(a, s') = f s
      put s'
      return a
    {-# MINIMAL state | get, put #-}

instance MonadState s m => MonadState s (ReaderT r m) where
    get = lift get
    put = lift . put
    state = lift . state

instance Monad m => MonadState s (S.StateT s m) where
    get = S.get
    put = S.put
    state = S.state

class Random a where
  randomR :: RandomGen g => (a,a) -> g -> (a,g)
  -- random  :: RandomGen g => g -> (a, g)
  randomRIO :: (a,a) -> IO a
  randomRIO range  = getStdRandom (randomR range)

instance Random Int        where randomR = randomIvalIntegral -- ; random = randomBounded

randomIvalIntegral :: (RandomGen g, Integral a) => (a, a) -> g -> (a, g)
randomIvalIntegral (l,h) = randomIvalInteger (toInteger l, toInteger h)

{-# SPECIALIZE randomIvalInteger :: (Num a) =>
    (Integer, Integer) -> StdGen -> (a, StdGen) #-}

randomIvalInteger :: (RandomGen g, Num a) => (Integer, Integer) -> g -> (a, g)
randomIvalInteger (l,h) rng
 | l > h     = randomIvalInteger (h,l) rng
 | otherwise = case (f 1 0 rng) of (v, rng') -> (fromInteger (l + v `mod` k), rng')
     where
       (genlo, genhi) = genRange rng
       b = fromIntegral genhi - fromIntegral genlo + 1

       q = 1000
       k = h - l + 1
       magtgt = k * q

       f mag v g | mag >= magtgt = (v, g)
                 | otherwise = v' `seq`f (mag*b) v' g' where
                        (x,g') = next g
                        v' = (v * b + (fromIntegral x - fromIntegral genlo))

class RandomGen g where
   next     :: g -> (Int, g)
   genRange :: g -> (Int,Int)
   genRange _ = (minBound, maxBound)

data StdGen
 = StdGen !Int32 !Int32

instance RandomGen StdGen where
  next  = stdNext
  genRange _ = stdRange

stdRange :: (Int,Int)
stdRange = (1, 2147483562)

stdNext :: StdGen -> (Int, StdGen)
stdNext (StdGen s1 s2) = (fromIntegral z', StdGen s1'' s2'')
 where  z'   = if z < 1 then z + 2147483562 else z
        z    = s1'' - s2''

        k    = s1 `quot` 53668
        s1'  = 40014 * (s1 - k * 53668) - k * 12211
        s1'' = if s1' < 0 then s1' + 2147483563 else s1'

        k'   = s2 `quot` 52774
        s2'  = 40692 * (s2 - k' * 52774) - k' * 3791
        s2'' = if s2' < 0 then s2' + 2147483399 else s2'

getStdRandom :: (StdGen -> (a,StdGen)) -> IO a
getStdRandom f = atomicModifyIORef' theStdGen (swap . f)
  where swap (v,g) = (g,v)

theStdGen :: IORef StdGen
theStdGen  = unsafePerformIO $ do
   rng <- mkStdRNG 0
   newIORef rng

mkStdRNG :: Integer -> IO StdGen
mkStdRNG o = do
    ct          <- getCPUTime
    (sec, psec) <- getTime
    return (createStdGen (sec * 12345 + psec + ct + o))

createStdGen :: Integer -> StdGen
createStdGen s = mkStdGen32 $ fromIntegral s

mkStdGen32 :: Int32 -> StdGen
mkStdGen32 sMaybeNegative = StdGen (s1+1) (s2+1)
  where
    s       = sMaybeNegative .&. maxBound
    (q, s1) = s `divMod` 2147483562
    s2      = q `mod` 2147483398

getTime :: IO (Integer, Integer)
getTime = do
  utc <- getCurrentTime
  let daytime = toRational $ utctDayTime utc
  return $ quotRem (numerator daytime) (denominator daytime)

$ /opt/ghc/8.0.2/bin/ghc MultiBench2.hs -O2 -fforce-recomp
[1 of 1] Compiling Main             ( MultiBench2.hs, MultiBench2.o )
Linking MultiBench2 ...
$ time ./MultiBench2 

real	0m2.954s
user	0m2.952s
sys	0m0.000s
$ /opt/ghc/8.2.1/bin/ghc MultiBench2.hs -O2 -fforce-recomp
[1 of 1] Compiling Main             ( MultiBench2.hs, MultiBench2.o )
Linking MultiBench2 ...
$ time ./MultiBench2 

real	0m12.335s
user	0m12.292s
sys	0m0.048s

The bulk of the increase in runtime is due to 2effe18a (The Early Inline Patch):

Commit 55efc9718b520ef354e32c15c4b49cdfecce412f (Combine identical case alternatives in CSE)
-----
$ time ./MultiBench2 

real	0m2.786s
user	0m2.784s
sys	0m0.000s

Commit 2effe18ab51d66474724d38b20e49cc1b8738f60 (The Early Inline Patch)
-----
$ time ./MultiBench2 

real	0m11.861s
user	0m11.816s
sys	0m0.052s

(I'm not sure yet what contributes to the other 0.5 seconds in runtime increase.)

changed milestone to %8.2.1

changed weight to 7

Given the magnitude of the change I think we should at least understand this before releasing.

Trac metadata

Trac field	Value
TypeOfFailure	OtherFailure → RuntimePerformance
Priority	normal → high

added runtime perf label

Here is what is happening

Before float-out we have

  $stest1mtl = \eta. ...foldr (\x k z. blah) z e...

Since the first arg of the foldr has no free vars, we float it out to give

  lvl = \x y z. blah
  $stest1mtl = \eta. ...foldr lvl z e...

That makes $stest1mtl small, so it is inlined at its two call sites (the first two test case in main).
So now there are two calls to lvl, and it is quite big, so it doesn't get inlined.
But actually it is much better not to inline $stest1mtl, and instead (after the foldr/build stuff has happened) to inline lvl back into it.

This kind of thing not new; I trip over it quite often. Generally, given

  f = e
  g = ...f..
  h = ...g...g..f...

should we inline f into g, thereby making g big, so it doesn't inline into h? Or should we instead inline g into h? Sometimes one is better, sometimes the other; I don't know any systematic way of doing The Right Thing all the time. It turned out that the early-inline patch changed the choice, which resulted in the changed performance.

However I did spot several things worth trying out

In CoreArity.rhsEtaExpandArity we carefully do not eta-expand thunks. But I saw some thunks like

        lvl_s621
          = case z_a4NJ of wild_a4OF { GHC.Types.I# x1_a4OH ->
            case x_a4NH of wild1_a4OJ { GHC.Types.I# y1_a4OL ->
            case GHC.Prim.<=# x1_a4OH y1_a4OL of {
              __DEFAULT -> (\ _ (eta_B1 :: Int) -> (wild_a4OF, eta_B1))
              1# ->        (\ _ (eta_B1 :: Int) -> (wild1_a4OJ, eta_B1))

Here it really would be good to eta-expand; then that particular lvl could be inlined at its call sites. Here's a change to CoreArity.rhsEtaExpandArity that did the job:

-        | isOneShotInfo os || has_lam e -> 1 + length oss
+        | isOneShotInfo os || not (is_app e) -> 1 + length oss

-    has_lam (Tick _ e) = has_lam e
-    has_lam (Lam b e)  = isId b || has_lam e
-    has_lam _          = False
+    is_app (Tick _ e) = is_app e
+    is_app (App f _)  = is_app f
+    is_app (Var _)    = True
+    is_app _          = False

Worth trying.

Now the offending top-level lvl function is still not inlined; but it has a function argument that is applied, so teh call sites look like

      lvl ... (\ab. blah) ...

When considering inining we do get a discount for the application of the argument inside lvl's rhs, but it was only a discout of 60, which seems small considering how great it is to inline a function. Boosting it to 150 with -funfolding-fun-discount=150 make the function inline, and we get good code all round. Maybe we should just up the default.

All the trouble is caused by the early float-out. I think we could try just elminating it.

Based on this diagnosis, I don't think we should hold up the release. It's not a bug in pass X that can readily be fixed; it's the (very difficult) challenge of making correct inlining decisions.

changed milestone to %8.4.1

added inlining label

changed milestone to %8.6.1

Is this still reproducible? If so this won't be fixed for 8.4.

changed weight to 5

In light of ticket:13851#comment:138297 I'm going to downgrade the priority and demilestone this.

Trac metadata

Trac field	Value
Priority	high → normal

Removing milestone as no one has stepped up to carry this on.

added simplifier label and removed inlining label

added Pnormal label

Trac field	Value
Version	8.2.1-rc2
Type	Bug
TypeOfFailure	OtherFailure
Priority	normal
Resolution	Unresolved
Component	Compiler
Test case
Differential revisions
BlockedBy
Related
Blocking
CC
Operating system
Architecture

Change in specialisation(?) behaviour since 8.0.2 causes 6x slowdown

Child items 0

Activity