Changes between Version 11 and Version 12 of SIMDVectorExampleInLLVM


Ignore:
Timestamp:
Apr 16, 2012 1:02:41 PM (2 years ago)
Author:
gmainland
Comment:

Add redirection

Legend:

Unmodified
Added
Removed
Modified
  • SIMDVectorExampleInLLVM

    v11 v12  
    1 It is useful to see the vector instructions "in action" in LLVM human readable form (a ".ll" file) prior to implementing the Cmm -> LLVM backend (within the ./compiler/llvmGen section of the code).  LLVM code is somewhere between Java byte code authoring and direct assembly language authoring.  Here is the process: 
    2  
    3  - Generate or Create a human readable file (a ".ll" file), for example, create "add_floats.ll" 
    4  - Compile this file to byte code using the LLVM compiler:  llvm-as add_floats.ll.  This generates a ".bc" file, in this case, add_floats.bc.  The byte code is unreadable. 
    5  - Now there are a few options once byte code is available 
    6    - Generate native machine code:  llc add_floats.bc will create a native assembler instruction set in a ".s" file (add_floats.s) 
    7    - Run the byte codes on the JIT compiler:  lli add_floats.bc should run the instructions and produce the result 
    8  
    9 To demonstrate the vector instructions, we can start with a basic C program (just to illustrate ... remember, LLVM is not functional so starting in an imperative language makes a lot of sense): 
    10 {{{ 
    11 #include <stdio.h> 
    12  
    13 int main() 
    14 { 
    15    float x[4], y[4], z[4]; 
    16    x[0] = 1.0; 
    17    x[1] = 2.0; 
    18    x[2] = 3.0; 
    19    x[3] = 4.0; 
    20    y[0] = 10.0; 
    21    y[1] = 20.0; 
    22    y[2] = 30.0; 
    23    y[3] = 40.0; 
    24  
    25    z[0] = x[0] + y[0];  
    26    z[1] = x[1] + y[1];  
    27    z[2] = x[2] + y[2];  
    28    z[3] = x[3] + y[3]; 
    29    printf("%f %f %f %f\n", z[0], z[1], z[2], z[3]); 
    30 } 
    31 }}} 
    32  
    33 Compiling and running this in C is easy and left to the user. 
    34  
    35 This converts easily to LLVM human readable format (use the [http://llvm.org/demo/index.cgi online generator] if you'd like): 
    36 {{{ 
    37 ; ModuleID = '/tmp/webcompile/_21191_0.bc' 
    38 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" 
    39 target triple = "x86_64-unknown-linux-gnu" 
    40  
    41 @.str = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00" 
    42  
    43 define i32 @main() nounwind { 
    44   %1 = alloca i32, align 4 
    45   %x = alloca [4 x float], align 16 
    46   %y = alloca [4 x float], align 16 
    47   %z = alloca [4 x float], align 16 
    48   store i32 0, i32* %1 
    49   %2 = getelementptr inbounds [4 x float]* %x, i32 0, i64 0 
    50   store float 1.000000e+00, float* %2 
    51   %3 = getelementptr inbounds [4 x float]* %x, i32 0, i64 1 
    52   store float 2.000000e+00, float* %3 
    53   %4 = getelementptr inbounds [4 x float]* %x, i32 0, i64 2 
    54   store float 3.000000e+00, float* %4 
    55   %5 = getelementptr inbounds [4 x float]* %x, i32 0, i64 3 
    56   store float 4.000000e+00, float* %5 
    57   %6 = getelementptr inbounds [4 x float]* %y, i32 0, i64 0 
    58   store float 1.000000e+01, float* %6 
    59   %7 = getelementptr inbounds [4 x float]* %y, i32 0, i64 1 
    60   store float 2.000000e+01, float* %7 
    61   %8 = getelementptr inbounds [4 x float]* %y, i32 0, i64 2 
    62   store float 3.000000e+01, float* %8 
    63   %9 = getelementptr inbounds [4 x float]* %y, i32 0, i64 3 
    64   store float 4.000000e+01, float* %9 
    65   %10 = getelementptr inbounds [4 x float]* %x, i32 0, i64 0 
    66   %11 = load float* %10 
    67   %12 = getelementptr inbounds [4 x float]* %y, i32 0, i64 0 
    68   %13 = load float* %12 
    69   %14 = fadd float %11, %13 
    70   %15 = getelementptr inbounds [4 x float]* %z, i32 0, i64 0 
    71   store float %14, float* %15 
    72   %16 = getelementptr inbounds [4 x float]* %x, i32 0, i64 1 
    73   %17 = load float* %16 
    74   %18 = getelementptr inbounds [4 x float]* %y, i32 0, i64 1 
    75   %19 = load float* %18 
    76   %20 = fadd float %17, %19 
    77   %21 = getelementptr inbounds [4 x float]* %z, i32 0, i64 1 
    78   store float %20, float* %21 
    79   %22 = getelementptr inbounds [4 x float]* %x, i32 0, i64 2 
    80   %23 = load float* %22 
    81   %24 = getelementptr inbounds [4 x float]* %y, i32 0, i64 2 
    82   %25 = load float* %24 
    83   %26 = fadd float %23, %25 
    84   %27 = getelementptr inbounds [4 x float]* %z, i32 0, i64 2 
    85   store float %26, float* %27 
    86   %28 = getelementptr inbounds [4 x float]* %x, i32 0, i64 3 
    87   %29 = load float* %28 
    88   %30 = getelementptr inbounds [4 x float]* %y, i32 0, i64 3 
    89   %31 = load float* %30 
    90   %32 = fadd float %29, %31 
    91   %33 = getelementptr inbounds [4 x float]* %z, i32 0, i64 3 
    92   store float %32, float* %33 
    93   %34 = getelementptr inbounds [4 x float]* %z, i32 0, i64 0 
    94   %35 = load float* %34 
    95   %36 = fpext float %35 to double 
    96   %37 = getelementptr inbounds [4 x float]* %z, i32 0, i64 1 
    97   %38 = load float* %37 
    98   %39 = fpext float %38 to double 
    99   %40 = getelementptr inbounds [4 x float]* %z, i32 0, i64 2 
    100   %41 = load float* %40 
    101   %42 = fpext float %41 to double 
    102   %43 = getelementptr inbounds [4 x float]* %z, i32 0, i64 3 
    103   %44 = load float* %43 
    104   %45 = fpext float %44 to double 
    105   %46 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double %36, double %39, double %42, double %45) 
    106   %47 = load i32* %1 
    107   ret i32 %47 
    108 } 
    109  
    110 declare i32 @printf(i8*, ...) 
    111 }}} 
    112  
    113 This is easy enough to run using the JIT compiler:  lli add_floats.ll 
    114 {{{ 
    115 [root@pg155-n19 pgms]# lli add_floats.ll  
    116 11.000000 22.000000 33.000000 44.000000 
    117 [root@pg155-n19 pgms]#  
    118 }}} 
    119  
    120 The core of the instructions can be replaced with vectorization (obviously, optimizing this program will result in very little code and vectorization is not necessary, but this is an exercise. 
    121  
    122 Here is the .ll code rewritten with vectorization: 
    123 {{{ 
    124 ; ModuleID = '/tmp/webcompile/_21191_0.bc' 
    125 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" 
    126 target triple = "x86_64-unknown-linux-gnu" 
    127  
    128 @.str = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00" 
    129  
    130 define i32 @main() nounwind { 
    131   %1 = alloca i32, align 4 
    132  
    133   ; allocate three vectors 
    134   %x = alloca <4 x float>, align 16 
    135   %y = alloca <4 x float>, align 16 
    136   %z = alloca <4 x float>, align 16 
    137  
    138   store i32 0, i32* %1 
    139  
    140   ; store initial values to the x and y vectors 
    141   %2 = getelementptr inbounds <4 x float>* %x, i32 0, i64 0 
    142   store float 1.000000e+00, float* %2 
    143   %3 = getelementptr inbounds <4 x float>* %x, i32 0, i64 1 
    144   store float 2.000000e+00, float* %3 
    145   %4 = getelementptr inbounds <4 x float>* %x, i32 0, i64 2 
    146   store float 3.000000e+00, float* %4 
    147   %5 = getelementptr inbounds <4 x float>* %x, i32 0, i64 3 
    148   store float 4.000000e+00, float* %5 
    149   %6 = getelementptr inbounds <4 x float>* %y, i32 0, i64 0 
    150   store float 1.000000e+01, float* %6 
    151   %7 = getelementptr inbounds <4 x float>* %y, i32 0, i64 1 
    152   store float 2.000000e+01, float* %7 
    153   %8 = getelementptr inbounds <4 x float>* %y, i32 0, i64 2 
    154   store float 3.000000e+01, float* %8 
    155   %9 = getelementptr inbounds <4 x float>* %y, i32 0, i64 3 
    156   store float 4.000000e+01, float* %9 
    157  
    158   ; load the vectors 
    159   %xs = load <4 x float>* %x 
    160   %ys = load <4 x float>* %y 
    161  
    162   ; add the vectors 
    163   %zs = fadd <4 x float> %xs, %ys 
    164  
    165   ; store the result vector back to z 
    166   store <4 x float> %zs, <4 x float>* %z 
    167  
    168   ; get the elements out of the vector for printing 
    169   %10 = getelementptr inbounds <4 x float>* %z, i32 0, i64 0 
    170   %11 = load float* %10 
    171   %12 = fpext float %11 to double 
    172   %13 = getelementptr inbounds <4 x float>* %z, i32 0, i64 1 
    173   %14 = load float* %13 
    174   %15 = fpext float %14 to double 
    175   %16 = getelementptr inbounds <4 x float>* %z, i32 0, i64 2 
    176   %17 = load float* %16 
    177   %18 = fpext float %17 to double 
    178   %19 = getelementptr inbounds <4 x float>* %z, i32 0, i64 3 
    179   %20 = load float* %19 
    180   %21 = fpext float %20 to double 
    181  
    182   ; print the components of z that were extracted above 
    183   %22 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double %12, double %15, double %18, double %21) 
    184  
    185   ; return 
    186   %23 = load i32* %1 
    187   ret i32 %23 
    188 } 
    189  
    190 declare i32 @printf(i8*, ...) 
    191 }}} 
    192  
    193 Rerunning the program above yields the same results as the original, non-vectorized LLVM program. 
    194  
    195 {{{ 
    196 [root@pg155-n19 pgms]# lli add_floats_vec.ll  
    197 11.000000 22.000000 33.000000 44.000000 
    198 }}} 
    199  
    200 Let's take a look at a program with a substantially larger array of floats to add.  Again, optimizing this would yield very little code and, certainly, lazy evaluation would turn this program into basically a no-op.  Nonetheless, this should speed up by almost a factor of 4 when vectorized.  Here is the C code that helps guide the imperative implementation: 
    201 {{{ 
    202 #include <stdio.h> 
    203  
    204 int main() 
    205 { 
    206    int sz = 128; 
    207    float x[sz], y[sz], z[sz]; 
    208    int i; 
    209    for (i = 0; i < sz; i++) { 
    210       x[i] = i; 
    211       y[i] = i + sz; 
    212    } 
    213  
    214    for (i = 0; i < sz; i += 4) { 
    215      z[i] = x[i] + y[i]; 
    216      z[i+1] = x[i+1] + y[i+1]; 
    217      z[i+2] = x[i+2] + y[i+2]; 
    218      z[i+3] = x[i+3] + y[i+3]; 
    219    } 
    220  
    221    printf("%f %f %f %f\n", z[0], z[1], z[2], z[3]); 
    222 } 
    223 }}} 
    224  
    225 In this case, the LLVM code is more compact and easier to work with if we use the optimized version of it.  You will note in the LLVM code that this optimizer does not automatically vectorize and, instead, still has 4 "fadd" instructions at the heart of the addition loop. 
    226  
    227 The resulting optimized LLVM code is as follows: 
    228 {{{ 
    229 ; ModuleID = '/tmp/webcompile/_15374_0.bc' 
    230 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" 
    231 target triple = "x86_64-unknown-linux-gnu" 
    232  
    233 @.str2 = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00" 
    234 @str = internal constant [24 x i8] c"Entering initialization\00" 
    235 @str3 = internal constant [18 x i8] c"Entering addition\00" 
    236  
    237 define i32 @main() nounwind { 
    238 ; <label>:0 
    239   %1 = alloca [40000 x float], align 16 
    240   %2 = alloca [40000 x float], align 16 
    241   %3 = alloca [40000 x float], align 16 
    242   %puts = call i32 @puts(i8* getelementptr inbounds ([24 x i8]* @str, i64 0, i64 0)) 
    243   br label %4 
    244  
    245 ; <label>:4                                       ; preds = %4, %0 
    246   %indvar21 = phi i64 [ 0, %0 ], [ %indvar.next22, %4 ] 
    247   %i.06 = trunc i64 %indvar21 to i32 
    248   %scevgep25 = getelementptr [40000 x float]* %2, i64 0, i64 %indvar21 
    249   %scevgep26 = getelementptr [40000 x float]* %1, i64 0, i64 %indvar21 
    250   %tmp27 = add i64 %indvar21, 40000 
    251   %tmp28 = trunc i64 %tmp27 to i32 
    252   %5 = sitofp i32 %i.06 to float 
    253   store float %5, float* %scevgep26, align 4, !tbaa !0 
    254   %6 = sitofp i32 %tmp28 to float 
    255   store float %6, float* %scevgep25, align 4, !tbaa !0 
    256   %indvar.next22 = add i64 %indvar21, 1 
    257   %exitcond23 = icmp eq i64 %indvar.next22, 40000 
    258   br i1 %exitcond23, label %7, label %4 
    259  
    260 ; <label>:7                                       ; preds = %4 
    261   %.sub3 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 0 
    262   %puts4 = call i32 @puts(i8* getelementptr inbounds ([18 x i8]* @str3, i64 0, i64 0)) 
    263   br label %8 
    264  
    265 ; <label>:8                                       ; preds = %8, %7 
    266   %indvar = phi i64 [ 0, %7 ], [ %indvar.next, %8 ] 
    267   %tmp = shl i64 %indvar, 2 
    268   %scevgep = getelementptr [40000 x float]* %3, i64 0, i64 %tmp 
    269   %scevgep7 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp 
    270   %scevgep8 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp 
    271   %tmp929 = or i64 %tmp, 1 
    272   %scevgep10 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp929 
    273   %scevgep11 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp929 
    274   %scevgep12 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp929 
    275   %tmp1330 = or i64 %tmp, 2 
    276   %scevgep14 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1330 
    277   %scevgep15 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1330 
    278   %scevgep16 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1330 
    279   %tmp1731 = or i64 %tmp, 3 
    280   %scevgep18 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1731 
    281   %scevgep19 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1731 
    282   %scevgep20 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1731 
    283   %9 = load float* %scevgep8, align 16, !tbaa !0 
    284   %10 = load float* %scevgep7, align 16, !tbaa !0 
    285   %11 = fadd float %9, %10 
    286   store float %11, float* %scevgep, align 16, !tbaa !0 
    287   %12 = load float* %scevgep12, align 4, !tbaa !0 
    288   %13 = load float* %scevgep11, align 4, !tbaa !0 
    289   %14 = fadd float %12, %13 
    290   store float %14, float* %scevgep10, align 4, !tbaa !0 
    291   %15 = load float* %scevgep16, align 8, !tbaa !0 
    292   %16 = load float* %scevgep15, align 8, !tbaa !0 
    293   %17 = fadd float %15, %16 
    294   store float %17, float* %scevgep14, align 8, !tbaa !0 
    295   %18 = load float* %scevgep20, align 4, !tbaa !0 
    296   %19 = load float* %scevgep19, align 4, !tbaa !0 
    297   %20 = fadd float %18, %19 
    298   store float %20, float* %scevgep18, align 4, !tbaa !0 
    299   %indvar.next = add i64 %indvar, 1 
    300   %exitcond = icmp eq i64 %indvar.next, 10000 
    301   br i1 %exitcond, label %21, label %8 
    302  
    303 ; <label>:21                                      ; preds = %8 
    304   %22 = load float* %.sub3, align 16, !tbaa !0 
    305   %23 = fpext float %22 to double 
    306   %24 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 1 
    307   %25 = load float* %24, align 4, !tbaa !0 
    308   %26 = fpext float %25 to double 
    309   %27 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 2 
    310   %28 = load float* %27, align 8, !tbaa !0 
    311   %29 = fpext float %28 to double 
    312   %30 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 3 
    313   %31 = load float* %30, align 4, !tbaa !0 
    314   %32 = fpext float %31 to double 
    315   %33 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str2, i64 0, i64 0), double %23, double %26, double %29, double %32) nounwind 
    316   ret i32 0 
    317 } 
    318  
    319 declare i32 @printf(i8* nocapture, ...) nounwind 
    320  
    321 declare i32 @puts(i8* nocapture) nounwind 
    322  
    323 !0 = metadata !{metadata !"float", metadata !1} 
    324 !1 = metadata !{metadata !"omnipotent char", metadata !2} 
    325 !2 = metadata !{metadata !"Simple C/C++ TBAA", null} 
    326 }}} 
    327  
    328 Instead of hand-optimizing the entire sequence, the exercise will merely convert the types to vectors and then alter the loop starting with "label 44" to use vector addition rather then the sequence of adds that is currently being used.  The resulting program is as follows: 
    329  
    330 {{{ 
    331 ; ModuleID = '/tmp/webcompile/_15374_0.bc' 
    332 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" 
    333 target triple = "x86_64-unknown-linux-gnu" 
    334  
    335 @.str2 = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00" 
    336 @str = internal constant [24 x i8] c"Entering initialization\00" 
    337 @str3 = internal constant [18 x i8] c"Entering addition\00" 
    338  
    339 define i32 @main() nounwind { 
    340 ; <label>:0 
    341   %1 = alloca <128 x float>, align 16 
    342   %2 = alloca <128 x float>, align 16 
    343   %3 = alloca <128 x float>, align 16 
    344   %puts = call i32 @puts(i8* getelementptr inbounds ([24 x i8]* @str, i64 0, i64 0)) 
    345   br label %4 
    346  
    347 ; <label>:4                                       ; preds = %4, %0 
    348   %indvar21 = phi i64 [ 0, %0 ], [ %indvar.next22, %4 ] 
    349   %i.06 = trunc i64 %indvar21 to i32 
    350   %scevgep25 = getelementptr <128 x float>* %2, i64 0, i64 %indvar21 
    351   %scevgep26 = getelementptr <128 x float>* %1, i64 0, i64 %indvar21 
    352   %tmp27 = add i64 %indvar21, 128 
    353   %tmp28 = trunc i64 %tmp27 to i32 
    354   %5 = sitofp i32 %i.06 to float 
    355   store float %5, float* %scevgep26, align 4, !tbaa !0 
    356   %6 = sitofp i32 %tmp28 to float 
    357   store float %6, float* %scevgep25, align 4, !tbaa !0 
    358   %indvar.next22 = add i64 %indvar21, 1 
    359   %exitcond23 = icmp eq i64 %indvar.next22, 128 
    360   br i1 %exitcond23, label %7, label %4 
    361  
    362 ; <label>:7                                       ; preds = %4 
    363   %.sub3 = getelementptr inbounds <128 x float>* %3, i64 0, i64 0 
    364   %puts4 = call i32 @puts(i8* getelementptr inbounds ([18 x i8]* @str3, i64 0, i64 0)) 
    365   br label %8 
    366  
    367 ; <label>:8                                       ; preds = %8, %7 
    368 ;  %indvar = phi i64 [ 0, %7 ], [ %indvar.next, %8 ] 
    369  
    370  
    371 ;  %indvar.next = add i64 %indvar, 1 
    372 ;  %exitcond = icmp eq i64 %indvar.next, 10000 
    373 ;  br i1 %exitcond, label %9, label %8 
    374    br label %9 
    375  
    376 ; <label>:9                                      ; preds = %8 
    377   %xs = load <128 x float>* %1 
    378   %ys = load <128 x float>* %2 
    379   %zs = fadd <128 x float> %xs, %ys 
    380   store <128 x float> %zs, <128 x float>* %3 
    381  
    382   %10 = load float* %.sub3, align 16, !tbaa !0 
    383   %11 = fpext float %10 to double 
    384   %12 = getelementptr inbounds <128 x float>* %3, i64 0, i64 1 
    385   %13 = load float* %12, align 4, !tbaa !0 
    386   %14 = fpext float %13 to double 
    387   %15 = getelementptr inbounds <128 x float>* %3, i64 0, i64 2 
    388   %16 = load float* %15, align 8, !tbaa !0 
    389   %17 = fpext float %16 to double 
    390   %18 = getelementptr inbounds <128 x float>* %3, i64 0, i64 3 
    391   %19 = load float* %18, align 4, !tbaa !0 
    392   %20 = fpext float %19 to double 
    393   %21 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str2, i64 0, i64 0), double %11, double %14, double %17, double %20) nounwind 
    394   ret i32 0 
    395 } 
    396  
    397 declare i32 @printf(i8* nocapture, ...) nounwind 
    398  
    399 declare i32 @puts(i8* nocapture) nounwind 
    400  
    401 !0 = metadata !{metadata !"float", metadata !1} 
    402 !1 = metadata !{metadata !"omnipotent char", metadata !2} 
    403 !2 = metadata !{metadata !"Simple C/C++ TBAA", null} 
    404 }}} 
    405  
    406 Timing the execution of the optimized vs. non-optimized bytecodes yields: 
    407  
    408 Finally, a note on converting from arrays to vectors and subsequently optimizing to use vector adds.  The simplest way to do this was to: 
    409 - convert the code to multiply 4 of the array values at a time (depends on the register sizes, data sizes, etc...) 
    410  - convert the array types to vector ([128 x float] becomes <128 x float>), the program will work AS-IS with this simple conversion 
    411  - work through the loop again to move to a load of the proper location in the vector to a packed vector, then do the fadd of the vectors 
    412  - vector sizes MUST be the size of a power of 2 (1, 2, 4, 8, 16, ....) 
    413  - vector sizes seem to be limited, 32768 definitely did NOT work, 128 is working 
    414  
    415  
     1This page has been moved to [wiki:SIMD/LlvmExample]. The main page describing the effort to add SIMD support to GHC is [wiki:SIMD].