Optimized, disassembled and checked asm, *should* be fast, but...

From: Jeronimo Pellegrini
Subject: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 15:21:02 +0000
Message-ID: <fbh8ot$1au$1@aioe.org>

Hi.

I am trying to understand how Lisp code is optimized (particularly in
SBCL/CMUCL and GCL, which produce native x86 code).

So, I have written this function:

(defun doit (a factor)
  (declare
   (optimize (speed 3) (safety 0) (debug 0) (space 0))
   (double-float factor)
   (type (simple-array double-float *) a)
   (ftype (function ((simple-array double-float *) double-float)
                     double-float) doit))
  (let ((sum 0d0))
    (declare (double-float sum))
    (dotimes (i 9999999)
      (setf (aref a i) (the double-float (/ i factor))))
    (setq sum 0d0)
    (dotimes (i 9999999)
      (incf sum (aref a i)))
    sum))

and ran this code:

(let ((a (make-array '(10000000)
                     :element-type 'double-float
                     :adjustable nil)))
  (declare
   (ftype (function ((simple-array double-float *) double-float)
                     double-float) doit))
  (time (doit a 2d0)))

(I think I included more declarations than necessary)

See that I am *not* timing allocation of the array.

Then I compared to this (slightly different) C version:

#include<stdio.h>
#include<malloc.h>

double doit(double factor) {
        double* a = (double*) malloc(10000000 * sizeof(double));
        double sum;
        unsigned int i;

        for(i=0; i<1000000; i++)
                a[i] = ((double) i) / factor;
        sum = 0.0;
        for(i=0; i<1000000; i++)
                sum += a[i];
        return sum;
}

int main() {
        printf ("%g\n", doit(2.0));
        return 0;
}

I just ran:

$ gcc -o prog prog.c
$ time prog           <== 15 times

So I *did* time both the malloc and the printf.

This is what I got after a number of trials (it's the average user
time after 15 runs, truncated to n.nnn):                                                                                                                       

C     0.079 s
GCL   0.360 s
CMUCL 0.398 s
SBCL  0.401 s

I would expect some difference in running time, but 4x is a lot...
(And I timed more code in the C version)

I have disassembled the Lisp function and also checked the assembly code
that gcc generates for the program, and they're not too different in
size or strategy, but there are some differences on what instructions
are used to add, divide, etc.

So... Why is there such a large difference in running time between C and
the Lisps?

I have two guesses:

- The assembly generated by the Lisps is not as optimized as that
  generated by gcc (then I wonder if SBCL/CMUCL wouldn't do better
  generating RTL or GIMPLE instead of plain assembly?);
- The Lisps do some more bookkeeping besides running the generated code
  (but it's not GC, because the code does no significant consing, and
  the GC time is zero on all tests).

Or is there something I'm missing?

I have included the assembly output from gcc and SBCL. The others aren't
too different, and I didn't include them because this post is already
too long. 

BTW, I ran all tests on Linux and the machine is 32-bit x86.

J.

P.S.: I would have tested ECL, but it seems that its time function is
 hanging here: when I run (time (princ nil)) on an ECL prompt it waits
 forever. This is with the  REPL binary that comes precompiled on
 Debian.

The output from the C version (gcc 4.1.3):

doit:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $56, %esp
        movl    8(%ebp), %eax
        movl    %eax, -40(%ebp)
        movl    12(%ebp), %eax
        movl    %eax, -36(%ebp)
        movl    $80000000, (%esp)
        call    malloc
        movl    %eax, -20(%ebp)
        movl    $0, -4(%ebp)
        jmp     .L2
.L3:
        movl    -4(%ebp), %eax
        sall    $3, %eax
        movl    %eax, %ecx
        addl    -20(%ebp), %ecx
        movl    -4(%ebp), %eax
        movl    $0, %edx
        pushl   %edx
        pushl   %eax
        fildll  (%esp)
        leal    8(%esp), %esp
        fdivl   -40(%ebp)
        fstpl   (%ecx)
        addl    $1, -4(%ebp)
.L2:
        cmpl    $999999, -4(%ebp)
        jbe     .L3
        fldz
        fstpl   -16(%ebp)
        movl    $0, -4(%ebp)
        jmp     .L5
.L6:
        movl    -4(%ebp), %eax
        sall    $3, %eax
        addl    -20(%ebp), %eax
        fldl    (%eax)
        fldl    -16(%ebp)
        faddp   %st, %st(1)
        fstpl   -16(%ebp)
        addl    $1, -4(%ebp)
.L5:
        cmpl    $999999, -4(%ebp)
        jbe     .L6
        fldl    -16(%ebp)
        leave
        ret
        .size   doit, .-doit


SBCL 1.0.9 generates:

; 0A9F1281:       DDD8             FSTPD FR0                  ; no-arg-parsing entry point
;      283:       D9EE             FLDZ
;      285:       31C0             XOR EAX, EAX
;      287:       EB17             JMP L1
;      289: L0:   8BC8             MOV ECX, EAX
;      28B:       C1F902           SAR ECX, 2
;      28E:       894DF4           MOV [EBP-12], ECX
;      291:       DDD8             FSTPD FR0
;      293:       DB45F4           FILD [EBP-12]
;      296:       D8F1             FDIVD FR1
;      298:       9B               WAIT
;      299:       DD544201         FSTD [EDX+EAX*2+1]
;      29D:       83C004           ADD EAX, 4
;      2A0: L1:   3DFC596202       CMP EAX, 39999996
;      2A5:       7CE2             JL L0
;      2A7:       DDD8             FSTPD FR0
;      2A9:       D9EE             FLDZ
;      2AB:       31C0             XOR EAX, EAX
;      2AD:       EB0E             JMP L3
;      2AF: L2:   DDD9             FSTPD FR1
;      2B1:       DD444201         FLDD [EDX+EAX*2+1]
;      2B5:       D9C9             FXCH FR1
;      2B7:       D8C1             FADDD FR1
;      2B9:       9B               WAIT
;      2BA:       83C004           ADD EAX, 4
;      2BD: L3:   3DFC596202       CMP EAX, 39999996
;      2C2:       7CEB             JL L2
;      2C4:       64               FS-SEGMENT-PREFIX
;      2C5:       800D4800000004   OR BYTE PTR [#x48], 4
;      2CC:       BA10000000       MOV EDX, 16
;      2D1:       64               FS-SEGMENT-PREFIX
;      2D2:       031520000000     ADD EDX, [#x20]
;      2D8:       64               FS-SEGMENT-PREFIX
;      2D9:       3B1524000000     CMP EDX, [#x24]
;      2DF:       7607             JBE L4
;      2E1:       E8162767FD       CALL #x80639FC             ; alloc_overflow_edx
;      2E6:       EB0A             JMP L5
;      2E8: L4:   64               FS-SEGMENT-PREFIX
;      2E9:       891520000000     MOV [#x20], EDX
;      2EF:       83EA10           SUB EDX, 16
;      2F2: L5:   C70216030000     MOV DWORD PTR [EDX], 790
;      2F8:       8D5207           LEA EDX, [EDX+7]
;      2FB:       DD5201           FSTD [EDX+1]
;      2FE:       64               FS-SEGMENT-PREFIX
;      2FF:       80354800000004   XOR BYTE PTR [#x48], 4
;      306:       7402             JEQ L6
;      308:       CC09             BREAK 9                    ; pending interrupt trap
;      30A: L6:   8D65F8           LEA ESP, [EBP-8]
;      30D:       F8               CLC
;      30E:       8B6DFC           MOV EBP, [EBP-4]
;      311:       C20400           RET 4

Re: Optimized, disassembled and checked asm, *should* be fast, but... Matthias Benkard
- Re: Optimized, disassembled and checked asm, *should* be fast, but... Jeronimo Pellegrini
  - Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
    - Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
      - Re: Optimized, disassembled and checked asm, *should* be fast, but... Juho Snellman
        Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
        Re: Optimized, disassembled and checked asm, *should* be fast, but... Juho Snellman
        Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
      - Re: Optimized, disassembled and checked asm, *should* be fast, but... Matthias Buelow
        Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
  - Re: Optimized, disassembled and checked asm, *should* be fast, but... Jeronimo Pellegrini
    - Re: Optimized, disassembled and checked asm, *should* be fast, but... Rayiner Hashem
- Re: Optimized, disassembled and checked asm, *should* be fast, but... Rainer Joswig
Re: Optimized, disassembled and checked asm, *should* be fast, but... Rainer Joswig
- Re: Optimized, disassembled and checked asm, *should* be fast, but... Dimiter "malkia" Stanev
Re: Optimized, disassembled and checked asm, *should* be fast, but... Dimiter "malkia" Stanev

From: Matthias Benkard
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 17:51:56 +0000
Message-ID: <1188841916.652742.92620@r29g2000hsg.googlegroups.com>

Hi,

-- Lisp:
> (dotimes (i 9999999)
>   (setf (aref a i) (the double-float (/ i factor))))

-- C:
> for(i=0; i<1000000; i++)
>   a[i] = ((double) i) / factor;

You know, umm...  9'999'999 isn't really close enough to 1'000'000 to
warrant a fair comparison. ;)

Bye-bye,
Matthias

From: Jeronimo Pellegrini
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 18:22:11 +0000
Message-ID: <fbhjcj$vkt$1@aioe.org>

On 2007-09-03, Matthias Benkard <··········@gmail.com> wrote:
> Hi,
>
> -- Lisp:
>> (dotimes (i 9999999)
>>   (setf (aref a i) (the double-float (/ i factor))))
>
> -- C:
>> for(i=0; i<1000000; i++)
>>   a[i] = ((double) i) / factor;
>
> You know, umm...  9'999'999 isn't really close enough to 1'000'000 to
> warrant a fair comparison. ;)

Aargh!

OK, I counted the number of zeroes in the malloc line, and thought I had
typed them the same in the for lines.

Updated times:

C     0.761 s
GCL   0.360 s
CMUCL 0.398 s
SBCL  0.401 s

Hm, makes much more sense. It may be now that the C version is taking a
long time on malloc and printf. So, I have included a timer around the
call to doit, making it equivalent to the Lisp version:

#include<stdio.h>
#include<malloc.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/times.h>
#include <math.h>

 double doit(double* a, double factor) {
        double sum;
        unsigned int i;

        for(i=0; i<10000000; i++)
                a[i] = ((double) i) / factor;
        sum = 0.0L;
        for(i=0; i<10000000; i++)
                sum += a[i];
        return sum;
}

int main() {
        double* a = ( double*) malloc(10000000 * sizeof( double));
        double sum = 0.0;
        struct tms time;
        clock_t current_t;
        long double tot_secs;

        times( &time );
        clock_t  start_t = time.tms_utime;
        sum = doit(a, 2.0);
        times( &time );

        current_t = time.tms_utime;
        tot_secs = (long double) (current_t - start_t) 
                   / (long double) sysconf(_SC_CLK_TCK);

        printf("Sum is %f\n", sum);
        printf("Time: %Lf\n", tot_secs);
        return 0;
}


And... Well, this time Lisp is faster than C (by a factor of 2, which
surprises me):

C     0.735 s 
GCL   0.360 s
CMUCL 0.398 s
SBCL  0.401 s

So, there it is! Lisp 2x faster than C on one-dimensional array
traversal. (I think things get more complicated for higer dimensions,
particularly for SBCL).

J.

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 20:51:15 +0000
Message-ID: <1188852675.378711.296010@w3g2000hsg.googlegroups.com>

> C     0.735 s
> GCL   0.360 s
> CMUCL 0.398 s
> SBCL  0.401 s
>
> So, there it is! Lisp 2x faster than C on one-dimensional array
> traversal. (I think things get more complicated for higer dimensions,
> particularly for SBCL).

You might want to run GCC with optimization turned on :-/

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 21:45:02 +0000
Message-ID: <1188855902.944357.3120@k79g2000hse.googlegroups.com>

On Sep 3, 4:51 pm, Rayiner Hashem <·······@gmail.com> wrote:
> > C     0.735 s
> > GCL   0.360 s
> > CMUCL 0.398 s
> > SBCL  0.401 s
>
> > So, there it is! Lisp 2x faster than C on one-dimensional array
> > traversal. (I think things get more complicated for higer dimensions,
> > particularly for SBCL).
>
> You might want to run GCC with optimization turned on :-/

Hmm. On my machine, SBCL is still about twice as fast as GCC -O2 with
that particular benchmark. This is even though GCC definitely
generates better code for the second loop (SBCL does a lot of FP stack
manipulation, GCC does none).

Interestingly, changing the i / factor to factor * 2 in the first loop
makes GCC faster again (0.05 sec versus 0.085 sec). GCC is quite
sensitive to the computation in that loop, while SBCL is almost
completely insensitive to it.

It should also be noted that your loops are tight enough that simple
things like the code-alignment of the loop will give you major (factor
of 2) differences in performance.

From: Juho Snellman
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 22:05:27 +0000
Message-ID: <slrnfdp197.760.jsnell@sbz-30.cs.Helsinki.FI>

Rayiner Hashem <·······@gmail.com> wrote:
> Hmm. On my machine, SBCL is still about twice as fast as GCC -O2 with
> that particular benchmark. This is even though GCC definitely
> generates better code for the second loop (SBCL does a lot of FP stack
> manipulation, GCC does none).
>
> Interestingly, changing the i / factor to factor * 2 in the first loop
> makes GCC faster again (0.05 sec versus 0.085 sec). 

Note that gcc inlines doit into main, and specializes it on the
(constant) parameter passed to doit. So for example the original code
would be compiled to a fp multiplication with the reciprocal of 2.0
rather than an fp division. The benchmarks might be less sensitive to
the exact operations done in the inner loop if you force doit to be
uninlined. (Though factor * 2 doesn't feel like a good operation,
since it can just be hoisted out of the loop completely).

-- 
Juho Snellman

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 22:25:27 +0000
Message-ID: <1188858327.218125.144450@r29g2000hsg.googlegroups.com>

> (constant) parameter passed to doit. So for example the original code
> would be compiled to a fp multiplication with the reciprocal of 2.0
> rather than an fp division.

Couldn't you apply that optimization regardless of whether doit is
inlined? Since you're applying the save divisor to each operation, you
could compute the reciprocal outside the loop and use a multiply
inside it.

>  (Though factor * 2 doesn't feel like a good operation,
> since it can just be hoisted out of the loop completely).

Doh, of course. At the same time, I don't get why the SBCL version
with the division isn't any slower than the SBCL version with the
multiplication in there.

From: Juho Snellman
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 22:46:07 +0000
Message-ID: <slrnfdp3lf.9rk.jsnell@sbz-30.cs.Helsinki.FI>

Rayiner Hashem <·······@gmail.com> wrote:
>> (constant) parameter passed to doit. So for example the original code
>> would be compiled to a fp multiplication with the reciprocal of 2.0
>> rather than an fp division.
>
> Couldn't you apply that optimization regardless of whether doit is
> inlined? Since you're applying the save divisor to each operation, you
> could compute the reciprocal outside the loop and use a multiply
> inside it.

My gut feeling is that you can't do it for arbitrary divisors if you
want the results to be accurate -- (A * B) / C is not equivalent to A
* (B / C) for floating point numbers. Since both 2.0 and 1/2.0
have exact reprentations, the optimization can be applied for this
specific case. 

But don't take this as gospel, it's a bit too late at night for me to
reliably reason about fp...

>>  (Though factor * 2 doesn't feel like a good operation,
>> since it can just be hoisted out of the loop completely).
>
> Doh, of course. At the same time, I don't get why the SBCL version
> with the division isn't any slower than the SBCL version with the
> multiplication in there.

SBCL doesn't do loop invariant hoisting.

-- 
Juho Snellman

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 23:49:10 +0000
Message-ID: <1188863350.591676.165390@k79g2000hse.googlegroups.com>

> My gut feeling is that you can't do it for arbitrary divisors if you
> want the results to be accurate -- (A * B) / C is not equivalent to A
> * (B / C) for floating point numbers. Since both 2.0 and 1/2.0
> have exact reprentations, the optimization can be applied for this
> specific case.

I assumed it was one of those "unsafe" FP optimizations --- but I
didn't expect that GCC would actually try and figure out if it was
safe to apply based on the value of the divisor.

From: Matthias Buelow
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 21:56:07 +0000
Message-ID: <5k3eajF1v3o6U1@mid.dfncis.de>

Rayiner Hashem wrote:

> Hmm. On my machine, SBCL is still about twice as fast as GCC -O2 with
> that particular benchmark. This is even though GCC definitely
> generates better code for the second loop (SBCL does a lot of FP stack
> manipulation, GCC does none).

It should be noted that gcc isn't the best compiler for high-performance
code... even the old OpenWatcom compiler has generated (on Windoze)
about 30% faster code in some samples I have tried it on a year ago or
so. And I don't think it has been given much attention over the last 10
years (similar to CMUCL), while gcc is (supposedly) continuously
improved. I could imagine that the Intel or M$FT compilers are much
closer to the CMUCL/SBCL timings than gcc. Still, ace performance of the
CMU compiler and of gcl aswell.

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 22:23:03 +0000
Message-ID: <1188858183.366464.252500@22g2000hsm.googlegroups.com>

> so. And I don't think it has been given much attention over the last 10
> years (similar to CMUCL), while gcc is (supposedly) continuously
> improved. I could imagine that the Intel or M$FT compilers are much
> closer to the CMUCL/SBCL timings than gcc. Still, ace performance of the
> CMU compiler and of gcl aswell.

I don't think the GCC backend is to blame here, since GCL performs
faster than GCC here too, using the same backend.

From: Jeronimo Pellegrini
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 18:47:31 +0000
Message-ID: <fbhks3$43r$1@aioe.org>

On 2007-09-03, Jeronimo Pellegrini <···@aleph0.info> wrote:
> C     0.735 s 
> GCL   0.360 s
> CMUCL 0.398 s
> SBCL  0.401 s
>
> So, there it is! Lisp 2x faster than C on one-dimensional array
> traversal. (I think things get more complicated for higer dimensions,
> particularly for SBCL).

But as Rainer pointed out, the data types in the arrays are not
equivalent...

But still, it shows how fast we can get with a good Lisp compiler.

J.

From: Rayiner Hashem
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 20:35:46 +0000
Message-ID: <1188851746.106593.289640@19g2000hsx.googlegroups.com>

> But as Rainer pointed out, the data types in the arrays are not
> equivalent...

It should be, as far as I can tell. I believe SBCL unboxes arrays of
double-floats like the one he's using.

From: Rainer Joswig
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 18:23:02 +0000
Message-ID: <joswig-E7F815.20230103092007@news-europe.giganews.com>

In article <·······················@r29g2000hsg.googlegroups.com>,
 Matthias Benkard <··········@gmail.com> wrote:

> Hi,
> 
> -- Lisp:
> > (dotimes (i 9999999)
> >   (setf (aref a i) (the double-float (/ i factor))))
> 
> -- C:
> > for(i=0; i<1000000; i++)
> >   a[i] = ((double) i) / factor;
> 
> You know, umm...  9'999'999 isn't really close enough to 1'000'000 to
> warrant a fair comparison. ;)
> 
> Bye-bye,
> Matthias

good catch

-- 
http://lispm.dyndns.org

From: Rainer Joswig
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Mon, 03 Sep 2007 17:33:38 +0000
Message-ID: <joswig-3B0A62.19333703092007@news-europe.giganews.com>

In article <············@aioe.org>,
 Jeronimo Pellegrini <···@aleph0.info> wrote:

> Hi.
> 
> I am trying to understand how Lisp code is optimized (particularly in
> SBCL/CMUCL and GCL, which produce native x86 code).
> 
> So, I have written this function:
> 
> (defun doit (a factor)
>   (declare
>    (optimize (speed 3) (safety 0) (debug 0) (space 0))
>    (double-float factor)
>    (type (simple-array double-float *) a)
>    (ftype (function ((simple-array double-float *) double-float)
>                      double-float) doit))
>   (let ((sum 0d0))
>     (declare (double-float sum))
>     (dotimes (i 9999999)
>       (setf (aref a i) (the double-float (/ i factor))))
>     (setq sum 0d0)
>     (dotimes (i 9999999)
>       (incf sum (aref a i)))
>     sum))
> 
> and ran this code:
> 
> (let ((a (make-array '(10000000)
>                      :element-type 'double-float
>                      :adjustable nil)))
>   (declare
>    (ftype (function ((simple-array double-float *) double-float)
>                      double-float) doit))
>   (time (doit a 2d0)))
> 
> (I think I included more declarations than necessary)

Actually for some Lisps you need more declarations.
Only SBCL / CMUCL are better figuring things out.
You may even need to declare the fixnum loops (DOTIMES)
and the INCFs.

I guess with SBCL you can get better performance with some
more tweaks.

additionally to what you wrote the following problems exist:

* data types in Lisp are not the same as in C.
  Often data is tagged. That means for a 32bit word,
  a few of the bits are tag + other stuff.
  So a small float in 32bits will not be 32bit wide. It will be less.
  A large float in 64bits will not be 64bit wide. It will be less.

  Often in computations you need to remove the tags, do some
  operation and create a result with a tag.

  Some data will not fit in the 'natural' boundary. In a 32bit
  Lisp, a 32bit integer will not be a fixnum, but a bignum.
  That means it takes more than a 32bit word.
  Computation with those values takes even more
  instructions.

  There are two ways to deal with that:

  * use the Lisp data with smaller ranges (fixnums)
    declare the types and the compiler won't do
    type checks and may create better instructions
    (fixnum vs. general numeric type).

  * use implementation dependent facilities for example
    to do real 32bit arithmetic.

* destructive float routines

  A typical problem is that Lisp compilers
  often create code that conses large amounts of floats.

  (let ((a 0d0))
     (setq a (+ a 1d0)))

  One may want use a FP instruction that destructively
  updates the variable a. But in many Lisps you
  get a new float from the addition which then is
  referenced by a.

  Some compilers may do better, but for example
  for MCL the compiler is totally stupid. Users
  have written a utility that allow to write
  code directly for the FPU. That code was
  much much much faster and consing was no longer
  an issue.

  http://mypage.iu.edu/~rdbeer/
  FPC-PPC (Version 0.21)
  A floating-point compiler for MCL and OpenMCL. This compiles Lisp double-float expressions directly
  into PPC assembly language, producing code that is usually faster and that allocates much less
  memory than the Lisp compiler.

For more background see here:

http://openmap.bbn.com/~kanderso/performance/postscript/lispfloat.ps
Fast floating-point processing in Common Lisp, Fateman, R.J., Broughan, K.A., Willcock,D.K., and Rettig, D.,
1995. ACM Trans. Math. Softw. 21, 1(Mar.), 26-62. Speed of floating-point intensive algorithms written in Lisp can
be comparable to (within 25%) of the same algorithm written in C or Fortran. See also: Reid, J.K., 1996. Remark on "Fast Floating-point PRocessing in Common-Lisp", ACM Trans. Math. Softw. 22, 4(Dec.), p.496-497.

Performing Lisp Tutorial - LUV'94 Slides Part 1 Part 2
http://openmap.bbn.com/~kanderso/performance/postscript/performing-lisp-1-ppt.ps
http://openmap.bbn.com/~kanderso/performance/postscript/performing-lisp-2-ppt.ps

-- 
http://lispm.dyndns.org

From: Dimiter "malkia" Stanev
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Tue, 04 Sep 2007 05:28:33 +0000
Message-ID: <5k48o2F21eq6U1@mid.individual.net>

Thanks, Rainer!

Really good stuff in the links :)

Rainer Joswig wrote:
> In article <············@aioe.org>,
>  Jeronimo Pellegrini <···@aleph0.info> wrote:
> 
>> Hi.
>>
>> I am trying to understand how Lisp code is optimized (particularly in
>> SBCL/CMUCL and GCL, which produce native x86 code).
>>
>> So, I have written this function:
>>
>> (defun doit (a factor)
>>   (declare
>>    (optimize (speed 3) (safety 0) (debug 0) (space 0))
>>    (double-float factor)
>>    (type (simple-array double-float *) a)
>>    (ftype (function ((simple-array double-float *) double-float)
>>                      double-float) doit))
>>   (let ((sum 0d0))
>>     (declare (double-float sum))
>>     (dotimes (i 9999999)
>>       (setf (aref a i) (the double-float (/ i factor))))
>>     (setq sum 0d0)
>>     (dotimes (i 9999999)
>>       (incf sum (aref a i)))
>>     sum))
>>
>> and ran this code:
>>
>> (let ((a (make-array '(10000000)
>>                      :element-type 'double-float
>>                      :adjustable nil)))
>>   (declare
>>    (ftype (function ((simple-array double-float *) double-float)
>>                      double-float) doit))
>>   (time (doit a 2d0)))
>>
>> (I think I included more declarations than necessary)
> 
> Actually for some Lisps you need more declarations.
> Only SBCL / CMUCL are better figuring things out.
> You may even need to declare the fixnum loops (DOTIMES)
> and the INCFs.
> 
> I guess with SBCL you can get better performance with some
> more tweaks.
> 
> additionally to what you wrote the following problems exist:
> 
> * data types in Lisp are not the same as in C.
>   Often data is tagged. That means for a 32bit word,
>   a few of the bits are tag + other stuff.
>   So a small float in 32bits will not be 32bit wide. It will be less.
>   A large float in 64bits will not be 64bit wide. It will be less.
> 
>   Often in computations you need to remove the tags, do some
>   operation and create a result with a tag.
> 
>   Some data will not fit in the 'natural' boundary. In a 32bit
>   Lisp, a 32bit integer will not be a fixnum, but a bignum.
>   That means it takes more than a 32bit word.
>   Computation with those values takes even more
>   instructions.
> 
> 
>   There are two ways to deal with that:
> 
>   * use the Lisp data with smaller ranges (fixnums)
>     declare the types and the compiler won't do
>     type checks and may create better instructions
>     (fixnum vs. general numeric type).
> 
>   * use implementation dependent facilities for example
>     to do real 32bit arithmetic.
> 
> * destructive float routines
> 
>   A typical problem is that Lisp compilers
>   often create code that conses large amounts of floats.
> 
>   (let ((a 0d0))
>      (setq a (+ a 1d0)))
> 
>   One may want use a FP instruction that destructively
>   updates the variable a. But in many Lisps you
>   get a new float from the addition which then is
>   referenced by a.
> 
>   Some compilers may do better, but for example
>   for MCL the compiler is totally stupid. Users
>   have written a utility that allow to write
>   code directly for the FPU. That code was
>   much much much faster and consing was no longer
>   an issue.
> 
>   http://mypage.iu.edu/~rdbeer/
>   FPC-PPC (Version 0.21)
>   A floating-point compiler for MCL and OpenMCL. This compiles Lisp double-float expressions directly
>   into PPC assembly language, producing code that is usually faster and that allocates much less
>   memory than the Lisp compiler.
> 
> 
> For more background see here:
> 
> http://openmap.bbn.com/~kanderso/performance/postscript/lispfloat.ps
> Fast floating-point processing in Common Lisp, Fateman, R.J., Broughan, K.A., Willcock,D.K., and Rettig, D.,
> 1995. ACM Trans. Math. Softw. 21, 1(Mar.), 26-62. Speed of floating-point intensive algorithms written in Lisp can
> be comparable to (within 25%) of the same algorithm written in C or Fortran. See also: Reid, J.K., 1996. Remark on "Fast Floating-point PRocessing in Common-Lisp", ACM Trans. Math. Softw. 22, 4(Dec.), p.496-497.
> 
> Performing Lisp Tutorial - LUV'94 Slides Part 1 Part 2
> http://openmap.bbn.com/~kanderso/performance/postscript/performing-lisp-1-ppt.ps
> http://openmap.bbn.com/~kanderso/performance/postscript/performing-lisp-2-ppt.ps
>

From: Dimiter "malkia" Stanev
Subject: Re: Optimized, disassembled and checked asm, *should* be fast, but...
Date: Wed, 05 Sep 2007 02:33:02 +0000
Message-ID: <5k6ir0F2bcdtU1@mid.individual.net>

In Lispworks, one thing optimized it most:

Changing

   (setf (aref a i) (the double-float (/ i factor)))

to

   (setf (aref a i) (the double-float (/ (coerce i 'double-float) factor))))

But also a couple of other things:

(defun doit (a factor)
   (declare
    (optimize (speed 3) (safety 0) (debug 0) (space 0)
              #+lispworks (float 0)) ; In lispworks this means don't do 
any floating point checks
    (double-float factor)
    (type (simple-array double-float *) a)
    (ftype (function ((simple-array double-float *) double-float)
                      double-float) doit))
   (let ((sum 0d0))
     (declare (double-float sum))
     (dotimes (i 9999999)
       (declare (type fixnum i))
       (setf (aref a i) (the double-float (/ (coerce i 'double-float) 
factor))))
     (setq sum 0d0)
     (dotimes (i 9999999)
       (declare (type fixnum i))
       (incf sum (aref a i)))
     sum))

(let ((a (make-array '(10000000)
                      :element-type 'double-float
                      :adjustable nil)))
   (declare
    (ftype (function ((simple-array double-float *) double-float)
                      double-float) doit))
   (time (doit a 2d0)))

Timing the evaluation of (DOIT A 2.0D0)

User time    =        0.062
System time  =        0.031
Elapsed time =        0.094
Allocation   = 16 bytes


Disassembly:

CL-USER 12 > (disassemble 'doit)
200A92CA:
        0:      55               push  ebp
        1:      89E5             move  ebp, esp
        3:      83EC14           sub   esp, 14
        6:      C7042486140000   move  [esp], 1486
       13:      8B7D08           move  edi, [ebp+8]
       16:      DD4005           fldl  [eax+5]
       19:      DD5DF0           fstpl [ebp-10]
       22:      33D2             xor   edx, edx
       24:      33DB             xor   ebx, ebx
L1:   26:      89D0             move  eax, edx
       28:      89D1             move  ecx, edx
       30:      C1F902           sar   ecx, 2
       33:      51               push  ecx
       34:      DB0424           fild  [esp]
       37:      83C404           add   esp, 4
       40:      DC75F0           fdivl [ebp-10]
       43:      DD5DF8           fstpl [ebp-8]
       46:      8B4FFD           move  ecx, [edi-3]
       49:      89CE             move  esi, ecx
       51:      81E6E0000000     and   esi, E0
       57:      83FE60           cmp   esi, 60
       60:      751C             jne   L3
       62:      89F9             move  ecx, edi
L2:   64:      C1E001           shl   eax, 1
       67:      03C8             add   ecx, eax
       69:      DD45F8           fldl  [ebp-8]
       72:      DD5905           fstpl [ecx+5]
       75:      81FBF8596202     cmp   ebx, 26259F8
       81:      7D0C             jge   L4
       83:      83C304           add   ebx, 4
       86:      89DA             move  edx, ebx
       88:      EBC0             jmp   L1
L3:   90:      8B4F05           move  ecx, [edi+5]
       93:      EBE1             jmp   L2
L4:   95:      DD05C0010E20     fldl  [200E01C0]       ; 0.0D0
      101:      DD5DF8           fstpl [ebp-8]
      104:      33C0             xor   eax, eax
      106:      33DB             xor   ebx, ebx
L5:  108:      89C1             move  ecx, eax
      110:      8B57FD           move  edx, [edi-3]
      113:      89D6             move  esi, edx
      115:      81E6E0000000     and   esi, E0
      121:      83FE60           cmp   esi, 60
      124:      7525             jne   L7
      126:      89FA             move  edx, edi
L6:  128:      C1E101           shl   ecx, 1
      131:      03D1             add   edx, ecx
      133:      DD4205           fldl  [edx+5]
      136:      DD5DF0           fstpl [ebp-10]
      139:      DD45F8           fldl  [ebp-8]
      142:      DC45F0           faddl [ebp-10]
      145:      DD5DF8           fstpl [ebp-8]
      148:      81FBF8596202     cmp   ebx, 26259F8
      154:      7D0C             jge   L8
      156:      83C304           add   ebx, 4
      159:      89D8             move  eax, ebx
      161:      EBC9             jmp   L5
L7:  163:      8B5705           move  edx, [edi+5]
      166:      EBD8             jmp   L6
L8:  168:      DD45F8           fldl  [ebp-8]
      171:      DD5DF8           fstpl [ebp-8]
      174:      83EC08           sub   esp, 8
      177:      8B75F8           move  esi, [ebp-8]
      180:      8975F0           move  [ebp-10], esi
      183:      8B75FC           move  esi, [ebp-4]
      186:      8975F4           move  [ebp-C], esi
      189:      8B7500           move  esi, [ebp]
      192:      8975F8           move  [ebp-8], esi
      195:      83ED08           sub   ebp, 8
      198:      8B750C           move  esi, [ebp+C]
      201:      897504           move  [ebp+4], esi
      204:      DD45F8           fldl  [ebp-8]
      207:      DD5D0C           fstpl [ebp+C]
      210:      C74508860C0000   move  [ebp+8], C86
      217:      B501             moveb ch, 1
      219:      C9               leave
      220:      FF2554DB1220     jmp   [2012DB54]       ; 
SYSTEM::RAW-FAST-BOX-DOUBLE
NIL


Jeronimo Pellegrini wrote:
> Hi.
> 
> I am trying to understand how Lisp code is optimized (particularly in
> SBCL/CMUCL and GCL, which produce native x86 code).
> 
> So, I have written this function:
> 
> (defun doit (a factor)
>   (declare
>    (optimize (speed 3) (safety 0) (debug 0) (space 0))
>    (double-float factor)
>    (type (simple-array double-float *) a)
>    (ftype (function ((simple-array double-float *) double-float)
>                      double-float) doit))
>   (let ((sum 0d0))
>     (declare (double-float sum))
>     (dotimes (i 9999999)
>       (setf (aref a i) (the double-float (/ i factor))))
>     (setq sum 0d0)
>     (dotimes (i 9999999)
>       (incf sum (aref a i)))
>     sum))
> 
> and ran this code:
> 
> (let ((a (make-array '(10000000)
>                      :element-type 'double-float
>                      :adjustable nil)))
>   (declare
>    (ftype (function ((simple-array double-float *) double-float)
>                      double-float) doit))
>   (time (doit a 2d0)))
> 
> (I think I included more declarations than necessary)
> 
> See that I am *not* timing allocation of the array.
> 
> Then I compared to this (slightly different) C version:
> 
> #include<stdio.h>
> #include<malloc.h>
> 
> double doit(double factor) {
>         double* a = (double*) malloc(10000000 * sizeof(double));
>         double sum;
>         unsigned int i;
> 
>         for(i=0; i<1000000; i++)
>                 a[i] = ((double) i) / factor;
>         sum = 0.0;
>         for(i=0; i<1000000; i++)
>                 sum += a[i];
>         return sum;
> }
> 
> int main() {
>         printf ("%g\n", doit(2.0));
>         return 0;
> }
> 
> I just ran:
> 
> $ gcc -o prog prog.c
> $ time prog           <== 15 times
> 
> So I *did* time both the malloc and the printf.
> 
> This is what I got after a number of trials (it's the average user
> time after 15 runs, truncated to n.nnn):                                                                                                                       
> 
> C     0.079 s
> GCL   0.360 s
> CMUCL 0.398 s
> SBCL  0.401 s
> 
> I would expect some difference in running time, but 4x is a lot...
> (And I timed more code in the C version)
> 
> I have disassembled the Lisp function and also checked the assembly code
> that gcc generates for the program, and they're not too different in
> size or strategy, but there are some differences on what instructions
> are used to add, divide, etc.
> 
> So... Why is there such a large difference in running time between C and
> the Lisps?
> 
> I have two guesses:
> 
> - The assembly generated by the Lisps is not as optimized as that
>   generated by gcc (then I wonder if SBCL/CMUCL wouldn't do better
>   generating RTL or GIMPLE instead of plain assembly?);
> - The Lisps do some more bookkeeping besides running the generated code
>   (but it's not GC, because the code does no significant consing, and
>   the GC time is zero on all tests).
> 
> Or is there something I'm missing?
> 
> I have included the assembly output from gcc and SBCL. The others aren't
> too different, and I didn't include them because this post is already
> too long. 
> 
> BTW, I ran all tests on Linux and the machine is 32-bit x86.
> 
> J.
> 
> P.S.: I would have tested ECL, but it seems that its time function is
>  hanging here: when I run (time (princ nil)) on an ECL prompt it waits
>  forever. This is with the  REPL binary that comes precompiled on
>  Debian.
> 
> The output from the C version (gcc 4.1.3):
> 
> doit:
>         pushl   %ebp
>         movl    %esp, %ebp
>         subl    $56, %esp
>         movl    8(%ebp), %eax
>         movl    %eax, -40(%ebp)
>         movl    12(%ebp), %eax
>         movl    %eax, -36(%ebp)
>         movl    $80000000, (%esp)
>         call    malloc
>         movl    %eax, -20(%ebp)
>         movl    $0, -4(%ebp)
>         jmp     .L2
> ..L3:
>         movl    -4(%ebp), %eax
>         sall    $3, %eax
>         movl    %eax, %ecx
>         addl    -20(%ebp), %ecx
>         movl    -4(%ebp), %eax
>         movl    $0, %edx
>         pushl   %edx
>         pushl   %eax
>         fildll  (%esp)
>         leal    8(%esp), %esp
>         fdivl   -40(%ebp)
>         fstpl   (%ecx)
>         addl    $1, -4(%ebp)
> ..L2:
>         cmpl    $999999, -4(%ebp)
>         jbe     .L3
>         fldz
>         fstpl   -16(%ebp)
>         movl    $0, -4(%ebp)
>         jmp     .L5
> ..L6:
>         movl    -4(%ebp), %eax
>         sall    $3, %eax
>         addl    -20(%ebp), %eax
>         fldl    (%eax)
>         fldl    -16(%ebp)
>         faddp   %st, %st(1)
>         fstpl   -16(%ebp)
>         addl    $1, -4(%ebp)
> ..L5:
>         cmpl    $999999, -4(%ebp)
>         jbe     .L6
>         fldl    -16(%ebp)
>         leave
>         ret
>         .size   doit, .-doit
> 
> 
> SBCL 1.0.9 generates:
> 
> ; 0A9F1281:       DDD8             FSTPD FR0                  ; no-arg-parsing entry point
> ;      283:       D9EE             FLDZ
> ;      285:       31C0             XOR EAX, EAX
> ;      287:       EB17             JMP L1
> ;      289: L0:   8BC8             MOV ECX, EAX
> ;      28B:       C1F902           SAR ECX, 2
> ;      28E:       894DF4           MOV [EBP-12], ECX
> ;      291:       DDD8             FSTPD FR0
> ;      293:       DB45F4           FILD [EBP-12]
> ;      296:       D8F1             FDIVD FR1
> ;      298:       9B               WAIT
> ;      299:       DD544201         FSTD [EDX+EAX*2+1]
> ;      29D:       83C004           ADD EAX, 4
> ;      2A0: L1:   3DFC596202       CMP EAX, 39999996
> ;      2A5:       7CE2             JL L0
> ;      2A7:       DDD8             FSTPD FR0
> ;      2A9:       D9EE             FLDZ
> ;      2AB:       31C0             XOR EAX, EAX
> ;      2AD:       EB0E             JMP L3
> ;      2AF: L2:   DDD9             FSTPD FR1
> ;      2B1:       DD444201         FLDD [EDX+EAX*2+1]
> ;      2B5:       D9C9             FXCH FR1
> ;      2B7:       D8C1             FADDD FR1
> ;      2B9:       9B               WAIT
> ;      2BA:       83C004           ADD EAX, 4
> ;      2BD: L3:   3DFC596202       CMP EAX, 39999996
> ;      2C2:       7CEB             JL L2
> ;      2C4:       64               FS-SEGMENT-PREFIX
> ;      2C5:       800D4800000004   OR BYTE PTR [#x48], 4
> ;      2CC:       BA10000000       MOV EDX, 16
> ;      2D1:       64               FS-SEGMENT-PREFIX
> ;      2D2:       031520000000     ADD EDX, [#x20]
> ;      2D8:       64               FS-SEGMENT-PREFIX
> ;      2D9:       3B1524000000     CMP EDX, [#x24]
> ;      2DF:       7607             JBE L4
> ;      2E1:       E8162767FD       CALL #x80639FC             ; alloc_overflow_edx
> ;      2E6:       EB0A             JMP L5
> ;      2E8: L4:   64               FS-SEGMENT-PREFIX
> ;      2E9:       891520000000     MOV [#x20], EDX
> ;      2EF:       83EA10           SUB EDX, 16
> ;      2F2: L5:   C70216030000     MOV DWORD PTR [EDX], 790
> ;      2F8:       8D5207           LEA EDX, [EDX+7]
> ;      2FB:       DD5201           FSTD [EDX+1]
> ;      2FE:       64               FS-SEGMENT-PREFIX
> ;      2FF:       80354800000004   XOR BYTE PTR [#x48], 4
> ;      306:       7402             JEQ L6
> ;      308:       CC09             BREAK 9                    ; pending interrupt trap
> ;      30A: L6:   8D65F8           LEA ESP, [EBP-8]
> ;      30D:       F8               CLC
> ;      30E:       8B6DFC           MOV EBP, [EBP-4]
> ;      311:       C20400           RET 4