From: ···········@alcoa.com
Subject: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <72g76b$4fb$1@nnrp1.dejanews.com>
We work with very large 2D tables of single-float data and have
stumbled on the fact that using the Common Lisp 2D arrays as in
(make-array (list isize jsize) ...) is not the best way to go
if you are concerned with performance. In fact, with two popular
implementations from Franz and Harlequin our experience is that
storing a 2D table in a 1D array (vector) and managing the index
arithmetic explicily gains significant performance. Further,
storing a 2D table as a vector of vectors has proven even
greater speed. The following two tables show times to execute a
test function for each 2D table technique. The test functions
create a table (500 X 500) and perform 100 iterations of filling
the table with a value and then accessing each entry. The first
table shows times for test functions that access the values by
row and the second by column.

Machine/                  vect-of-vect 1D-array 2D-array
Compiler                        by row   by row   by row
--------------------------------------------------------
HPC180-HPUX10.2/Franz ACL5.0final  1.2     14.2     29.1
P90-NT4/Franz ACL5.0beta           6.9     17.5     54.7
P90-NT4/Harlequin LWW4.1personal  16.3     29.3    382.7 sec

Machine/                  vect-of-vect 1D-array 2D-array
Compiler                        by col   by col   by col
--------------------------------------------------------
HPC180-HPUX10.2/Franz ACL5.0final 11.0     35.8     38.4
P90-NT4/Franz ACL5.0beta          29.4     38.0     73.5
P90-NT4/Harlequin LWW4.1personal  39.7     50.6    324.1 sec

Notes:
1) As expected by-row access is faster.
2) With both by-row and by-col access the vector of vectors
approach is faster.
3) With both compilers on NT the vector of vectors is faster.
4) A underpowered Pentium 90Mhz laptop holds its own
surprisingly well against the HPC180. Superior integer arithmetic?
5) Harlequin informs me that 2D floating point arrays are yet
to be optimized. This makes the other two approaches of even
more value for the time being.
6) One downside to the vector of vectors is that it creates
many more objects for the garbage collector to manage. Our
experiece with ACL5.0 was that full GC's really suffered. Our
solution was to use the nonANSI ACL make-array extension
":allocation :static". This puts the vector of vectors into space
that is never GCed. Our arrays were sticking around anyway so for
us this was a great solution.

John Watton
Technical Specialist
Aluminum Company of America

-------------------------------------------------------------------

(in-package :USER)

;;overcomes bug in early LWW4.1 personal
#+Lispworks (eval-when (compile) (require "arraymac"))

(defun test-vector-of-vectors-by-row (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  (let ((table (make-array isize :element-type '(simple-array t (*)))))
    (declare (type (simple-array t (*)) table))
    ;; create table
    (dotimes (i isize)
      (declare (fixnum i))
      (setf (aref table i) (make-array jsize :element-type 'single-float)))
    (dotimes (k iters)
      ;; fill table
      (dotimes (i isize)
	(declare (fixnum i))
	(let ((vector (aref table i)))
	  (declare (type (simple-array single-float (*)) vector))
	  (dotimes (j jsize)
	    (declare (fixnum j))
	    (setf (aref vector j) 3.6))))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (let ((vector (aref table i)))
	    (declare (type (simple-array single-float (*)) vector))
	    (dotimes (j jsize)
	      (declare (fixnum j))
	      (incf sum (aref vector j)))))))
    ))

(defun test-2d-array-by-row (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (list isize jsize) :element-type 'single-float)))
    (declare (type (simple-array single-float (* *)) table))
    (dotimes (k iters)
      ;; fill table
      (dotimes (i isize)
	(declare (fixnum i))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (setf (aref table i j) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (dotimes (j jsize)
	    (declare (fixnum j))
	    (incf sum (aref table i j))))))
    ))

(defun test-1d-array-by-row (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (* isize jsize) :element-type 'single-float))
	(temp 0))
    (declare (type (simple-array single-float (*)) table)
	     (fixnum temp))
    (dotimes (k iters)
      ;; fill table
      (dotimes (j jsize)
	(declare (fixnum j))
	(setq temp (* j isize))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (setf (aref table (+ temp i)) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (j isize)
	  (declare (fixnum j))
	  (dotimes (i isize)
	    (declare (fixnum i))
	    (setq temp (* j isize))
	    (incf sum (aref table (+ temp i)))))))
    ))

(defun test-vector-of-vectors-by-col (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  (let ((table (make-array isize :element-type '(simple-array t (*))
			   #+:allegro :allocation #+:allegro :old)))
    (declare (type (simple-array t (*)) table))
    ;; create table
    (dotimes (i isize)
      (declare (fixnum i))
      (setf (aref table i) (make-array jsize :element-type 'single-float)))
    (dotimes (k iters)
      ;; fill table
      (dotimes (j jsize)
	(declare (fixnum j))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (let ((vector (aref table i)))
	    (declare (type (simple-array single-float (*)) vector))
	    (setf (aref vector j) 3.6))))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (dotimes (i isize)
	    (declare (fixnum i))
	    (let ((vector (aref table i)))
	      (declare (type (simple-array single-float (*)) vector))
	      (incf sum (aref vector j)))))))
    ))

(defun test-2d-array-by-col (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (list isize jsize) :element-type 'single-float)))
    (declare (type (simple-array single-float (* *)) table))
    (dotimes (k iters)
      ;; fill table
      (dotimes (j jsize)
	(declare (fixnum j))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (setf (aref table i j) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (dotimes (i isize)
	    (declare (fixnum i))
	    (incf sum (aref table i j))))))
    ))

(defun test-1d-array-by-col (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (* isize jsize) :element-type 'single-float))
	(temp 0))
    (declare (type (simple-array single-float (*)) table)
	     (fixnum temp))
    (dotimes (k iters)
      ;; fill table
      (dotimes (i isize)
	(declare (fixnum i))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (setq temp (* j isize))
	  (setf (aref table (+ temp i)) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (dotimes (j isize)
	    (declare (fixnum j))
	    (setq temp (* j isize))
	    (incf sum (aref table (+ temp i)))))))
    ))

(defun test (&optional (isize 500) (jsize 500) (iters 100))
  (print "test vector of vectors by row")
  (time (test-vector-of-vectors-by-row isize jsize iters))
  (print "test vector of vectors by col")
  (time (test-vector-of-vectors-by-col isize jsize iters))
  (print "test 1d array by-row")
  (time (test-1d-array-by-row isize jsize iters))
  (print "test 1d array by col")
  (time (test-1d-array-by-col isize jsize iters))
  (print "test 2d array by row")
  (time (test-2d-array-by-row isize jsize iters))
  (print "test 2d array by col")
  (time (test-2d-array-by-col isize jsize iters)))

-----------== Posted via Deja News, The Discussion Network ==----------
http://www.dejanews.com/       Search, Read, Discuss, or Start Your Own    

From: Duane Rettig
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4u304jhpb.fsf@beta.franz.com>
···········@alcoa.com writes:

> The test functions
> create a table (500 X 500) and perform 100 iterations of filling
==================^^^===^^^
> the table with a value and then accessing each entry.

 [ ... ]

> (defun test-2d-array-by-row (isize jsize iters)
>   (declare (fixnum isize jsize iters)
> 	   #+:allegro (:explain :boxing :calls)
>            #+Lispworks (optimize (float 0) (fixnum-safety 3))
> 	   (optimize (speed 3) (safety 0)))
>   ;; create table
>   (let ((table (make-array (list isize jsize) :element-type 'single-float)))
>     (declare (type (simple-array single-float (* *)) table))
=================================================^=^

If there is any way you can hold the table size constant, and thus specify
it as (simple-array single-float (500 500)) here, then the Allegro CL
compiler can infer more at compile time.  We don't do "strength reduction"
in array loops yet, but we do a pretty good job with constant-sized
arrays.

The same should go for the by-column iteration.

-- 
Duane Rettig          Franz Inc.            http://www.franz.com/ (www)
1995 University Ave Suite 275  Berkeley, CA 94704
Phone: (510) 548-3600; FAX: (510) 548-8253   ·····@Franz.COM (internet)
From: Raymond Toy
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4nbtmbsm7p.fsf@rtp.ericsson.se>
>>>>> "john" == john watton <···········@alcoa.com> writes:

    john> Machine/                  vect-of-vect 1D-array 2D-array
    john> Compiler                        by row   by row   by row
    john> --------------------------------------------------------
    john> HPC180-HPUX10.2/Franz ACL5.0final  1.2     14.2     29.1
    john> P90-NT4/Franz ACL5.0beta           6.9     17.5     54.7
    john> P90-NT4/Harlequin LWW4.1personal  16.3     29.3    382.7 sec

    john> Machine/                  vect-of-vect 1D-array 2D-array
    john> Compiler                        by col   by col   by col
    john> --------------------------------------------------------
    john> HPC180-HPUX10.2/Franz ACL5.0final 11.0     35.8     38.4
    john> P90-NT4/Franz ACL5.0beta          29.4     38.0     73.5
    john> P90-NT4/Harlequin LWW4.1personal  39.7     50.6    324.1 sec

I'm always interested in these kinds of simple benchmarks.  Here are
the results from CMUCL with the latest sources running on a 300 MHz
Ultra 30:

Machine/                  vect-of-vect 1D-array 2D-array
Compiler                        by row   by row   by row
--------------------------------------------------------
Ultra30-300/CMUCL latest          1.15     2.33     3.58 

Machine/                  vect-of-vect 1D-array 2D-array
Compiler                        by col   by col   by col
--------------------------------------------------------
Ultra30-300/CMUCL latest          4.23     6.12     6.50

So CMUCL handles the 2D-array access is much better than the others,
but the 1D access still wins.  Clearly, there's room for improvement.

Ray
From: Sunil Mishra
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <efy67cjxyvf.fsf@hustle.cc.gatech.edu>
···········@alcoa.com writes:

> We work with very large 2D tables of single-float data and have
> stumbled on the fact that using the Common Lisp 2D arrays as in
> (make-array (list isize jsize) ...) is not the best way to go
> if you are concerned with performance. In fact, with two popular
> implementations from Franz and Harlequin our experience is that
> storing a 2D table in a 1D array (vector) and managing the index
> arithmetic explicily gains significant performance. Further,

Hmmm, I find it really odd that you have that result with lispworks. At
least with version 3.2.2, the 2D arrays have an internal vector
representation. So, you can ask for the underlying vector that stores the
elements of the array. And the function that does that is...

system:data-vector$array

I'd be willing to bet the cause is some overhead involved when aref is
invoked.

Sunil
From: Barry Margolin
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <am332.44$3I3.497296@burlma1-snr1.gtei.net>
In article <···············@hustle.cc.gatech.edu>,
Sunil Mishra  <·······@hustle.cc.gatech.edu> wrote:
>I'd be willing to bet the cause is some overhead involved when aref is
>invoked.

Notice that his 1D test did (setq temp (* i jsize)) once per row.  This
multiplication is normally done as part of a 2D aref.  While it's possible
for a compiler to optimize this out (I think this type of strength
reduction was one of Fortran's earliest optimizations), someone posted that
Harlequin doesn't do it.  So the 2D case is doing lots more multiplications
than the 1D case.

-- 
Barry Margolin, ······@bbnplanet.com
GTE Internetworking, Powered by BBN, Burlington, MA
*** DON'T SEND TECHNICAL QUESTIONS DIRECTLY TO ME, post them to newsgroups.
Don't bother cc'ing followups to me.
From: Lyman S. Taylor
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <72i3sp$4mr@pravda.cc.gatech.edu>
In article <············@nnrp1.dejanews.com>,  <···········@alcoa.com> wrote:

>if you are concerned with performance. In fact, with two popular
>implementations from Franz and Harlequin our experience is that
>storing a 2D table in a 1D array (vector) and managing the index
>arithmetic explicily gains significant performance. 

   I'm not sure how "synthetic" the benchmarks below are but if 
   you are striding through a whole array one element at a time
   doesn't  ROW-MAJOR-AREF already do this? 

   [ Err nevermind I just finished running a version with ROW-MAJOR-AREF
     and it is down right slower than molasses in January.  Is that the 
     implicit range checking slowing things down??? Or the array displacement
     cannot be lifted out of the enclosing loop?  Inlined and optimized
     it seems like this would be as fast as treating it as a single
     vector.  ]
 


   In benchmarking it does help to compare "apples" to "apples" 
    [ doesn't seem to dramatically affect the numbers though. ]


>(defun test-1d-array-by-row (isize jsize iters)

>    (dotimes (k iters)
>      ;; fill table
>      (dotimes (j jsize)
...
>	(dotimes (i isize)
...
>      ;; access table
>	(dotimes (j isize)
....
>	  (dotimes (i isize)
...
>    ))

  Errrr,  why do the first two row benchmarks do nest a "j loop" inside of an 
  "i loop" and this does the opposite?   It suppose to be by rows, right?  
  Secondly, the table access is indexed by isize for both I and J. 
  "As is" the 1d-array-by-row stores and strides through a column major
  matrix.
 
  ( Ditto for the column striding equivalent function. Loop nesting 
     reversed and filling/striding column major matrix.  )

  Doesn't make too much of a difference since the given numbers are run
  on a sqare matrix, though.  But the following seems more apropos. 

      ;;; fill table 
   	(dotimes (i isize)
	  (declare (fixnum i))
          (setq temp (* i isize))
	  (dotimes (j jsize)
	    (declare (fixnum i))
	    (setf (aref table (+ temp j)) 3.6 )))

       ;;;acess table 
	(dotimes (i isize)
	  (declare (fixnum i))
          (setq temp (* i isize))
	  (dotimes (j jsize)
	    (declare (fixnum j))
	    (incf sum (aref table (+ temp j)))))))


  and later when striding by column.... 

      ;; fill table
      (dotimes (j jsize)
	(declare (fixnum j))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (setq temp (* i isize))
	  (setf (aref table (+ temp j)) 3.6)))

       ;;;
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (dotimes (i isize)
	    (declare (fixnum i))
	    (setq temp (* i isize))
	    (incf sum (aref table (+ temp j)))))))



  



-- 
					
Lyman S. Taylor                "Twinkie Cream; food of the Gods" 
(·····@cc.gatech.edu)                     Jarod, "The Pretender" 
From: Barry Margolin
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <2p332.45$3I3.497296@burlma1-snr1.gtei.net>
In article <··········@pravda.cc.gatech.edu>,
Lyman S. Taylor <·····@cc.gatech.edu> wrote:
>   [ Err nevermind I just finished running a version with ROW-MAJOR-AREF
>     and it is down right slower than molasses in January.  Is that the 
>     implicit range checking slowing things down??? Or the array displacement
>     cannot be lifted out of the enclosing loop?  Inlined and optimized
>     it seems like this would be as fast as treating it as a single
>     vector.  ]

Displaced arrays tend to have more overhead -- AREF has to check the array
header, notice that it's displaced, and indirect to the real array.

And ROW-MAJOR-AREF still has the problem that it must do the multiplication
by the row size each time through the loop, where the 1D function was able
to save that in a per-row temporary.

-- 
Barry Margolin, ······@bbnplanet.com
GTE Internetworking, Powered by BBN, Burlington, MA
*** DON'T SEND TECHNICAL QUESTIONS DIRECTLY TO ME, post them to newsgroups.
Don't bother cc'ing followups to me.
From: ···········@alcoa.com
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <72l3od$3mg$1@nnrp1.dejanews.com>
In article <··········@pravda.cc.gatech.edu>,
  ·····@cc.gatech.edu (Lyman S. Taylor) wrote:
>    In benchmarking it does help to compare "apples" to "apples"
>     [ doesn't seem to dramatically affect the numbers though. ]

Sorry, I goofed. Below are correct versions of the test-1d-array-by-row and
test-1d-array-by-col (hopefully). The times I reported should be adjusted down
for test-1d-array-by-row (20-30%). Fortunately, as pointed out, using a square
matrix saves me and makes the time reported for test-1d-array-by-col valid.

(defun test-1d-array-by-row (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (* isize jsize) :element-type 'single-float))
	(temp 0))
    (declare (type (simple-array single-float (*)) table)
	     (fixnum temp))
    (dotimes (k iters)
      ;; fill table
      (dotimes (i isize)
	(declare (fixnum i))
	(setq temp (* i jsize))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (setf (aref table (+ temp j)) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (setq temp (* i jsize))
	  (dotimes (j jsize)
	    (declare (fixnum j))
	    (incf sum (aref table (+ temp j)))))))
    ))

(defun test-1d-array-by-col (isize jsize iters)
  (declare (fixnum isize jsize iters)
	   #+:allegro (:explain :boxing :calls)
           #+Lispworks (optimize (float 0) (fixnum-safety 3))
	   (optimize (speed 3) (safety 0)))
  ;; create table
  (let ((table (make-array (* isize jsize) :element-type 'single-float)))
    (declare (type (simple-array single-float (*)) table))
    (dotimes (k iters)
      ;; fill table
      (dotimes (j jsize)
	(declare (fixnum j))
	(dotimes (i isize)
	  (declare (fixnum i))
	  (setf (aref table (+ (the fixnum (* i jsize)) j)) 3.6)))
      ;; access table
      (let ((sum 0.0))
	(declare (single-float sum))
	(dotimes (j jsize)
	  (declare (fixnum j))
	  (dotimes (i isize)
	    (declare (fixnum i))
	    (incf sum (aref table (+ (the fixnum (* i jsize)) j)))))))
    ))

-----------== Posted via Deja News, The Discussion Network ==----------
http://www.dejanews.com/       Search, Read, Discuss, or Start Your Own    
From: Larry Hunter
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <rbogq3eh3b.fsf@work.nlm.nih.gov>
I also use large 2D arrays of single floats.  (And agree that ACL's
":Allocation :Static" is a big win, btw).  So I tried John Watton's
benchmark on my SGI Octane running ACL 5.0 (using the corrected versions of
test-1d-array-by-row and test-1d-array-by-col).

It has almost never been the case in my experience that the SGI port of much
of anything is as carefully done as the Sun/HP ports, but there's always a
first time.  John's problem does not seem to happen on the SGI port, at
least not the order of magnitude slowdown in the by row case.   [It's also
nice to see the SGI's speed shows up, too -- it sure feels like a fast
machine compared to the HP's I've dealt with.]



   Machine/                  vect-of-vect 1D-array 2D-array
   Compiler                        by row   by row   by row
   --------------------------------------------------------
   SGI Octane/Irix 6.5.1m/ACL 5.0    1.24     1.43     3.62
>  HPC180-HPUX10.2/Franz ACL5.0final  1.2     14.2     29.1
>  P90-NT4/Franz ACL5.0beta           6.9     17.5     54.7
>  P90-NT4/Harlequin LWW4.1personal  16.3     29.3    382.7 sec

   Machine/                  vect-of-vect 1D-array 2D-array
   Compiler                        by col   by col   by col
   --------------------------------------------------------
   SGI Octane/Irix 6.5.1m/ACL 5.0    1.50     2.31     3.84
>  HPC180-HPUX10.2/Franz ACL5.0final 11.0     35.8     38.4
>  P90-NT4/Franz ACL5.0beta          29.4     38.0     73.5
>  P90-NT4/Harlequin LWW4.1personal  39.7     50.6    324.1 sec

I also tried it using Duane Rettig's advice about putting the size of the
array in the declarations, but it made no significant difference. 

Go figure.

Larry

-- 
Lawrence Hunter, PhD.
National Library of Medicine               phone: +1 (301) 496-9303
Bldg. 38A, 9th fl, MS-54                   fax:   +1 (301) 496-0673
Bethesda. MD 20894 USA                     email: ······@nlm.nih.gov
From: Duane Rettig
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4k90l6jpg.fsf@beta.franz.com>
Larry Hunter <······@nlm.nih.gov> writes:
> I also use large 2D arrays of single floats.  (And agree that ACL's
> ":Allocation :Static" is a big win, btw).  So I tried John Watton's
> benchmark on my SGI Octane running ACL 5.0 (using the corrected versions of
> test-1d-array-by-row and test-1d-array-by-col).
> 
> It has almost never been the case in my experience that the SGI port of much
> of anything is as carefully done as the Sun/HP ports, but there's always a
> first time.  John's problem does not seem to happen on the SGI port, at
> least not the order of magnitude slowdown in the by row case.   [It's also
> nice to see the SGI's speed shows up, too -- it sure feels like a fast
> machine compared to the HP's I've dealt with.]

[ late post; back from LUGM conference and a small vacation]

Due to the differences in availability of hardware multiply on each of
SGI, sparc, and HP, and due in part to their upgrade policy (although we
may have missed an opportunity on the HP, depending on whether they still
support PA-RISC 1.1) we use small subroutines on sparc and HP to do
integer multiply, whereas on the SGI we compile a multiply instruction
directly.

I don't have my PA-RISC 2.0 manual handy, so I don't remember if there is
a hardware integer multiply.  If so, and if the 1.1 is no longer supported,
then it may be that this platform can take an inline multiply instead of a
call to a subroutine.  It may be, hwever, that a call to the $$mulI
millicode on the PA-RISC 2.0 automatically executes such a hardware
instruction, but I've never checked it out.

For the sparc, our biggest problem has been that there is no way to detect
the exact sparc chip that is running, and thus we have to run a small timing
loop at lisp startup time to determine whether the chip has hardware integer
multiply or requires a subroutine.  Being wrong on either side usually means
a 10x slowdown in multiplication: if a subtroutine is called on an ultra, it
is 10x slower than executing the smul instruction - if an smul instruction
is trapped and emulated on a sparc 2 or sparc 10, then the trap overhead added
to the emulation (in software) of the instruction is about 10x slower than the
subroutine call.  In short, we must call it correctly, and the timing loop
sets up the primitive call "%mul" to vector to one of two routines; one that
calls the .mul subroutine or one that executes the smul instruction.  This
has been effective in mitigating the problem of the hardware-vs-software 
integer multiply problem that has existed on sparc, but admittedly it is not
as fast as compiling smul instructions directly.

For both sparc and HP, if and when we can verify that all of the older chips
have been purged from our user base, we can start generating direct multiply
instructions, which will tend to nudge up the performance on tight loops
such as those in the benchmark.

> I also tried it using Duane Rettig's advice about putting the size of the
> array in the declarations, but it made no significant difference. 
> 
> Go figure.

Heh, heh, sorry about that.  I made the classic mistake that I preach
about, and even taught about last week in an optimization tutorial
in the LUGM conference, which is "never assume you know where the time is
being spent; always profile first".  I gave the answer that I gave
because I had no time before my presentation to actually look at it
deeply, and while either declaring indices or accessing with constant
indices (or both) will increase the speed of the code, it is dwarfed
by other things; on the sparc and HP, the multiply call is obviously
part of the bottleneck, but even on the SGI, the floating point aref
and its setf, while only one instruction each, play a significant role
in such a tight loop that is not unrolled and not power-reduced.  Other
architectures make matters even worse by performing conversions on the
single-float memory representation to get it into a float register and
vice-versa.

-- 
Duane Rettig          Franz Inc.            http://www.franz.com/ (www)
1995 University Ave Suite 275  Berkeley, CA 94704
Phone: (510) 548-3600; FAX: (510) 548-8253   ·····@Franz.COM (internet)
From: Raymond Toy
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4n4sribh3z.fsf@rtp.ericsson.se>
>>>>> "Duane" == Duane Rettig <·····@franz.com> writes:

    Duane> For the sparc, our biggest problem has been that there is no way to detect
    Duane> the exact sparc chip that is running, and thus we have to run a small timing
    Duane> loop at lisp startup time to determine whether the chip has hardware integer

Doesn't the info given by fpversion tell you what sparc chip you have?

For example, on an Ultra30, I get

 Sun-4 floating-point controller version 0 found.
 An UltraSPARC chip is available.
 FPU's frequency appears to be approximately 277.1 MHz.

 Use "-xtarget=ultra -xcache=16/32/1:2048/64/1" code-generation option.

On a Sparc 20, I get

 Sun-4 floating-point controller version 0 found.
 A Fujitsu/Ross RT620 hyperSPARC chip is available.
 FPU's frequency appears to be approximately 144.4 MHz.

 Use "-xtarget=ss20/hs11" code-generation option.


Ray
From: Duane Rettig
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4af19g0bh.fsf@beta.franz.com>
Raymond Toy <···@rtp.ericsson.se> writes:

> >>>>> "Duane" == Duane Rettig <·····@franz.com> writes:
> 
>     Duane> For the sparc, our biggest problem has been that there is no way to detect
>     Duane> the exact sparc chip that is running, and thus we have to run a small timing
>     Duane> loop at lisp startup time to determine whether the chip has hardware integer
> 
> Doesn't the info given by fpversion tell you what sparc chip you have?

Not necessarily.  When I asked the people at Sun about any way to
programmatically identify Sparc chips (with the cpu identification
fields in mind) they said that these fields were optional and that
noone had any control over whether they were correct (or if they
represented the actual (also optional) hardware instructions that
were available.  We finally settled on the timing loop to do the
determinations, which from what I read in the man page for fpversion
is exactly how they do it as well.

> For example, on an Ultra30, I get
> 
>  Sun-4 floating-point controller version 0 found.
>  An UltraSPARC chip is available.
>  FPU's frequency appears to be approximately 277.1 MHz.
> 
>  Use "-xtarget=ultra -xcache=16/32/1:2048/64/1" code-generation option.
> 
> On a Sparc 20, I get
> 
>  Sun-4 floating-point controller version 0 found.
>  A Fujitsu/Ross RT620 hyperSPARC chip is available.
>  FPU's frequency appears to be approximately 144.4 MHz.
> 
>  Use "-xtarget=ss20/hs11" code-generation option.
> 
> 
> Ray

Now try running fpversion on a machine with Solaris1 installed :-)
(No fair; we still have an old Solaris1, which does not have fpversion).
My guess is that Sun created fpversion to answer many requests like
mine to identify the cpu as best as possible.  The man page is dated
1992, so I assume that is when the command was created.  When I asked
for it a year earlier than that, they suggested a different command which
I do not remember, but which did a much worse job of guessing than
fpversion apparently does.

I think a better way to go would have been to cache values at Solaris
startup which identify the cpu by trying out instructions with no
emulation trap handlers installed;  if the instruction executes without
a trap, the hardware feature exists.  Then a programmatic interface
could be made available for runtime examination of the results.
Perhaps fpversion even does it this way, but I don't see any
programmatic interface to such info.

BTW, the x86 chips have exactly the same identification problem; Intel
didn't add a CPUID instruction until 486.  But they did give an
assembler routine that identifies that you have at least a 486, after
which the cpuid instruction can be used to further identify the chip.
So there is a programmatic way to tell exactly what Intel chip you
have, even though nowadays most people are using at least 486's (which
are even themselves fading fast from the scene).

-- 
Duane Rettig          Franz Inc.            http://www.franz.com/ (www)
1995 University Ave Suite 275  Berkeley, CA 94704
Phone: (510) 548-3600; FAX: (510) 548-8253   ·····@Franz.COM (internet)
From: Duane Rettig
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <490gtg006.fsf@beta.franz.com>
Duane Rettig <·····@franz.com> writes:
> Raymond Toy <···@rtp.ericsson.se> writes:
> 
> > Doesn't the info given by fpversion tell you what sparc chip you have?
> 
> Not necessarily.  When I asked the people at Sun about any way to
> programmatically identify Sparc chips (with the cpu identification
> fields in mind) they said that these fields were optional and that
> noone had any control over whether they were correct (or if they
> represented the actual (also optional) hardware instructions that
> were available.

I realized after I wrote this that part of my explanation had not been
written down and might thus cause confusion:  the reason why there was
no control was because of the many manufactures of Sparc Architecture
chips, not due to any lack of control within Sun's own design/manufacturing
process.

-- 
Duane Rettig          Franz Inc.            http://www.franz.com/ (www)
1995 University Ave Suite 275  Berkeley, CA 94704
Phone: (510) 548-3600; FAX: (510) 548-8253   ·····@Franz.COM (internet)
From: Hannu Koivisto
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <t2www4dm9h1.fsf@lehtori.cc.tut.fi>
Duane Rettig <·····@franz.com> writes:

| BTW, the x86 chips have exactly the same identification problem; Intel
| didn't add a CPUID instruction until 486.  But they did give an
| assembler routine that identifies that you have at least a 486, after
| which the cpuid instruction can be used to further identify the chip.
| So there is a programmatic way to tell exactly what Intel chip you
| have, even though nowadays most people are using at least 486's (which

I find this a bit confusing. On the other hand you say "the x86
chips have exactly the same identification problem" but then
"there is a programmatic way to tell exactly what Intel chip you
have". Anyway, I interpreted this so that you are saying that
there is an identification problem with < 486 chips. If I
understood you wrong, ignore the rest. If not, then what you say
is incorrect.

It's completely possible to identify whether you have 8086,
80286, 80386, ... There's even a routine from Intel to do that
but you could easily do that yourself if you have the databooks.
It's also possible to identify between 386sx and 386dx, 80186,
8088 vs. 8086 and different stepping levels of chips (even those
that don't have cpuid instruction), but _that_ is (depending on
the case how much) harder.

//Hannu
From: Duane Rettig
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <4emqlp0bo.fsf@beta.franz.com>
Hannu Koivisto <·····@iki.fi.ns> writes:

> Duane Rettig <·····@franz.com> writes:
> 
> | BTW, the x86 chips have exactly the same identification problem; Intel
> | didn't add a CPUID instruction until 486.  But they did give an
> | assembler routine that identifies that you have at least a 486, after
> | which the cpuid instruction can be used to further identify the chip.
> | So there is a programmatic way to tell exactly what Intel chip you
> | have, even though nowadays most people are using at least 486's (which
> 
> I find this a bit confusing. On the other hand you say "the x86
> chips have exactly the same identification problem" but then
> "there is a programmatic way to tell exactly what Intel chip you
> have". Anyway, I interpreted this so that you are saying that
> there is an identification problem with < 486 chips. If I
> understood you wrong, ignore the rest. If not, then what you say
> is incorrect.

I think the confusion might lie in the definition of the term
"identify"  (And, of course, I shouldn't have said "exactly the
same").  Of course all chips are ultimately identified by their
behaviors, and can thus be distinguished form each other, even if
it requires accurate timing loops in order to do so.  But if a chip
has a programmatic way to identify itself (such as a cpuid on the
486/pentium chips, or the purported identification fields in some
later sparc architectures), then the identification process is
more straightforward.

> It's completely possible to identify whether you have 8086,
> 80286, 80386, ... There's even a routine from Intel to do that
> but you could easily do that yourself if you have the databooks.
> It's also possible to identify between 386sx and 386dx, 80186,
> 8088 vs. 8086 and different stepping levels of chips (even those
> that don't have cpuid instruction), but _that_ is (depending on
> the case how much) harder.

Yes, this is exactly the programmatic way to do this identification.
In my pentium manual, it is listed in Chapter 5; "Feature Determination".
However, you may notice that this routine uses some obscure bit twiddling
instead of the obvious "try the instruction to see if it doesn't trap"
technique. The reason for this is to avoid too much interaction between
the identification routine and any emulation trap handlers that may be
installed which would try to emulate the instruction instead.

For the Sparc, when running under Solaris2, the emulation of (for example)
the smul instruction is too perfect; an illegal instruction trap on older
chips is vectored to an emulation routine, which then performs the
multiplication and then restores the environment, leaving the program
in the same state as if the instruction had been done in hardware.  No
other bits in any registers (according to conversations with the hardware
vendor) are reliable to determine the chip's identity; you must use a
timing loop.

The tension between identifytability and complete compatibility is a hard
one; on one hand you want your older chips to run correctly on software
that takes advantage of new hardware (even if it runs more slowly), but
on the other hand you want to know when you are getting performance
degradation because of excessive traps, due to new software on old hardware.

-- 
Duane Rettig          Franz Inc.            http://www.franz.com/ (www)
1995 University Ave Suite 275  Berkeley, CA 94704
Phone: (510) 548-3600; FAX: (510) 548-8253   ·····@Franz.COM (internet)
From: rusty craine
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <73vdli$36q$1@excalibur.flash.net>
Duane Rettig wrote in message <·············@beta.franz.com>...
>Hannu Koivisto <·····@iki.fi.ns> writes:
>
>> Duane Rettig <·····@franz.com> writes:
>>
>> | BTW, the x86 chips have exactly the same identification problem; Intel
>> | didn't add a CPUID instruction until 486.  But they did give an
>> | assembler routine that identifies that you have at least a 486, after
>> | which the cpuid instruction can be used to further identify the chip.
>> | So there is a programmatic way to tell exactly what Intel chip you
>> | have, even though nowadays most people are using at least 486's (which

Below is the proc that identifies older cpu's. I don't have a routine to
identify 586.  I wish I could give credit to the source of the algorithm,
I borrowed it long ago.  [i am not even sure this code is pertinent today
at all].  If you are not a assemblier type, you need a stack, data segment,
and main proc to run it.  if it would be of any use, let me know and i'll
send
it to ya.   I use assemblier often but in the 16 bit set, sorry haven't user
the 32 bit set.  If you don't mov a few things around, load up a register or
two and call an int , test a bit or two  then jump somewhere and do it all
over
 again when ya get there I most likely  don't know how it is done.  {where
the int's
 go in win32?...geeez}

rusty
;
 MyProg   SEGMENT
         assume CS:MyProg,DS:MyData
;;;CPUID -- determines the installed CPU
;
;    Returns a value in AL that has the following meaning
;      0 - 8086 or 8088
;      1 - 80286
;      2 - 80386 , DX or SX
;      3 - 80486
CPUID     PROC
.386
       ; Assume we have an 8086/8088 for now
       xor      DX,DX             ; clear DX to 0
       push   DX                    ; Push 16 bits of 0's on the stack
       popf                             ; Pop 16 bits of 0's off the stack &
into flags
       pop      AX                   ; so that we can pop them off & look at
them
       cmp      AX,0F000H   ; Test high 4 bits of waht was in flags
       je       DONE                ; If top 4 bits are set, it's a
8086/8088
       inc      DX                     ; Otherwise we have at least a 286.
so we inc DX
       push     0F000H          ; And keep testing ->  Push value on stack
       popf                              ; Pop value off stack into flags
       pushf                             ; Push state flags back onto stack
       pop     AX                     ; So we can pop them off again
       and     AX,0F000H      ; See if they still contain 0F000H
       jz      DONE                  ; If flags *NOT* 0F00H CPU is 286
       inc     DX                      ; otherwise inc DX and keep testing
; testing for 486 algorithm from HUMMEL
       mov     ECX,ESP       ; Save stack pointer in EDX
       and     ESP,NOT 3     ; Force stack pointer to align on a DWORD
       pushfd                          ; Push extended flags register onto
stack
       pop     EAX                 ; Pop extended flags values off stack
into EAX
       mov     EBX,EAX       ; Save copy of extended flags for later use
       xor     EAX,0004000H  ; Try to toggle AC bit in extended flags
       push    EAX                 ; Push flags test vakue from EAX onto
stack
       popfd                            ; and pop it back intp the extended
flags reg.
       pushfd                           ;Push extended flags value back onto
stack
       pop     EAX                   ; and pop it back into EAX for
inspection
       xor     EAX,EBX           ; Compare altered against unalterd
       jz      FixESP                 ; If toggle of AC didn't take  it's a
386
       inc     DX                       ; but if it did we have a 486 so inc
DX to 3
FixESP: mov    ESP,ECX      ; put oringal stack pointer back into stack
Done:  mov     AX,DX        ; return CPU code in AX
       ret                                  ; go back to caller
CPUID          ENDP


>>
From: rusty craine
Subject: Re: Optimizing 2D tables for speed - 2D array vs. 1D array vs. vector-of-vectors
Date: 
Message-ID: <73ve4r$52g$1@excalibur.flash.net>
Duane Rettig wrote in message <·············@beta.franz.com>...
>Hannu Koivisto <·····@iki.fi.ns> writes:
>
>> Duane Rettig <·····@franz.com> writes:
>>
>> | BTW, the x86 chips have exactly the same identification problem; Intel
>> | didn't add a CPUID instruction until 486.  But they did give an
>> | assembler routine that identifies that you have at least a 486, after
>> | which the cpuid instruction can be used to further identify the chip.
>> | So there is a programmatic way to tell exactly what Intel chip you
>> | have, even though nowadays most people are using at least 486's (which

Below is the proc that identifies older cpu's. I don't have a routine to
identify 586.  I wish I could give credit to the source of the algorithm,
I borrowed it long ago.  [i am not even sure this code is pertinent today
at all].  If you are not a assemblier type, you need a stack, data segment,
and main proc to run it.  if it would be of any use, let me know and i'll
send
it to ya.   I use assemblier often but in the 16 bit set, sorry haven't user
the 32 bit set.  If you don't mov a few things around, load up a register or
two and call an int , test a bit or two  then jump somewhere and do it all
over
 again when ya get there I most likely  don't know how it is done.  {where
the int's
 go in win32?...geeez}

rusty
;
 MyProg   SEGMENT
         assume CS:MyProg,DS:MyData
;;;CPUID -- determines the installed CPU
;
;    Returns a value in AL that has the following meaning
;      0 - 8086 or 8088
;      1 - 80286
;      2 - 80386 , DX or SX
;      3 - 80486
CPUID     PROC
.386
       ; Assume we have an 8086/8088 for now
       xor      DX,DX             ; clear DX to 0
       push   DX                    ; Push 16 bits of 0's on the stack
       popf                             ; Pop 16 bits of 0's off the stack &
into flags
       pop      AX                   ; so that we can pop them off & look at
them
       cmp      AX,0F000H   ; Test high 4 bits of waht was in flags
       je       DONE                ; If top 4 bits are set, it's a
8086/8088
       inc      DX                     ; Otherwise we have at least a 286.
so we inc DX
       push     0F000H          ; And keep testing ->  Push value on stack
       popf                              ; Pop value off stack into flags
       pushf                             ; Push state flags back onto stack
       pop     AX                     ; So we can pop them off again
       and     AX,0F000H      ; See if they still contain 0F000H
       jz      DONE                  ; If flags *NOT* 0F00H CPU is 286
       inc     DX                      ; otherwise inc DX and keep testing
; testing for 486 algorithm from HUMMEL
       mov     ECX,ESP       ; Save stack pointer in EDX
       and     ESP,NOT 3     ; Force stack pointer to align on a DWORD
       pushfd                          ; Push extended flags register onto
stack
       pop     EAX                 ; Pop extended flags values off stack
into EAX
       mov     EBX,EAX       ; Save copy of extended flags for later use
       xor     EAX,0004000H  ; Try to toggle AC bit in extended flags
       push    EAX                 ; Push flags test vakue from EAX onto
stack
       popfd                            ; and pop it back intp the extended
flags reg.
       pushfd                           ;Push extended flags value back onto
stack
       pop     EAX                   ; and pop it back into EAX for
inspection
       xor     EAX,EBX           ; Compare altered against unalterd
       jz      FixESP                 ; If toggle of AC didn't take  it's a
386
       inc     DX                       ; but if it did we have a 486 so inc
DX to 3
FixESP: mov    ESP,ECX      ; put oringal stack pointer back into stack
Done:  mov     AX,DX        ; return CPU code in AX
       ret                                  ; go back to caller
CPUID          ENDP


>>