From: Szymon
Subject: [long] fast (?) string splitting.
Date: 
Message-ID: <87sm7vhsj3.fsf@eva.rplacd.net>
Hi.

I just finished a (trivial) string utility.

It can split strings on set of characters.

(my-split-string "foo:bar:baz" #\:)

===> ("foo" "bar" "baz")

(my-split-string "foo:bar:baz" #(#\o #\a))

===> ("f" ":b" "r:b" "z")

First argument must be type of STRING.

Second one can be STRING/VECTOR (not BIT-VECTOR), CHARACTER or LIST.


Enjoy.


Some benchmarks (CMUCL):


first argument:  STRINGs, and SIMPLE-STRINGs (all possible).

second argument: all possible (exept single chars[**]).

(do-tests '(my-split-string cllib:split-string) 20000)

===>

   operator      avg. no of cpu cycles.   avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING          117435                    1260
SPLIT-STRING    [*]      822833                    1702
-------------------------------------------------------------------



first argument:  STRINGs (no simple ones).

second argument: all possible (exept single chars[**]).

(do-tests '(my-split-string cllib:split-string) 20000)

===>

   operator      avg. no of cpu cycles.   avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING          179054                    1259
SPLIT-STRING    [*]      861348                    1700
-------------------------------------------------------------------



first argument:  only SIMPLE-STRINGs.

second argument: all possible (exept single chars[**]).

(do-tests '(my-split-string cllib:split-string) 20000)

===>

   operator      avg. no of cpu cycles.   avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING           50171                    1266
SPLIT-STRING    [*]      714784                    1713
-------------------------------------------------------------------

[*] [ http://cvs.sourceforge.net/viewcvs.py/*checkout*/clocc/clocc/src/cllib/string.lisp]

[**] because CLLIB:SPLIT-STRING cannot handle single characters.

Code:

;;; -*- Mode: Lisp -*-
;;;
;;;
;;; function:    MY-SPLIT-STRING
;;;
;;;
;;; first arg:   SIMPLE-STRING/STRING
;;; 
;;; second arg:  CHARACTER, LIST, SIMPLE-STRING/STRING, SIMPLE-VECTOR/VECTOR
;;;              not BIT-VECTOR.
;;;
;;; returns:     LIST
;;;
;;;
;;; example:
;;;
;;; (my-split-string "foo:bar:baz,quux,foshmoo" ":,")
;;;
;;; ==> ("foo" "bar" "baz" "quux" "foshmoo")
;;;

(declaim
 (inline is-in-simple-string-p is-in-string-p is-in-vector-p)
 (ftype (function (character simple-string) boolean) is-in-simple-string-p)
 (ftype (function (character string) boolean) is-in-string-p)
 (ftype (function (character vector) boolean) is-in-vector-p)
 (ftype (function (simple-string character) list) split-simple-string-on-char)
 (ftype (function (string character) list) split-string-on-char)
 (ftype (function (simple-string list) list) split-simple-string-on-list-of-chars)
 (ftype (function (string list) list) split-string-on-list-of-chars)
 (ftype (function (simple-string simple-string) list) split-simple-string-on-simple-string)
 (ftype (function (string simple-string) list) split-string-on-simple-string)
 (ftype (function (simple-string vector) list) split-simple-string-on-vector)
 (ftype (function (string vector) list) split-string-on-vector))

(defun is-in-simple-string-p (ch str)
  (declare
   (optimize (speed 3) (safety 0))
   (type character ch)
   (type simple-string str))
  (loop for current-char of-type character across (the simple-string str)
	if (eql current-char ch) return T))

(defun is-in-string-p (ch str)
  (declare
   (optimize (speed 3) (safety 0))
   (type character ch)
   (type string str))
  (loop for current-char of-type character across (the string str)
	if (eql current-char ch) return T))

(defun is-in-vector-p (ch vector)
  (declare
   (optimize (speed 3) (safety 0))
   (type character ch)
   (type vector vector))
  (loop for current-char of-type character across (the vector vector)
	if (eql current-char ch) return T))

(defmacro make-splitter (&key fname first-arg-type second-arg-type)
  (let ((acc-func-1
	 (ecase first-arg-type
	   (string 'char)
	   (simple-string 'schar)))
	(acc-func-2
	 (ecase second-arg-type
	   (character 'eql)
	   (list 'member)
	   (simple-string 'is-in-simple-string-p)
	   (string 'is-in-string-p)
	   (vector 'is-in-vector-p))))
    `(defun ,fname (s c)
       (declare
	(optimize (speed 3) (safety 0))
	(,first-arg-type s)
	(,second-arg-type c))
       (prog ((head-index 0) (tail-index 0) (i 0) (j 0) (l 0) (s-len 0) (slen-1 0)
	      (result-string "") (result (list nil)) result-pointer)
	     (declare
	      (type fixnum i j l s-len slen-1 head-index tail-index)
	      (type simple-string result-string)
	      (type cons result)
	      (type list result-pointer))
	     (setq s-len (array-dimension s 0)
		   slen-1 (1- s-len)
		   tail-index slen-1
		   result-pointer result)
	     |1| (when (,acc-func-2 (,acc-func-1 s head-index) c)
		   (if (eql head-index slen-1)
		       (return-from ,fname NIL))
		   (incf head-index)
		   (go |1|))
	     |2| (when (,acc-func-2 (,acc-func-1 s tail-index) c)
		   (if (eql tail-index head-index)
		       (return-from ,fname NIL))
		   (decf tail-index)
		   (go |2|))
	     (if (eql head-index tail-index)
		 (return-from ,fname
		   (prog1 result
		     (rplaca result (string (,acc-func-1 s tail-index))))))
	     (setq i head-index
		   j head-index
		   slen-1 tail-index
		   s-len (1+ slen-1))
	     |3| (if (,acc-func-2 (,acc-func-1 s (incf i)) c)
		     (if (eql i j)
			 (tagbody |-| (if (,acc-func-2 (,acc-func-1 s (incf i)) c) (go |-|)) (setq j i))
		       (setq result-pointer
			     (cdr
			      (rplacd result-pointer
				      (list
				       (prog1 (setq l -1 result-string (make-string (- i j)))
					 (tagbody |-|
						  (setf (schar result-string (incf l))
							(,acc-func-1 s j))
						  (unless (eql (incf j) i) (go |-|)))
					 (incf j))))))))
	     (unless (eql i slen-1) (go |3|))
	     (rplacd result-pointer
		     (list
		      (prog1 (setq l -1 result-string (make-string (- s-len j)))
			(tagbody |-|
				 (setf (schar result-string (incf l))
				       (,acc-func-1 s j))
				 (unless (eql (incf j) s-len) (go |-|))))))
	     (return-from ,fname (cdr result))))))

;; make splitters on single character.

(make-splitter :fname split-simple-string-on-char
	       :first-arg-type simple-string
	       :second-arg-type character)

(make-splitter :fname split-string-on-char
	       :first-arg-type string
	       :second-arg-type character)

;; make splitters on list of characters.

(make-splitter :fname split-simple-string-on-list-of-chars
	       :first-arg-type simple-string
	       :second-arg-type list)

(make-splitter :fname split-string-on-list-of-chars
	       :first-arg-type string
	       :second-arg-type list)

;; make splitters on simple string.

(make-splitter :fname split-simple-string-on-simple-string
	       :first-arg-type simple-string
	       :second-arg-type simple-string)

(make-splitter :fname split-string-on-simple-string
	       :first-arg-type string
	       :second-arg-type simple-string)

;; make splitters on vector.

(make-splitter :fname split-simple-string-on-vector
	       :first-arg-type simple-string
	       :second-arg-type vector)

(make-splitter :fname split-string-on-vector
	       :first-arg-type string
	       :second-arg-type vector)

;; main.

(let ((cons-cell-pool (make-list 60))
      (pool-tail nil)
      (temporary-pool-tail nil)
      (seq-length 0))
  (declare
   (type cons cons-cell-pool)
   (type list pool-tail temporary-pool-tail)
   (type fixnum seq-length))
  (let ((pool-length (list-length cons-cell-pool)))
    (declare
     (type fixnum pool-length))
    (defun my-split-string (s seq/char)
      (declare (optimize (speed 3) (safety 0))
	       (inline
		 split-simple-string-on-char
		 split-string-on-char
		 split-simple-string-on-list-of-chars
		 split-string-on-list-of-chars
		 split-simple-string-on-simple-string
		 split-string-on-simple-string
		 split-simple-string-on-vector
		 split-string-on-vector)
	       (type string s)
	       (type (or character list simple-string string vector (not bit-vector)) seq/char))
      (unless seq/char
	(return-from my-split-string s))
      (if (typep s 'simple-string)
	  (if (typep seq/char 'character)
	      (split-simple-string-on-char s seq/char)
	    (if (typep seq/char 'list)
		(if (eq (cdr seq/char) nil)
		    (split-simple-string-on-char s (car seq/char))
		  (split-simple-string-on-list-of-chars s seq/char))	 
	      (if (= (setq seq-length (length seq/char)) 1)
		  (split-simple-string-on-char s (aref seq/char 0))
		(if (typep seq/char 'simple-string)
		    (split-simple-string-on-simple-string s seq/char)
		  (if (< seq-length pool-length)
		      (progn
			(loop for i of-type t across seq/char
			      for j of-type list on cons-cell-pool
			      do (rplaca j i)
			      finally (setq temporary-pool-tail j pool-tail (cdr j))
			      (rplacd j NIL))
			(prog1 (split-simple-string-on-list-of-chars s cons-cell-pool)
			  (rplacd temporary-pool-tail pool-tail)))
		    (split-simple-string-on-vector s seq/char))))))
	(if (typep seq/char 'character)
	    (split-string-on-char s seq/char)
	  (if (typep seq/char 'list)
	      (if (eq (cdr seq/char) nil)
		  (split-string-on-char s (car seq/char))
		(split-string-on-list-of-chars s seq/char))
	    (if (= (setq seq-length (length seq/char)) 1)
		(split-string-on-char s (aref seq/char 0))
	      (if (typep seq/char 'simple-string)
		  (split-string-on-simple-string s seq/char)
		(if (< seq-length pool-length)
		    (progn
		      (loop for i of-type t across seq/char
			    for j of-type list on cons-cell-pool
			    do (rplaca j i)
			    finally (setq temporary-pool-tail j pool-tail (cdr j))
			    (rplacd j NIL))
		      (prog1 (split-string-on-list-of-chars s cons-cell-pool)
			(rplacd temporary-pool-tail pool-tail)))
		  (split-string-on-vector s seq/char))))))))))
;;; MY-SPLIT-STRING ends here.

Regards, Szymon.