From: Szymon
Subject: [long] fast (?) string splitting.
Date:
Message-ID: <87sm7vhsj3.fsf@eva.rplacd.net>
Hi.
I just finished a (trivial) string utility.
It can split strings on set of characters.
(my-split-string "foo:bar:baz" #\:)
===> ("foo" "bar" "baz")
(my-split-string "foo:bar:baz" #(#\o #\a))
===> ("f" ":b" "r:b" "z")
First argument must be type of STRING.
Second one can be STRING/VECTOR (not BIT-VECTOR), CHARACTER or LIST.
Enjoy.
Some benchmarks (CMUCL):
first argument: STRINGs, and SIMPLE-STRINGs (all possible).
second argument: all possible (exept single chars[**]).
(do-tests '(my-split-string cllib:split-string) 20000)
===>
operator avg. no of cpu cycles. avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING 117435 1260
SPLIT-STRING [*] 822833 1702
-------------------------------------------------------------------
first argument: STRINGs (no simple ones).
second argument: all possible (exept single chars[**]).
(do-tests '(my-split-string cllib:split-string) 20000)
===>
operator avg. no of cpu cycles. avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING 179054 1259
SPLIT-STRING [*] 861348 1700
-------------------------------------------------------------------
first argument: only SIMPLE-STRINGs.
second argument: all possible (exept single chars[**]).
(do-tests '(my-split-string cllib:split-string) 20000)
===>
operator avg. no of cpu cycles. avg. no of bytes consed.
-------------------------------------------------------------------
MY-SPLIT-STRING 50171 1266
SPLIT-STRING [*] 714784 1713
-------------------------------------------------------------------
[*] [ http://cvs.sourceforge.net/viewcvs.py/*checkout*/clocc/clocc/src/cllib/string.lisp]
[**] because CLLIB:SPLIT-STRING cannot handle single characters.
Code:
;;; -*- Mode: Lisp -*-
;;;
;;;
;;; function: MY-SPLIT-STRING
;;;
;;;
;;; first arg: SIMPLE-STRING/STRING
;;;
;;; second arg: CHARACTER, LIST, SIMPLE-STRING/STRING, SIMPLE-VECTOR/VECTOR
;;; not BIT-VECTOR.
;;;
;;; returns: LIST
;;;
;;;
;;; example:
;;;
;;; (my-split-string "foo:bar:baz,quux,foshmoo" ":,")
;;;
;;; ==> ("foo" "bar" "baz" "quux" "foshmoo")
;;;
(declaim
(inline is-in-simple-string-p is-in-string-p is-in-vector-p)
(ftype (function (character simple-string) boolean) is-in-simple-string-p)
(ftype (function (character string) boolean) is-in-string-p)
(ftype (function (character vector) boolean) is-in-vector-p)
(ftype (function (simple-string character) list) split-simple-string-on-char)
(ftype (function (string character) list) split-string-on-char)
(ftype (function (simple-string list) list) split-simple-string-on-list-of-chars)
(ftype (function (string list) list) split-string-on-list-of-chars)
(ftype (function (simple-string simple-string) list) split-simple-string-on-simple-string)
(ftype (function (string simple-string) list) split-string-on-simple-string)
(ftype (function (simple-string vector) list) split-simple-string-on-vector)
(ftype (function (string vector) list) split-string-on-vector))
(defun is-in-simple-string-p (ch str)
(declare
(optimize (speed 3) (safety 0))
(type character ch)
(type simple-string str))
(loop for current-char of-type character across (the simple-string str)
if (eql current-char ch) return T))
(defun is-in-string-p (ch str)
(declare
(optimize (speed 3) (safety 0))
(type character ch)
(type string str))
(loop for current-char of-type character across (the string str)
if (eql current-char ch) return T))
(defun is-in-vector-p (ch vector)
(declare
(optimize (speed 3) (safety 0))
(type character ch)
(type vector vector))
(loop for current-char of-type character across (the vector vector)
if (eql current-char ch) return T))
(defmacro make-splitter (&key fname first-arg-type second-arg-type)
(let ((acc-func-1
(ecase first-arg-type
(string 'char)
(simple-string 'schar)))
(acc-func-2
(ecase second-arg-type
(character 'eql)
(list 'member)
(simple-string 'is-in-simple-string-p)
(string 'is-in-string-p)
(vector 'is-in-vector-p))))
`(defun ,fname (s c)
(declare
(optimize (speed 3) (safety 0))
(,first-arg-type s)
(,second-arg-type c))
(prog ((head-index 0) (tail-index 0) (i 0) (j 0) (l 0) (s-len 0) (slen-1 0)
(result-string "") (result (list nil)) result-pointer)
(declare
(type fixnum i j l s-len slen-1 head-index tail-index)
(type simple-string result-string)
(type cons result)
(type list result-pointer))
(setq s-len (array-dimension s 0)
slen-1 (1- s-len)
tail-index slen-1
result-pointer result)
|1| (when (,acc-func-2 (,acc-func-1 s head-index) c)
(if (eql head-index slen-1)
(return-from ,fname NIL))
(incf head-index)
(go |1|))
|2| (when (,acc-func-2 (,acc-func-1 s tail-index) c)
(if (eql tail-index head-index)
(return-from ,fname NIL))
(decf tail-index)
(go |2|))
(if (eql head-index tail-index)
(return-from ,fname
(prog1 result
(rplaca result (string (,acc-func-1 s tail-index))))))
(setq i head-index
j head-index
slen-1 tail-index
s-len (1+ slen-1))
|3| (if (,acc-func-2 (,acc-func-1 s (incf i)) c)
(if (eql i j)
(tagbody |-| (if (,acc-func-2 (,acc-func-1 s (incf i)) c) (go |-|)) (setq j i))
(setq result-pointer
(cdr
(rplacd result-pointer
(list
(prog1 (setq l -1 result-string (make-string (- i j)))
(tagbody |-|
(setf (schar result-string (incf l))
(,acc-func-1 s j))
(unless (eql (incf j) i) (go |-|)))
(incf j))))))))
(unless (eql i slen-1) (go |3|))
(rplacd result-pointer
(list
(prog1 (setq l -1 result-string (make-string (- s-len j)))
(tagbody |-|
(setf (schar result-string (incf l))
(,acc-func-1 s j))
(unless (eql (incf j) s-len) (go |-|))))))
(return-from ,fname (cdr result))))))
;; make splitters on single character.
(make-splitter :fname split-simple-string-on-char
:first-arg-type simple-string
:second-arg-type character)
(make-splitter :fname split-string-on-char
:first-arg-type string
:second-arg-type character)
;; make splitters on list of characters.
(make-splitter :fname split-simple-string-on-list-of-chars
:first-arg-type simple-string
:second-arg-type list)
(make-splitter :fname split-string-on-list-of-chars
:first-arg-type string
:second-arg-type list)
;; make splitters on simple string.
(make-splitter :fname split-simple-string-on-simple-string
:first-arg-type simple-string
:second-arg-type simple-string)
(make-splitter :fname split-string-on-simple-string
:first-arg-type string
:second-arg-type simple-string)
;; make splitters on vector.
(make-splitter :fname split-simple-string-on-vector
:first-arg-type simple-string
:second-arg-type vector)
(make-splitter :fname split-string-on-vector
:first-arg-type string
:second-arg-type vector)
;; main.
(let ((cons-cell-pool (make-list 60))
(pool-tail nil)
(temporary-pool-tail nil)
(seq-length 0))
(declare
(type cons cons-cell-pool)
(type list pool-tail temporary-pool-tail)
(type fixnum seq-length))
(let ((pool-length (list-length cons-cell-pool)))
(declare
(type fixnum pool-length))
(defun my-split-string (s seq/char)
(declare (optimize (speed 3) (safety 0))
(inline
split-simple-string-on-char
split-string-on-char
split-simple-string-on-list-of-chars
split-string-on-list-of-chars
split-simple-string-on-simple-string
split-string-on-simple-string
split-simple-string-on-vector
split-string-on-vector)
(type string s)
(type (or character list simple-string string vector (not bit-vector)) seq/char))
(unless seq/char
(return-from my-split-string s))
(if (typep s 'simple-string)
(if (typep seq/char 'character)
(split-simple-string-on-char s seq/char)
(if (typep seq/char 'list)
(if (eq (cdr seq/char) nil)
(split-simple-string-on-char s (car seq/char))
(split-simple-string-on-list-of-chars s seq/char))
(if (= (setq seq-length (length seq/char)) 1)
(split-simple-string-on-char s (aref seq/char 0))
(if (typep seq/char 'simple-string)
(split-simple-string-on-simple-string s seq/char)
(if (< seq-length pool-length)
(progn
(loop for i of-type t across seq/char
for j of-type list on cons-cell-pool
do (rplaca j i)
finally (setq temporary-pool-tail j pool-tail (cdr j))
(rplacd j NIL))
(prog1 (split-simple-string-on-list-of-chars s cons-cell-pool)
(rplacd temporary-pool-tail pool-tail)))
(split-simple-string-on-vector s seq/char))))))
(if (typep seq/char 'character)
(split-string-on-char s seq/char)
(if (typep seq/char 'list)
(if (eq (cdr seq/char) nil)
(split-string-on-char s (car seq/char))
(split-string-on-list-of-chars s seq/char))
(if (= (setq seq-length (length seq/char)) 1)
(split-string-on-char s (aref seq/char 0))
(if (typep seq/char 'simple-string)
(split-string-on-simple-string s seq/char)
(if (< seq-length pool-length)
(progn
(loop for i of-type t across seq/char
for j of-type list on cons-cell-pool
do (rplaca j i)
finally (setq temporary-pool-tail j pool-tail (cdr j))
(rplacd j NIL))
(prog1 (split-string-on-list-of-chars s cons-cell-pool)
(rplacd temporary-pool-tail pool-tail)))
(split-string-on-vector s seq/char))))))))))
;;; MY-SPLIT-STRING ends here.
Regards, Szymon.