(in-package :kira)
(eval-always
(defconstant +char-class-bits+ 257 "8-bit + catch-all bit.")
(deftype char-class () `(simple-bit-vector ,+char-class-bits+))
(defvar *char-class-table* (make-hash-table :test 'equal))
(defun char-class-p (object) (typep object 'char-class))
(defun make-char-class (&optional (initial-element 0))
(make-bit-vector +char-class-bits+ initial-element)))
(defmacro define-char-class (name lambda-list &body body)
(multiple-value-bind (lambda-expression documentation)
(parse-macro-1 name lambda-list body)
`(eval-always
(prog1 (progn (clrhash *char-class-table*) ',name)
(setf (documentation ',name 'char-class) ,documentation)
(setf (get ',name 'char-class) #',lambda-expression)))))
(define-char-class digit (&optional (radix 10))
(check-type radix (integer 2 36) "an integer radix")
(let ((upper-case-alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
(cond ((<= radix 10) (subseq "0123456789" 0 radix))
(t (let ((upper-case-alphabet (subseq upper-case-alphabet 0 (- radix 10))))
`(or digit ,upper-case-alphabet ,(string-downcase upper-case-alphabet)))))))
(define-char-class satisfies (predicate-name)
(loop with class = (make-char-class) for code fixnum from 0 below (1- +char-class-bits+)
when (funcall predicate-name (code-char code)) do (setf (sbit class code) 1)
finally (return class)))
(eval-always
(defun ascii-char-p (char)
(when (characterp char)
(let ((code (char-code char)))
(declare (type fixnum code))
(< code 128)))))
(eval-always
(deftype ascii-char ()
'(satisfies ascii-char-p)))
(eval-always
(defun whitespacep (thing)
(cond ((sequencep thing) (every #'whitespacep thing))
((case thing ((#\Return #\Linefeed #\Page #\Space
#\PageUp #\Tab #\No-Break_Space) t))))))
(define-char-class ascii () '(satisfies ascii-char-p))
(define-char-class graphic () '(satisfies graphic-char-p))
(define-char-class whitespace () '(satisfies whitespacep))
(define-char-class horizontal-whitespace () '(or #\Tab #\Space))
(define-char-class upper-case () "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define-char-class lower-case () "abcdefghijklmnopqrstuvwxyz")
(define-char-class alpha () '(or upper-case lower-case))
(define-char-class alphanumeric () '(or alpha digit))
(define-char-class t ()
"Set of all characters."
(make-char-class 1))
(define-char-class nil ()
"Empty set of characters."
(make-char-class 0))
(define-char-class vertical-whitespace ()
'(and whitespace (not horizontal-whitespace)))
(eval-always
(defun char-class-expand-1 (form &optional env)
(let ((x (typecase form (null '(nil)) (symbol (list form)) (otherwise form))))
(let ((expander (and (typep x '(cons symbol t)) (get (car x) 'char-class))))
(if expander (values (funcall expander x env) t) (values form nil))))))
(eval-always
(defun char-class-expand (form &optional environment)
(loop with expanded-p and any-expanded-p = nil
do (multiple-value-setq (form expanded-p)
(char-class-expand-1 form environment))
while expanded-p do (setq any-expanded-p t)
finally (return (values form any-expanded-p)))))
(define-char-class not (specifier)
(bit-not (char-class specifier)))
(eval-always
(defun and-2 (spec-x spec-y)
(let ((class-x (char-class spec-x))
(class-y (char-class spec-y)))
(declare (type char-class class-x))
(declare (type char-class class-y))
(bit-and class-x class-y))))
(eval-always
(defun or-2 (spec-x spec-y)
(let ((class-x (char-class spec-x))
(class-y (char-class spec-y)))
(declare (type char-class class-x))
(declare (type char-class class-y))
(bit-ior class-x class-y))))
(eval-always
(defun make-char-class-from-string (string)
(loop with char-class = (make-char-class)
for char of-type character across string
for char-code fixnum = (char-code char)
do (setf (bit char-class char-code) 1)
finally (return char-class))))
(eval-always
(defun char-class (specifier)
(setq specifier (char-class-expand specifier))
(cond ((char-class-p specifier) specifier)
(t (orf (gethash specifier *char-class-table*)
(ctypecase specifier
(string (make-char-class-from-string specifier))
((cons (eql and)) (reduce #'and-2 (cdr specifier) :initial-value (char-class t)))
((cons (eql or)) (reduce #'or-2 (cdr specifier) :initial-value (char-class nil)))
(ascii-char (let ((char-class (make-char-class)) (char-code (char-code specifier)))
(setf (sbit char-class char-code) 1) char-class))))))))
(define-compiler-macro char-class (&whole char-class-form specifier)
(multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
(unless specifier-evaluated-p (return-from char-class char-class-form))
(char-class specifier)))
(eval-always
(defmacro %char-of-class-p (char class)
(let ((index `(min (char-code ,char) ,(1- +char-class-bits+))))
`(not (zerop (sbit ,class (the array-index ,index)))))))
(eval-always
(defun char-of-class-p (char specifier)
(let ((class (char-class specifier)))
(%char-of-class-p char class))))
(define-compiler-macro char-of-class-p (char specifier &aux class position)
(multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
(when specifier-evaluated-p
(setq class (char-class specifier))
(cond ((not (setq position (position 1 class))) (return-from char-of-class-p nil))
((and (= position (position 1 class :from-end t)) (/= position (1- +char-class-bits+)))
(return-from char-of-class-p `(eql ,char ,(code-char position))))
((not (setq position (position 0 class))) (return-from char-of-class-p t))
((and (= position (position 0 class :from-end t)) (/= position (1- +char-class-bits+)))
(return-from char-of-class-p `(not (eql ,char ,(code-char position)))))))
(orf class `(char-class ,specifier))
`(%char-of-class-p ,char ,class)))
(define-condition lexer-error (error)
((thing :initarg :thing :reader lexer-error-thing)
(string :initarg :string :reader lexer-error-string)
(position :initarg :position :reader lexer-error-position))
(:report (lambda (condition stream)
(with-accessors ((lexer-error-position lexer-error-position)) condition
(format stream "Invalid ~A:~%~S~%~V@T^" (lexer-error-thing condition)
(lexer-error-string condition) (1+ lexer-error-position))))))
(defmacro with-lexer-error
((thing &optional junk-allowed) &body body)
(with-gensyms (normal-return-block lexer-error-tag)
`(macrolet ((lexer-error () `(go ,',lexer-error-tag)))
(block ,normal-return-block
(tagbody
(return-from
,normal-return-block
(locally ,@body))
,lexer-error-tag
(unless ,junk-allowed
(error 'lexer-error
:thing ,thing
:position lexer-position
:string (subseq lexer-string
lexer-start
lexer-end))))))))
(defmacro with-lexer ((string &optional (start 0) end) &body body)
"Set up lexer comprising: LEXER-STRING, LEXER-START, LEXER-END, and LEXER-POSITION."
`(with-array-data ((lexer-string ,string) (lexer-start ,start) (lexer-end ,end))
(check-type lexer-string simple-string)
(let ((lexer-position lexer-start))
(declare (type array-index lexer-position))
(declare (type simple-string lexer-string))
,@body)))
(defmacro lexer-case ((char &rest final) &rest clauses)
`(cond ((>= lexer-position lexer-end) ,@(or final '(nil)))
(t (let ((,char (schar lexer-string lexer-position)))
(declare (type character ,char) (ignorable ,char))
(cond ,.(loop for clause in clauses
for class = (pop clause) do (orf clause '(nil))
when (eq class 'otherwise) do (setq class 't)
collect `((char-of-class-p ,char ',class)
(incf lexer-position) ,@clause)))))))
(defmacro lexer-match (class &rest classes)
(loop with char = (gensym (string 'char)) for class in (list* class classes)
collect `(lexer-case (,char) (,class ,char)) into lexer-match-sequence
finally (return (if (rest lexer-match-sequence)
`(lexer-advance-if ,.lexer-match-sequence)
(first lexer-match-sequence)))))
(defmacro lexer-advance-if (test . tests)
(with-gensyms (previous-lexer-position)
`(let ((,previous-lexer-position lexer-position))
(or (and ,test ,@tests) (multiple-value-prog1 (values nil lexer-position)
(setq lexer-position ,previous-lexer-position))))))
(defmacro lexer-find (class)
(with-gensyms (position char)
`(when (< lexer-position lexer-end)
(loop with ,position of-type array-index = lexer-position
for ,char of-type character = (schar lexer-string ,position)
do (cond ((char-of-class-p ,char ',class) (return ,position))
((eql (incf ,position) lexer-end) (return nil)))))))
(defmacro lexer-skip (class)
(with-gensyms (char again)
`(the array-index
(prog1 lexer-position
(tagbody ,again
(lexer-case (,char)
(,class (go ,again))))))))
(defmacro get-token (class)
(cond ((eq class t) '(subseq lexer-string lexer-position lexer-end))
(t `(subseq lexer-string (lexer-skip ,class) lexer-position))))
(defmacro lexer-relative-position ()
`(the array-index (- lexer-position
lexer-start)))
(defstatic +digits+
(loop with digit-weight-type = '(integer 0 (36))
with max = (reduce #'max (map 'vector #'char-code "9Zz"))
with digits = (make-array (1+ max) :element-type digit-weight-type)
for i to max as weight = (or (digit-char-p (code-char i) 36) 0)
do (setf (aref digits i) weight) finally (return digits)))
(defmacro lexer-unsigned (&optional (radix 10))
(with-gensyms (unsigned-value digit-char digit-weight)
`(loop with ,unsigned-value finally (return ,unsigned-value)
while (lexer-case (,digit-char)
((digit ,radix)
(setq ,unsigned-value (if ,unsigned-value (* ,radix ,unsigned-value) 0))
(let ((,digit-weight (aref (load-time-value +digits+) (char-code ,digit-char))))
(declare (type (mod 36) ,digit-weight)) (incf ,unsigned-value ,digit-weight)))))))
(defmacro lexer-integer (&rest options)
(with-gensyms (sign minusp unsigned-value)
`(lexer-advance-if (let ((,minusp (lexer-case (,sign) (#\+ nil) (#\- t))))
(whereas ((,unsigned-value (lexer-unsigned ,@options)))
(if ,minusp (- ,unsigned-value) ,unsigned-value))))))
(defun %trim-name (buffer &optional (name-length-limit 80))
(cond ((<= (fill-pointer buffer) name-length-limit) buffer)
((let* ((type-marker-position (position #\. buffer :from-end t))
(type (when type-marker-position (subseq buffer (1+ type-marker-position))))
(fill-pointer (position #\- buffer :from-end t :end name-length-limit)))
(setf (fill-pointer buffer) (or fill-pointer name-length-limit))
(when (< 0 (length type) 7) (format buffer ".~A" type))
buffer))))
(defun collapse (string &optional allow-dot-p &key (start 0) end)
"Ensure STRING is valid as the name + type of a logical pathname."
(with-lexer (string start end)
(loop with state = nil and dot-position = nil
with buffer-size fixnum = (- lexer-end lexer-start)
with buffer = (make-buffer buffer-size 'standard-char)
do (lexer-case (char (return (%trim-name buffer)))
(#\_ (vector-push-extend #\_ buffer)
(setq state :hard-separator))
(alphanumeric
(when (eq state :soft-separator)
(vector-push-extend #\- buffer))
(vector-push-extend char buffer)
(setq state :alphanumeric-sequence))
(otherwise
(cond ((eql char #\'))
((and allow-dot-p (eql char #\.))
(when dot-position
(setf (char buffer dot-position) #\_))
(setq dot-position (fill-pointer buffer))
(vector-push-extend #\. buffer)
(setq state :hard-separator))
((>= (char-code char) 128))
((eq state :alphanumeric-sequence)
(setq state :soft-separator))))))))
(defun matcher-pattern (symbol terminated-by)
(loop for char across (symbol-name symbol)
collect char into symbol-characters
finally (return (append symbol-characters
(list terminated-by)))))
(defun make-matcher (symbols terminated-by &aux (states (make-buffer 50 t)))
(loop with matcher = (cons states nil) initially (vector-push-extend (list nil) states)
for symbol in symbols as pair = (cons (matcher-pattern symbol terminated-by) symbol)
do (push pair (cdr matcher)) finally (compute-states '() matcher) (return matcher)))
(defun compute-possible-matches (state matcher)
(loop with index fixnum = (length state) and alist = (cdr matcher)
for (pattern . symbol) in alist as length = (list-length pattern)
unless (or (<= length index) (mismatch pattern state :end1 index))
collect (cons (nthcdr index pattern) symbol) into pruned-alist
finally (return (delete-duplicates pruned-alist :key #'caar))))
(defun compute-transition (state pattern symbol matcher)
(destructuring-bind (char . remaining-pattern) pattern
(cond ((endp remaining-pattern) `((,char) (return ',symbol)))
(t (let ((next-state (append state (list char))) (states (car matcher)))
(let ((next-state-number (vector-push-extend (cons next-state nil) states)))
(compute-states next-state matcher) `(,char (setq state ,next-state-number))))))))
(defun compute-states (state matcher &aux (states (car matcher)))
(loop for (pattern . symbol) in (compute-possible-matches state matcher)
collect (compute-transition state pattern symbol matcher) into char-clauses
finally (let ((transition-form `(case char ,.char-clauses (otherwise (loop-finish)))))
(setf (cdr (aref states (position state states :key #'car :test #'equal)))
transition-form))))
(defmacro match-symbol (symbols &key case-sensitive-p terminated-by if-none-match)
(loop with match-failure-forms = (when (eq if-none-match :error) '((lexer-error)))
with char-upcase-form = '(when char (setq char (char-upcase char)))
with states = (car (make-matcher (ensure-list symbols) terminated-by))
for state-number of-type array-index from 0 below (fill-pointer states)
collect `(,state-number ,(cdr (aref states state-number))) into clauses
finally (return (with-gensyms (saved-position)
(let ((state-forms `((case state ,.clauses))))
(unless case-sensitive-p (push char-upcase-form state-forms))
`(loop with state of-type (integer 0 (,(fill-pointer states)))
with ,saved-position of-type fixnum = lexer-position
do (let ((char (lexer-match t))) ,@state-forms)
finally (setq lexer-position ,saved-position)
,@match-failure-forms))))))