;;;; -*- Mode: Lisp; Base: 10; Syntax: ANSI-Common-Lisp; Package: kira; Coding: utf-8 -*-
;;;; Copyright © 2021 David Mullen. All Rights Reserved. Origin: <https://cl-pdx.com/kira/>

(in-package :kira)

  (defconstant +char-class-bits+ 129 "ASCII + catch-all bit.")
  (deftype char-class () `(simple-bit-vector ,+char-class-bits+))
  (defvar *char-class-table* (make-hash-table :test 'equal))
  (defun char-class-p (object) (typep object 'char-class))
  (defun make-char-class (&optional (initial-element 0))
    (make-sequence 'simple-bit-vector +char-class-bits+
                   :initial-element initial-element)))

(defmacro define-char-class (macro-name lambda-list &body body)
  (multiple-value-bind (lambda-list whole-macro-form environment)
      (normalize-lambda-list lambda-list t t)
    (orf environment (gensym (string 'environment)))
    (orf whole-macro-form (gensym (string 'whole-macro-form)))
    (let ((macro-arguments `(if (atom ,whole-macro-form) '() (rest ,whole-macro-form))))
      (multiple-value-bind (body declarations documentation) (parse-body body nil t)
           (prog1 ',macro-name
             (clrhash *char-class-table*) ; Wipe out cached expansions.
             (setf (documentation ',macro-name 'char-class) ',documentation)
             (setf (get ',macro-name 'char-class)
                   #'(lambda (,whole-macro-form ,environment)
                       (declare (ignorable ,environment))
                       (destructuring-bind ,lambda-list ,macro-arguments
                         ,.declarations (block ,macro-name ,@body))))))))))

(define-char-class digit (&optional (radix 10))
  (check-type radix (integer 2 36) "an integer radix")
  (let ((upper-case-alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
    (cond ((<= radix 10) (subseq "0123456789" 0 radix))
          (t (let ((upper-case-alphabet (subseq upper-case-alphabet 0 (- radix 10))))
               `(or digit ,upper-case-alphabet ,(string-downcase upper-case-alphabet)))))))

(define-char-class satisfies (predicate-name)
  (loop with class = (make-char-class) for code fixnum from 0 below (1- +char-class-bits+)
        when (funcall predicate-name (code-char code)) do (setf (sbit class code) 1)
        finally (return class)))

  (defun ascii-char-p (char)
    (< (char-code char) 128)))

  (deftype ascii-char ()
    '(satisfies ascii-char-p)))

(define-char-class ascii () '(satisfies ascii-char-p))
(define-char-class graphic () '(satisfies graphic-char-p))
(define-char-class whitespace () '(satisfies whitespacep))
(define-char-class horizontal-whitespace () '(or #\Tab #\Space))
(define-char-class upper-case () "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define-char-class lower-case () "abcdefghijklmnopqrstuvwxyz")
(define-char-class alpha () '(or upper-case lower-case))
(define-char-class alphanumeric () '(or alpha digit))
(define-char-class t () (make-char-class 1))
(define-char-class nil () (make-char-class))

(define-char-class vertical-whitespace ()
  '(and whitespace (not horizontal-whitespace)))

  (defun char-class-expand-1 (form &optional env)
    (let ((name (if (consp form) (car form) form)))
      (whereas ((expander (if (or (symbolp form) (consp form)) (get name 'char-class))))
        (return-from char-class-expand-1 (values (funcall expander form env) t)))
      (values form nil))))

  (defun char-class-expand (form &optional environment)
    (loop with expanded-p and any-expanded-p = nil
	  do (multiple-value-setq (form expanded-p)
               (char-class-expand-1 form environment))
	  while expanded-p do (setq any-expanded-p t)
	  finally (return (values form any-expanded-p)))))

(define-char-class not (specifier)
  (bit-not (char-class specifier)))

  (defun and-2 (spec-x spec-y)
    (let ((class-x (char-class spec-x))
          (class-y (char-class spec-y)))
      (declare (type char-class class-x))
      (declare (type char-class class-y))
      (bit-and class-x class-y))))

  (defun or-2 (spec-x spec-y)
    (let ((class-x (char-class spec-x))
          (class-y (char-class spec-y)))
      (declare (type char-class class-x))
      (declare (type char-class class-y))
      (bit-ior class-x class-y))))

  (defun make-char-class-from-string (string)
    (loop with char-class = (make-char-class)
          for char of-type character across string
          for char-code fixnum = (char-code char)
          finally (return char-class)
          do (require-type char 'ascii-char)
             (setf (sbit char-class char-code)

  (defun char-class (specifier)
    (setq specifier (char-class-expand specifier))
    (cond ((char-class-p specifier) specifier)
          (t (orf (gethash specifier *char-class-table*)
                  (ctypecase specifier
                    (string (make-char-class-from-string specifier))
                    ((cons (eql and)) (reduce #'and-2 (cdr specifier) :initial-value (char-class t)))
                    ((cons (eql or)) (reduce #'or-2 (cdr specifier) :initial-value (char-class nil)))
                    (ascii-char (let ((char-class (make-char-class)) (char-code (char-code specifier)))
                                  (setf (sbit char-class char-code) 1) char-class))))))))

(define-compiler-macro char-class (&whole char-class-form specifier)
  (multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
    (unless specifier-evaluated-p (return-from char-class char-class-form))
    (char-class specifier)))

  (defmacro %char-of-class-p (char class)
    (let ((index `(min (char-code ,char) ,(1- +char-class-bits+))))
      `(not (zerop (sbit ,class (the array-index ,index)))))))

  (defun char-of-class-p (char specifier)
    (let ((class (char-class specifier)))
      (%char-of-class-p char class))))

(define-compiler-macro char-of-class-p (char specifier &aux class position)
  (multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
    (when specifier-evaluated-p
      (setq class (char-class specifier))
      (cond ((not (setq position (position 1 class))) (return-from char-of-class-p nil))
            ((and (= position (position 1 class :from-end t)) (/= position (1- +char-class-bits+)))
             (return-from char-of-class-p `(eql ,char ,(code-char position))))
            ((not (setq position (position 0 class))) (return-from char-of-class-p t))
            ((and (= position (position 0 class :from-end t)) (/= position (1- +char-class-bits+)))
             (return-from char-of-class-p `(not (eql ,char ,(code-char position)))))))
    (orf class `(char-class ,specifier))
    `(%char-of-class-p ,char ,class)))

(define-condition lexer-error (error)
  ((thing :initarg :thing :reader lexer-error-thing)
   (string :initarg :string :reader lexer-error-string)
   (position :initarg :position :reader lexer-error-position))
  (:report (lambda (condition stream)
             (with-accessors ((lexer-error-position lexer-error-position)) condition
               (format stream "Invalid ~A:~%~S~%~V@T^" (lexer-error-thing condition)
                       (lexer-error-string condition) (1+ lexer-error-position))))))

(defmacro with-lexer-error
    ((thing &optional junk-allowed) &body body)
  (with-gensyms (normal-return-block lexer-error-tag)
    `(macrolet ((lexer-error () `(go ,',lexer-error-tag)))
       (block ,normal-return-block
             (locally ,@body))
           (unless ,junk-allowed
             (error 'lexer-error
                    :thing ,thing
                    :position lexer-position
                    :string (subseq lexer-string

(defmacro with-lexer ((string &optional (start 0) end) &body body)
  "Set up lexer comprising: LEXER-STRING, LEXER-START, LEXER-END, and LEXER-POSITION."
  `(with-array-data ((lexer-string ,string) (lexer-start ,start) (lexer-end ,end))
     (check-type lexer-string simple-string)
     (let ((lexer-position lexer-start))
       (declare (type array-index lexer-position))
       (declare (type simple-string lexer-string))

(defmacro lexer-case ((char &rest final) &rest clauses)
  `(cond ((>= lexer-position lexer-end) ,@(or final '(nil)))
         (t (let ((,char (schar lexer-string lexer-position)))
              (declare (type character ,char) (ignorable ,char))
              (cond ,.(loop for clause in clauses
                            for class = (pop clause) do (orf clause '(nil))
                            when (eq class 'otherwise) do (setq class 't)
                            collect `((char-of-class-p ,char ',class)
                                      (incf lexer-position) ,@clause)))))))

(defmacro lexer-match (class &rest classes)
  (loop with char = (gensym (string 'char)) for class in (list* class classes)
        collect `(lexer-case (,char) (,class ,char)) into lexer-match-sequence
        finally (return (if (rest lexer-match-sequence)
                            `(lexer-advance-if ,.lexer-match-sequence)
                            (first lexer-match-sequence)))))

(defmacro lexer-advance-if (test . tests)
  (with-gensyms (previous-lexer-position)
    `(let ((,previous-lexer-position lexer-position))
       (or (and ,test ,@tests) (prog1 nil ; Restore original LEXER-POSITION.
                                 (setq lexer-position ,previous-lexer-position))))))

(defmacro lexer-find (class)
  (with-gensyms (position)
    `(loop for ,position of-type fixnum from lexer-position below lexer-end
           when (char-of-class-p (schar lexer-string ,position) ',class)
           return ,position)))

(defmacro lexer-skip (class)
  (with-gensyms (char again)
    `(the array-index
       (prog1 lexer-position
         (tagbody ,again
           (lexer-case (,char)
             (,class (go ,again))))))))

(defmacro get-token (class)
  (cond ((eq class t) '(subseq lexer-string lexer-position lexer-end))
        (t `(subseq lexer-string (lexer-skip ,class) lexer-position))))

(defstatic +digits+
  (loop with digit-weight-type = '(integer 0 (36))
        with max = (reduce #'max (map 'vector #'char-code "9Zz"))
        with digits = (make-array (1+ max) :element-type digit-weight-type)
        for i to max as weight = (or (digit-char-p (code-char i) 36) 0)
        do (setf (aref digits i) weight) finally (return digits)))

(defmacro lexer-unsigned (&optional (radix 10))
  (with-gensyms (unsigned-value digit-char digit-weight)
    `(loop with ,unsigned-value finally (return ,unsigned-value)
           while (lexer-case (,digit-char)
                   ((digit ,radix)
                    (setq ,unsigned-value (if ,unsigned-value (* ,radix ,unsigned-value) 0))
                    (let ((,digit-weight (aref (load-time-value +digits+) (char-code ,digit-char))))
                      (declare (type (mod 36) ,digit-weight)) (incf ,unsigned-value ,digit-weight)))))))

(defmacro lexer-integer (&rest options)
  (with-gensyms (sign minusp unsigned-value)
    `(lexer-advance-if (let ((,minusp (lexer-case (,sign) (#\+ nil) (#\- t))))
                         (whereas ((,unsigned-value (lexer-unsigned ,@options)))
                           (if ,minusp (- ,unsigned-value) ,unsigned-value))))))

(defun %trim-name (buffer &optional (name-length-limit 80))
  (cond ((<= (fill-pointer buffer) name-length-limit) buffer)
        ((let* ((type-marker-position (position #\. buffer :from-end t))
                (type (when type-marker-position (subseq buffer (1+ type-marker-position))))
                (fill-pointer (position #\- buffer :from-end t :end name-length-limit)))
           (setf (fill-pointer buffer) (or fill-pointer name-length-limit))
           (when (< 0 (length type) 7) (format buffer ".~A" type))

(defun collapse (string &optional allow-dots-p &key (start 0) end)
  "Make STRING acceptable as the filename part of a logical namestring."
  (with-lexer (string start end)
    (loop with state = nil
          with buffer-size fixnum = (- lexer-end lexer-start)
          with buffer = (make-buffer buffer-size 'standard-char)
          do (lexer-case (char (return (%trim-name buffer)))
               (#\_ (vector-push-extend #\_ buffer)
                    (setq state :hard-separator))
                (when (eq state :soft-separator)
                  (vector-push-extend #\- buffer))
                (vector-push-extend char buffer)
                (setq state :alphanumeric-sequence))
                (cond ((eql char #\')) ; Skip apostrophes.
                      ((and allow-dots-p (eql char #\.))
                       (vector-push-extend #\. buffer)
                       (setq state :hard-separator))
                      ((>= (char-code char) 128))
                      ((eq state :alphanumeric-sequence)
                       (setq state :soft-separator))))))))

(defun make-matcher (symbols terminated-by &aux matcher)
  (loop with states = (make-array (* 7 (length symbols)) :fill-pointer 1 :adjustable t)
        initially (setf (aref states 0) (list nil)) ; The initial state is the null prefix.
        for symbol in symbols as string = (format nil "~:@(~A~)~@[~A~]" symbol terminated-by)
        for pattern = (coerce string 'list) collect (cons pattern symbol) into alist
        finally (compute-states '() (setq matcher (cons states alist))) (return matcher)))

(defun compute-possible-matches (state matcher)
  (loop with index fixnum = (length state) and alist = (cdr matcher)
        for (pattern . symbol) in alist as length = (list-length pattern)
        unless (or (<= length index) (mismatch pattern state :end1 index))
        collect (cons (nthcdr index pattern) symbol) into pruned-alist
        finally (return (delete-duplicates pruned-alist :key #'caar))))

(defun compute-transition (state pattern symbol matcher)
  (destructuring-bind (char . remaining-pattern) pattern
    (cond ((endp remaining-pattern) `(,char (return ',symbol)))
          (t (let ((next-state (append state (list char))) (states (car matcher)))
               (let ((next-state-number (vector-push-extend (cons next-state nil) states)))
                 (compute-states next-state matcher) `(,char (setq state ,next-state-number))))))))

(defun compute-states (state matcher &aux (states (car matcher)))
  (loop for (pattern . symbol) in (compute-possible-matches state matcher)
        collect (compute-transition state pattern symbol matcher) into char-clauses
        finally (let ((transition-form `(case char ,.char-clauses (otherwise (loop-finish)))))
                  (setf (cdr (aref states (position state states :key #'car :test #'equal)))

(defmacro match-symbol (symbols &key case-sensitive-p terminated-by if-none-match)
  (loop with match-failure-forms = (when (eq if-none-match :error) '((lexer-error)))
        with states = (car (make-matcher (ensure-list symbols) terminated-by))
        for state-number of-type array-index from 0 below (fill-pointer states)
        collect `(,state-number ,(cdr (aref states state-number))) into clauses
        finally (return (with-gensyms (saved-position)
                          (let ((state-forms `((case state ,.clauses))))
                            (unless case-sensitive-p (push '(setq char (char-upcase char)) state-forms))
                            `(loop with state fixnum = 0 and ,saved-position fixnum = lexer-position
                                   do (lexer-case (char (loop-finish)) (otherwise ,@state-forms))
                                   finally (setq lexer-position ,saved-position) ,@match-failure-forms))))))