;;;; -*- Mode: Lisp; Base: 10; Syntax: ANSI-Common-Lisp; Package: kira; Coding: utf-8 -*-
;;;; Copyright © 2024 David Mullen. All Rights Reserved. Origin: <https://cl-pdx.com/kira/>

(in-package :kira)

(eval-always
  (defconstant +char-class-bits+ 257 "8-bit + catch-all bit.")
  (deftype char-class () `(simple-bit-vector ,+char-class-bits+))
  (defvar *char-class-table* (make-hash-table :test 'equal))
  (defun char-class-p (object) (typep object 'char-class))
  (defun make-char-class (&optional (initial-element 0))
    (make-bit-vector +char-class-bits+ initial-element)))

(defmacro define-char-class (name lambda-list &body body)
  (multiple-value-bind (lambda-expression documentation)
      (parse-macro-1 name lambda-list body)
    `(eval-always
       (prog1 (progn (clrhash *char-class-table*) ',name)
         (setf (documentation ',name 'char-class) ,documentation)
         (setf (get ',name 'char-class) #',lambda-expression)))))

(define-char-class digit (&optional (radix 10))
  (check-type radix (integer 2 36) "an integer radix")
  (let ((upper-case-alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
    (cond ((<= radix 10) (subseq "0123456789" 0 radix))
          (t (let ((upper-case-alphabet (subseq upper-case-alphabet 0 (- radix 10))))
               `(or digit ,upper-case-alphabet ,(string-downcase upper-case-alphabet)))))))

(define-char-class satisfies (predicate-name)
  (loop with class = (make-char-class) for code fixnum from 0 below (1- +char-class-bits+)
        when (funcall predicate-name (code-char code)) do (setf (sbit class code) 1)
        finally (return class)))

(eval-always
  (defun ascii-char-p (char)
    (when (characterp char)
      (let ((code (char-code char)))
        (declare (type fixnum code))
        (< code 128)))))

(eval-always
  (deftype ascii-char ()
    '(satisfies ascii-char-p)))

(eval-always
  (defun whitespacep (thing)
    (cond ((sequencep thing) (every #'whitespacep thing))
          ((case thing ((#\Return #\Linefeed #\Page #\Space
                         #\PageUp #\Tab #\No-Break_Space) t))))))

(define-char-class ascii () '(satisfies ascii-char-p))
(define-char-class graphic () '(satisfies graphic-char-p))
(define-char-class whitespace () '(satisfies whitespacep))
(define-char-class horizontal-whitespace () '(or #\Tab #\Space))
(define-char-class upper-case () "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define-char-class lower-case () "abcdefghijklmnopqrstuvwxyz")
(define-char-class alpha () '(or upper-case lower-case))
(define-char-class alphanumeric () '(or alpha digit))

(define-char-class t ()
  "Set of all characters."
  (make-char-class 1))

(define-char-class nil ()
  "Empty set of characters."
  (make-char-class 0))

(define-char-class vertical-whitespace ()
  '(and whitespace (not horizontal-whitespace)))

(eval-always
  (defun char-class-expand-1 (form &optional env)
    (let ((x (typecase form (null '(nil)) (symbol (list form)) (otherwise form))))
      (let ((expander (and (typep x '(cons symbol t)) (get (car x) 'char-class))))
        (if expander (values (funcall expander x env) t) (values form nil))))))

(eval-always
  (defun char-class-expand (form &optional environment)
    (loop with expanded-p and any-expanded-p = nil
	  do (multiple-value-setq (form expanded-p)
               (char-class-expand-1 form environment))
	  while expanded-p do (setq any-expanded-p t)
	  finally (return (values form any-expanded-p)))))

(define-char-class not (specifier)
  (bit-not (char-class specifier)))

(eval-always
  (defun and-2 (spec-x spec-y)
    (let ((class-x (char-class spec-x))
          (class-y (char-class spec-y)))
      (declare (type char-class class-x))
      (declare (type char-class class-y))
      (bit-and class-x class-y))))

(eval-always
  (defun or-2 (spec-x spec-y)
    (let ((class-x (char-class spec-x))
          (class-y (char-class spec-y)))
      (declare (type char-class class-x))
      (declare (type char-class class-y))
      (bit-ior class-x class-y))))

(eval-always
  (defun make-char-class-from-string (string)
    (loop with char-class = (make-char-class)
          for char of-type character across string
          for char-code fixnum = (char-code char)
          do (setf (bit char-class char-code) 1)
          finally (return char-class))))

(eval-always
  (defun char-class (specifier)
    (setq specifier (char-class-expand specifier))
    (cond ((char-class-p specifier) specifier)
          (t (orf (gethash specifier *char-class-table*)
                  (ctypecase specifier
                    (string (make-char-class-from-string specifier))
                    ((cons (eql and)) (reduce #'and-2 (cdr specifier) :initial-value (char-class t)))
                    ((cons (eql or)) (reduce #'or-2 (cdr specifier) :initial-value (char-class nil)))
                    (ascii-char (let ((char-class (make-char-class)) (char-code (char-code specifier)))
                                  (setf (sbit char-class char-code) 1) char-class))))))))

(define-compiler-macro char-class (&whole char-class-form specifier)
  (multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
    (unless specifier-evaluated-p (return-from char-class char-class-form))
    (char-class specifier)))

(eval-always
  (defmacro %char-of-class-p (char class)
    (let ((index `(min (char-code ,char) ,(1- +char-class-bits+))))
      `(not (zerop (sbit ,class (the array-index ,index)))))))

(eval-always
  (defun char-of-class-p (char specifier)
    (let ((class (char-class specifier)))
      (%char-of-class-p char class))))

(define-compiler-macro char-of-class-p (char specifier &aux class position)
  (multiple-value-bind (specifier specifier-evaluated-p) (macro-eval specifier)
    (when specifier-evaluated-p
      (setq class (char-class specifier))
      (cond ((not (setq position (position 1 class))) (return-from char-of-class-p nil))
            ((and (= position (position 1 class :from-end t)) (/= position (1- +char-class-bits+)))
             (return-from char-of-class-p `(eql ,char ,(code-char position))))
            ((not (setq position (position 0 class))) (return-from char-of-class-p t))
            ((and (= position (position 0 class :from-end t)) (/= position (1- +char-class-bits+)))
             (return-from char-of-class-p `(not (eql ,char ,(code-char position)))))))
    (orf class `(char-class ,specifier))
    `(%char-of-class-p ,char ,class)))

(define-condition lexer-error (error)
  ((thing :initarg :thing :reader lexer-error-thing)
   (string :initarg :string :reader lexer-error-string)
   (position :initarg :position :reader lexer-error-position))
  (:report (lambda (condition stream)
             (with-accessors ((lexer-error-position lexer-error-position)) condition
               (format stream "Invalid ~A:~%~S~%~V@T^" (lexer-error-thing condition)
                       (lexer-error-string condition) (1+ lexer-error-position))))))

(defmacro with-lexer-error
    ((thing &optional junk-allowed) &body body)
  (with-gensyms (normal-return-block lexer-error-tag)
    `(macrolet ((lexer-error () `(go ,',lexer-error-tag)))
       (block ,normal-return-block
         (tagbody
           (return-from
               ,normal-return-block
             (locally ,@body))
          ,lexer-error-tag
           (unless ,junk-allowed
             (error 'lexer-error
                    :thing ,thing
                    :position lexer-position
                    :string (subseq lexer-string
                                    lexer-start
                                    lexer-end))))))))

(defmacro with-lexer ((string &optional (start 0) end) &body body)
  "Set up lexer comprising: LEXER-STRING, LEXER-START, LEXER-END, and LEXER-POSITION."
  `(with-array-data ((lexer-string ,string) (lexer-start ,start) (lexer-end ,end))
     (check-type lexer-string simple-string)
     (let ((lexer-position lexer-start))
       (declare (type array-index lexer-position))
       (declare (type simple-string lexer-string))
       ,@body)))

(defmacro lexer-case ((char &rest final) &rest clauses)
  `(cond ((>= lexer-position lexer-end) ,@(or final '(nil)))
         (t (let ((,char (schar lexer-string lexer-position)))
              (declare (type character ,char) (ignorable ,char))
              (cond ,.(loop for clause in clauses
                            for class = (pop clause) do (orf clause '(nil))
                            when (eq class 'otherwise) do (setq class 't)
                            collect `((char-of-class-p ,char ',class)
                                      (incf lexer-position) ,@clause)))))))

(defmacro lexer-match (class &rest classes)
  (loop with char = (gensym (string 'char)) for class in (list* class classes)
        collect `(lexer-case (,char) (,class ,char)) into lexer-match-sequence
        finally (return (if (rest lexer-match-sequence)
                            `(lexer-advance-if ,.lexer-match-sequence)
                            (first lexer-match-sequence)))))

(defmacro lexer-advance-if (test . tests)
  (with-gensyms (previous-lexer-position)
    `(let ((,previous-lexer-position lexer-position))
       (or (and ,test ,@tests) (multiple-value-prog1 (values nil lexer-position)
                                 (setq lexer-position ,previous-lexer-position))))))

(defmacro lexer-find (class)
  (with-gensyms (position char)
    `(when (< lexer-position lexer-end)
       (loop with ,position of-type array-index = lexer-position
           for ,char of-type character = (schar lexer-string ,position)
           do (cond ((char-of-class-p ,char ',class) (return ,position))
                    ((eql (incf ,position) lexer-end) (return nil)))))))

(defmacro lexer-skip (class)
  (with-gensyms (char again)
    `(the array-index
       (prog1 lexer-position
         (tagbody ,again
           (lexer-case (,char)
             (,class (go ,again))))))))

(defmacro get-token (class)
  (cond ((eq class t) '(subseq lexer-string lexer-position lexer-end))
        (t `(subseq lexer-string (lexer-skip ,class) lexer-position))))

(defmacro lexer-relative-position ()
  `(the array-index (- lexer-position
                       lexer-start)))

(defstatic +digits+
  (loop with digit-weight-type = '(integer 0 (36))
        with max = (reduce #'max (map 'vector #'char-code "9Zz"))
        with digits = (make-array (1+ max) :element-type digit-weight-type)
        for i to max as weight = (or (digit-char-p (code-char i) 36) 0)
        do (setf (aref digits i) weight) finally (return digits)))

(defmacro lexer-unsigned (&optional (radix 10))
  (with-gensyms (unsigned-value digit-char digit-weight)
    `(loop with ,unsigned-value finally (return ,unsigned-value)
           while (lexer-case (,digit-char)
                   ((digit ,radix)
                    (setq ,unsigned-value (if ,unsigned-value (* ,radix ,unsigned-value) 0))
                    (let ((,digit-weight (aref (load-time-value +digits+) (char-code ,digit-char))))
                      (declare (type (mod 36) ,digit-weight)) (incf ,unsigned-value ,digit-weight)))))))

(defmacro lexer-integer (&rest options)
  (with-gensyms (sign minusp unsigned-value)
    `(lexer-advance-if (let ((,minusp (lexer-case (,sign) (#\+ nil) (#\- t))))
                         (whereas ((,unsigned-value (lexer-unsigned ,@options)))
                           (if ,minusp (- ,unsigned-value) ,unsigned-value))))))

(defun %trim-name (buffer &optional (name-length-limit 80))
  (cond ((<= (fill-pointer buffer) name-length-limit) buffer)
        ((let* ((type-marker-position (position #\. buffer :from-end t))
                (type (when type-marker-position (subseq buffer (1+ type-marker-position))))
                (fill-pointer (position #\- buffer :from-end t :end name-length-limit)))
           (setf (fill-pointer buffer) (or fill-pointer name-length-limit))
           (when (< 0 (length type) 7) (format buffer ".~A" type))
           buffer))))

(defun collapse (string &optional allow-dot-p &key (start 0) end)
  "Ensure STRING is valid as the name + type of a logical pathname."
  (with-lexer (string start end)
    (loop with state = nil and dot-position = nil
          with buffer-size fixnum = (- lexer-end lexer-start)
          with buffer = (make-buffer buffer-size 'standard-char)
          do (lexer-case (char (return (%trim-name buffer)))
               (#\_ (vector-push-extend #\_ buffer)
                    (setq state :hard-separator))
               (alphanumeric
                (when (eq state :soft-separator)
                  (vector-push-extend #\- buffer))
                (vector-push-extend char buffer)
                (setq state :alphanumeric-sequence))
               (otherwise
                (cond ((eql char #\')) ; Skip apostrophes.
                      ((and allow-dot-p (eql char #\.))
                       (when dot-position
                         ;; Only one dot is allowed, since there's
                         ;; no version and having dots in the name
                         ;; (escaped) doesn't seem to work in CCL.
                         ;; An underscore is allowed in CCL logical
                         ;; pathnames, though it is outside of ANSI.
                         (setf (char buffer dot-position) #\_))
                       (setq dot-position (fill-pointer buffer))
                       (vector-push-extend #\. buffer)
                       (setq state :hard-separator))
                      ((>= (char-code char) 128))
                      ((eq state :alphanumeric-sequence)
                       (setq state :soft-separator))))))))

(defun matcher-pattern (symbol terminated-by)
  (loop for char across (symbol-name symbol)
        collect char into symbol-characters
        finally (return (append symbol-characters
                                (list terminated-by)))))

(defun make-matcher (symbols terminated-by &aux (states (make-buffer 50 t)))
  (loop with matcher = (cons states nil) initially (vector-push-extend (list nil) states)
        for symbol in symbols as pair = (cons (matcher-pattern symbol terminated-by) symbol)
        do (push pair (cdr matcher)) finally (compute-states '() matcher) (return matcher)))

(defun compute-possible-matches (state matcher)
  (loop with index fixnum = (length state) and alist = (cdr matcher)
        for (pattern . symbol) in alist as length = (list-length pattern)
        unless (or (<= length index) (mismatch pattern state :end1 index))
        collect (cons (nthcdr index pattern) symbol) into pruned-alist
        finally (return (delete-duplicates pruned-alist :key #'caar))))

(defun compute-transition (state pattern symbol matcher)
  (destructuring-bind (char . remaining-pattern) pattern
    (cond ((endp remaining-pattern) `((,char) (return ',symbol)))
          (t (let ((next-state (append state (list char))) (states (car matcher)))
               (let ((next-state-number (vector-push-extend (cons next-state nil) states)))
                 (compute-states next-state matcher) `(,char (setq state ,next-state-number))))))))

(defun compute-states (state matcher &aux (states (car matcher)))
  (loop for (pattern . symbol) in (compute-possible-matches state matcher)
        collect (compute-transition state pattern symbol matcher) into char-clauses
        finally (let ((transition-form `(case char ,.char-clauses (otherwise (loop-finish)))))
                  (setf (cdr (aref states (position state states :key #'car :test #'equal)))
                        transition-form))))

(defmacro match-symbol (symbols &key case-sensitive-p terminated-by if-none-match)
  (loop with match-failure-forms = (when (eq if-none-match :error) '((lexer-error)))
        with char-upcase-form = '(when char (setq char (char-upcase char)))
        with states = (car (make-matcher (ensure-list symbols) terminated-by))
        for state-number of-type array-index from 0 below (fill-pointer states)
        collect `(,state-number ,(cdr (aref states state-number))) into clauses
        finally (return (with-gensyms (saved-position)
                          (let ((state-forms `((case state ,.clauses))))
                            (unless case-sensitive-p (push char-upcase-form state-forms))
                            `(loop with state of-type (integer 0 (,(fill-pointer states)))
                                   with ,saved-position of-type fixnum = lexer-position
                                   do (let ((char (lexer-match t))) ,@state-forms)
                                   finally (setq lexer-position ,saved-position)
                                           ,@match-failure-forms))))))