;;;; -*- Mode: Lisp; Base: 10; Syntax: ANSI-Common-Lisp; Package: kira; Coding: utf-8 -*-
;;;; Copyright © 2023 David Mullen. All Rights Reserved. Origin: <https://cl-pdx.com/kira/>

(in-package :kira)

(defstruct* archive
  message-id-database
  article-count
  text-stream
  index-stream
  thread-stream
  line-buffer)

(defvar *archive*
  (make-archive))

(defun get-text-stream ()
  (orf (archive-text-stream *archive*)
       (open #p"kira:cll;cll.txt"
             :sharing :external)))

(defun get-index-pathname ()
  (let ((text-pathname (pathname (get-text-stream))))
    (merge-pathnames #p"kira:;.idx" text-pathname)))

(defun get-index-stream ()
  (orf (archive-index-stream *archive*)
       (let ((index-pathname (get-index-pathname)))
         (open index-pathname :sharing :external
               :element-type '(unsigned-byte 32)))))

(defun article-count ()
  (orf (archive-article-count *archive*)
       (let ((pathname (get-index-pathname)))
         (ash (file-data-size pathname) -3))))

(defun get-article-record (article-id)
  (let ((index-stream (get-index-stream)))
    (file-position index-stream (* 2 article-id))
    (let  ((text-position (read-byte index-stream nil))
           (universal-time (read-byte index-stream nil)))
      (declare (type (unsigned-byte 32) text-position universal-time))
      (unless text-position (error "No such article: ~D." article-id))
      (values text-position (local-time :universal universal-time)))))

(defun get-thread-pathname ()
  (let ((text-pathname (pathname (get-text-stream))))
    (merge-pathnames #p"kira:;.spl" text-pathname)))

(defun get-thread-stream ()
  (orf (archive-thread-stream *archive*)
       (let ((thread-pathname (get-thread-pathname)))
         (open thread-pathname :sharing :external
               :element-type '(unsigned-byte 32)))))

(defun get-replies (article-id)
  (let ((thread-stream (get-thread-stream)))
    (file-position thread-stream article-id)
    (let ((list-position (read-byte thread-stream)))
      (cond ((eql list-position 0) nil) ; No replies.
            (t (file-position thread-stream list-position)
               (loop repeat (read-byte thread-stream)
                     collect (read-byte thread-stream)))))))

(defun get-line-buffer ()
  (orf (archive-line-buffer *archive*)
       (make-buffer 1024 'character)))

(defun %trim-header-line (buffer)
  (loop for end of-type array-index = (fill-pointer buffer)
        for char = (when (plusp end) (char buffer (1- end)))
        while (or (eql char #\Space) (eql char #\Tab))
        do (decf (fill-pointer buffer))))

(defun read-header-line* (&optional (stream (get-text-stream)))
  (loop with buffer = (get-line-buffer) and state = :reading-header-line
        initially (reset-buffer buffer) for char = (read-char stream nil)
        unless char do (%trim-header-line buffer) (return buffer)
        do (if (eq state :possible-continuation)
               (cond ((or (eql char #\Space)
                          (eql char #\Tab))
                      (setq state :reading-header-line)
                      (vector-push-extend char buffer))
                     ;; Not a continuation line.
                     (t (unread-char char stream)
                        (%trim-header-line buffer)
                        (return buffer)))
               ;; State :READING-HEADER-LINE.
               (if (eql char #\Newline)
                   (if (plusp (fill-pointer buffer))
                       (setq state :possible-continuation)
                       (return-from read-header-line* ""))
                   (vector-push-extend char buffer)))))

(defmacro %get-header (article-id header-name)
  `(loop with text-stream = (get-text-stream)
         with position = (get-article-record ,article-id)
         initially (file-position text-stream position)
         for line = (read-header-line* text-stream)
         do (with-lexer ((if (plusp (length line)) line (loop-finish)))
              (when (match-symbol ,header-name :terminated-by #\:)
                (lexer-skip whitespace) (return (get-token t))))))

(defun get-date (article-id)
  "Get the raw \"Date:\" header."
  (%get-header article-id :date))

(defstatic *zone-plist*
  ;; This isn't any kind of exhaustive list,
  ;; just the zones seen in comp.lang.lisp.
  (list :pst  (parse-zone "-0800")
        :pdt  (parse-zone "-0700")
        :mst  (parse-zone "-0700")
        :mdt  (parse-zone "-0600")
        :cst  (parse-zone "-0600")
        :cdt  (parse-zone "-0500")
        :est  (parse-zone "-0500")
        :edt  (parse-zone "-0400")
        :gmt  (parse-zone "+0000")
        :ut   (parse-zone "+0000")
        :utc  (parse-zone "+0000")
        :wet  (parse-zone "+0000")
        :bst  (parse-zone "+0100")
        :cet  (parse-zone "+0100")
        :cez  (parse-zone "+0100")
        :met  (parse-zone "+0100")
        :mez  (parse-zone "+0100")
        :west (parse-zone "+0100")
        :cest (parse-zone "+0200")
        :mest (parse-zone "+0200")
        :eet  (parse-zone "+0200")
        :eez  (parse-zone "+0200")
        :kst  (parse-zone "+0900")
        :nzst (parse-zone "+1200")
        :nzdt (parse-zone "+1300")))

(defstatic *article-id->message-id*
  (make-hash-table :weak :value))

(defun get-message-id (article-id)
  (orf (gethash article-id *article-id->message-id*)
       (with-lexer ((%get-header article-id :message-id))
         (when (lexer-match #\<) (get-token (not #\>))))))

(defun get-archive-cdb ()
  (orf (archive-message-id-database *archive*)
       (let ((text-pathname (pathname (get-text-stream))))
         (cdb-open (merge-pathnames ".cdb" text-pathname)))))

(defun find-article (message-id)
  (let ((message-id-length (length message-id)))
    (rletz ((key :cdb-datum) (value :cdb-datum))
      (with-cstrs ((message-id-cstring message-id))
        (setf (pref key :cdb-datum.size) message-id-length)
        (setf (pref key :cdb-datum.data) message-id-cstring)
        (multiple-value-bind (value.data value.size)
            (progn (cdb-get (get-archive-cdb) key value)
                   (values (pref value :cdb-datum.data)
                           (pref value :cdb-datum.size)))
          ;; The data value is a vector of 32-bit IDs.
          (cond ((%null-ptr-p value.data) (values nil 0))
                (t (values (%get-unsigned-long value.data)
                           (the fixnum (ash value.size -2))))))))))

(defmacro get-reference-from-end ()
  "Get last reference, narrowing the lexer window."
  `(loop with lexer-position of-type array-index = lexer-end
         while (> lexer-position lexer-start) do (decf lexer-position)
         when (lexer-match #\<) do (let ((new-lexer-end (1- lexer-position))
                                         (parent-id (get-token (not #\>))))
                                     (declare (type array-index new-lexer-end))
                                     (setq lexer-end new-lexer-end)
                                     (return parent-id))))

(defmacro get-viable-reference ()
  `(loop for last-reference = (or (get-reference-from-end) (loop-finish))
         do (whereas ((reference-article-id (find-article last-reference)))
              (return reference-article-id))))

(defstruct* article
  id from subject time
  message-id text replies
  in-reply-to x-no-archive
  encoding charset)

(defstatic *article-cache*
  (make-hash-table :weak :value)
  "Map IDs to ARTICLE structures.")

(defmacro match-eol ()
  '(or (lexer-match #\Newline)
       (lexer-case (eof t))))

(defun q->octets (string &optional (start 0) end underscore-to-space-p)
  "Used for both quoted-printable and encoded-word sequences."
  (with-lexer (string start end)
    (loop with buffer-size = (- lexer-end lexer-start)
          with buffer = (make-buffer buffer-size 'octet)
          for char = (or (lexer-match t) (return buffer))
          do (or (when (eql char #\=)
                   (or (match-eol)
                       (let (x y)
                         (lexer-advance-if
                          (setq x (lexer-match (digit 16)))
                          (setq y (lexer-match (digit 16))))
                         (when (and x y)
                           (locally (declare (type character x y))
                             (let* ((hi (aref +digits+ (char-code x)))
                                    (lo (aref +digits+ (char-code y)))
                                    (hex-code (logior (ash hi 4) lo)))
                               (declare (type (unsigned-byte 4) hi))
                               (declare (type (unsigned-byte 4) lo))
                               (declare (type (unsigned-byte 8) hex-code))
                               (vector-push-extend hex-code buffer)))))))
                 ;; Regular character, or skipping
                 ;; a bad quoted-printable sequence.
                 (if (and underscore-to-space-p (eql char #\_))
                     (vector-push-extend (char-code #\ ) buffer)
                     (vector-push-extend (char-code char) buffer))))))

(define-symbol-macro +windows-1252->unicode+
  #.(coerce '(#\U+20AC #\U+FFFD #\U+201A #\U+0192
              #\U+201E #\U+2026 #\U+2020 #\U+2021
              #\U+02C6 #\U+2030 #\U+0160 #\U+2039
              #\U+0152 #\U+FFFD #\U+017D #\U+FFFD
              #\U+FFFD #\U+2018 #\U+2019 #\U+201C
              #\U+201D #\U+2022 #\U+2013 #\U+2014
              #\U+02DC #\U+2122 #\U+0161 #\U+203A
              #\U+0153 #\U+FFFD #\U+017E #\U+0178)
            'simple-string))

(defun char-range (lo hi)
  (loop with string = (make-string (1+ (- hi lo)))
        for i of-type array-index from lo to hi
        for c of-type character = (code-char i)
        do (setf (schar string (- i lo)) c)
        finally (return string)))

(defstatic +windows-1252+
  (concatenate 'simple-string
               (char-range #x00 #x7F)
               +windows-1252->unicode+
               (char-range #xA0 #xFF)))

(defun %decode-windows-1252 (octet-vector)
  (loop with string = (make-string (length octet-vector))
        for octet of-type octet across octet-vector
        for i fixnum from 0 finally (return string)
        for char = (schar +windows-1252+ octet)
        do (setf (schar string i) char)))

(defun decode-quoted-printable (string &key (start 0) end (external-format :utf-8) (qp t))
  ;; The :windows-1252 encoding is special-cased because CCL doesn't handle it out of the box.
  (cond ((eq external-format :windows-1252) (%decode-windows-1252 (q->octets string start end (not qp))))
        ((decode-string-from-octets (q->octets string start end (not qp)) :external-format external-format))))

(defun decode-text (text charset content-transfer-encoding)
  (if (string-equal content-transfer-encoding :quoted-printable)
      (decode-quoted-printable text :external-format charset)
      text))

(eval-always
  (define-symbol-macro +basic-charset-names+
    '(:iso-8859-1 :us-ascii :utf-8 :windows-1252)))

(defmacro match-encoded-charset ()
  `(lexer-advance-if (lexer-match #\= #\?)
                     (match-symbol #.+basic-charset-names+
                                   :terminated-by #\?)))

(defmacro %remember-whitespace ()
  `(multiple-value-setq (whitespace-start whitespace-end)
     (values (lexer-skip whitespace) lexer-position)))

(defmacro %decode-encoded-word (string start end charset)
  `(decode-quoted-printable ,string :start ,start :end ,end
                            :qp nil :external-format ,charset))

(defun decode-encoded-words (string &optional (start 0) end)
  "Decode encoded-word sequences (for the +BASIC-CHARSET-NAMES+)."
  (with-output-to-string (output-stream)
    (with-lexer (string start end)
      (loop with whitespace-start of-type array-index = lexer-start
            with whitespace-end of-type array-index = lexer-start
            for position of-type array-index = lexer-position
            do (or (whereas ((charset (match-encoded-charset)))
                     (when (match-symbol :q :terminated-by #\?)
                       (loop with start of-type array-index = lexer-position
                             for end of-type array-index = lexer-position
                             until (lexer-match #\? #\=) do (or (lexer-match t) (return nil))
                             finally (let ((result (%decode-encoded-word lexer-string start end charset)))
                                       (write-string result output-stream) (return (%remember-whitespace))))))
                   ;; Regular character, or skipping
                   ;; an unknown encoded-word sequence.
                   (progn (setq lexer-position position)
                          (lexer-case (char (loop-finish))
                            (t (when (> whitespace-end whitespace-start)
                                 (write-string lexer-string output-stream
                                               :start whitespace-start
                                               :end whitespace-end)
                                 (setq whitespace-start lexer-position)
                                 (setq whitespace-end whitespace-start))
                               (write-char char output-stream)))))))))

(defmacro find-charset ()
  `(loop with original-position = lexer-position
         for lexer-position of-type array-index
         from original-position below lexer-end
         when (match-symbol :charset :terminated-by #\=)
         return (with-lexer-error ("content-type parameter")
                  (return (find (parse-http-value)
                                +basic-charset-names+
                                :test #'string-equal)))))

(defun get-article (article-id)
  (orf (gethash article-id *article-cache*)
       (let ((text-stream (get-text-stream))
             encoding (initargs '()))
         (multiple-value-bind
               (text-position local-time)
             (get-article-record article-id)
           (file-position text-stream text-position)
           (setf (getf initargs :time) local-time))
         (loop for line = (read-header-line* text-stream)
               while (and line (plusp (length line)))
               do (with-lexer (line)
                    (whereas ((important-header-name
                               (match-symbol
                                (:from
                                 :subject
                                 :message-id
                                 :references
                                 :x-no-archive
                                 :content-type
                                 :content-transfer-encoding)
                                :terminated-by #\:)))
                      (lexer-skip whitespace)
                      (case important-header-name
                        (:message-id
                         (loop while (> lexer-end lexer-position)
                               finally (setf (getf initargs :message-id) (get-token t))
                               ;; Locate the closing bracket (i.e. skip trailing "#1/1").
                               until (char= (schar lexer-string (1- lexer-end)) #\>)
                               do (decf lexer-end)))
                        (:content-type (setf (getf initargs :charset) (find-charset)))
                        (:references (setf (getf initargs :in-reply-to) (get-viable-reference)))
                        (:content-transfer-encoding (setq encoding (get-token t)))
                        (otherwise (setf (getf initargs important-header-name)
                                         (decode-encoded-words lexer-string
                                                               lexer-position
                                                               lexer-end)))))))
         ;; End of article headers. Get the content.
         ;; This may involve handling "quoted-printable."
         (let ((text (read-article-text text-stream))
               (charset (getf initargs :charset :iso-8859-1)))
           (setq text (decode-text text charset encoding))
           (setq text (string-trim '(#\Newline) text))
           (apply #'make-article :id article-id
                  :text text :encoding encoding
                  :replies (get-replies article-id)
                  initargs)))))

(defun read-article-text (stream)
  (loop with pattern = #.(format nil "~%From ...~%")
        with pattern-length fixnum = (length pattern)
        with match-count of-type array-index = 0
        with buffer = (make-buffer 1024 'character)
        for char = (read-char stream nil)
        while char finally (return buffer)
        do (vector-push-extend char buffer)
           (cond ((eql char (char pattern match-count))
                  (when (= (incf match-count) pattern-length)
                    (decf (fill-pointer buffer) pattern-length)
                    (loop-finish)))
                 ((eql char #\Newline)
                  (setq match-count 1))
                 ((setq match-count 0)))))

(defmacro write-marked-inclusive (&optional (stream 'html-output))
  "Write from MARK including the character behind LEXER-POSITION."
  (let ((end-including-matched-character 'lexer-position))
    `(when mark
       (write-string lexer-string ,stream
                     :start (prog1 mark (setq mark nil))
                     :end ,end-including-matched-character))))

(defmacro write-marked-exclusive (&optional (stream 'html-output))
  (let ((end-excluding-matched-character '(1- lexer-position)))
    `(when mark
       (write-string lexer-string ,stream
                     :start (prog1 mark (setq mark nil))
                     :end ,end-excluding-matched-character))))

(defun render-pre (string &optional (start 0) end)
  "Like ESCAPE-HTML but also redact email addresses."
  (with-lexer (string start end)
    (loop with mark = nil and html-output = *html-output*
          do (lexer-case (char (return (write-marked-inclusive)))
               (#\& (write-marked-exclusive) (write-string "&amp;" html-output))
               (#\< (write-marked-exclusive) (write-string "&lt;" html-output))
               (#\> (write-marked-exclusive) (write-string "&gt;" html-output))
               (#\@ (loop with start fixnum = (or mark (1- lexer-position))
                          with dot-count fixnum = (- (1- lexer-position) start)
                          repeat dot-count do (write-char #\Middle_Dot html-output)
                          finally (write-char #\@ html-output) (setq mark nil)))
               ((or "()[]," whitespace)
                (write-marked-exclusive)
                (write-char char html-output))
               (t (orf mark (1- lexer-position)))))))

(defun render-from (from-line)
  (with-lexer (from-line)
    (unless (= lexer-end lexer-start)
      (case (schar lexer-string (1- lexer-end))
        (#\> (setq lexer-end (lexer-find #\<)))
        (#\) (whereas ((open-position (lexer-find #\() array-index))
               (progn (decf lexer-end) (setq lexer-start (1+ open-position)))))))
    (symbol-macrolet
        ((first-char
           (unless (= lexer-end lexer-start)
             (schar lexer-string lexer-start)))
         (last-char
           (unless (= lexer-end lexer-start)
             (schar lexer-string (1- lexer-end)))))
      ;; Remove surrounding spaces and quotes.
      (loop finally (setq lexer-position lexer-start)
            while (or (if (eql last-char #\Space) (decf lexer-end))
                      (if (eql last-char #\Tab) (decf lexer-end))
                      (if (eql first-char #\") (incf lexer-start))
                      (if (eql last-char #\") (decf lexer-end)))))
    ;; Get rid of any backslash escapes, which will cons up a new string.
    ;; Not a common case in "From:" lines, but it might be better to go
    ;; whole hog and parse every parenthesized and quoted string properly.
    (loop for backslash-position = (or (lexer-find #\\) (loop-finish))
          for before-backslash = (subseq lexer-string lexer-start backslash-position)
          for after-backslash = (subseq lexer-string (1+ backslash-position) lexer-end)
          do (setq lexer-string (concatenate 'simple-string before-backslash after-backslash))
             (setq lexer-start 0 lexer-end (length lexer-string) lexer-position lexer-start))
    ;; Is something displayable?
    (if (= lexer-end lexer-start)
        (html "[Invalid-From-Line]")
        (render-pre lexer-string
                    lexer-start
                    lexer-end))))

(define-html-macro :article-id (article)
  (let ((article-id `(article-id ,article)))
    `(:progn "article-" (:princ ,article-id))))

(defun render-usenet-time (local-time)
  (html ((:time :datetime local-time)
         (822-time *html-output*
                   local-time
                   nil t))))

(define-html-macro :message-id-link (article)
  `((:a :href (:progn #\# (:article-id ,article)))
    (escape-html (article-message-id ,article))))

(defun render-article (article)
  (unless (article-x-no-archive article)
    (html ((:pre :class "header" :id (:article-id article))
           "From: " (render-from (article-from article)) #\Newline
           "Subject: " (render-pre (article-subject article)) #\Newline
           "Date: " (render-usenet-time (article-time article)) #\Newline
           "Message-ID: " (:message-id-link article))
          (:pre (render-pre (article-text article))))))

(defun render-article-link (article)
  (if (article-x-no-archive article)
      (html (:i "This article excluded per X-No-Archive"))
      (html (:let ((article-from (article-from article)))
              ((:a :href (:progn #\# (:article-id article)))
               (escape-html (article-subject article)))
              #\Space (:small (:b (render-from article-from)))))))

(defun render-reply-links (article)
  (whereas ((replies (article-replies article)))
    (html (:ul (dolist (reply-id replies replies)
                 (let ((reply (get-article reply-id)))
                   (html (:li (render-article-link reply)
                              (render-reply-links reply)))))))))

(defun render-thread (article)
  (loop initially (render-article article)
        for reply-id in (article-replies article)
        do (render-thread (get-article reply-id))))

(defun multiple-replies-p (article)
  (whereas ((replies (article-replies article)))
    (destructuring-bind (first . rest) replies
      (or (article-replies (get-article first))
          rest))))

(defun article-root (article)
  (loop for in-reply-to = (article-in-reply-to article)
        do (cond ((not in-reply-to) (return article))
                 ((setq article (get-article in-reply-to))))))

(defun article-pathname (article &aux (article-time (article-time article)))
  "Pathname where the rendered article is stored. This only applies to original (non-reply) articles."
  (multiple-value-call #'format nil "kira:www;comp.lang.lisp;~D;~(~:/kira:print-month-name/~);~D.html"
                       (multiple-value-bind (msec sec minute hour day month year) (decode-local-time article-time)
                         (declare (ignore msec sec minute hour day)) (values year month (article-id article)))))

(defun article-uri (article &aux (article-id (article-id article)))
  (let ((root-uri (pathname-uri (article-pathname (article-root article)))))
    (%make-uri-from-string (format nil "~A#article-~D" root-uri article-id))))

(defun spam-log->bit-vector ()
  (let ((bit-vector (make-bit-vector (article-count))))
    (with-open-file (stream "kira:www;comp.lang.lisp;spam.log")
      (loop for line = (or (read-line stream nil) (loop-finish))
            do (setf (sbit bit-vector (find-article line)) 1)
            finally (return (values bit-vector (count 1 bit-vector)))))))