(in-package :kira)
(defstruct* archive
message-id-database
article-count
text-stream
index-stream
thread-stream
line-buffer)
(defvar *archive*
(make-archive))
(defun get-text-stream ()
(orf (archive-text-stream *archive*)
(open #p"kira:cll;cll.txt"
:sharing :external)))
(defun get-index-pathname ()
(let ((text-pathname (pathname (get-text-stream))))
(merge-pathnames #p"kira:;.idx" text-pathname)))
(defun get-index-stream ()
(orf (archive-index-stream *archive*)
(let ((index-pathname (get-index-pathname)))
(open index-pathname :sharing :external
:element-type '(unsigned-byte 32)))))
(defun article-count ()
(orf (archive-article-count *archive*)
(let ((pathname (get-index-pathname)))
(ash (file-data-size pathname) -3))))
(defun get-article-record (article-id)
(let ((index-stream (get-index-stream)))
(file-position index-stream (* 2 article-id))
(let ((text-position (read-byte index-stream nil))
(universal-time (read-byte index-stream nil)))
(declare (type (unsigned-byte 32) text-position universal-time))
(unless text-position (error "No such article: ~D." article-id))
(values text-position (local-time :universal universal-time)))))
(defun get-thread-pathname ()
(let ((text-pathname (pathname (get-text-stream))))
(merge-pathnames #p"kira:;.spl" text-pathname)))
(defun get-thread-stream ()
(orf (archive-thread-stream *archive*)
(let ((thread-pathname (get-thread-pathname)))
(open thread-pathname :sharing :external
:element-type '(unsigned-byte 32)))))
(defun get-replies (article-id)
(let ((thread-stream (get-thread-stream)))
(file-position thread-stream article-id)
(let ((list-position (read-byte thread-stream)))
(cond ((eql list-position 0) nil)
(t (file-position thread-stream list-position)
(loop repeat (read-byte thread-stream)
collect (read-byte thread-stream)))))))
(defun get-line-buffer ()
(orf (archive-line-buffer *archive*)
(make-buffer 1024 'character)))
(defun (buffer)
(loop for end of-type array-index = (fill-pointer buffer)
for char = (when (plusp end) (char buffer (1- end)))
while (or (eql char #\Space) (eql char #\Tab))
do (decf (fill-pointer buffer))))
(defun (&optional (stream (get-text-stream)))
(loop with buffer = (get-line-buffer) and state = :reading-header-line
initially (reset-buffer buffer) for char = (read-char stream nil)
unless char do (%trim-header-line buffer) (return buffer)
do (if (eq state :possible-continuation)
(cond ((or (eql char #\Space)
(eql char #\Tab))
(setq state :reading-header-line)
(vector-push-extend char buffer))
(t (unread-char char stream)
(%trim-header-line buffer)
(return buffer)))
(if (eql char #\Newline)
(if (plusp (fill-pointer buffer))
(setq state :possible-continuation)
(return-from read-header-line* ""))
(vector-push-extend char buffer)))))
(defmacro (article-id header-name)
`(loop with text-stream = (get-text-stream)
with position = (get-article-record ,article-id)
initially (file-position text-stream position)
for line = (read-header-line* text-stream)
do (with-lexer ((if (plusp (length line)) line (loop-finish)))
(when (match-symbol ,header-name :terminated-by #\:)
(lexer-skip whitespace) (return (get-token t))))))
(defun get-date (article-id)
"Get the raw \"Date:\" header."
(%get-header article-id :date))
(defstatic *zone-plist*
(list :pst (parse-zone "-0800")
:pdt (parse-zone "-0700")
:mst (parse-zone "-0700")
:mdt (parse-zone "-0600")
:cst (parse-zone "-0600")
:cdt (parse-zone "-0500")
:est (parse-zone "-0500")
:edt (parse-zone "-0400")
:gmt (parse-zone "+0000")
:ut (parse-zone "+0000")
:utc (parse-zone "+0000")
:wet (parse-zone "+0000")
:bst (parse-zone "+0100")
:cet (parse-zone "+0100")
:cez (parse-zone "+0100")
:met (parse-zone "+0100")
:mez (parse-zone "+0100")
:west (parse-zone "+0100")
:cest (parse-zone "+0200")
:mest (parse-zone "+0200")
:eet (parse-zone "+0200")
:eez (parse-zone "+0200")
:kst (parse-zone "+0900")
:nzst (parse-zone "+1200")
:nzdt (parse-zone "+1300")))
(defstatic *article-id->message-id*
(make-hash-table :weak :value))
(defun get-message-id (article-id)
(orf (gethash article-id *article-id->message-id*)
(with-lexer ((%get-header article-id :message-id))
(when (lexer-match #\<) (get-token (not #\>))))))
(defun get-archive-cdb ()
(orf (archive-message-id-database *archive*)
(let ((text-pathname (pathname (get-text-stream))))
(cdb-open (merge-pathnames ".cdb" text-pathname)))))
(defun find-article (message-id)
(let ((message-id-length (length message-id)))
(rletz ((key :cdb-datum) (value :cdb-datum))
(with-cstrs ((message-id-cstring message-id))
(setf (pref key :cdb-datum.size) message-id-length)
(setf (pref key :cdb-datum.data) message-id-cstring)
(multiple-value-bind (value.data value.size)
(progn (cdb-get (get-archive-cdb) key value)
(values (pref value :cdb-datum.data)
(pref value :cdb-datum.size)))
(cond ((%null-ptr-p value.data) (values nil 0))
(t (values (%get-unsigned-long value.data)
(the fixnum (ash value.size -2))))))))))
(defmacro get-reference-from-end ()
"Get last reference, narrowing the lexer window."
`(loop with lexer-position of-type array-index = lexer-end
while (> lexer-position lexer-start) do (decf lexer-position)
when (lexer-match #\<) do (let ((new-lexer-end (1- lexer-position))
(parent-id (get-token (not #\>))))
(declare (type array-index new-lexer-end))
(setq lexer-end new-lexer-end)
(return parent-id))))
(defmacro get-viable-reference ()
`(loop for last-reference = (or (get-reference-from-end) (loop-finish))
do (whereas ((reference-article-id (find-article last-reference)))
(return reference-article-id))))
(defstruct* article
id from subject time
message-id text replies
in-reply-to x-no-archive
encoding charset)
(defstatic *article-cache*
(make-hash-table :weak :value)
"Map IDs to ARTICLE structures.")
(defmacro match-eol ()
'(or (lexer-match #\Newline)
(lexer-case (eof t))))
(defun q->octets (string &optional (start 0) end underscore-to-space-p)
"Used for both quoted-printable and encoded-word sequences."
(with-lexer (string start end)
(loop with buffer-size = (- lexer-end lexer-start)
with buffer = (make-buffer buffer-size 'octet)
for char = (or (lexer-match t) (return buffer))
do (or (when (eql char #\=)
(or (match-eol)
(let (x y)
(lexer-advance-if
(setq x (lexer-match (digit 16)))
(setq y (lexer-match (digit 16))))
(when (and x y)
(locally (declare (type character x y))
(let* ((hi (aref +digits+ (char-code x)))
(lo (aref +digits+ (char-code y)))
(hex-code (logior (ash hi 4) lo)))
(declare (type (unsigned-byte 4) hi))
(declare (type (unsigned-byte 4) lo))
(declare (type (unsigned-byte 8) hex-code))
(vector-push-extend hex-code buffer)))))))
(if (and underscore-to-space-p (eql char #\_))
(vector-push-extend (char-code #\ ) buffer)
(vector-push-extend (char-code char) buffer))))))
(define-symbol-macro +windows-1252->unicode+
#.(coerce '(#\U+20AC #\U+FFFD #\U+201A #\U+0192
#\U+201E #\U+2026 #\U+2020 #\U+2021
#\U+02C6 #\U+2030 #\U+0160 #\U+2039
#\U+0152 #\U+FFFD #\U+017D #\U+FFFD
#\U+FFFD #\U+2018 #\U+2019 #\U+201C
#\U+201D #\U+2022 #\U+2013 #\U+2014
#\U+02DC #\U+2122 #\U+0161 #\U+203A
#\U+0153 #\U+FFFD #\U+017E #\U+0178)
'simple-string))
(defun char-range (lo hi)
(loop with string = (make-string (1+ (- hi lo)))
for i of-type array-index from lo to hi
for c of-type character = (code-char i)
do (setf (schar string (- i lo)) c)
finally (return string)))
(defstatic +windows-1252+
(concatenate 'simple-string
(char-range #x00 #x7F)
+windows-1252->unicode+
(char-range #xA0 #xFF)))
(defun %decode-windows-1252 (octet-vector)
(loop with string = (make-string (length octet-vector))
for octet of-type octet across octet-vector
for i fixnum from 0 finally (return string)
for char = (schar +windows-1252+ octet)
do (setf (schar string i) char)))
(defun decode-quoted-printable (string &key (start 0) end (external-format :utf-8) (qp t))
(cond ((eq external-format :windows-1252) (%decode-windows-1252 (q->octets string start end (not qp))))
((decode-string-from-octets (q->octets string start end (not qp)) :external-format external-format))))
(defun decode-text (text charset content-transfer-encoding)
(if (string-equal content-transfer-encoding :quoted-printable)
(decode-quoted-printable text :external-format charset)
text))
(eval-always
(define-symbol-macro +basic-charset-names+
'(:iso-8859-1 :us-ascii :utf-8 :windows-1252)))
(defmacro match-encoded-charset ()
`(lexer-advance-if (lexer-match #\= #\?)
(match-symbol #.+basic-charset-names+
:terminated-by #\?)))
(defmacro %remember-whitespace ()
`(multiple-value-setq (whitespace-start whitespace-end)
(values (lexer-skip whitespace) lexer-position)))
(defmacro %decode-encoded-word (string start end charset)
`(decode-quoted-printable ,string :start ,start :end ,end
:qp nil :external-format ,charset))
(defun decode-encoded-words (string &optional (start 0) end)
"Decode encoded-word sequences (for the +BASIC-CHARSET-NAMES+)."
(with-output-to-string (output-stream)
(with-lexer (string start end)
(loop with whitespace-start of-type array-index = lexer-start
with whitespace-end of-type array-index = lexer-start
for position of-type array-index = lexer-position
do (or (whereas ((charset (match-encoded-charset)))
(when (match-symbol :q :terminated-by #\?)
(loop with start of-type array-index = lexer-position
for end of-type array-index = lexer-position
until (lexer-match #\? #\=) do (or (lexer-match t) (return nil))
finally (let ((result (%decode-encoded-word lexer-string start end charset)))
(write-string result output-stream) (return (%remember-whitespace))))))
(progn (setq lexer-position position)
(lexer-case (char (loop-finish))
(t (when (> whitespace-end whitespace-start)
(write-string lexer-string output-stream
:start whitespace-start
:end whitespace-end)
(setq whitespace-start lexer-position)
(setq whitespace-end whitespace-start))
(write-char char output-stream)))))))))
(defmacro find-charset ()
`(loop with original-position = lexer-position
for lexer-position of-type array-index
from original-position below lexer-end
when (match-symbol :charset :terminated-by #\=)
return (with-lexer-error ("content-type parameter")
(return (find (parse-http-value)
+basic-charset-names+
:test #'string-equal)))))
(defun get-article (article-id)
(orf (gethash article-id *article-cache*)
(let ((text-stream (get-text-stream))
encoding (initargs '()))
(multiple-value-bind
(text-position local-time)
(get-article-record article-id)
(file-position text-stream text-position)
(setf (getf initargs :time) local-time))
(loop for line = (read-header-line* text-stream)
while (and line (plusp (length line)))
do (with-lexer (line)
(whereas ((important-header-name
(match-symbol
(:from
:subject
:message-id
:references
:x-no-archive
:content-type
:content-transfer-encoding)
:terminated-by #\:)))
(lexer-skip whitespace)
(case important-header-name
(:message-id
(loop while (> lexer-end lexer-position)
finally (setf (getf initargs :message-id) (get-token t))
until (char= (schar lexer-string (1- lexer-end)) #\>)
do (decf lexer-end)))
(:content-type (setf (getf initargs :charset) (find-charset)))
(:references (setf (getf initargs :in-reply-to) (get-viable-reference)))
(:content-transfer-encoding (setq encoding (get-token t)))
(otherwise (setf (getf initargs important-header-name)
(decode-encoded-words lexer-string
lexer-position
lexer-end)))))))
(let ((text (read-article-text text-stream))
(charset (getf initargs :charset :iso-8859-1)))
(setq text (decode-text text charset encoding))
(setq text (string-trim '(#\Newline) text))
(apply #'make-article :id article-id
:text text :encoding encoding
:replies (get-replies article-id)
initargs)))))
(defun read-article-text (stream)
(loop with pattern = #.(format nil "~%From ...~%")
with pattern-length fixnum = (length pattern)
with match-count of-type array-index = 0
with buffer = (make-buffer 1024 'character)
for char = (read-char stream nil)
while char finally (return buffer)
do (vector-push-extend char buffer)
(cond ((eql char (char pattern match-count))
(when (= (incf match-count) pattern-length)
(decf (fill-pointer buffer) pattern-length)
(loop-finish)))
((eql char #\Newline)
(setq match-count 1))
((setq match-count 0)))))
(defmacro write-marked-inclusive (&optional (stream 'html-output))
"Write from MARK including the character behind LEXER-POSITION."
(let ((end-including-matched-character 'lexer-position))
`(when mark
(write-string lexer-string ,stream
:start (prog1 mark (setq mark nil))
:end ,end-including-matched-character))))
(defmacro write-marked-exclusive (&optional (stream 'html-output))
(let ((end-excluding-matched-character '(1- lexer-position)))
`(when mark
(write-string lexer-string ,stream
:start (prog1 mark (setq mark nil))
:end ,end-excluding-matched-character))))
(defun render-pre (string &optional (start 0) end)
"Like ESCAPE-HTML but also redact email addresses."
(with-lexer (string start end)
(loop with mark = nil and html-output = *html-output*
do (lexer-case (char (return (write-marked-inclusive)))
(#\& (write-marked-exclusive) (write-string "&" html-output))
(#\< (write-marked-exclusive) (write-string "<" html-output))
(#\> (write-marked-exclusive) (write-string ">" html-output))
(#\@ (loop with start fixnum = (or mark (1- lexer-position))
with dot-count fixnum = (- (1- lexer-position) start)
repeat dot-count do (write-char #\Middle_Dot html-output)
finally (write-char #\@ html-output) (setq mark nil)))
((or "()[]," whitespace)
(write-marked-exclusive)
(write-char char html-output))
(t (orf mark (1- lexer-position)))))))
(defun render-from (from-line)
(with-lexer (from-line)
(unless (= lexer-end lexer-start)
(case (schar lexer-string (1- lexer-end))
(#\> (setq lexer-end (lexer-find #\<)))
(#\) (whereas ((open-position (lexer-find #\() array-index))
(progn (decf lexer-end) (setq lexer-start (1+ open-position)))))))
(symbol-macrolet
((first-char
(unless (= lexer-end lexer-start)
(schar lexer-string lexer-start)))
(last-char
(unless (= lexer-end lexer-start)
(schar lexer-string (1- lexer-end)))))
(loop finally (setq lexer-position lexer-start)
while (or (if (eql last-char #\Space) (decf lexer-end))
(if (eql last-char #\Tab) (decf lexer-end))
(if (eql first-char #\") (incf lexer-start))
(if (eql last-char #\") (decf lexer-end)))))
(loop for backslash-position = (or (lexer-find #\\) (loop-finish))
for before-backslash = (subseq lexer-string lexer-start backslash-position)
for after-backslash = (subseq lexer-string (1+ backslash-position) lexer-end)
do (setq lexer-string (concatenate 'simple-string before-backslash after-backslash))
(setq lexer-start 0 lexer-end (length lexer-string) lexer-position lexer-start))
(if (= lexer-end lexer-start)
(html "[Invalid-From-Line]")
(render-pre lexer-string
lexer-start
lexer-end))))
(define-html-macro :article-id (article)
(let ((article-id `(article-id ,article)))
`(:progn "article-" (:princ ,article-id))))
(defun render-usenet-time (local-time)
(html ((:time :datetime local-time)
(822-time *html-output*
local-time
nil t))))
(define-html-macro :message-id-link (article)
`((:a :href (:progn #\# (:article-id ,article)))
(escape-html (article-message-id ,article))))
(defun render-article (article)
(unless (article-x-no-archive article)
(html ((:pre :class "header" :id (:article-id article))
"From: " (render-from (article-from article)) #\Newline
"Subject: " (render-pre (article-subject article)) #\Newline
"Date: " (render-usenet-time (article-time article)) #\Newline
"Message-ID: " (:message-id-link article))
(:pre (render-pre (article-text article))))))
(defun render-article-link (article)
(if (article-x-no-archive article)
(html (:i "This article excluded per X-No-Archive"))
(html (:let ((article-from (article-from article)))
((:a :href (:progn #\# (:article-id article)))
(escape-html (article-subject article)))
#\Space (:small (:b (render-from article-from)))))))
(defun render-reply-links (article)
(whereas ((replies (article-replies article)))
(html (:ul (dolist (reply-id replies replies)
(let ((reply (get-article reply-id)))
(html (:li (render-article-link reply)
(render-reply-links reply)))))))))
(defun render-thread (article)
(loop initially (render-article article)
for reply-id in (article-replies article)
do (render-thread (get-article reply-id))))
(defun multiple-replies-p (article)
(whereas ((replies (article-replies article)))
(destructuring-bind (first . rest) replies
(or (article-replies (get-article first))
rest))))
(defun article-root (article)
(loop for in-reply-to = (article-in-reply-to article)
do (cond ((not in-reply-to) (return article))
((setq article (get-article in-reply-to))))))
(defun article-pathname (article &aux (article-time (article-time article)))
"Pathname where the rendered article is stored. This only applies to original (non-reply) articles."
(multiple-value-call #'format nil "kira:www;comp.lang.lisp;~D;~(~:/kira:print-month-name/~);~D.html"
(multiple-value-bind (msec sec minute hour day month year) (decode-local-time article-time)
(declare (ignore msec sec minute hour day)) (values year month (article-id article)))))
(defun article-uri (article &aux (article-id (article-id article)))
(let ((root-uri (pathname-uri (article-pathname (article-root article)))))
(%make-uri-from-string (format nil "~A#article-~D" root-uri article-id))))
(defun spam-log->bit-vector ()
(let ((bit-vector (make-bit-vector (article-count))))
(with-open-file (stream "kira:www;comp.lang.lisp;spam.log")
(loop for line = (or (read-line stream nil) (loop-finish))
do (setf (sbit bit-vector (find-article line)) 1)
finally (return (values bit-vector (count 1 bit-vector)))))))