From: Pilch
Subject: sgmls2lisp
Date: 
Message-ID: <ucc02aa.750294286@sun3>
;; sgml2lisp  -- sgml output formatting tool using SGMLS, EMACS and Lisp
 
; PURPOSE

 ; Generate a lisp program that acts as a filter
 ; in converting SGML text to any user-specified format.
 ; The generated converter operates on the output of
 ; the SGML parser SGMLS (copyleft by J. Clarke) and
 ; performs the same task as SGMLSASP.
 ; But conversion algorithms needn't any longer conform to
 ; the restricted code of the Amsterdam Parser (ASP), 
 ; but are free to draw on the vast resources of a
 ; leading artificial intelligence language.

;; SOFTWARE DEPENDENCY

 ; Both SGMLS and EMACS are needed for generating the lisp-data.
 ; The executable file sgmls V 1.0 must be accessible via the PATH. 
 ; EMACS generates an optional dummy converter and performs the conversion.
 ; It can be in interactive, editing mode or be run on an e-lisp batch 
 ; as a commadline interpreter.

 ; Any LISP interpreter should be able to output the lisp-data to the 
 ; user-specified format.  Therefore interpreters of other Lisp 
 ; dialects than E-Lisp can be used to write the converter.

;; HOW IT WORKS

 ; 1. run sgm-to-lisp on your sgml-document, save the output in lisp-data.el
 ; 2. run dtd-to-lisp on your dtd, save the output in converter.el
 ; 3. do M-x-load-file on converter.el and lisp-data.el in sequence. 
 ;    Now you have performed your first dummy conversion generating
 ;    the empty string as output.
 ; 4. Make a copy of converter.el for each application for which you
 ;    want to write a converter, e.g. converter-LaTeX.el, converter-lout.el,
 ;    converter-nroff.el, converter-ps.el.  Modify these files until you
 ;    get the wanted output.  
 ; 
 ;    For a converter-LaTeX.el you may write something like this: 
 ;
 ;    (defun DOC (arg)
 ;      (insert 
 ;        "\\documentstyle[" APTSIZE "," ALANGUAGE "]{" AFORMAT "}" (newline)
 ;        "\\begin{document}" (newline)
 ;        arg 
 ;        (newline) 
 ;        "\\end{document}" (newline)
 ;     )
 ;
 ;    or, for a converter-bourneshell.el, a syntagm such as 
 ;
 ;     <ftpalias>
 ;       <name> ostasien
 ;       <adr> ftp.lrz-muenchen.de
 ;       <comment> major ftp site for East-Asian software applications, 
 ;         administered by a group of German scholars 
 ;      </ftpalias>
 ;
 ;    may be formatted by the following e-lisp functions:
 ;
 ;    (defun FTPALIAS (arg)         ; compound
 ;      (setq SNAME "nosite")       ; initialize components
 ;      (setq SADR "site.nowhere") 
 ;      (setq SCOMMENT "")  
 ;      (arg)                       ; read component values
 ;	(concat                     ; format compound
 ;	  (concat (newline) SNAME "=\"" SADR "\";export " SNAME)
 ;	  (if (not (equal SCOMMENT "")) 
 ;	    (concat (newline) "# " (remove-linebreaks SCOMMENT))
 ;	    "")
 ;     )
 ;    (defun NAME (arg) (setq SNAME arg))       
 ;    (defun ADR (arg) (setq SADR arg))         
 ;    (defun COMMENT (arg) (setq SCOMMENT arg)) 
 ;
 ;   so as to produce the shell-script entry
 ;
 ;   ostasien=ftp.lrz-muenchen.de;export ostasien
 ;   # major ftp site for East-Asian software applications, administered by a group of German scholars 
 ;
 ;   Some basic principles to be induced from the examples are:
 ;
 ;     1.  "(insert (concat .. arg ..)" 
 ;         is used in the topmost GI node and only there,
 ;         as in the above example DOC.
 ;     2.  "(setq ..) (arg) (concat *template*)" 
 ;         is used in complex (i.e. non-#pcdata) elements.  
 ;         The lower level GIs are initialized, then read in, then
 ;         formatted according to the *template*,
 ;         as in the above example FTPALIAS
 ;     3.  "(concat arg)" can be simplified to "(arg)" in simple (i.e.
 ;         #pcdata) elements.  The lisp functions for these elements 
 ;         have no other form than that of NAME and ADR above.

;; BUGS / TO-DO-LIST

;   The dummy converter that you have to start with is rather
;   primitive.  It would not be very difficult to generate a more 
;   sophisticated dummy converter, that would already fully apply 
;   the above principles.
;
;   The macros invoke regexp replacement commands over and over again
;   rather than doing an optimized replacement at a lower
;   programming level.  That makes them easy to write but time-consuming
;   to execute.  The best way to solve this problem will be to discard
;   the present tool and incorporate its functions in sgmls itself, 
;   i.e. to allow sgmls to be invoked with a commandline syntax like
;
;       sgmls [--lispprog] [--lispdata] [sgmlfile]
;
;   where "--lispprog" would produce the output of function dtd-to-lisp,
;         "--lispdata"                          of function sgm-to-lisp.

;; AUTHOR

; <adr id=PilchH>
; <person mf=m>
;  <surn>Pilch<givn>Hartmut
;  <titles>M.A., staatl.gepr. Dolmetscher f&ue;r Chinesisch
; <place>
;  <pmail><country>D<zip>80687<str>Von-der-Pfordten-Str.<nr>9
;  <tele><country>49<area>89<phone>5804845<fax>567642
;  <email>·······@lrz.lrz-muenchen.de
; </adr>

;;PROGRAM TEXT

(setq case-replace nil)
(defun replace-regexp-all (a b) 
  (beginning-of-buffer)
  (replace-regexp a b nil)
 )
(defun shell-command-on-buffer (kmd)
  (interactive "scommand: ")     
  (shell-command-on-region (beginning-of-buffer) (end-of-buffer) kmd nil 1))
(defun convert-simple-functions ()
  (interactive)
  (replace-regexp-all "\\([^\\\\]\\)\"" "\\1\\\\\"") ;protect quotation marks
  (replace-regexp-all "^-\\(.*\\)$" "\"\\1\"")       ;convert field delimiters
  (replace-regexp-all "^(\\(\\w+\\)$
^\"\\(.*\\)\"$
^)\\1$" 
    "(\\1 \"\\2\")"
   )                                                 ;convert functions
  )
(defun convert-tokens () 
  (replace-regexp-all "^\\(\\w+\\) TOKEN \\(\\w+\\)$" "(setq \\1 \"\\2\")")
 )
(defun convert-endmark () 
  (end-of-buffer) (previous-line 3) 
  (replace-regexp "^C" "(sgmls-output-end)")
 )

(defun convert-remaining-functions () 
  (replace-regexp-all "^(\\(\\w+\\)$" "(\\1 (concat ")
  (replace-regexp-all "^)\\(\\w+\\)$" " )) ;\\1")
 )

(defun sgmls-to-lisp () "

  convert sgmls output to a series of lisp functions, to whom
  application-specific meanings must defined in a series of
  defun-statements, before they can generate input for the
  intended application.

  " (interactive)
  (convert-simple-functions)
  (convert-remaining-functions)
  (convert-tokens)
  (convert-endmark)  
 )

(defun sgm-to-lisp () "
  parse sgml doc using external parser sgmls and 
  produce e-lisp code using e-lisp function sgmls-to-lisp
  " (interactive)
  (shell-command-on-buffer "sgmls")
  (switch-to-buffer "*Shell Command Output*")
  (sgmls-to-lisp))
(defun dtd-to-lisp () "
  generate dummy defun statements from a dtd, which must be
  in the current buffer, and write them to the *occur* buffer 
  " (interactive)
  (list-matching-lines "!element" nil)
  (switch-to-buffer "*Occur*")
  (beginning-of-buffer) (kill-line 1) 
  (replace-regexp-all "^.*!element \\(\\w*\\) .*$" "\\1")
  (mark-whole-buffer) (upcase-region (region-beginning) (region-end))
  (replace-regexp-all "^\\(\\w+\\)$" "(defun \\1 (arg) (concat arg))")
  (end-of-buffer)
  (insert "(defun sgmls-output-end () (setq ok \"ok\"))")
 )