From: mac
Subject: Efficient way to create strings? (was:  How to Write a Spelling Corrector)
Date: 
Message-ID: <1177738691.379826.320650@h2g2000hsg.googlegroups.com>
There was a discussion sometime ago regarding a lisp version of
Norvig's spell checking program.

http://groups.google.com/group/comp.lang.lisp/browse_thread/thread/788d1e428dad54a9/13cd97ddb1ce3c6f

However, the main concern at the time was to minimize line count.

I was curious and wrote my version that optimizes for readability.

However, when I tried it out I noticed that it is _very_ slow. Norvig
claimed that the first test case (270 words) took 13 seconds.

Here's my result (P4 2.8Ghz SBCL 1.0.3 linux)

SPELLCHECK> (time (spelltest *tests1* :verbose t))
...
User time    =  0:04:57.094
System time  =        0.840
Elapsed time =  0:05:14.309
Allocation   = 1581329808 bytes
0 Page faults

That is simply unacceptable. So I tried a few examples in the above
thread and the results were either much worst or _slightly_ better
(within 10% range).

Slime profiler indicated that the culprits were conc and edits1.

Is there a better way to create string other than (apply #'concatenate
'string args)?

Or maybe I'm using the wrong approach to translate the following
python code?

set([word[0:i]+word[i+1:] for i in range(n)] +                     #
deletion
    [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + #
transposition
    [word[0:i]+c+word[i+1:] for i in range(n) for c in alphabet] + #
alteration
    [word[0:i]+c+word[i:] for i in range(n+1) for c in alphabet])  #
insertion

Suggestions welcomed.


SPELLCHECK> (swank:profile-package (find-package :spellcheck) t t)
SPELLCHECK> (time (spelltest *tests0* :verbose t))
accesing => accusing (1); expected accessing (0)
Total test cases:5; failed:1; unknown words:1; correct:80.00%
Evaluation took:
  10.577 seconds of real time
  8.012501 seconds of user run time
  2.488156 seconds of system run time

SPELLCHECK> (swank:profile-report)
  seconds  |    consed   |  calls  |  sec/call  |  name
-----------------------------------------------------------
     4.057 |  80,701,616 | 369,679 |   0.000011 | CONC
     2.276 |  44,497,896 |     599 |   0.003799 | EDITS1
     0.012 |      28,760 |      10 |   0.001196 | KNOWN
     0.000 |      19,760 |       1 |   0.000000 | SPELLTEST
     0.000 |     541,288 |       1 |   0.000000 | KNOWN-EDITS2
     0.000 |           0 |       5 |   0.000000 | CORRECT
-----------------------------------------------------------
     6.345 | 125,789,320 | 370,295 |            | Total

estimated total profiling overhead: 3.45 seconds
overhead estimation parameters:
  4.8e-8s/call, 9.32e-6s total profiling, 4.024e-6s internal profiling



;; spellcheck.lisp
;; http://norvig.com/spell-correct.html
;; this is fun

(in-package #:cl-user)

#+asdf
(eval-when (:compile-toplevel :load-toplevel :execute)
  (asdf:oos 'asdf:load-op :cl-ppcre)
  (asdf:oos 'asdf:load-op :iterate))

(defpackage #:spellcheck
  (:use #:cl #:cl-ppcre #:iterate))

(in-package #:spellcheck)

(defparameter *this-file*
  (load-time-value (or #.*compile-file-pathname* *load-pathname*)))

(defvar *words* (make-hash-table :test 'equalp)
  "Hash table of known words")

(defvar *alphabets*
  (loop for i from (char-code #\a) to (char-code #\z) collect
        (code-char i))
  "Used in edits1 insertion")

(defun slurp-file (filespec &key
                   (element-type 'character)
                   (fill-char #\space)
                   (external-format :latin-1))
  "Read file content into an array"
  (with-open-file (stream filespec :direction :input
                          :element-type element-type
                          :external-format external-format)
    (let ((seq (make-string (file-length stream)
                            :element-type element-type
                            :initial-element fill-char)))
      (read-sequence seq stream)
      seq)))

(defun read-file-into-words (filespec)
  "Parse words in file to build a lookup hashtable"
  (loop for word in (ppcre:all-matches-as-strings
                     "(?i)[a-z]+" (slurp-file filespec))
        do (incf (gethash (string-downcase word) *words* 0))))

(defun conc (&rest args)
  "Concat words"
  (apply #'concatenate 'string (mapcar #'string args)))

(defun edits1 (word)
  "Return words that are 1 edit distance away from word"
  (let ((word (string-downcase word))
        (length (length word)))
    (nconc
     ;; delete
     (loop for i from 0 below length collect
           (conc (subseq word 0 i) (subseq word (1+ i))))
     ;; insertion
     (iter (for i from 0 below length)
           (dolist (c *alphabets*)
             (collect (conc (subseq word 0 i) c (subseq word i)))))
     ;; there will be overlapping words in transposition & alteration,
     ;; and we only want the unique ones
     (nunion
      ;; transposition
      (loop for i from 0 below (1- length) collect
            (conc (subseq word 0 i)
                  (char word (1+ i))
                  (char word i)
                  (subseq word (+ 2 i))))
      ;; alteration
      (iter (for i from 0 below length)
            (dolist (c *alphabets*)
              (collect
                  (conc (subseq word 0 i) c
                        (subseq word (1+ i))))))
      :test #'string=))))

(defun known-edits2 (word &optional pre-computed-edits1)
  "Return known words that are 2 edit distance away from word"
  (iter (for edit1 in (or pre-computed-edits1 (edits1 word)))
        (dolist (edit2 (edits1 edit1))
          (when (gethash edit2 *words*)
            (collect edit2)))))

(defun known (&rest words)
  "Filter and return only known words"
  (loop for word in words
        if (gethash word *words*)
        collect word))

(defun correct (word)
  "Correct word spelling"
  (iter (for candidate in
             (or (known word)
                 (let ((pre-computed-edits1 (edits1 word)))
                   (or (apply #'known pre-computed-edits1)
                       (known-edits2 word pre-computed-edits1)))
                 (list word)))
        (finding candidate maximizing
                 (gethash candidate *words* 0))))

(defun spelltest (test &key verbose)
  "Run testcase `test'"
  (let ((n 0) (bad 0) (unknown 0))
    (loop for (target wrongs) on test by #'cddr do
          (loop for wrong in (cl-ppcre:split " " wrongs) do
                (incf n)
                (let ((word (correct wrong)))
                  (unless (string= target word)
                    (incf bad)
                    (unless (gethash target *words*)
                      (incf unknown))
                    (when verbose
                      (format t "~a => ~a (~d); expected ~a (~d)~%"
                              wrong word (gethash word *words* 0)
                              target (gethash target *words* 0)))))))
    (when verbose
      (format t "Total test cases:~d; failed:~d; unknown words:~d;
correct:~$%~%"
              n bad unknown (or (and (plusp n) (* (- 1 (/ bad n))
100)) 0)))
    (values n bad unknown (or (and (plusp n) (float (- 1 (/ bad n))))
0))))


;; We need to build a database first, get one sample file here
;; http://norvig.com/big.txt

#-(and)
(read-file-into-words (merge-pathnames "big.txt" *this-file*))

;; first try

#-(and)
(correct "aboadr")

;; run these test cases

#-(and)
(time (spelltest *tests0* :verbose t))

;; the following will take a while...
#-(and)
(time (spelltest *tests1* :verbose t))
#-(and)
(time (spelltest *tests2* :verbose t))

(defparameter *tests0*
  '("access" "acess" "accessing" "accesing"
    "accommodation" "accomodation acommodation acomodation"))

(defparameter *tests1*
  '("access" "acess" "accessing" "accesing" "accommodation"
    "accomodation acommodation acomodation" "account" "acount"
"address"
    "adress adres" "addressable" "addresable" "arranged" "aranged
arrainged"
    "arrangeing" "aranging" "arrangement" "arragment" "articles"
"articals"
    "aunt" "annt anut arnt" "auxiliary" "auxillary" "available"
"avaible"
    "awful" "awfall afful" "basically" "basicaly" "beginning"
"begining"
    "benefit" "benifit" "benefits" "benifits" "between" "beetween"
"bicycle"
    "bicycal bycicle bycycle" "biscuits"
    "biscits biscutes biscuts bisquits buiscits buiscuts" "built"
"biult"
    "cake" "cak" "career" "carrer"
    "cemetery" "cemetary semetary" "centrally" "centraly" "certain"
"cirtain"
    "challenges" "chalenges chalenges" "chapter" "chaper chaphter
chaptur"
    "choice" "choise" "choosing" "chosing" "clerical" "clearical"
    "committee" "comittee" "compare" "compair" "completely"
"completly"
    "consider" "concider" "considerable" "conciderable" "contented"
    "contenpted contende contended contentid" "curtains"
    "cartains certans courtens cuaritains curtans curtians curtions"
"decide" "descide" "decided"
    "descided" "definitely" "definately difinately" "definition"
"defenition"
    "definitions" "defenitions" "description" "discription"
"desiccate"
    "desicate dessicate dessiccate" "diagrammatically"
"diagrammaticaally"
    "different" "diffrent" "driven" "dirven" "ecstasy" "exstacy
ecstacy"
    "embarrass" "embaras embarass" "establishing" "astablishing
establising"
    "experience" "experance experiance" "experiences" "experances"
"extended"
    "extented" "extremely" "extreamly" "fails" "failes" "families"
"familes"
    "february" "febuary" "further" "futher" "gallery" "galery gallary
gallerry gallrey"
    "hierarchal" "hierachial" "hierarchy" "hierchy" "inconvenient"
    "inconvienient inconvient inconvinient" "independent" "independant
independant"
    "initial" "intial" "initials" "inetials inistals initails initals
intials"
    "juice" "guic juce jucie juise juse" "latest" "lates latets
latiest latist"
    "laugh" "lagh lauf laught lugh" "level" "leval"
    "levels" "levals" "liaison" "liaision liason" "lieu" "liew"
"literature"
    "litriture" "loans" "lones" "locally" "localy" "magnificent"
    "magnificnet magificent magnifcent magnifecent magnifiscant
magnifisent magnificant"
    "management" "managment" "meant" "ment" "minuscule" "miniscule"
    "minutes" "muinets" "monitoring" "monitering" "necessary"
    "neccesary necesary neccesary necassary necassery neccasary"
"occurrence"
    "occurence occurence" "often" "ofen offen offten ofton"
"opposite"
    "opisite oppasite oppesite oppisit oppisite opposit oppossite
oppossitte" "parallel"
    "paralel paralell parrallel parralell parrallell" "particular"
"particulaur"
    "perhaps" "perhapse" "personnel" "personnell" "planned" "planed"
"poem"
    "poame" "poems" "poims pomes" "poetry" "poartry poertry poetre
poety powetry"
    "position" "possition" "possible" "possable" "pretend"
    "pertend protend prtend pritend" "problem" "problam proble
promblem proplen"
    "pronunciation" "pronounciation" "purple" "perple perpul poarple"
    "questionnaire" "questionaire" "really" "realy relley relly"
"receipt"
    "receit receite reciet recipt" "receive" "recieve" "refreshment"
    "reafreshment refreshmant refresment refressmunt" "remember"
"rember remeber rememmer rermember"
    "remind" "remine remined" "scarcely" "scarcly scarecly scarely
scarsely"
    "scissors" "scisors sissors" "separate" "seperate"
    "singular" "singulaur" "someone" "somone" "sources" "sorces"
"southern"
    "southen" "special" "speaical specail specal speical" "splendid"
    "spledid splended splened splended" "standardizing" "stanerdizing"
"stomach"
    "stomac stomache stomec stumache" "supersede" "supercede
superceed" "there" "ther"
    "totally" "totaly" "transferred" "transfred" "transportability"
    "transportibility" "triangular" "triangulaur" "understand"
"undersand undistand"
    "unexpected" "unexpcted unexpeted unexspected" "unfortunately"
    "unfortunatly" "unique" "uneque" "useful" "usefull" "valuable"
"valubale valuble"
    "variable" "varable" "variant" "vairiant" "various" "vairious"
    "visited" "fisited viseted vistid vistied" "visitors" "vistors"
    "voluntary" "volantry" "voting" "voteing" "wanted" "wantid wonted"
    "whether" "wether" "wrote" "rote wote"))

(defparameter *tests2*
  '("forbidden" "forbiden" "decisions" "deciscions descisions"
    "supposedly" "supposidly" "embellishing" "embelishing" "technique"
    "tecnique" "permanently" "perminantly" "confirmation"
"confermation"
    "appointment" "appoitment" "progression" "progresion"
"accompanying"
    "acompaning" "applicable" "aplicable" "regained" "regined"
"guidelines"
    "guidlines" "surrounding" "serounding" "titles" "tittles"
"unavailable"
    "unavailble" "advantageous" "advantageos" "brief" "brif" "appeal"
    "apeal" "consisting" "consisiting" "clerk" "cleark clerck"
"component"
    "componant" "favourable" "faverable" "separation" "seperation"
"search"
    "serch" "receive" "recieve" "employees" "emploies" "prior" "piror"
    "resulting" "reulting" "suggestion" "sugestion" "opinion"
"oppinion"
    "cancellation" "cancelation" "criticism" "citisum" "useful"
"usful"
    "humour" "humor" "anomalies" "anomolies" "would" "whould" "doubt"
    "doupt" "examination" "eximination" "therefore" "therefoe"
"recommend"
    "recomend" "separated" "seperated" "successful" "sucssuful
succesful"
    "apparent" "apparant" "occurred" "occureed" "particular"
"paerticulaur"
    "pivoting" "pivting" "announcing" "anouncing" "challenge"
"chalange"
    "arrangements" "araingements" "proportions" "proprtions"
"organized"
    "oranised" "accept" "acept" "dependence" "dependance" "unequalled"
    "unequaled" "numbers" "numbuers" "sense" "sence" "conversely"
    "conversly" "provide" "provid" "arrangement" "arrangment"
    "responsibilities" "responsiblities" "fourth" "forth" "ordinary"
    "ordenary" "description" "desription descvription desacription"
    "inconceivable" "inconcievable" "data" "dsata" "register"
"rgister"
    "supervision" "supervison" "encompassing" "encompasing"
"negligible"
    "negligable" "allow" "alow" "operations" "operatins" "executed"
    "executted" "interpretation" "interpritation" "hierarchy"
"heiarky"
    "indeed" "indead" "years" "yesars" "through" "throut" "committee"
    "committe" "inquiries" "equiries" "before" "befor" "continued"
    "contuned" "permanent" "perminant" "choose" "chose" "virtually"
    "vertually" "correspondence" "correspondance" "eventually"
"eventully"
    "lonely" "lonley" "profession" "preffeson" "they" "thay" "now"
"noe"
    "desperately" "despratly" "university" "unversity" "adjournment"
    "adjurnment" "possibilities" "possablities" "stopped" "stoped"
"mean"
    "meen" "weighted" "wagted" "adequately" "adequattly" "shown"
"hown"
    "matrix" "matriiix" "profit" "proffit" "encourage" "encorage"
"collate"
    "colate" "disaggregate" "disaggreagte disaggreaget" "receiving"
    "recieving reciving" "proviso" "provisoe" "umbrella" "umberalla"
"approached"
    "aproached" "pleasant" "plesent" "difficulty" "dificulty"
"appointments"
    "apointments" "base" "basse" "conditioning" "conditining"
"earliest"
    "earlyest" "beginning" "begining" "universally" "universaly"
    "unresolved" "unresloved" "length" "lengh" "exponentially"
    "exponentualy" "utilized" "utalised" "set" "et" "surveys"
"servays"
    "families" "familys" "system" "sysem" "approximately"
"aproximatly"
    "their" "ther" "scheme" "scheem" "speaking" "speeking"
"repetitive"
    "repetative" "inefficient" "ineffiect" "geneva" "geniva" "exactly"
    "exsactly" "immediate" "imediate" "appreciation" "apreciation"
"luckily"
    "luckeley" "eliminated" "elimiated" "believe" "belive"
"appreciated"
    "apreciated" "readjusted" "reajusted" "were" "wer where" "feeling"
    "fealing" "and" "anf" "false" "faulse" "seen" "seeen"
"interrogating"
    "interogationg" "academically" "academicly" "relatively"
"relativly relitivly"
    "traditionally" "traditionaly" "studying" "studing"
    "majority" "majorty" "build" "biuld" "aggravating" "agravating"
    "transactions" "trasactions" "arguing" "aurguing" "sheets"
"sheertes"
    "successive" "sucsesive sucessive" "segment" "segemnt"
"especially"
    "especaily" "later" "latter" "senior" "sienior" "dragged" "draged"
    "atmosphere" "atmospher" "drastically" "drasticaly" "particularly"
    "particulary" "visitor" "vistor" "session" "sesion" "continually"
    "contually" "availability" "avaiblity" "busy" "buisy" "parameters"
    "perametres" "surroundings" "suroundings seroundings" "employed"
    "emploied" "adequate" "adiquate" "handle" "handel" "means" "meens"
    "familiar" "familer" "between" "beeteen" "overall" "overal"
"timing"
    "timeing" "committees" "comittees commitees" "queries" "quies"
    "econometric" "economtric" "erroneous" "errounous" "decides"
"descides"
    "reference" "refereence refference" "intelligence" "inteligence"
    "edition" "ediion ediition" "are" "arte" "apologies" "appologies"
    "thermawear" "thermawere thermawhere" "techniques" "tecniques"
    "voluntary" "volantary" "subsequent" "subsequant subsiquent"
"currently"
    "curruntly" "forecast" "forcast" "weapons" "wepons" "routine"
"rouint"
    "neither" "niether" "approach" "aproach" "available" "availble"
    "recently" "reciently" "ability" "ablity" "nature" "natior"
    "commercial" "comersial" "agencies" "agences" "however" "howeverr"
    "suggested" "sugested" "career" "carear" "many" "mony" "annual"
    "anual" "according" "acording" "receives" "recives recieves"
    "interesting" "intresting" "expense" "expence" "relevant"
    "relavent relevaant" "table" "tasble" "throughout" "throuout"
"conference"
    "conferance" "sensible" "sensable" "described" "discribed
describd"
    "union" "unioun" "interest" "intrest" "flexible" "flexable"
"refered"
    "reffered" "controlled" "controled" "sufficient" "suficient"
    "dissension" "desention" "adaptable" "adabtable" "representative"
    "representitive" "irrelevant" "irrelavent" "unnecessarily"
"unessasarily"
    "applied" "upplied" "apologised" "appologised" "these" "thees
thess"
    "choices" "choises" "will" "wil" "procedure" "proceduer"
"shortened"
    "shortend" "manually" "manualy" "disappointing" "dissapoiting"
    "excessively" "exessively" "comments" "coments" "containing"
"containg"
    "develop" "develope" "credit" "creadit" "government" "goverment"
    "acquaintances" "aquantences" "orientated" "orentated" "widely"
"widly"
    "advise" "advice" "difficult" "dificult" "investigated"
"investegated"
    "bonus" "bonas" "conceived" "concieved" "nationally" "nationaly"
    "compared" "comppared compased" "moving" "moveing" "necessity"
    "nessesity" "opportunity" "oppertunity oppotunity opperttunity"
"thoughts"
    "thorts" "equalled" "equaled" "variety" "variatry" "analysis"
    "analiss analsis analisis" "patterns" "pattarns" "qualities"
"quaties" "easily"
    "easyly" "organization" "oranisation oragnisation" "the" "thw hte
thi"
    "corporate" "corparate" "composed" "compossed" "enormously"
"enomosly"
    "financially" "financialy" "functionally" "functionaly"
"discipline"
    "disiplin" "announcement" "anouncement" "progresses" "progressess"
    "except" "excxept" "recommending" "recomending" "mathematically"
    "mathematicaly" "source" "sorce" "combine" "comibine" "input"
"inut"
    "careers" "currers carrers" "resolved" "resoved" "demands"
"diemands"
    "unequivocally" "unequivocaly" "suffering" "suufering"
"immediately"
    "imidatly imediatly" "accepted" "acepted" "projects" "projeccts"
    "necessary" "necasery nessasary nessisary neccassary" "journalism"
    "journaism" "unnecessary" "unessessay" "night" "nite" "output"
    "oputput" "security" "seurity" "essential" "esential" "beneficial"
    "benificial benficial" "explaining" "explaning" "supplementary"
    "suplementary" "questionnaire" "questionare" "employment"
"empolyment"
    "proceeding" "proceding" "decision" "descisions descision" "per"
"pere"
    "discretion" "discresion" "reaching" "reching" "analysed"
"analised"
    "expansion" "expanion" "although" "athough" "subtract" "subtrcat"
    "analysing" "aalysing" "comparison" "comparrison" "months"
"monthes"
    "hierarchal" "hierachial" "misleading" "missleading" "commit"
"comit"
    "auguments" "aurgument" "within" "withing" "obtaining" "optaning"
    "accounts" "acounts" "primarily" "pimarily" "operator" "opertor"
    "accumulated" "acumulated" "extremely" "extreemly" "there" "thear"
    "summarys" "sumarys" "analyse" "analiss" "understandable"
    "understadable" "safeguard" "safegaurd" "consist" "consisit"
    "declarations" "declaratrions" "minutes" "muinutes muiuets"
"associated"
    "assosiated" "accessibility" "accessability" "examine" "examin"
    "surveying" "servaying" "politics" "polatics" "annoying" "anoying"
    "again" "agiin" "assessing" "accesing" "ideally" "idealy"
"scrutinized"
    "scrutiniesed" "simular" "similar" "personnel" "personel"
"whereas"
    "wheras" "when" "whn" "geographically" "goegraphicaly" "gaining"
    "ganing" "requested" "rquested" "separate" "seporate" "students"
    "studens" "prepared" "prepaired" "generated" "generataed"
"graphically"
    "graphicaly" "suited" "suted" "variable" "varible vaiable"
"building"
    "biulding" "required" "reequired" "necessitates" "nessisitates"
    "together" "togehter" "profits" "proffits"))