2010年3月17日水曜日

正規表現からNFAを作る

ドラゴンブックの字句解析の項目を参考に、正規表現を表す文字列からNFAを作る。

(defvar *label-count* 0)

;;遷移図で特殊な入力記号として用いるもの
;; :epsilon イプシロン遷移
;; :all 任意の1文字

(defun mklist (obj)
(if (listp obj) obj (list obj)))

(defmacro do-hash ((key val) table &body body)
`(maphash
#'(lambda (,key ,val)
,@body)
,table))

(defun make-new-state ()
(incf *label-count*)
(intern (format nil "STATE-~A" *label-count*) :keyword))

(defun get-states (input move-table)
(or (cdr (assoc input move-table))
(if (eq input :epsilon)
nil
(cdr (assoc :all move-table)))))

;;tableはハッシュテーブルで、状態をキーとして、
;;入力とそれに対する遷移後の状態をalistで保存する
(defstruct NFA
start ;初期状態
table ;遷移図
f) ;受理状態の集合

(defun make-nfa-char (ch)
(let ((i (make-new-state))
(f (make-new-state)))
(let ((table (make-hash-table)))
(setf
(gethash i table)
(acons ch (list f) nil))
(make-nfa
:start i
:table table
:f (list f)))))

;;任意の一文字を受理する
(defun make-nfa-all ()
(let ((i (make-new-state))
(f (list (make-new-state)))
(table (make-hash-table)))
(setf (gethash i table)
(acons :all f nil))
(make-nfa
:start i
:f f
:table table)))


;;正規表現abを、a,bを表すNFA1,NFA2を連結して作成
(defun merge-nfa-concat (nfa1 nfa2)
(let ((i (nfa-start nfa1))
(nfa1f (mklist (nfa-f nfa1)))
(nfa2i (nfa-start nfa2))
(f (mklist (nfa-f nfa2))))
(let ((table (make-hash-table)))
(do-hash (s val) (nfa-table nfa1)
(setf
(gethash s table)
val))
(do-hash (s val) (nfa-table nfa2)
(dolist (move val)
;;nfa2の初期状態とnfa1の終了状態をくっつける
(if (eq s nfa2i)
(dolist (f nfa1f)
(push
move
(gethash f table)))
(push
move
(gethash s table)))))
(make-nfa
:start i
:f f
:table table))))

;;a|bをあらわすNFAを合成する
(defun merge-nfa-or (nfa1 nfa2)
(let ((newi (make-new-state))
(newf (make-new-state))
(table (make-hash-table)))
;;遷移図をコピー
(do-hash (key val) (nfa-table nfa1)
(setf (gethash key table) val))
(do-hash (key val) (nfa-table nfa2)
(setf (gethash key table) val))
;;新しい初期状態(newi)からのイプシロン遷移
(push
(list :epsilon (nfa-start nfa1) (nfa-start nfa2))
(gethash newi table))
;;新しい終了状態(newf)へのイプシロン遷移を追加
(dolist (f (mklist (nfa-f nfa1)))
(let ((old (gethash f table)))
(push
`(:epsilon ,newf ,@(get-states :epsilon old))
(gethash f table))))
(dolist (f (mklist (nfa-f nfa2)))
(let ((old (gethash f table)))
(push
`(:epsilon ,newf ,@(get-states :epsilon old))
(gethash f table))))
(make-nfa
:start newi
:f (list newf)
:table table)))

(defun make-nfa-loop (nfa)
(let ((newi (make-new-state))
(newf (make-new-state))
(table (make-hash-table)))
;;nfaのテーブルをコピー
(do-hash (k v) (nfa-table nfa)
(setf (gethash k table) v))
;;newiからnewfへのイプシロン遷移
;;newiから(nfa-start nfa)へのイプシロン遷移
(setf
(gethash newi table)
`((:epsilon ,(nfa-start nfa) ,newf)))
;;(nfa-f nfa)から(nfa-start nfa),newfへのイプシロン遷移
(dolist (f (mklist (nfa-f nfa)))
(let ((old (gethash f table)))
(push
`(:epsilon ,newf ,(nfa-start nfa) ,@(get-states :epsilon old))
(gethash f table))))
(make-nfa
:start newi
:f (list newf)
:table table)))

(defun nfa-start-states (nfa)
(move-epsilon (nfa-table nfa)
(nfa-start nfa)))

(defun move (nfa states input)
(let ((states (mklist states))
(table (nfa-table nfa)))
(move-epsilon
table
(move-inner table states input))))

(defun move-inner (table states input)
(if (not (listp states))
(get-states input (gethash states table))
(let ((result nil))
(dolist (s states)
(dolist (next (get-states input (gethash s table)))
(pushnew next result)))
result)))

(defun move-epsilon (table states)
(let ((unchecked (if (listp states) states (list states)))
(checked nil))
(do ((s (pop unchecked) (pop unchecked)))
((null s) checked)
(push s checked)
(dolist (next (move-inner table s :epsilon))
(unless (or (member next checked)
(member next unchecked))
(push next unchecked))))))

(defun regexp->nfa (str &optional (start 0))
(let ((len (length str))
(result nil))
(do ((i start (1+ i)))
((>= i len) (values
(reduce #'merge-nfa-concat
(nreverse result))
i))
(case (char str i)
((#\()
(multiple-value-bind (nfa next)
(regexp->nfa str (1+ i))
(push nfa result)
(setf i next)))
((#\))
(return-from regexp->nfa
(values
(reduce #'merge-nfa-concat
(nreverse result))
i)))
((#\*)
(let ((prev (pop result)))
(push (make-nfa-loop prev) result)))
((#\|)
(multiple-value-bind (nfa next)
(regexp->nfa str (1+ i))
(let ((prev
(reduce #'merge-nfa-concat
(nreverse result))))
(setf result nil)
(push
(merge-nfa-or prev nfa)
result))
(setf i next)))
((#\.)
(push (make-nfa-all) result))
(T
(push (make-nfa-char (char str i)) result))))))

(defun match (nfa str)
(let ((is (nfa-start-states nfa))
(f (nfa-f nfa))
(path nil)
(strlen (length str)))
(do ((begin 0 (1+ begin)))
((>= begin strlen) nil)
(setf path nil)
(do ((i 0 (1+ i))
(crr is crr))
((>= (+ begin i) strlen) nil)
(setf crr
(move nfa crr (char str (+ begin i))))
(if crr
(push crr path)
(setf i strlen))) ;次のループで終了する
(loop
:for sts in path
:for rest = path then (cdr rest)
:when (intersection sts f)
:do
(let ((end (+ begin (length rest))))
(return-from match
(values (subseq str begin end)
begin end)))))))

;;テスト用
(defun grep-file (reg file &optional (num nil))
(with-open-file (s file :direction :input)
(let ((nfa (regexp->nfa reg)))
(loop
:for line = (read-line s nil nil)
:for n from 1
:while line
:when (match nfa line)
:do
(if num
(format t "~A:~A~%" n line)
(format t "~A~%" line))))))

いちおう.*|の3種類を特別扱いしてくれるはず。今回のメインなのに、読み取りがうまく出来ないなぁ。

使ってみる。

>(match (regexp->nfa "'.*((lisp|scheme)|c++).*'")
"I like 'common lisp'")
"'common lisp'"
7
20
>(match (regexp->nfa "'.*((lisp|scheme)|c++).*'")
"I like 'white space'")
NIL

0 件のコメント:

コメントを投稿