Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/parser.lisp
Kind | Covered | All | % |
expression | 270 | 285 | 94.7 |
branch | 41 | 44 | 93.2 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp $
4
;;; The parser will - with the help of the lexer - parse a regex
5
;;; string and convert it into a "parse tree" (see docs for details
6
;;; about the syntax of these trees). Note that the lexer might return
7
;;; illegal parse trees. It is assumed that the conversion process
8
;;; later on will track them down.
10
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
12
;;; Redistribution and use in source and binary forms, with or without
13
;;; modification, are permitted provided that the following conditions
16
;;; * Redistributions of source code must retain the above copyright
17
;;; notice, this list of conditions and the following disclaimer.
19
;;; * Redistributions in binary form must reproduce the above
20
;;; copyright notice, this list of conditions and the following
21
;;; disclaimer in the documentation and/or other materials
22
;;; provided with the distribution.
24
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
25
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
28
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
30
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
32
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
(in-package #:cl-ppcre)
39
(declare #.*standard-optimize-settings*)
40
"Parses and consumes a <group>.
41
The productions are: <group> -> \"(\"<regex>\")\"
44
\"(?<flags>:\"<regex>\")\"
49
\"(?(\"<num>\")\"<regex>\")\"
50
\"(?(\"<regex>\")\"<regex>\")\"
52
where <flags> is parsed by the lexer function MAYBE-PARSE-FLAGS.
53
Will return <parse-tree> or (<grouping-type> <parse-tree>) where
54
<grouping-type> is one of six keywords - see source for details."
55
(multiple-value-bind (open-token flags)
57
(cond ((eq open-token :open-paren-paren)
58
;; special case for conditional regular expressions; note
59
;; that at this point we accept a couple of illegal
60
;; combinations which'll be sorted out later by the
62
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
63
;; check if what follows "(?(" is a number
64
(number (try-number lexer :no-whitespace-p t))
65
;; make changes to extended-mode-p local
66
(*extended-mode-p* *extended-mode-p*))
67
(declare (type fixnum open-paren-pos))
69
;; condition is a number (i.e. refers to a
71
(let* ((inner-close-token (get-token lexer))
72
(reg-expr (reg-expr lexer))
73
(close-token (get-token lexer)))
74
(unless (eq inner-close-token :close-paren)
75
(signal-ppcre-syntax-error*
77
"Opening paren has no matching closing paren"))
78
(unless (eq close-token :close-paren)
79
(signal-ppcre-syntax-error*
81
"Opening paren has no matching closing paren"))
82
(list :branch number reg-expr)))
84
;; condition must be a full regex (actually a
85
;; look-behind or look-ahead); and here comes a
86
;; terrible kludge: instead of being cleanly
87
;; separated from the lexer, the parser pushes
88
;; back the lexer by one position, thereby
89
;; landing in the middle of the 'token' "(?(" -
91
(decf (lexer-pos lexer))
92
(let* ((inner-reg-expr (group lexer))
93
(reg-expr (reg-expr lexer))
94
(close-token (get-token lexer)))
95
(unless (eq close-token :close-paren)
96
(signal-ppcre-syntax-error*
98
"Opening paren has no matching closing paren"))
99
(list :branch inner-reg-expr reg-expr))))))
100
((member open-token '(:open-paren
104
:open-paren-exclamation
105
:open-paren-less-equal
106
:open-paren-less-exclamation)
108
;; make changes to extended-mode-p local
109
(let ((*extended-mode-p* *extended-mode-p*))
110
;; we saw one of the six token representing opening
112
(let* ((open-paren-pos (car (lexer-last-pos lexer)))
113
(reg-expr (reg-expr lexer))
114
(close-token (get-token lexer)))
115
(when (eq open-token :open-paren)
116
;; if this is the "("<regex>")" production we have to
117
;; increment the register counter of the lexer
118
(incf (lexer-reg lexer)))
119
(unless (eq close-token :close-paren)
120
;; the token following <regex> must be the closing
121
;; parenthesis or this is a syntax error
122
(signal-ppcre-syntax-error*
124
"Opening paren has no matching closing paren"))
126
;; if the lexer has returned a list of flags this must
127
;; have been the "(?:"<regex>")" production
128
(cons :group (nconc flags (list reg-expr)))
129
(list (case open-token
134
((:open-paren-greater)
138
((:open-paren-exclamation)
140
((:open-paren-less-equal)
141
:positive-lookbehind)
142
((:open-paren-less-exclamation)
143
:negative-lookbehind))
146
;; this is the <legal-token> production; <legal-token> is
147
;; any token which passes START-OF-SUBEXPR-P (otherwise
148
;; parsing had already stopped in the SEQ method)
151
(defun greedy-quant (lexer)
152
(declare #.*standard-optimize-settings*)
153
"Parses and consumes a <greedy-quant>.
154
The productions are: <greedy-quant> -> <group> | <group><quantifier>
155
where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
156
Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
157
(let* ((group (group lexer))
158
(token (get-quantifier lexer)))
160
;; if GET-QUANTIFIER returned a non-NIL value it's the
161
;; two-element list (<min> <max>)
162
(list :greedy-repetition (first token) (second token) group)
166
(declare #.*standard-optimize-settings*)
167
"Parses and consumes a <quant>.
168
The productions are: <quant> -> <greedy-quant> | <greedy-quant>\"?\".
169
Will return the <parse-tree> returned by GREEDY-QUANT and optionally
170
change :GREEDY-REPETITION to :NON-GREEDY-REPETITION."
171
(let* ((greedy-quant (greedy-quant lexer))
172
(pos (lexer-pos lexer))
173
(next-char (next-char lexer)))
175
(if (char= next-char #\?)
176
(setf (car greedy-quant) :non-greedy-repetition)
177
(setf (lexer-pos lexer) pos)))
181
(declare #.*standard-optimize-settings*)
182
"Parses and consumes a <seq>.
183
The productions are: <seq> -> <quant> | <quant><seq>.
184
Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
185
(flet ((make-array-from-two-chars (char1 char2)
186
(let ((string (make-array 2
187
:element-type 'character
190
(setf (aref string 0) char1)
191
(setf (aref string 1) char2)
193
;; Note that we're calling START-OF-SUBEXPR-P before we actually try
194
;; to parse a <seq> or <quant> in order to catch empty regular
196
(if (start-of-subexpr-p lexer)
197
(let ((quant (quant lexer)))
198
(if (start-of-subexpr-p lexer)
199
(let* ((seq (seq lexer))
200
(quant-is-char-p (characterp quant))
201
(seq-is-sequence-p (and (consp seq)
202
(eq (first seq) :sequence))))
203
(cond ((and quant-is-char-p
205
(make-array-from-two-chars seq quant))
206
((and quant-is-char-p
208
(vector-push-extend quant seq)
210
((and quant-is-char-p
212
(characterp (second seq)))
216
(make-array-from-two-chars (second seq)
220
(t (make-array-from-two-chars (second seq) quant))))
221
((and quant-is-char-p
223
(stringp (second seq)))
228
(vector-push-extend quant (second seq))
233
(vector-push-extend quant (second seq))
236
;; if <seq> is also a :SEQUENCE parse tree we merge
237
;; both lists into one to avoid unnecessary consing
239
(cons quant (cdr seq)))
241
(t (list :sequence quant seq))))
245
(defun reg-expr (lexer)
246
(declare #.*standard-optimize-settings*)
247
"Parses and consumes a <regex>, a complete regular expression.
248
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
249
Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
250
(let ((pos (lexer-pos lexer)))
251
(case (next-char lexer)
253
;; if we didn't get any token we return :VOID which stands for
254
;; "empty regular expression"
257
;; now check whether the expression started with a vertical
258
;; bar, i.e. <seq> - the left alternation - is empty
259
(list :alternation :void (reg-expr lexer)))
261
;; otherwise un-read the character we just saw and parse a
262
;; <seq> plus the character following it
263
(setf (lexer-pos lexer) pos)
264
(let* ((seq (seq lexer))
265
(pos (lexer-pos lexer)))
266
(case (next-char lexer)
268
;; no further character, just a <seq>
271
;; if the character was a vertical bar, this is an
272
;; alternation and we have the second production
273
(let ((reg-expr (reg-expr lexer)))
274
(cond ((and (consp reg-expr)
275
(eq (first reg-expr) :alternation))
276
;; again we try to merge as above in SEQ
278
(cons seq (cdr reg-expr)))
280
(t (list :alternation seq reg-expr)))))
282
;; a character which is not a vertical bar - this is
283
;; either a syntax error or we're inside of a group and
284
;; the next character is a closing parenthesis; so we
285
;; just un-read the character and let another function
287
(setf (lexer-pos lexer) pos)
290
(defun reverse-strings (parse-tree)
291
(declare #.*standard-optimize-settings*)
292
(cond ((stringp parse-tree)
293
(nreverse parse-tree))
295
(loop for parse-tree-rest on parse-tree
296
while parse-tree-rest
297
do (setf (car parse-tree-rest)
298
(reverse-strings (car parse-tree-rest))))
302
(defun parse-string (string)
303
(declare #.*standard-optimize-settings*)
304
"Translate the regex string STRING into a parse tree."
305
(let* ((lexer (make-lexer string))
306
(parse-tree (reverse-strings (reg-expr lexer))))
307
;; check whether we've consumed the whole regex string
308
(if (end-of-string-p lexer)
310
(signal-ppcre-syntax-error*
312
"Expected end of string"))))