Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/lexer.lisp
Kind | Covered | All | % |
expression | 493 | 556 | 88.7 |
branch | 70 | 76 | 92.1 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp $
4
;;; The lexer's responsibility is to convert the regex string into a
5
;;; sequence of tokens which are in turn consumed by the parser.
7
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
8
;;; (with a little help from the parser) how many register groups it
9
;;; has opened so far. (The latter is necessary for interpreting
10
;;; strings like "\\10" correctly.)
12
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
14
;;; Redistribution and use in source and binary forms, with or without
15
;;; modification, are permitted provided that the following conditions
18
;;; * Redistributions of source code must retain the above copyright
19
;;; notice, this list of conditions and the following disclaimer.
21
;;; * Redistributions in binary form must reproduce the above
22
;;; copyright notice, this list of conditions and the following
23
;;; disclaimer in the documentation and/or other materials
24
;;; provided with the distribution.
26
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
27
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
30
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
32
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
34
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
(in-package #:cl-ppcre)
40
(declaim (inline map-char-to-special-class))
41
(defun map-char-to-special-char-class (chr)
42
(declare #.*standard-optimize-settings*)
43
"Maps escaped characters like \"\\d\" to the tokens which represent
44
their associated character classes."
55
:whitespace-char-class)
57
:non-whitespace-char-class)))
60
(declare #.*standard-optimize-settings*)
61
(defstruct (lexer (:constructor make-lexer-internal))
62
"LEXER structures are used to hold the regex string which is
63
currently lexed and to keep track of the lexer's state."
77
(defun make-lexer (string)
78
(declare (inline make-lexer-internal)
79
#-genera (type string string))
80
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
81
:len (length string)))
83
(declaim (inline end-of-string-p))
84
(defun end-of-string-p (lexer)
85
(declare #.*standard-optimize-settings*)
86
"Tests whether we're at the end of the regex string."
90
(declaim (inline looking-at-p))
91
(defun looking-at-p (lexer chr)
92
(declare #.*standard-optimize-settings*)
93
"Tests whether the next character the lexer would see is CHR.
94
Does not respect extended mode."
95
(and (not (end-of-string-p lexer))
96
(char= (schar (lexer-str lexer) (lexer-pos lexer))
99
(declaim (inline next-char-non-extended))
100
(defun next-char-non-extended (lexer)
101
(declare #.*standard-optimize-settings*)
102
"Returns the next character which is to be examined and updates the
103
POS slot. Does not respect extended mode."
104
(cond ((end-of-string-p lexer)
108
(schar (lexer-str lexer) (lexer-pos lexer))
109
(incf (lexer-pos lexer))))))
111
(defun next-char (lexer)
112
(declare #.*standard-optimize-settings*)
113
"Returns the next character which is to be examined and updates the
114
POS slot. Respects extended mode, i.e. whitespace, comments, and also
115
nested comments are skipped if applicable."
116
(let ((next-char (next-char-non-extended lexer))
119
;; remember where we started
120
(setq last-loop-pos (lexer-pos lexer))
121
;; first we look for nested comments like (?#foo)
123
(char= next-char #\()
124
(looking-at-p lexer #\?))
125
(incf (lexer-pos lexer))
126
(cond ((looking-at-p lexer #\#)
127
;; must be a nested comment - so we have to search for
128
;; the closing parenthesis
129
(let ((error-pos (- (lexer-pos lexer) 2)))
131
;; loop 'til ')' or end of regex string and
132
;; return NIL if ')' wasn't encountered
133
(loop for skip-char = next-char
134
then (next-char-non-extended lexer)
136
(char/= skip-char #\)))
137
finally (return skip-char))
138
(signal-ppcre-syntax-error*
140
"Comment group not closed")))
141
(setq next-char (next-char-non-extended lexer)))
143
;; undo effect of previous INCF if we didn't see a #
144
(decf (lexer-pos lexer)))))
145
(when *extended-mode-p*
146
;; now - if we're in extended mode - we skip whitespace and
147
;; comments; repeat the following loop while we look at
149
(loop while (and next-char
150
(or (char= next-char #\#)
151
(whitespacep next-char)))
153
(if (char= next-char #\#)
154
;; if we saw a comment marker skip until
155
;; we're behind #\Newline...
156
(loop for skip-char = next-char
157
then (next-char-non-extended lexer)
159
(char/= skip-char #\Newline))
160
finally (return (next-char-non-extended lexer)))
161
;; ...otherwise (whitespace) skip until we
162
;; see the next non-whitespace character
163
(loop for skip-char = next-char
164
then (next-char-non-extended lexer)
166
(whitespacep skip-char))
167
finally (return skip-char))))))
168
;; if the position has moved we have to repeat our tests
169
;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which
170
;; would be equivalent to /^a{3}c/ in Perl
171
(unless (> (lexer-pos lexer) last-loop-pos)
172
(return next-char)))))
174
(declaim (inline fail))
176
(declare #.*standard-optimize-settings*)
177
"Moves (LEXER-POS LEXER) back to the last position stored in
178
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
179
(unless (lexer-last-pos lexer)
180
(signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
181
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
184
(defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
185
(declare #.*standard-optimize-settings*)
186
"Read and consume the number the lexer is currently looking at and
187
return it. Returns NIL if no number could be identified.
188
RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
189
at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
190
we don't tolerate whitespace in front of the number."
191
(when (or (end-of-string-p lexer)
193
(whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))
194
(return-from get-number nil))
195
(multiple-value-bind (integer new-pos)
196
(parse-integer (lexer-str lexer)
197
:start (lexer-pos lexer)
199
(let ((end-pos (+ (lexer-pos lexer)
200
(the fixnum max-length)))
201
(lexer-len (lexer-len lexer)))
202
(if (< end-pos lexer-len)
208
(cond ((and integer (>= (the fixnum integer) 0))
209
(setf (lexer-pos lexer) new-pos)
213
(declaim (inline try-number))
214
(defun try-number (lexer &key (radix 10) max-length no-whitespace-p)
215
(declare #.*standard-optimize-settings*)
216
"Like GET-NUMBER but won't consume anything if no number is seen."
217
;; remember current position
218
(push (lexer-pos lexer) (lexer-last-pos lexer))
219
(let ((number (get-number lexer
221
:max-length max-length
222
:no-whitespace-p no-whitespace-p)))
223
(or number (fail lexer))))
225
(declaim (inline make-char-from-code))
226
(defun make-char-from-code (number error-pos)
227
(declare #.*standard-optimize-settings*)
228
"Create character from char-code NUMBER. NUMBER can be NIL
229
which is interpreted as 0. ERROR-POS is the position where
230
the corresponding number started within the regex string."
231
;; only look at rightmost eight bits in compliance with Perl
232
(let ((code (logand #o377 (the fixnum (or number 0)))))
233
(or (and (< code char-code-limit)
235
(signal-ppcre-syntax-error*
237
"No character for hex-code ~X"
240
(defun unescape-char (lexer)
241
(declare #.*standard-optimize-settings*)
242
"Convert the characters(s) following a backslash into a token
243
which is returned. This function is to be called when the backslash
244
has already been consumed. Special character classes like \\W are
246
(when (end-of-string-p lexer)
247
(signal-ppcre-syntax-error "String ends with backslash"))
248
(let ((chr (next-char-non-extended lexer)))
251
;; if \Q quoting is on this is ignored, otherwise it's just an
257
;; \cx means control-x in Perl
258
(let ((next-char (next-char-non-extended lexer)))
260
(signal-ppcre-syntax-error*
262
"Character missing after '\\c' at position ~A"))
263
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
265
;; \x should be followed by a hexadecimal char code,
266
;; two digits or less
267
(let* ((error-pos (lexer-pos lexer))
268
(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))
269
;; note that it is OK if \x is followed by zero digits
270
(make-char-from-code number error-pos)))
271
((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
272
;; \x should be followed by an octal char code,
273
;; three digits or less
274
(let* ((error-pos (decf (lexer-pos lexer)))
275
(number (get-number lexer :radix 8 :max-length 3)))
276
(make-char-from-code number error-pos)))
277
;; the following five character names are 'semi-standard'
278
;; according to the CLHS but I'm not aware of any implementation
279
;; that doesn't implement them
291
(code-char 7)) ; ASCII bell
293
(code-char 27)) ; ASCII escape
295
;; all other characters aren't affected by a backslash
298
(defun collect-char-class (lexer)
299
(declare #.*standard-optimize-settings*)
300
"Reads and consumes characters from regex string until a right
301
bracket is seen. Assembles them into a list \(which is returned) of
302
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
303
tokens representing special character classes."
304
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
308
(flet ((handle-char (c)
309
"Do the right thing with character C depending on whether
310
we're inside a range or not."
311
(cond ((and hyphen-seen last-char)
312
(setf (car list) (list :range last-char c)
317
(setq hyphen-seen nil)))
318
(loop for first = t then nil
319
for c = (next-char-non-extended lexer)
320
;; leave loop if at end of string
324
;; we've seen a backslash
325
(let ((next-char (next-char-non-extended lexer)))
327
((#\d #\D #\w #\W #\s #\S)
328
;; a special character class
329
(push (map-char-to-special-char-class next-char) list)
330
;; if the last character was a hyphen
331
;; just collect it literally
334
;; if the next character is a hyphen do the same
335
(when (looking-at-p lexer #\-)
337
(incf (lexer-pos lexer)))
338
(setq hyphen-seen nil))
340
;; if \Q quoting is on we ignore \E,
341
;; otherwise it's just a plain #\E
342
(unless *allow-quoting*
345
;; otherwise unescape the following character(s)
346
(decf (lexer-pos lexer))
347
(handle-char (unescape-char lexer))))))
349
;; the first character must not be a right bracket
350
;; and isn't treated specially if it's a hyphen
353
;; end of character class
354
;; make sure we collect a pending hyphen
356
(setq hyphen-seen nil)
358
;; reverse the list to preserve the order intended
359
;; by the author of the regex string
360
(return-from collect-char-class (nreverse list)))
364
;; if the last character was 'just a character'
365
;; we expect to be in the middle of a range
366
(setq hyphen-seen t))
368
;; otherwise this is just an ordinary hyphen
371
;; default case - just collect the character
373
;; we can only exit the loop normally if we've reached the end
374
;; of the regex string without seeing a right bracket
375
(signal-ppcre-syntax-error*
377
"Missing right bracket to close character class"))))
379
(defun maybe-parse-flags (lexer)
380
(declare #.*standard-optimize-settings*)
381
"Reads a sequence of modifiers \(including #\\- to reverse their
382
meaning) and returns a corresponding list of \"flag\" tokens. The
383
\"x\" modifier is treated specially in that it dynamically modifies
384
the behaviour of the lexer itself via the special variable
388
for chr = (next-char-non-extended lexer)
390
do (signal-ppcre-syntax-error "Unexpected end of string")
391
while (find chr "-imsx" :test #'char=)
392
;; the first #\- will invert the meaning of all modifiers
396
else if (char= chr #\x)
397
do (setq *extended-mode-p* set)
405
:single-line-mode-p))
410
:not-multi-line-mode-p)
412
:not-single-line-mode-p))))
413
(decf (lexer-pos lexer))))
415
(defun get-quantifier (lexer)
416
(declare #.*standard-optimize-settings*)
417
"Returns a list of two values (min max) if what the lexer is looking
418
at can be interpreted as a quantifier. Otherwise returns NIL and
419
resets the lexer to its old position."
420
;; remember starting position for FAIL and UNGET-TOKEN functions
421
(push (lexer-pos lexer) (lexer-last-pos lexer))
422
(let ((next-char (next-char lexer)))
425
;; * (Kleene star): match 0 or more times
428
;; +: match 1 or more times
431
;; ?: match 0 or 1 times
435
;; {n}: match exactly n times
436
;; {n,}: match at least n times
437
;; {n,m}: match at least n but not more than m times
438
;; note that anything not matching one of these patterns will
439
;; be interpreted literally - even whitespace isn't allowed
440
(let ((num1 (get-number lexer :no-whitespace-p t)))
442
(let ((next-char (next-char-non-extended lexer)))
445
(let* ((num2 (get-number lexer :no-whitespace-p t))
446
(next-char (next-char-non-extended lexer)))
449
;; this is the case {n,} (NUM2 is NIL) or {n,m}
454
;; this is the case {n}
458
;; no number following left curly brace, so we treat it
459
;; like a normal character
461
;; cannot be a quantifier
465
(defun get-token (lexer)
466
(declare #.*standard-optimize-settings*)
467
"Returns and consumes the next token from the regex string (or NIL)."
468
;; remember starting position for UNGET-TOKEN function
469
(push (lexer-pos lexer)
470
(lexer-last-pos lexer))
471
(let ((next-char (next-char lexer)))
474
;; the easy cases first - the following six characters
475
;; always have a special meaning and get translated
476
;; into tokens immediately
490
;; quantifiers will always be consumend by
491
;; GET-QUANTIFIER, they must not appear here
492
(signal-ppcre-syntax-error*
493
(1- (lexer-pos lexer))
494
"Quantifier '~A' not allowed"
497
;; left brace isn't a special character in it's own
498
;; right but we must check if what follows might
499
;; look like a quantifier
500
(let ((this-pos (lexer-pos lexer))
501
(this-last-pos (lexer-last-pos lexer)))
503
(when (get-quantifier lexer)
504
(signal-ppcre-syntax-error*
506
"Quantifier '~A' not allowed"
507
(subseq (lexer-str lexer)
510
(setf (lexer-pos lexer) this-pos
511
(lexer-last-pos lexer) this-last-pos)
514
;; left bracket always starts a character class
515
(cons (cond ((looking-at-p lexer #\^)
516
(incf (lexer-pos lexer))
517
:inverted-char-class)
520
(collect-char-class lexer)))
522
;; backslash might mean different things so we have
523
;; to peek one char ahead:
524
(let ((next-char (next-char-non-extended lexer)))
527
:modeless-start-anchor)
529
:modeless-end-anchor)
531
:modeless-end-anchor-no-newline)
536
((#\d #\D #\w #\W #\s #\S)
537
;; these will be treated like character classes
538
(map-char-to-special-char-class next-char))
539
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
541
(let* ((old-pos (decf (lexer-pos lexer)))
542
;; ...so let's get the whole number first
543
(backref-number (get-number lexer)))
544
(declare (type fixnum backref-number))
545
(cond ((and (> backref-number (lexer-reg lexer))
546
(<= 10 backref-number))
547
;; \10 and higher are treated as octal
548
;; character codes if we haven't
549
;; opened that much register groups
551
(setf (lexer-pos lexer) old-pos)
552
;; re-read the number from the old
553
;; position and convert it to its
554
;; corresponding character
555
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
558
;; otherwise this must refer to a
560
(list :back-reference backref-number)))))
562
;; this always means an octal character code
563
;; (at most three digits)
564
(let ((old-pos (decf (lexer-pos lexer))))
565
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
568
;; in all other cases just unescape the
570
(decf (lexer-pos lexer))
571
(unescape-char lexer)))))
573
;; an open parenthesis might mean different things
574
;; depending on what follows...
575
(cond ((looking-at-p lexer #\?)
576
;; this is the case '(?' (and probably more behind)
577
(incf (lexer-pos lexer))
578
;; we have to check for modifiers first
579
;; because a colon might follow
580
(let* ((flags (maybe-parse-flags lexer))
581
(next-char (next-char-non-extended lexer)))
582
;; modifiers are only allowed if a colon
583
;; or a closing parenthesis are following
585
(not (find next-char ":)" :test #'char=)))
586
(signal-ppcre-syntax-error*
587
(car (lexer-last-pos lexer))
588
"Sequence '~A' not recognized"
589
(subseq (lexer-str lexer)
590
(car (lexer-last-pos lexer))
595
(signal-ppcre-syntax-error
596
"End of string following '(?'"))
598
;; an empty group except for the flags
599
;; (if there are any)
610
;; positive look-ahead
613
;; negative look-ahead
614
:open-paren-exclamation)
616
;; non-capturing group - return flags as
618
(values :open-paren-colon flags))
620
;; might be a look-behind assertion, so
621
;; check next character
622
(let ((next-char (next-char-non-extended lexer)))
625
;; positive look-behind
626
:open-paren-less-equal)
628
;; negative look-behind
629
:open-paren-less-exclamation)
631
;; Perl allows "(?<)" and treats
632
;; it like a null string
636
(signal-ppcre-syntax-error
637
"End of string following '(?<'"))
640
(signal-ppcre-syntax-error*
641
(1- (lexer-pos lexer))
642
"Character '~A' may not follow '(?<'"
645
(signal-ppcre-syntax-error*
646
(1- (lexer-pos lexer))
647
"Character '~A' may not follow '(?'"
650
;; if next-char was not #\? (this is within
651
;; the first COND), we've just seen an opening
652
;; parenthesis and leave it like that
655
;; all other characters are their own tokens
657
;; we didn't get a character (this if the "else" branch from
658
;; the first IF), so we don't return a token but NIL
660
(pop (lexer-last-pos lexer))
663
(declaim (inline unget-token))
664
(defun unget-token (lexer)
665
(declare #.*standard-optimize-settings*)
666
"Moves the lexer back to the last position stored in the LAST-POS stack."
667
(if (lexer-last-pos lexer)
668
(setf (lexer-pos lexer)
669
(pop (lexer-last-pos lexer)))
670
(error "No token to unget \(this should not happen)")))
672
(declaim (inline start-of-subexpr-p))
673
(defun start-of-subexpr-p (lexer)
674
(declare #.*standard-optimize-settings*)
675
"Tests whether the next token can start a valid sub-expression, i.e.
676
a stand-alone regex."
677
(let* ((pos (lexer-pos lexer))
678
(next-char (next-char lexer)))
679
(not (or (null next-char)
681
(member (the character next-char)
684
(setf (lexer-pos lexer) pos))))))