Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/scanner.lisp
Kind | Covered | All | % |
expression | 307 | 523 | 58.7 |
branch | 77 | 116 | 66.4 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.26 2005/07/19 23:18:15 edi Exp $
4
;;; Here the scanner for the actual regex as well as utility scanners
5
;;; for the constant start and end strings are created.
7
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
9
;;; Redistribution and use in source and binary forms, with or without
10
;;; modification, are permitted provided that the following conditions
13
;;; * Redistributions of source code must retain the above copyright
14
;;; notice, this list of conditions and the following disclaimer.
16
;;; * Redistributions in binary form must reproduce the above
17
;;; copyright notice, this list of conditions and the following
18
;;; disclaimer in the documentation and/or other materials
19
;;; provided with the distribution.
21
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
22
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
25
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
(in-package #:cl-ppcre)
35
(defmacro bmh-matcher-aux (&key case-insensitive-p)
36
"Auxiliary macro used by CREATE-BMH-MATCHER."
37
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
39
(declare (type fixnum start-pos))
40
(if (or (minusp start-pos)
41
(> (the fixnum (+ start-pos m)) *end-pos*))
43
(loop named bmh-matcher
44
for k of-type fixnum = (+ start-pos m -1)
45
then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
47
do (loop for j of-type fixnum downfrom (1- m)
48
for i of-type fixnum downfrom k
50
(,char-compare (schar *string* i)
52
finally (if (minusp j)
53
(return-from bmh-matcher (1+ i)))))))))
55
(defun create-bmh-matcher (pattern case-insensitive-p)
56
(declare #.*standard-optimize-settings*)
57
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
58
simple-string *STRING* for the first occurence of the substring
59
PATTERN. The search starts at the position START-POS within *STRING*
60
and stops before *END-POS* is reached. Depending on the second
61
argument the search is case-insensitive or not. If the special
62
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
63
instead. (BMH matchers are faster but need much more space.)"
64
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
66
(unless *use-bmh-matchers*
67
(let ((test (if case-insensitive-p #'char-equal #'char=)))
68
(return-from create-bmh-matcher
70
(declare (type fixnum start-pos))
71
(and (not (minusp start-pos))
77
(let* ((m (length pattern))
78
(skip (make-array *regex-char-code-limit*
81
(declare (type fixnum m))
82
(loop for k of-type fixnum below m
84
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
85
(aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1))
87
do (setf (aref skip (char-code (schar pattern k))) (- m k 1)))
88
(if case-insensitive-p
89
(bmh-matcher-aux :case-insensitive-p t)
92
(defmacro char-searcher-aux (&key case-insensitive-p)
93
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
94
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
96
(declare (type fixnum start-pos))
97
(and (not (minusp start-pos))
98
(loop for i of-type fixnum from start-pos below *end-pos*
99
thereis (and (,char-compare (schar *string* i) chr) i))))))
101
(defun create-char-searcher (chr case-insensitive-p)
102
(declare #.*standard-optimize-settings*)
103
"Returns a function which searches the (special) simple-string
104
*STRING* for the first occurence of the character CHR. The search
105
starts at the position START-POS within *STRING* and stops before
106
*END-POS* is reached. Depending on the second argument the search is
107
case-insensitive or not."
108
(if case-insensitive-p
109
(char-searcher-aux :case-insensitive-p t)
110
(char-searcher-aux)))
112
(declaim (inline newline-skipper))
114
(defun newline-skipper (start-pos)
115
(declare #.*standard-optimize-settings*)
116
(declare (type fixnum start-pos))
117
"Find the next occurence of a character in *STRING* which is behind
119
;; we can start with (1- START-POS) without testing for (PLUSP
120
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
121
;; the first iteration
122
(loop for i of-type fixnum from (1- start-pos) below *end-pos*
123
thereis (and (char= (schar *string* i)
127
(defmacro insert-advance-fn (advance-fn)
128
"Creates the actual closure returned by CREATE-SCANNER-AUX by
129
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
130
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
132
advance-fn '(advance-fn-definition)
133
'(lambda (string start end)
135
;; initialize a couple of special variables used by the
136
;; matchers (see file specials.lisp)
137
(let* ((*string* string)
140
;; we will search forward for END-STRING if this value
141
;; isn't at least as big as POS (see ADVANCE-FN), so it
142
;; is safe to start to the left of *START-POS*; note
143
;; that this value will _never_ be decremented - this
144
;; is crucial to the scanning process
145
(*end-string-pos* (1- *start-pos*))
146
;; the next five will shadow the variables defined by
147
;; DEFPARAMETER; at this point, we don't know if we'll
148
;; actually use them, though
149
(*repeat-counters* *repeat-counters*)
150
(*last-pos-stores* *last-pos-stores*)
151
(*reg-starts* *reg-starts*)
152
(*regs-maybe-start* *regs-maybe-start*)
153
(*reg-ends* *reg-ends*)
154
;; we might be able to optimize the scanning process by
155
;; (virtually) shifting *START-POS* to the right
156
(scan-start-pos *start-pos*)
157
(starts-with-str (if start-string-test
160
;; we don't need to try further than MAX-END-POS
161
(max-end-pos (- *end-pos* min-len)))
162
(declare (type fixnum scan-start-pos)
163
(type function match-fn))
164
;; definition of ADVANCE-FN will be inserted here by macrology
165
(labels ((advance-fn-definition))
166
(declare (inline advance-fn))
167
(when (plusp rep-num)
168
;; we have at least one REPETITION which needs to count
169
;; the number of repetitions
170
(setq *repeat-counters* (make-array rep-num
172
:element-type 'fixnum)))
173
(when (plusp zero-length-num)
174
;; we have at least one REPETITION which needs to watch
175
;; out for zero-length repetitions
176
(setq *last-pos-stores* (make-array zero-length-num
177
:initial-element nil)))
178
(when (plusp reg-num)
179
;; we have registers in our regular expression
180
(setq *reg-starts* (make-array reg-num :initial-element nil)
181
*regs-maybe-start* (make-array reg-num :initial-element nil)
182
*reg-ends* (make-array reg-num :initial-element nil)))
184
;; the regular expression has a constant end string which
185
;; is anchored at the very end of the target string
186
;; (perhaps modulo a #\Newline)
187
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
188
(declare (type fixnum end-test-pos)
189
(type function end-string-test))
190
(unless (setq *end-string-pos* (funcall end-string-test
192
(when (and (= 1 (the fixnum end-anchored-p))
193
(> *end-pos* scan-start-pos)
194
(char= #\Newline (schar *string* (1- *end-pos*))))
195
;; if we didn't find an end string candidate from
196
;; END-TEST-POS and if a #\Newline at the end is
197
;; allowed we try it again from one position to the
199
(setq *end-string-pos* (funcall end-string-test
200
(1- end-test-pos))))))
201
(unless (and *end-string-pos*
202
(<= *start-pos* *end-string-pos*))
203
;; no end string candidate found, so give up
204
(return-from scan nil))
205
(when end-string-offset
206
;; if the offset of the constant end string from the
207
;; left of the regular expression is known we can start
208
;; scanning further to the right; this is similar to
209
;; what we might do in ADVANCE-FN
210
(setq scan-start-pos (max scan-start-pos
211
(- (the fixnum *end-string-pos*)
212
(the fixnum end-string-offset))))))
215
;; we're anchored at the start of the target string,
216
;; so no need to try again after first failure
217
(when (or (/= *start-pos* scan-start-pos)
218
(< max-end-pos *start-pos*))
219
;; if END-STRING-OFFSET has proven that we don't
220
;; need to bother to scan from *START-POS* or if the
221
;; minimal length of the regular expression is
222
;; longer than the target string we give up
223
(return-from scan nil))
224
(when starts-with-str
226
(declare (type fixnum starts-with-len))
227
(cond ((and (case-insensitive-p starts-with)
228
(not (*string*-equal starts-with-str
233
;; the regular expression has a
234
;; case-insensitive constant start string
235
;; and we didn't find it
236
(return-from scan nil))
237
((and (not (case-insensitive-p starts-with))
238
(not (*string*= starts-with-str
240
(+ *start-pos* starts-with-len)
242
;; the regular expression has a
243
;; case-sensitive constant start string
244
;; and we didn't find it
245
(return-from scan nil))
247
(when (and end-string-test
248
(not end-anchored-p))
249
;; the regular expression has a constant end string
250
;; which isn't anchored so we didn't check for it
252
(block end-string-loop
253
;; we temporarily use *END-STRING-POS* as our
254
;; starting position to look for end string
256
(setq *end-string-pos* *start-pos*)
258
(unless (setq *end-string-pos*
259
(funcall (the function end-string-test)
261
;; no end string candidate found, so give up
262
(return-from scan nil))
263
(unless end-string-offset
264
;; end string doesn't have an offset so we
265
;; can start scanning now
266
(return-from end-string-loop))
267
(let ((maybe-start-pos (- (the fixnum *end-string-pos*)
268
(the fixnum end-string-offset))))
269
(cond ((= maybe-start-pos *start-pos*)
270
;; offset of end string into regular
271
;; expression matches start anchor -
273
(return-from end-string-loop))
274
((and (< maybe-start-pos *start-pos*)
275
(< (+ *end-string-pos* end-string-len) *end-pos*))
276
;; no match but maybe we find another
277
;; one to the right - try again
278
(incf *end-string-pos*))
281
(return-from scan nil)))))))
282
;; if we got here we scan exactly once
283
(let ((next-pos (funcall match-fn *start-pos*)))
285
(values (if next-pos *start-pos* nil)
290
(loop for pos = (if starts-with-everything
291
;; don't jump to the next
292
;; #\Newline on the first
295
(advance-fn scan-start-pos))
296
then (advance-fn pos)
297
;; give up if the regular expression can't fit
298
;; into the rest of the target string
300
(<= (the fixnum pos) max-end-pos))
301
do (let ((next-pos (funcall match-fn pos)))
303
(return-from scan (values pos
307
;; not yet found, increment POS
308
#-cormanlisp (incf (the fixnum pos))
309
#+cormanlisp (incf pos)))))))))
312
(defun create-scanner-aux (match-fn
324
(declare #.*standard-optimize-settings*)
325
(declare (type fixnum min-len zero-length-num rep-num reg-num))
326
"Auxiliary function to create and return a scanner \(which is
327
actually a closure). Used by CREATE-SCANNER."
328
(let ((starts-with-len (if (typep starts-with 'str)
330
(starts-with-everything (typep starts-with 'everything)))
332
;; this COND statement dispatches on the different versions we
333
;; have for ADVANCE-FN and creates different closures for each;
334
;; note that you see only the bodies of ADVANCE-FN below - the
335
;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
336
;; could have done this with closures instead of macrology but
337
;; would have consed a lot more)
338
((and start-string-test end-string-test end-string-offset)
339
;; we know that the regular expression has constant start and
340
;; end strings and we know the end string's offset (from the
344
(declare (type fixnum end-string-offset starts-with-len)
345
(type function start-string-test end-string-test))
347
(unless (setq pos (funcall start-string-test pos))
348
;; give up completely if we can't find a start string
350
(return-from scan nil))
352
;; from here we know that POS is a FIXNUM
353
(declare (type fixnum pos))
354
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
355
;; if we already found an end string candidate the
356
;; position of which matches the start string
357
;; candidate we're done
358
(return-from advance-fn pos))
359
(let ((try-pos (+ pos starts-with-len)))
360
;; otherwise try (again) to find an end string
361
;; candidate which starts behind the start string
364
(unless (setq *end-string-pos*
365
(funcall end-string-test try-pos))
366
;; no end string candidate found, so give up
367
(return-from scan nil))
368
;; NEW-POS is where we should start scanning
369
;; according to the end string candidate
370
(let ((new-pos (- (the fixnum *end-string-pos*)
372
(declare (type fixnum new-pos *end-string-pos*))
373
(cond ((= new-pos pos)
374
;; if POS and NEW-POS are equal then the
375
;; two candidates agree so we're fine
376
(return-from advance-fn pos))
378
;; if NEW-POS is further to the right we
379
;; advance POS and try again, i.e. we go
380
;; back to the start of the outer LOOP
382
;; this means "return from inner LOOP"
385
;; otherwise NEW-POS is smaller than POS,
386
;; so we have to redo the inner LOOP to
387
;; find another end string candidate
388
;; further to the right
389
(setq try-pos (1+ *end-string-pos*))))))))))))
390
((and starts-with-everything end-string-test end-string-offset)
391
;; we know that the regular expression starts with ".*" (which
392
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
393
;; with a constant end string and we know the end string's
394
;; offset (from the left)
397
(declare (type fixnum end-string-offset)
398
(type function end-string-test))
400
(unless (setq pos (newline-skipper pos))
401
;; if we can't find a #\Newline we give up immediately
402
(return-from scan nil))
404
;; from here we know that POS is a FIXNUM
405
(declare (type fixnum pos))
406
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
407
;; if we already found an end string candidate the
408
;; position of which matches the place behind the
409
;; #\Newline we're done
410
(return-from advance-fn pos))
412
;; otherwise try (again) to find an end string
413
;; candidate which starts behind the #\Newline
415
(unless (setq *end-string-pos*
416
(funcall end-string-test try-pos))
417
;; no end string candidate found, so we give up
418
(return-from scan nil))
419
;; NEW-POS is where we should start scanning
420
;; according to the end string candidate
421
(let ((new-pos (- (the fixnum *end-string-pos*)
423
(declare (type fixnum new-pos *end-string-pos*))
424
(cond ((= new-pos pos)
425
;; if POS and NEW-POS are equal then the
426
;; the end string candidate agrees with
427
;; the #\Newline so we're fine
428
(return-from advance-fn pos))
430
;; if NEW-POS is further to the right we
431
;; advance POS and try again, i.e. we go
432
;; back to the start of the outer LOOP
434
;; this means "return from inner LOOP"
437
;; otherwise NEW-POS is smaller than POS,
438
;; so we have to redo the inner LOOP to
439
;; find another end string candidate
440
;; further to the right
441
(setq try-pos (1+ *end-string-pos*))))))))))))
442
((and start-string-test end-string-test)
443
;; we know that the regular expression has constant start and
444
;; end strings; similar to the first case but we only need to
445
;; check for the end string, it doesn't provide enough
446
;; information to advance POS
449
(declare (type function start-string-test end-string-test))
450
(unless (setq pos (funcall start-string-test pos))
451
(return-from scan nil))
452
(if (<= (the fixnum pos)
453
(the fixnum *end-string-pos*))
454
(return-from advance-fn pos))
455
(unless (setq *end-string-pos* (funcall end-string-test pos))
456
(return-from scan nil))
458
((and starts-with-everything end-string-test)
459
;; we know that the regular expression starts with ".*" (which
460
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
461
;; with a constant end string; similar to the second case but we
462
;; only need to check for the end string, it doesn't provide
463
;; enough information to advance POS
466
(declare (type function end-string-test))
467
(unless (setq pos (newline-skipper pos))
468
(return-from scan nil))
469
(if (<= (the fixnum pos)
470
(the fixnum *end-string-pos*))
471
(return-from advance-fn pos))
472
(unless (setq *end-string-pos* (funcall end-string-test pos))
473
(return-from scan nil))
476
;; just check for constant start string candidate
479
(declare (type function start-string-test))
480
(unless (setq pos (funcall start-string-test pos))
481
(return-from scan nil))
483
(starts-with-everything
484
;; just advance POS with NEWLINE-SKIPPER
487
(unless (setq pos (newline-skipper pos))
488
(return-from scan nil))
491
;; just check for the next end string candidate if POS has
492
;; advanced beyond the last one
495
(declare (type function end-string-test))
496
(if (<= (the fixnum pos)
497
(the fixnum *end-string-pos*))
498
(return-from advance-fn pos))
499
(unless (setq *end-string-pos* (funcall end-string-test pos))
500
(return-from scan nil))
503
;; not enough optimization information about the regular
504
;; expression to optimize so we just return POS