Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/optimize.lisp
Kind | Covered | All | % |
expression | 464 | 499 | 93.0 |
branch | 59 | 72 | 81.9 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.26 2005/04/13 15:35:57 edi Exp $
4
;;; This file contains optimizations which can be applied to converted
7
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
9
;;; Redistribution and use in source and binary forms, with or without
10
;;; modification, are permitted provided that the following conditions
13
;;; * Redistributions of source code must retain the above copyright
14
;;; notice, this list of conditions and the following disclaimer.
16
;;; * Redistributions in binary form must reproduce the above
17
;;; copyright notice, this list of conditions and the following
18
;;; disclaimer in the documentation and/or other materials
19
;;; provided with the distribution.
21
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
22
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
25
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
(in-package #:cl-ppcre)
35
(defgeneric flatten (regex)
36
(declare #.*standard-optimize-settings*)
37
(:documentation "Merges adjacent sequences and alternations, i.e. it
38
transforms #<SEQ #<STR \"a\"> #<SEQ #<STR \"b\"> #<STR \"c\">>> to
39
#<SEQ #<STR \"a\"> #<STR \"b\"> #<STR \"c\">>. This is a destructive
40
operation on REGEX."))
42
(defmethod flatten ((seq seq))
43
;; this looks more complicated than it is because we modify SEQ in
44
;; place to avoid unnecessary consing
45
(let ((elements-rest (elements seq)))
49
(let ((flattened-element (flatten (car elements-rest)))
50
(next-elements-rest (cdr elements-rest)))
51
(cond ((typep flattened-element 'seq)
52
;; FLATTENED-ELEMENT is a SEQ object, so we "splice"
53
;; it into out list of elements
54
(let ((flattened-element-elements
55
(elements flattened-element)))
56
(setf (car elements-rest)
57
(car flattened-element-elements)
59
(nconc (cdr flattened-element-elements)
60
(cdr elements-rest)))))
62
;; otherwise we just replace the current element with
63
;; its flattened counterpart
64
(setf (car elements-rest) flattened-element)))
65
(setq elements-rest next-elements-rest))))
66
(let ((elements (elements seq)))
67
(cond ((cadr elements)
71
(t (make-instance 'void)))))
73
(defmethod flatten ((alternation alternation))
74
;; same algorithm as above
75
(let ((choices-rest (choices alternation)))
79
(let ((flattened-choice (flatten (car choices-rest)))
80
(next-choices-rest (cdr choices-rest)))
81
(cond ((typep flattened-choice 'alternation)
82
(let ((flattened-choice-choices
83
(choices flattened-choice)))
84
(setf (car choices-rest)
85
(car flattened-choice-choices)
87
(nconc (cdr flattened-choice-choices)
88
(cdr choices-rest)))))
90
(setf (car choices-rest) flattened-choice)))
91
(setq choices-rest next-choices-rest))))
92
(let ((choices (choices alternation)))
97
(t (signal-ppcre-syntax-error
98
"Encountered alternation without choices.")))))
100
(defmethod flatten ((branch branch))
101
(with-slots ((test test)
102
(then-regex then-regex)
103
(else-regex else-regex))
109
then-regex (flatten then-regex)
110
else-regex (flatten else-regex))
113
(defmethod flatten ((regex regex))
115
((or repetition register lookahead lookbehind standalone)
116
;; if REGEX contains exactly one inner REGEX object flatten it
118
(flatten (regex regex)))
121
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
122
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
126
(defgeneric gather-strings (regex)
127
(declare #.*standard-optimize-settings*)
128
(:documentation "Collects adjacent strings or characters into one
129
string provided they have the same case mode. This is a destructive
130
operation on REGEX."))
132
(defmethod gather-strings ((seq seq))
133
;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it
134
;; expects SEQ to be flattened already; in particular, SEQ cannot be
135
;; empty and cannot contain embedded SEQ objects
136
(let* ((start-point (cons nil (elements seq)))
137
(curr-point start-point)
143
(declare (type fixnum collector-length))
145
(let ((elements-rest (cdr curr-point)))
146
(unless elements-rest
148
(let* ((element (car elements-rest))
149
(case-mode (case-mode element old-case-mode)))
150
(cond ((and case-mode
151
(eq case-mode old-case-mode))
152
;; if ELEMENT is a STR and we have collected a STR of
153
;; the same case mode in the last iteration we
154
;; concatenate ELEMENT onto COLLECTOR and remember the
155
;; value of its SKIP slot
156
(let ((old-collector-length collector-length))
157
(unless (and (adjustable-array-p collector)
158
(array-has-fill-pointer-p collector))
160
(make-array collector-length
161
:initial-contents collector
162
:element-type 'character
165
collector-start nil))
166
(adjust-array collector
167
(incf collector-length (len element))
169
(setf (subseq collector
170
old-collector-length)
172
;; it suffices to remember the last SKIP slot
173
;; because due to the way MAYBE-ACCUMULATE
174
;; works adjacent STR objects have the same
176
skip (skip element)))
177
(setf (cdr curr-point) (cdr elements-rest)))
179
(let ((collected-string
180
(cond (collector-start
183
;; if we have collected something already
184
;; we convert it into a STR
193
;; if ELEMENT is a string with a different case
194
;; mode than the last one we have either just
195
;; converted COLLECTOR into a STR or COLLECTOR
196
;; is still empty; in both cases we can now
197
;; begin to fill it anew
198
(setq collector (str element)
199
collector-start element
200
;; and we remember the SKIP value as above
202
collector-length (len element))
203
(cond (collected-string
204
(setf (car elements-rest)
209
(setf (cdr curr-point)
210
(cdr elements-rest)))))
212
;; otherwise this is not a STR so we apply
213
;; GATHER-STRINGS to it and collect it directly
215
(cond (collected-string
216
(setf (car elements-rest)
221
(cons (gather-strings element)
226
(setf (car elements-rest)
227
(gather-strings element)
230
;; we also have to empty COLLECTOR here in case
231
;; it was still filled from the last iteration
233
collector-start nil))))))
234
(setq old-case-mode case-mode))))
236
(setf (cdr curr-point)
245
(setf (elements seq) (cdr start-point))
248
(defmethod gather-strings ((alternation alternation))
249
;; loop ON the choices of ALTERNATION so we can modify them directly
250
(loop for choices-rest on (choices alternation)
252
do (setf (car choices-rest)
253
(gather-strings (car choices-rest))))
256
(defmethod gather-strings ((branch branch))
257
(with-slots ((test test)
258
(then-regex then-regex)
259
(else-regex else-regex))
264
(gather-strings test))
265
then-regex (gather-strings then-regex)
266
else-regex (gather-strings else-regex))
269
(defmethod gather-strings ((regex regex))
271
((or repetition register lookahead lookbehind standalone)
272
;; if REGEX contains exactly one inner REGEX object apply
273
;; GATHER-STRINGS to it
275
(gather-strings (regex regex)))
278
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
279
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
283
;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS.
285
(defgeneric start-anchored-p (regex &optional in-seq-p)
286
(declare #.*standard-optimize-settings*)
287
(:documentation "Returns T if REGEX starts with a \"real\" start
288
anchor, i.e. one that's not in multi-line mode, NIL otherwise. If
289
IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a
290
zero-length assertion."))
292
(defmethod start-anchored-p ((seq seq) &optional in-seq-p)
293
(declare (ignore in-seq-p))
294
;; note that START-ANCHORED-P is to be applied after FLATTEN and
295
;; GATHER-STRINGS, i.e. SEQ cannot be empty and cannot contain
296
;; embedded SEQ objects
297
(loop for element in (elements seq)
298
for anchored-p = (start-anchored-p element t)
299
;; skip zero-length elements because they won't affect the
300
;; "anchoredness" of the sequence
301
while (eq anchored-p :zero-length)
302
finally (return (and anchored-p (not (eq anchored-p :zero-length))))))
304
(defmethod start-anchored-p ((alternation alternation) &optional in-seq-p)
305
(declare (ignore in-seq-p))
306
;; clearly an alternation can only be start-anchored if all of its
307
;; choices are start-anchored
308
(loop for choice in (choices alternation)
309
always (start-anchored-p choice)))
311
(defmethod start-anchored-p ((branch branch) &optional in-seq-p)
312
(declare (ignore in-seq-p))
313
(and (start-anchored-p (then-regex branch))
314
(start-anchored-p (else-regex branch))))
316
(defmethod start-anchored-p ((repetition repetition) &optional in-seq-p)
317
(declare (ignore in-seq-p))
318
;; well, this wouldn't make much sense, but anyway...
319
(and (plusp (minimum repetition))
320
(start-anchored-p (regex repetition))))
322
(defmethod start-anchored-p ((register register) &optional in-seq-p)
323
(declare (ignore in-seq-p))
324
(start-anchored-p (regex register)))
326
(defmethod start-anchored-p ((standalone standalone) &optional in-seq-p)
327
(declare (ignore in-seq-p))
328
(start-anchored-p (regex standalone)))
330
(defmethod start-anchored-p ((anchor anchor) &optional in-seq-p)
331
(declare (ignore in-seq-p))
333
(not (multi-line-p anchor))))
335
(defmethod start-anchored-p ((regex regex) &optional in-seq-p)
337
((or lookahead lookbehind word-boundary void)
338
;; zero-length assertions
349
;; BACK-REFERENCE, CHAR-CLASS, EVERYTHING, and STR
352
;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS.
354
(defgeneric end-string-aux (regex &optional old-case-insensitive-p)
355
(declare #.*standard-optimize-settings*)
356
(:documentation "Returns the constant string (if it exists) REGEX
357
ends with wrapped into a STR object, otherwise NIL.
358
OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR
359
collected or :VOID if no STR has been collected yet. (This is a helper
360
function called by END-STRIN.)"))
362
(defmethod end-string-aux ((str str)
363
&optional (old-case-insensitive-p :void))
364
(declare (special last-str))
365
(cond ((and (not (skip str)) ; avoid constituents of STARTS-WITH
366
;; only use STR if nothing has been collected yet or if
367
;; the collected string has the same value for
368
;; CASE-INSENSITIVE-P
369
(or (eq old-case-insensitive-p :void)
370
(eq (case-insensitive-p str) old-case-insensitive-p)))
372
;; set the SKIP property of this STR
377
(defmethod end-string-aux ((seq seq)
378
&optional (old-case-insensitive-p :void))
379
(declare (special continuep))
380
(let (case-insensitive-p
383
(concatenated-length 0))
384
(declare (type fixnum concatenated-length))
385
(loop for element in (reverse (elements seq))
386
;; remember the case-(in)sensitivity of the last relevant
388
for loop-old-case-insensitive-p = old-case-insensitive-p
390
loop-old-case-insensitive-p
391
(case-insensitive-p element-end))
392
;; the end-string of the current element
393
for element-end = (end-string-aux element
394
loop-old-case-insensitive-p)
395
;; whether we encountered a zero-length element
396
for skip = (if element-end
397
(zerop (len element-end))
399
;; set CONTINUEP to NIL if we have to stop collecting to
400
;; alert END-STRING-AUX methods on enclosing SEQ objects
402
do (setq continuep nil)
403
;; end loop if we neither got a STR nor a zero-length
406
;; only collect if not zero-length
408
do (cond (concatenated-string
409
(when concatenated-start
410
(setf concatenated-string
411
(make-array concatenated-length
412
:initial-contents (reverse (str concatenated-start))
413
:element-type 'character
416
concatenated-start nil))
417
(let ((len (len element-end))
418
(str (str element-end)))
419
(declare (type fixnum len))
420
(incf concatenated-length len)
421
(loop for i of-type fixnum downfrom (1- len) to 0
422
do (vector-push-extend (char str i)
423
concatenated-string))))
425
(setf concatenated-string
432
(case-insensitive-p element-end))))
433
;; stop collecting if END-STRING-AUX on inner SEQ has said so
435
(cond ((zerop concatenated-length)
436
;; don't bother to return zero-length strings
442
:str (nreverse concatenated-string)
443
:case-insensitive-p case-insensitive-p)))))
445
(defmethod end-string-aux ((register register)
446
&optional (old-case-insensitive-p :void))
447
(end-string-aux (regex register) old-case-insensitive-p))
449
(defmethod end-string-aux ((standalone standalone)
450
&optional (old-case-insensitive-p :void))
451
(end-string-aux (regex standalone) old-case-insensitive-p))
453
(defmethod end-string-aux ((regex regex)
454
&optional (old-case-insensitive-p :void))
455
(declare (special last-str end-anchored-p continuep))
457
((or anchor lookahead lookbehind word-boundary void)
458
;; a zero-length REGEX object - for the sake of END-STRING-AUX
459
;; this is a zero-length string
460
(when (and (typep regex 'anchor)
462
(or (no-newline-p regex)
463
(not (multi-line-p regex)))
464
(eq old-case-insensitive-p :void))
465
;; if this is a "real" end-anchor and we haven't collected
466
;; anything so far we can set END-ANCHORED-P (where 1 or 0
467
;; indicate whether we accept a #\Newline at the end or not)
468
(setq end-anchored-p (if (no-newline-p regex) 0 1)))
471
:case-insensitive-p :void))
473
;; (ALTERNATION, BACK-REFERENCE, BRANCH, CHAR-CLASS, EVERYTHING,
474
;; REPETITION, FILTER)
477
(defgeneric end-string (regex)
478
(declare #.*standard-optimize-settings*)
479
(:documentation "Returns the constant string (if it exists) REGEX ends with wrapped
480
into a STR object, otherwise NIL."))
482
(defmethod end-string ((regex regex))
483
(declare (special end-string-offset))
484
(declare #.*standard-optimize-settings*)
485
;; LAST-STR points to the last STR object (seen from the end) that's
486
;; part of END-STRING; CONTINUEP is set to T if we stop collecting
487
;; in the middle of a SEQ
490
(declare (special continuep last-str))
492
(end-string-aux regex)
494
;; if we've found something set the START-OF-END-STRING-P of
495
;; the leftmost STR collected accordingly and remember the
496
;; OFFSET of this STR (in a special variable provided by the
497
;; caller of this function)
498
(setf (start-of-end-string-p last-str) t
499
end-string-offset (offset last-str))))))
501
(defgeneric compute-min-rest (regex current-min-rest)
502
(declare #.*standard-optimize-settings*)
503
(:documentation "Returns the minimal length of REGEX plus
504
CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it
505
recurses down into REGEX and sets the MIN-REST slots of REPETITION
508
(defmethod compute-min-rest ((seq seq) current-min-rest)
509
(loop for element in (reverse (elements seq))
510
for last-min-rest = current-min-rest then this-min-rest
511
for this-min-rest = (compute-min-rest element last-min-rest)
512
finally (return this-min-rest)))
514
(defmethod compute-min-rest ((alternation alternation) current-min-rest)
515
(loop for choice in (choices alternation)
516
minimize (compute-min-rest choice current-min-rest)))
518
(defmethod compute-min-rest ((branch branch) current-min-rest)
519
(min (compute-min-rest (then-regex branch) current-min-rest)
520
(compute-min-rest (else-regex branch) current-min-rest)))
522
(defmethod compute-min-rest ((str str) current-min-rest)
523
(+ current-min-rest (len str)))
525
(defmethod compute-min-rest ((filter filter) current-min-rest)
526
(+ current-min-rest (or (len filter) 0)))
528
(defmethod compute-min-rest ((repetition repetition) current-min-rest)
529
(setf (min-rest repetition) current-min-rest)
530
(compute-min-rest (regex repetition) current-min-rest)
531
(+ current-min-rest (* (minimum repetition) (min-len repetition))))
533
(defmethod compute-min-rest ((register register) current-min-rest)
534
(compute-min-rest (regex register) current-min-rest))
536
(defmethod compute-min-rest ((standalone standalone) current-min-rest)
537
(declare (ignore current-min-rest))
538
(compute-min-rest (regex standalone) 0))
540
(defmethod compute-min-rest ((lookahead lookahead) current-min-rest)
541
(compute-min-rest (regex lookahead) 0)
544
(defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest)
545
(compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind)))
548
(defmethod compute-min-rest ((regex regex) current-min-rest)
550
((or char-class everything)
551
(1+ current-min-rest))
553
;; zero min-len and no embedded regexes (ANCHOR,
554
;; BACK-REFERENCE, VOID, and WORD-BOUNDARY)