Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/regex-class.lisp
Kind | Covered | All | % |
expression | 371 | 482 | 77.0 |
branch | 26 | 36 | 72.2 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class.lisp,v 1.26 2005/06/10 10:23:42 edi Exp $
4
;;; This file defines the REGEX class and some utility methods for
5
;;; this class. REGEX objects are used to represent the (transformed)
6
;;; parse trees internally
8
;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
10
;;; Redistribution and use in source and binary forms, with or without
11
;;; modification, are permitted provided that the following conditions
14
;;; * Redistributions of source code must retain the above copyright
15
;;; notice, this list of conditions and the following disclaimer.
17
;;; * Redistributions in binary form must reproduce the above
18
;;; copyright notice, this list of conditions and the following
19
;;; disclaimer in the documentation and/or other materials
20
;;; provided with the distribution.
22
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
23
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
26
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
28
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
30
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
(in-package #:cl-ppcre)
36
;; Genera need the eval-when, here, or the types created by the class
37
;; definitions aren't seen by the typep calls later in the file.
38
(eval-when (:compile-toplevel :load-toplevel :execute)
40
(declare #.*standard-optimize-settings*)
43
(:documentation "The REGEX base class. All other classes inherit
48
((elements :initarg :elements
51
:documentation "A list of REGEX objects."))
52
(:documentation "SEQ objects represents sequences of
53
regexes. (Like \"ab\" is the sequence of \"a\" and \"b\".)"))
55
(defclass alternation (regex)
56
((choices :initarg :choices
59
:documentation "A list of REGEX objects"))
60
(:documentation "ALTERNATION objects represent alternations of
61
regexes. (Like \"a|b\" ist the alternation of \"a\" or \"b\".)"))
63
(defclass lookahead (regex)
64
((regex :initarg :regex
66
:documentation "The REGEX object we're checking.")
67
(positivep :initarg :positivep
69
:documentation "Whether this assertion is positive."))
70
(:documentation "LOOKAHEAD objects represent look-ahead assertions."))
72
(defclass lookbehind (regex)
73
((regex :initarg :regex
75
:documentation "The REGEX object we're checking.")
76
(positivep :initarg :positivep
78
:documentation "Whether this assertion is positive.")
82
:documentation "The (fixed) length of the enclosed regex."))
83
(:documentation "LOOKBEHIND objects represent look-behind assertions."))
85
(defclass repetition (regex)
86
((regex :initarg :regex
88
:documentation "The REGEX that's repeated.")
89
(greedyp :initarg :greedyp
91
:documentation "Whether the repetition is greedy.")
92
(minimum :initarg :minimum
95
:documentation "The minimal number of repetitions.")
96
(maximum :initarg :maximum
98
:documentation "The maximal number of repetitions.
99
Can be NIL for unbounded.")
100
(min-len :initarg :min-len
102
:documentation "The minimal length of the enclosed regex.")
105
:documentation "The length of the enclosed regex. NIL
107
(min-rest :initform 0
110
:documentation "The minimal number of characters which must
111
appear after this repetition.")
112
(contains-register-p :initarg :contains-register-p
113
:reader contains-register-p
114
:documentation "If the regex contains a register."))
115
(:documentation "REPETITION objects represent repetitions of regexes."))
117
(defclass register (regex)
118
((regex :initarg :regex
120
:documentation "The inner regex.")
124
:documentation "The number of this register, starting from 0.
125
This is the index into *REGS-START* and *REGS-END*."))
126
(:documentation "REGISTER objects represent register groups."))
128
(defclass standalone (regex)
129
((regex :initarg :regex
131
:documentation "The inner regex."))
132
(:documentation "A standalone regular expression."))
134
(defclass back-reference (regex)
138
:documentation "The number of the register this
139
reference refers to.")
140
(case-insensitive-p :initarg :case-insensitive-p
141
:reader case-insensitive-p
142
:documentation "Whether we check
143
case-insensitively."))
144
(:documentation "BACK-REFERENCE objects represent backreferences."))
146
(defclass char-class (regex)
147
((hash :initarg :hash
149
:type (or hash-table null)
150
:documentation "A hash table the keys of which are the
151
characters; the values are always T.")
152
(case-insensitive-p :initarg :case-insensitive-p
153
:reader case-insensitive-p
154
:documentation "If the char class
156
(invertedp :initarg :invertedp
158
:documentation "Whether we mean the inverse of
160
(word-char-class-p :initarg :word-char-class-p
161
:reader word-char-class-p
162
:documentation "Whether this CHAR CLASS
163
represents the special class WORD-CHAR-CLASS."))
164
(:documentation "CHAR-CLASS objects represent character classes."))
166
(defclass str (regex)
170
:documentation "The actual string.")
174
:documentation "The length of the string.")
175
(case-insensitive-p :initarg :case-insensitive-p
176
:reader case-insensitive-p
177
:documentation "If we match case-insensitively.")
178
(offset :initform nil
180
:documentation "Offset from the left of the whole
181
parse tree. The first regex has offset 0. NIL if unknown, i.e. behind
182
a variable-length regex.")
186
:documentation "If we can avoid testing for this
187
string because the SCAN function has done this already.")
188
(start-of-end-string-p :initform nil
189
:accessor start-of-end-string-p
190
:documentation "If this is the unique
191
STR which starts END-STRING (a slot of MATCHER)."))
192
(:documentation "STR objects represent string."))
194
(defclass anchor (regex)
195
((startp :initarg :startp
197
:documentation "Whether this is a \"start anchor\".")
198
(multi-line-p :initarg :multi-line-p
200
:documentation "Whether we're in multi-line mode,
201
i.e. whether each #\\Newline is surrounded by anchors.")
202
(no-newline-p :initarg :no-newline-p
204
:documentation "Whether we ignore #\\Newline at the end."))
205
(:documentation "ANCHOR objects represent anchors like \"^\" or \"$\"."))
207
(defclass everything (regex)
208
((single-line-p :initarg :single-line-p
209
:reader single-line-p
210
:documentation "Whether we're in single-line mode,
211
i.e. whether we also match #\\Newline."))
212
(:documentation "EVERYTHING objects represent regexes matching
213
\"everything\", i.e. dots."))
215
(defclass word-boundary (regex)
216
((negatedp :initarg :negatedp
218
:documentation "Whether we mean the opposite,
219
i.e. no word-boundary."))
220
(:documentation "WORD-BOUNDARY objects represent word-boundary assertions."))
222
(defclass branch (regex)
223
((test :initarg :test
225
:documentation "The test of this branch, one of
226
LOOKAHEAD, LOOKBEHIND, or a number.")
227
(then-regex :initarg :then-regex
229
:documentation "The regex that's to be matched if the
231
(else-regex :initarg :else-regex
232
:initform (make-instance 'void)
234
:documentation "The regex that's to be matched if the
236
(:documentation "BRANCH objects represent Perl's conditional regular
239
(defclass filter (regex)
242
:type (or function symbol)
243
:documentation "The user-defined function.")
246
:documentation "The fixed length of this filter or NIL."))
247
(:documentation "FILTER objects represent arbitrary functions
248
defined by the user."))
250
(defclass void (regex)
252
(:documentation "VOID objects represent empty regular expressions."))))
254
(defmethod initialize-instance :after ((char-class char-class) &rest init-args)
255
(declare #.*standard-optimize-settings*)
256
"Make large hash tables smaller, if possible."
257
(let ((hash (getf init-args :hash)))
259
(> *regex-char-code-limit* 256)
260
(> (hash-table-count hash)
261
(/ *regex-char-code-limit* 2)))
262
(setf (slot-value char-class 'hash)
263
(merge-inverted-hash (make-hash-table)
265
(slot-value char-class 'invertedp)
266
(not (slot-value char-class 'invertedp))))))
268
;;; The following four methods allow a VOID object to behave like a
269
;;; zero-length STR object (only readers needed)
271
(defmethod initialize-instance :after ((str str) &rest init-args)
272
(declare #.*standard-optimize-settings*)
273
(declare (ignore init-args))
274
"Automatically computes the length of a STR after initialization."
275
(let ((str-slot (slot-value str 'str)))
276
(unless (typep str-slot 'simple-string)
277
(setf (slot-value str 'str) (coerce str-slot 'simple-string))))
278
(setf (len str) (length (str str))))
280
(defmethod len ((void void))
281
(declare #.*standard-optimize-settings*)
284
(defmethod str ((void void))
285
(declare #.*standard-optimize-settings*)
288
(defmethod skip ((void void))
289
(declare #.*standard-optimize-settings*)
292
(defmethod start-of-end-string-p ((void void))
293
(declare #.*standard-optimize-settings*)
296
(defgeneric case-mode (regex old-case-mode)
297
(declare #.*standard-optimize-settings*)
298
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
299
Returns a keyword denoting the case-(in)sensitivity of a STR or its
300
second argument if the STR has length 0. Returns NIL for REGEX objects
301
which are not of type STR."))
303
(defmethod case-mode ((str str) old-case-mode)
304
(cond ((zerop (len str))
306
((case-insensitive-p str)
311
(defmethod case-mode ((regex regex) old-case-mode)
312
(declare (ignore old-case-mode))
315
(defgeneric copy-regex (regex)
316
(declare #.*standard-optimize-settings*)
317
(:documentation "Implements a deep copy of a REGEX object."))
319
(defmethod copy-regex ((anchor anchor))
320
(make-instance 'anchor
321
:startp (startp anchor)
322
:multi-line-p (multi-line-p anchor)
323
:no-newline-p (no-newline-p anchor)))
325
(defmethod copy-regex ((everything everything))
326
(make-instance 'everything
327
:single-line-p (single-line-p everything)))
329
(defmethod copy-regex ((word-boundary word-boundary))
330
(make-instance 'word-boundary
331
:negatedp (negatedp word-boundary)))
333
(defmethod copy-regex ((void void))
334
(make-instance 'void))
336
(defmethod copy-regex ((lookahead lookahead))
337
(make-instance 'lookahead
338
:regex (copy-regex (regex lookahead))
339
:positivep (positivep lookahead)))
341
(defmethod copy-regex ((seq seq))
343
:elements (mapcar #'copy-regex (elements seq))))
345
(defmethod copy-regex ((alternation alternation))
346
(make-instance 'alternation
347
:choices (mapcar #'copy-regex (choices alternation))))
349
(defmethod copy-regex ((branch branch))
350
(with-slots ((test test))
352
(make-instance 'branch
353
:test (if (typep test 'regex)
356
:then-regex (copy-regex (then-regex branch))
357
:else-regex (copy-regex (else-regex branch)))))
359
(defmethod copy-regex ((lookbehind lookbehind))
360
(make-instance 'lookbehind
361
:regex (copy-regex (regex lookbehind))
362
:positivep (positivep lookbehind)
363
:len (len lookbehind)))
365
(defmethod copy-regex ((repetition repetition))
366
(make-instance 'repetition
367
:regex (copy-regex (regex repetition))
368
:greedyp (greedyp repetition)
369
:minimum (minimum repetition)
370
:maximum (maximum repetition)
371
:min-len (min-len repetition)
372
:len (len repetition)
373
:contains-register-p (contains-register-p repetition)))
375
(defmethod copy-regex ((register register))
376
(make-instance 'register
377
:regex (copy-regex (regex register))
378
:num (num register)))
380
(defmethod copy-regex ((standalone standalone))
381
(make-instance 'standalone
382
:regex (copy-regex (regex standalone))))
384
(defmethod copy-regex ((back-reference back-reference))
385
(make-instance 'back-reference
386
:num (num back-reference)
387
:case-insensitive-p (case-insensitive-p back-reference)))
389
(defmethod copy-regex ((char-class char-class))
390
(make-instance 'char-class
391
:hash (hash char-class)
392
:case-insensitive-p (case-insensitive-p char-class)
393
:invertedp (invertedp char-class)
394
:word-char-class-p (word-char-class-p char-class)))
396
(defmethod copy-regex ((str str))
399
:case-insensitive-p (case-insensitive-p str)))
401
(defmethod copy-regex ((filter filter))
402
(make-instance 'filter
406
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
407
;;; wrapped into one function. Maybe in the next release...
409
;;; Further note that this function is used by CONVERT to factor out
410
;;; complicated repetitions, i.e. cases like
411
;;; (a)* -> (?:a*(a))?
412
;;; This won't work for, say,
413
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
414
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
416
(defgeneric remove-registers (regex)
417
(declare #.*standard-optimize-settings*)
418
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
419
optionally removes embedded REGISTER objects if possible and if the
420
special variable REMOVE-REGISTERS-P is true."))
422
(defmethod remove-registers ((register register))
423
(declare (special remove-registers-p reg-seen))
424
(cond (remove-registers-p
425
(remove-registers (regex register)))
427
;; mark REG-SEEN as true so enclosing REPETITION objects
428
;; (see method below) know if they contain a register or not
430
(copy-regex register))))
432
(defmethod remove-registers ((repetition repetition))
434
(inner-regex (remove-registers (regex repetition))))
435
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
436
;; (REGEX REPETITION) contains a REGISTER
437
(declare (special reg-seen))
438
(make-instance 'repetition
440
:greedyp (greedyp repetition)
441
:minimum (minimum repetition)
442
:maximum (maximum repetition)
443
:min-len (min-len repetition)
444
:len (len repetition)
445
:contains-register-p reg-seen)))
447
(defmethod remove-registers ((standalone standalone))
448
(make-instance 'standalone
449
:regex (remove-registers (regex standalone))))
451
(defmethod remove-registers ((lookahead lookahead))
452
(make-instance 'lookahead
453
:regex (remove-registers (regex lookahead))
454
:positivep (positivep lookahead)))
456
(defmethod remove-registers ((lookbehind lookbehind))
457
(make-instance 'lookbehind
458
:regex (remove-registers (regex lookbehind))
459
:positivep (positivep lookbehind)
460
:len (len lookbehind)))
462
(defmethod remove-registers ((branch branch))
463
(with-slots ((test test))
465
(make-instance 'branch
466
:test (if (typep test 'regex)
467
(remove-registers test)
469
:then-regex (remove-registers (then-regex branch))
470
:else-regex (remove-registers (else-regex branch)))))
472
(defmethod remove-registers ((alternation alternation))
473
(declare (special remove-registers-p))
474
;; an ALTERNATION, so we can't remove REGISTER objects further down
475
(setq remove-registers-p nil)
476
(copy-regex alternation))
478
(defmethod remove-registers ((regex regex))
481
(defmethod remove-registers ((seq seq))
483
:elements (mapcar #'remove-registers (elements seq))))
485
(defgeneric everythingp (regex)
486
(declare #.*standard-optimize-settings*)
487
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
488
to this object, otherwise NIL. So, \"(.){1}\" would return true
489
(i.e. the object corresponding to \".\", for example."))
491
(defmethod everythingp ((seq seq))
492
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
493
;; due to the parsing process
494
(let ((cleaned-elements (remove-if #'(lambda (element)
495
(typep element 'void))
497
(and (= 1 (length cleaned-elements))
498
(everythingp (first cleaned-elements)))))
500
(defmethod everythingp ((alternation alternation))
501
(with-slots ((choices choices))
503
(and (= 1 (length choices))
504
;; this is unlikely to happen for human-generated regexes,
505
;; but machine-generated ones might look like this
506
(everythingp (first choices)))))
508
(defmethod everythingp ((repetition repetition))
509
(with-slots ((maximum maximum)
514
(= 1 minimum maximum)
515
;; treat "<regex>{1,1}" like "<regex>"
516
(everythingp regex))))
518
(defmethod everythingp ((register register))
519
(everythingp (regex register)))
521
(defmethod everythingp ((standalone standalone))
522
(everythingp (regex standalone)))
524
(defmethod everythingp ((everything everything))
527
(defmethod everythingp ((regex regex))
528
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
529
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
532
(defgeneric regex-length (regex)
533
(declare #.*standard-optimize-settings*)
534
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
536
(defmethod regex-length ((seq seq))
537
;; simply add all inner lengths unless one of them is NIL
538
(loop for sub-regex in (elements seq)
539
for len = (regex-length sub-regex)
540
if (not len) do (return nil)
543
(defmethod regex-length ((alternation alternation))
544
;; only return a true value if all inner lengths are non-NIL and
546
(loop for sub-regex in (choices alternation)
547
for old-len = nil then len
548
for len = (regex-length sub-regex)
550
(and old-len (/= len old-len))) do (return nil)
551
finally (return len)))
553
(defmethod regex-length ((branch branch))
554
;; only return a true value if both alternations have a length and
556
(let ((then-length (regex-length (then-regex branch))))
558
(eql then-length (regex-length (else-regex branch)))
561
(defmethod regex-length ((repetition repetition))
562
;; we can only compute the length of a REPETITION object if the
563
;; number of repetitions is fixed; note that we don't call
564
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
565
;; always set correctly
566
(with-slots ((len len)
571
(eql minimum maximum))
575
(defmethod regex-length ((register register))
576
(regex-length (regex register)))
578
(defmethod regex-length ((standalone standalone))
579
(regex-length (regex standalone)))
581
(defmethod regex-length ((back-reference back-reference))
582
;; with enough effort we could possibly do better here, but
583
;; currently we just give up and return NIL
586
(defmethod regex-length ((char-class char-class))
589
(defmethod regex-length ((everything everything))
592
(defmethod regex-length ((str str))
595
(defmethod regex-length ((filter filter))
598
(defmethod regex-length ((regex regex))
599
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
600
;; WORD-BOUNDARY (which all have zero-length)
603
(defgeneric regex-min-length (regex)
604
(declare #.*standard-optimize-settings*)
605
(:documentation "Returns the minimal length of REGEX."))
607
(defmethod regex-min-length ((seq seq))
608
;; simply add all inner minimal lengths
609
(loop for sub-regex in (elements seq)
610
for len = (regex-min-length sub-regex)
613
(defmethod regex-min-length ((alternation alternation))
614
;; minimal length of an alternation is the minimal length of the
615
;; "shortest" element
616
(loop for sub-regex in (choices alternation)
617
for len = (regex-min-length sub-regex)
620
(defmethod regex-min-length ((branch branch))
621
;; minimal length of both alternations
622
(min (regex-min-length (then-regex branch))
623
(regex-min-length (else-regex branch))))
625
(defmethod regex-min-length ((repetition repetition))
626
;; obviously the product of the inner minimal length and the minimal
627
;; number of repetitions
628
(* (minimum repetition) (min-len repetition)))
630
(defmethod regex-min-length ((register register))
631
(regex-min-length (regex register)))
633
(defmethod regex-min-length ((standalone standalone))
634
(regex-min-length (regex standalone)))
636
(defmethod regex-min-length ((char-class char-class))
639
(defmethod regex-min-length ((everything everything))
642
(defmethod regex-min-length ((str str))
645
(defmethod regex-min-length ((filter filter))
649
(defmethod regex-min-length ((regex regex))
650
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
651
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
654
(defgeneric compute-offsets (regex start-pos)
655
(declare #.*standard-optimize-settings*)
656
(:documentation "Returns the offset the following regex would have
657
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
658
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
659
slots of STR objects further down the tree."))
661
;; note that we're actually only interested in the offset of
662
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
663
;; can stop at variable-length alternations and don't need to descend
666
(defmethod compute-offsets ((seq seq) start-pos)
667
(loop for element in (elements seq)
668
;; advance offset argument for next call while looping through
670
for pos = start-pos then curr-offset
671
for curr-offset = (compute-offsets element pos)
673
finally (return curr-offset)))
675
(defmethod compute-offsets ((alternation alternation) start-pos)
676
(loop for choice in (choices alternation)
677
for old-offset = nil then curr-offset
678
for curr-offset = (compute-offsets choice start-pos)
679
;; we stop immediately if two alternations don't result in the
681
if (or (not curr-offset)
682
(and old-offset (/= curr-offset old-offset)))
684
finally (return curr-offset)))
686
(defmethod compute-offsets ((branch branch) start-pos)
687
;; only return offset if both alternations have equal value
688
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
690
(eql then-offset (compute-offsets (else-regex branch) start-pos))
693
(defmethod compute-offsets ((repetition repetition) start-pos)
694
;; no need to descend into the inner regex
695
(with-slots ((len len)
700
(eq minimum maximum))
701
;; fixed number of repetitions, so we know how to proceed
702
(+ start-pos (* minimum len))
703
;; otherwise return NIL
706
(defmethod compute-offsets ((register register) start-pos)
707
(compute-offsets (regex register) start-pos))
709
(defmethod compute-offsets ((standalone standalone) start-pos)
710
(compute-offsets (regex standalone) start-pos))
712
(defmethod compute-offsets ((char-class char-class) start-pos)
715
(defmethod compute-offsets ((everything everything) start-pos)
718
(defmethod compute-offsets ((str str) start-pos)
719
(setf (offset str) start-pos)
720
(+ start-pos (len str)))
722
(defmethod compute-offsets ((back-reference back-reference) start-pos)
723
;; with enough effort we could possibly do better here, but
724
;; currently we just give up and return NIL
725
(declare (ignore start-pos))
728
(defmethod compute-offsets ((filter filter) start-pos)
729
(let ((len (len filter)))
734
(defmethod compute-offsets ((regex regex) start-pos)
735
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
736
;; WORD-BOUNDARY (which all have zero-length)