Coverage report: /home/jsnell/.sbcl/site/cl-ppcre-1.2.13/convert.lisp

KindCoveredAll%
expression539593 90.9
branch8394 88.3
Key
Not instrumented
Conditionalized out
Executed
Not executed
 
Both branches taken
One branch taken
Neither branch taken
1
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2
 ;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.22 2005/04/01 21:29:09 edi Exp $
3
 
4
 ;;; Here the parse tree is converted into its internal representation
5
 ;;; using REGEX objects.  At the same time some optimizations are
6
 ;;; already applied.
7
 
8
 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
9
 
10
 ;;; Redistribution and use in source and binary forms, with or without
11
 ;;; modification, are permitted provided that the following conditions
12
 ;;; are met:
13
 
14
 ;;;   * Redistributions of source code must retain the above copyright
15
 ;;;     notice, this list of conditions and the following disclaimer.
16
 
17
 ;;;   * Redistributions in binary form must reproduce the above
18
 ;;;     copyright notice, this list of conditions and the following
19
 ;;;     disclaimer in the documentation and/or other materials
20
 ;;;     provided with the distribution.
21
 
22
 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
23
 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24
 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
26
 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
28
 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29
 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
30
 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31
 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32
 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
 
34
 (in-package #:cl-ppcre)
35
 
36
 ;;; The flags that represent the "ism" modifiers are always kept
37
 ;;; together in a three-element list. We use the following macros to
38
 ;;; access individual elements.
39
 
40
 (defmacro case-insensitive-mode-p (flags)
41
   "Accessor macro to extract the first flag out of a three-element flag list."
42
   `(first ,flags))
43
 
44
 (defmacro multi-line-mode-p (flags)
45
   "Accessor macro to extract the second flag out of a three-element flag list."
46
   `(second ,flags))
47
 
48
 (defmacro single-line-mode-p (flags)
49
   "Accessor macro to extract the third flag out of a three-element flag list."
50
   `(third ,flags))
51
 
52
 (defun set-flag (token)
53
   (declare #.*standard-optimize-settings*)
54
   (declare (special flags))
55
   "Reads a flag token and sets or unsets the corresponding entry in
56
 the special FLAGS list."
57
   (case token
58
     ((:case-insensitive-p)
59
       (setf (case-insensitive-mode-p flags) t))
60
     ((:case-sensitive-p)
61
       (setf (case-insensitive-mode-p flags) nil))
62
     ((:multi-line-mode-p)
63
       (setf (multi-line-mode-p flags) t))
64
     ((:not-multi-line-mode-p)
65
       (setf (multi-line-mode-p flags) nil))
66
     ((:single-line-mode-p)
67
       (setf (single-line-mode-p flags) t))
68
     ((:not-single-line-mode-p)
69
       (setf (single-line-mode-p flags) nil))
70
     (otherwise
71
       (signal-ppcre-syntax-error "Unknown flag token ~A" token))))
72
 
73
 (defun add-range-to-hash (hash from to)
74
   (declare #.*standard-optimize-settings*)
75
   (declare (special flags))
76
   "Adds all characters from character FROM to character TO (inclusive)
77
 to the char class hash HASH. Does the right thing with respect to
78
 case-(in)sensitivity as specified by the special variable FLAGS."
79
   (let ((from-code (char-code from))
80
         (to-code (char-code to)))
81
     (when (> from-code to-code)
82
       (signal-ppcre-syntax-error "Invalid range from ~A to ~A in char-class"
83
                                  from to))
84
     (cond ((case-insensitive-mode-p flags)
85
             (loop for code from from-code to to-code
86
                   for chr = (code-char code)
87
                   do (setf (gethash (char-upcase chr) hash) t
88
                            (gethash (char-downcase chr) hash) t)))
89
           (t
90
             (loop for code from from-code to to-code
91
                   do (setf (gethash (code-char code) hash) t))))
92
     hash))
93
 
94
 (defun convert-char-class-to-hash (list)
95
   (declare #.*standard-optimize-settings*)
96
   "Combines all items in LIST into one char class hash and returns it.
97
 Items can be single characters, character ranges like \(:RANGE #\\A
98
 #\\E), or special character classes like :DIGIT-CLASS. Does the right
99
 thing with respect to case-\(in)sensitivity as specified by the
100
 special variable FLAGS."
101
   (loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4)))
102
                                      :rehash-size (float (expt *regex-char-code-limit* (/ 1 4)))
103
                                      :rehash-threshold #-genera 1.0 #+genera 0.99)
104
         for item in list
105
         if (characterp item)
106
           ;; treat a single character C like a range (:RANGE C C)
107
           do (add-range-to-hash hash item item)
108
         else if (symbolp item)
109
           ;; special character classes
110
           do (setq hash
111
                      (case item
112
                        ((:digit-class)
113
                          (merge-hash hash +digit-hash+))
114
                        ((:non-digit-class)
115
                          (merge-inverted-hash hash +digit-hash+))
116
                        ((:whitespace-char-class)
117
                          (merge-hash hash +whitespace-char-hash+))
118
                        ((:non-whitespace-char-class)
119
                          (merge-inverted-hash hash +whitespace-char-hash+))
120
                        ((:word-char-class)
121
                          (merge-hash hash +word-char-hash+))
122
                        ((:non-word-char-class)
123
                          (merge-inverted-hash hash +word-char-hash+))
124
                        (otherwise
125
                          (signal-ppcre-syntax-error
126
                           "Unknown symbol ~A in character class"
127
                           item))))
128
         else if (and (consp item)
129
                      (eq (car item) :range))
130
           ;; proper ranges
131
           do (add-range-to-hash hash
132
                                 (second item)
133
                                 (third item))
134
         else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list"
135
                                            item)
136
         finally (return hash)))
137
 
138
 (defun maybe-split-repetition (regex
139
                                greedyp
140
                                minimum
141
                                maximum
142
                                min-len
143
                                length
144
                                reg-seen)
145
   (declare #.*standard-optimize-settings*)
146
   (declare (type fixnum minimum)
147
            (type (or fixnum null) maximum))
148
   "Splits a REPETITION object into a constant and a varying part if
149
 applicable, i.e. something like
150
   a{3,} -> a{3}a*
151
 The arguments to this function correspond to the REPETITION slots of
152
 the same name."
153
   ;; note the usage of COPY-REGEX here; we can't use the same REGEX
154
   ;; object in both REPETITIONS because they will have different
155
   ;; offsets
156
   (when maximum
157
     (when (zerop maximum)
158
       ;; trivial case: don't repeat at all
159
       (return-from maybe-split-repetition
160
         (make-instance 'void)))
161
     (when (= 1 minimum maximum)
162
       ;; another trivial case: "repeat" exactly once
163
       (return-from maybe-split-repetition
164
         regex)))
165
   ;; first set up the constant part of the repetition
166
   ;; maybe that's all we need
167
   (let ((constant-repetition (if (plusp minimum)
168
                                (make-instance 'repetition
169
                                               :regex (copy-regex regex)
170
                                               :greedyp greedyp
171
                                               :minimum minimum
172
                                               :maximum minimum
173
                                               :min-len min-len
174
                                               :len length
175
                                               :contains-register-p reg-seen)
176
                                ;; don't create garbage if minimum is 0
177
                                nil)))
178
     (when (and maximum
179
                (= maximum minimum))
180
       (return-from maybe-split-repetition
181
         ;; no varying part needed because min = max
182
         constant-repetition))
183
     ;; now construct the varying part
184
     (let ((varying-repetition
185
             (make-instance 'repetition
186
                            :regex regex
187
                            :greedyp greedyp
188
                            :minimum 0
189
                            :maximum (if maximum (- maximum minimum) nil)
190
                            :min-len min-len
191
                            :len length
192
                            :contains-register-p reg-seen)))
193
       (cond ((zerop minimum)
194
               ;; min = 0, no constant part needed
195
               varying-repetition)
196
             ((= 1 minimum)
197
               ;; min = 1, constant part needs no REPETITION wrapped around
198
               (make-instance 'seq
199
                              :elements (list (copy-regex regex)
200
                                              varying-repetition)))
201
             (t
202
               ;; general case
203
               (make-instance 'seq
204
                              :elements (list constant-repetition
205
                                              varying-repetition)))))))
206
 
207
 ;; During the conversion of the parse tree we keep track of the start
208
 ;; of the parse tree in the special variable STARTS-WITH which'll
209
 ;; either hold a STR object or an EVERYTHING object. The latter is the
210
 ;; case if the regex starts with ".*" which implicitely anchors the
211
 ;; regex at the start (perhaps modulo #\Newline).
212
 
213
 (defun maybe-accumulate (str)
214
   (declare #.*standard-optimize-settings*)
215
   (declare (special accumulate-start-p starts-with))
216
   (declare (ftype (function (t) fixnum) len))
217
   "Accumulate STR into the special variable STARTS-WITH if
218
 ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
219
 NIL or a STR object of the same case mode. Always returns NIL."
220
   (when accumulate-start-p
221
     (etypecase starts-with
222
       (str
223
         ;; STARTS-WITH already holds a STR, so we check if we can
224
         ;; concatenate
225
         (cond ((eq (case-insensitive-p starts-with)
226
                    (case-insensitive-p str))
227
                 ;; we modify STARTS-WITH in place
228
                 (setf (len starts-with)
229
                         (+ (len starts-with) (len str)))
230
                 ;; note that we use SLOT-VALUE because the accessor
231
                 ;; STR has a declared FTYPE which doesn't fit here
232
                 (adjust-array (slot-value starts-with 'str)
233
                               (len starts-with)
234
                               :fill-pointer t)
235
                 (setf (subseq (slot-value starts-with 'str)
236
                               (- (len starts-with) (len str)))
237
                         (str str)
238
                       ;; STR objects that are parts of STARTS-WITH
239
                       ;; always have their SKIP slot set to true
240
                       ;; because the SCAN function will take care of
241
                       ;; them, i.e. the matcher can ignore them
242
                       (skip str) t))
243
               (t (setq accumulate-start-p nil))))
244
       (null
245
         ;; STARTS-WITH is still empty, so we create a new STR object
246
         (setf starts-with
247
                 (make-instance 'str
248
                                :str ""
249
                                :case-insensitive-p (case-insensitive-p str))
250
               ;; INITIALIZE-INSTANCE will coerce the STR to a simple
251
               ;; string, so we have to fill it afterwards
252
               (slot-value starts-with 'str)
253
                 (make-array (len str)
254
                             :initial-contents (str str)
255
                             :element-type 'character
256
                             :fill-pointer t
257
                             :adjustable t)
258
               (len starts-with)
259
                 (len str)
260
               ;; see remark about SKIP above
261
               (skip str) t))
262
       (everything
263
         ;; STARTS-WITH already holds an EVERYTHING object - we can't
264
         ;; concatenate
265
         (setq accumulate-start-p nil))))
266
   nil)
267
 
268
 (defun convert-aux (parse-tree)
269
   (declare #.*standard-optimize-settings*)
270
   (declare (special flags reg-num accumulate-start-p starts-with max-back-ref))
271
   "Converts the parse tree PARSE-TREE into a REGEX object and returns it.
272
 
273
 Will also
274
   - split and optimize repetitions,
275
   - accumulate strings or EVERYTHING objects into the special variable
276
     STARTS-WITH,
277
   - keep track of all registers seen in the special variable REG-NUM,
278
   - keep track of the highest backreference seen in the special
279
     variable MAX-BACK-REF,
280
   - maintain and adher to the currently applicable modifiers in the special
281
     variable FLAGS, and
282
   - maybe even wash your car..."
283
   (cond ((consp parse-tree)
284
           (case (first parse-tree)
285
             ;; (:SEQUENCE {<regex>}*)
286
             ((:sequence)
287
               (cond ((cddr parse-tree)
288
                       ;; this is essentially like
289
                       ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
290
                       ;; but we don't cons a new list
291
                       (loop for parse-tree-rest on (rest parse-tree)
292
                             while parse-tree-rest
293
                             do (setf (car parse-tree-rest)
294
                                        (convert-aux (car parse-tree-rest))))
295
                       (make-instance 'seq
296
                                      :elements (rest parse-tree)))
297
                     (t (convert-aux (second parse-tree)))))
298
             ;; (:GROUP {<regex>}*)
299
             ;; this is a syntactical construct equivalent to :SEQUENCE
300
             ;; intended to keep the effect of modifiers local
301
             ((:group)
302
               ;; make a local copy of FLAGS and shadow the global
303
               ;; value while we descend into the enclosed regexes
304
               (let ((flags (copy-list flags)))
305
                 (declare (special flags))
306
                 (cond ((cddr parse-tree)
307
                         (loop for parse-tree-rest on (rest parse-tree)
308
                               while parse-tree-rest
309
                               do (setf (car parse-tree-rest)
310
                                          (convert-aux (car parse-tree-rest))))
311
                         (make-instance 'seq
312
                                        :elements (rest parse-tree)))
313
                       (t (convert-aux (second parse-tree))))))
314
             ;; (:ALTERNATION {<regex>}*)
315
             ((:alternation)
316
               ;; we must stop accumulating objects into STARTS-WITH
317
               ;; once we reach an alternation
318
               (setq accumulate-start-p nil)
319
               (loop for parse-tree-rest on (rest parse-tree)
320
                     while parse-tree-rest
321
                     do (setf (car parse-tree-rest)
322
                                (convert-aux (car parse-tree-rest))))
323
               (make-instance 'alternation
324
                              :choices (rest parse-tree)))
325
             ;; (:BRANCH <test> <regex>)
326
             ;; <test> must be look-ahead, look-behind or number;
327
             ;; if <regex> is an alternation it must have one or two
328
             ;; choices
329
             ((:branch)
330
               (setq accumulate-start-p nil)
331
               (let* ((test-candidate (second parse-tree))
332
                      (test (cond ((numberp test-candidate)
333
                                    (when (zerop (the fixnum test-candidate))
334
                                      (signal-ppcre-syntax-error
335
                                       "Register 0 doesn't exist: ~S"
336
                                       parse-tree))
337
                                    (1- (the fixnum test-candidate)))
338
                                  (t (convert-aux test-candidate))))
339
                      (alternations (convert-aux (third parse-tree))))
340
                 (when (and (not (numberp test))
341
                            (not (typep test 'lookahead))
342
                            (not (typep test 'lookbehind)))
343
                   (signal-ppcre-syntax-error
344
                    "Branch test must be look-ahead, look-behind or number: ~S"
345
                    parse-tree))
346
                 (typecase alternations
347
                   (alternation
348
                     (case (length (choices alternations))
349
                       ((0)
350
                         (signal-ppcre-syntax-error "No choices in branch: ~S"
351
                                                    parse-tree))
352
                       ((1)
353
                         (make-instance 'branch
354
                                        :test test
355
                                        :then-regex (first
356
                                                     (choices alternations))))
357
                       ((2)
358
                         (make-instance 'branch
359
                                        :test test
360
                                        :then-regex (first
361
                                                     (choices alternations))
362
                                        :else-regex (second
363
                                                     (choices alternations))))
364
                       (otherwise
365
                         (signal-ppcre-syntax-error
366
                          "Too much choices in branch: ~S"
367
                          parse-tree))))
368
                   (t
369
                     (make-instance 'branch
370
                                    :test test
371
                                    :then-regex alternations)))))
372
             ;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD <regex>)
373
             ((:positive-lookahead :negative-lookahead)
374
               ;; keep the effect of modifiers local to the enclosed
375
               ;; regex and stop accumulating into STARTS-WITH
376
               (setq accumulate-start-p nil)
377
               (let ((flags (copy-list flags)))
378
                 (declare (special flags))
379
                 (make-instance 'lookahead
380
                                :regex (convert-aux (second parse-tree))
381
                                :positivep (eq (first parse-tree)
382
                                               :positive-lookahead))))
383
             ;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND <regex>)
384
             ((:positive-lookbehind :negative-lookbehind)
385
               ;; keep the effect of modifiers local to the enclosed
386
               ;; regex and stop accumulating into STARTS-WITH
387
               (setq accumulate-start-p nil)
388
               (let* ((flags (copy-list flags))
389
                      (regex (convert-aux (second parse-tree)))
390
                      (len (regex-length regex)))
391
                 (declare (special flags))
392
                 ;; lookbehind assertions must be of fixed length
393
                 (unless len
394
                   (signal-ppcre-syntax-error
395
                    "Variable length look-behind not implemented (yet): ~S"
396
                    parse-tree))
397
                 (make-instance 'lookbehind
398
                                :regex regex
399
                                :positivep (eq (first parse-tree)
400
                                               :positive-lookbehind)
401
                                :len len)))
402
             ;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>)
403
             ((:greedy-repetition :non-greedy-repetition)
404
               ;; remember the value of ACCUMULATE-START-P upon entering
405
               (let ((local-accumulate-start-p accumulate-start-p))
406
                 (let ((minimum (second parse-tree))
407
                       (maximum (third parse-tree)))
408
                   (declare (type fixnum minimum))
409
                   (declare (type (or null fixnum) maximum))
410
                   (unless (and maximum
411
                                (= 1 minimum maximum))
412
                     ;; set ACCUMULATE-START-P to NIL for the rest of
413
                     ;; the conversion because we can't continue to
414
                     ;; accumulate inside as well as after a proper
415
                     ;; repetition
416
                     (setq accumulate-start-p nil))
417
                   (let* (reg-seen
418
                          (regex (convert-aux (fourth parse-tree)))
419
                          (min-len (regex-min-length regex))
420
                          (greedyp (eq (first parse-tree) :greedy-repetition))
421
                          (length (regex-length regex)))
422
                     ;; note that this declaration already applies to
423
                     ;; the call to CONVERT-AUX above
424
                     (declare (special reg-seen))
425
                     (when (and local-accumulate-start-p
426
                                (not starts-with)
427
                                (zerop minimum)
428
                                (not maximum))
429
                       ;; if this repetition is (equivalent to) ".*"
430
                       ;; and if we're at the start of the regex we
431
                       ;; remember it for ADVANCE-FN (see the SCAN
432
                       ;; function)
433
                       (setq starts-with (everythingp regex)))
434
                     (if (or (not reg-seen)
435
                             (not greedyp)
436
                             (not length)
437
                             (zerop length)
438
                             (and maximum (= minimum maximum)))
439
                       ;; the repetition doesn't enclose a register, or
440
                       ;; it's not greedy, or we can't determine it's
441
                       ;; (inner) length, or the length is zero, or the
442
                       ;; number of repetitions is fixed; in all of
443
                       ;; these cases we don't bother to optimize
444
                       (maybe-split-repetition regex
445
                                               greedyp
446
                                               minimum
447
                                               maximum
448
                                               min-len
449
                                               length
450
                                               reg-seen)
451
                       ;; otherwise we make a transformation that looks
452
                       ;; roughly like one of
453
                       ;;   <regex>* -> (?:<regex'>*<regex>)?
454
                       ;;   <regex>+ -> <regex'>*<regex>
455
                       ;; where the trick is that as much as possible
456
                       ;; registers from <regex> are removed in
457
                       ;; <regex'>
458
                       (let* (reg-seen   ; new instance for REMOVE-REGISTERS
459
                              (remove-registers-p t)
460
                              (inner-regex (remove-registers regex))
461
                              (inner-repetition
462
                                ;; this is the "<regex'>" part
463
                                (maybe-split-repetition inner-regex
464
                                                        ;; always greedy
465
                                                        t
466
                                                        ;; reduce minimum by 1
467
                                                        ;; unless it's already 0
468
                                                        (if (zerop minimum)
469
                                                          0
470
                                                          (1- minimum))
471
                                                        ;; reduce maximum by 1
472
                                                        ;; unless it's NIL
473
                                                        (and maximum
474
                                                             (1- maximum))
475
                                                        min-len
476
                                                        length
477
                                                        reg-seen))
478
                              (inner-seq
479
                                ;; this is the "<regex'>*<regex>" part
480
                                (make-instance 'seq
481
                                               :elements (list inner-repetition
482
                                                               regex))))
483
                         ;; note that this declaration already applies
484
                         ;; to the call to REMOVE-REGISTERS above
485
                         (declare (special remove-registers-p reg-seen))
486
                         ;; wrap INNER-SEQ with a greedy
487
                         ;; {0,1}-repetition (i.e. "?") if necessary
488
                         (if (plusp minimum)
489
                           inner-seq
490
                           (maybe-split-repetition inner-seq
491
                                                   t
492
                                                   0
493
                                                   1
494
                                                   min-len
495
                                                   nil
496
                                                   t))))))))
497
             ;; (:REGISTER <regex>)
498
             ((:register)
499
               ;; keep the effect of modifiers local to the enclosed
500
               ;; regex; also, assign the current value of REG-NUM to
501
               ;; the corresponding slot of the REGISTER object and
502
               ;; increase this counter afterwards
503
               (let ((flags (copy-list flags))
504
                     (stored-reg-num reg-num))
505
                 (declare (special flags reg-seen))
506
                 (setq reg-seen t)
507
                 (incf (the fixnum reg-num))
508
                 (make-instance 'register
509
                                :regex (convert-aux (second parse-tree))
510
                                :num stored-reg-num)))
511
             ;; (:FILTER <function> &optional <length>)
512
             ((:filter)
513
               ;; stop accumulating into STARTS-WITH
514
               (setq accumulate-start-p nil)
515
               (make-instance 'filter
516
                              :fn (second parse-tree)
517
                              :len (third parse-tree)))
518
             ;; (:STANDALONE <regex>)
519
             ((:standalone)
520
               ;; stop accumulating into STARTS-WITH
521
               (setq accumulate-start-p nil)
522
               ;; keep the effect of modifiers local to the enclosed
523
               ;; regex
524
               (let ((flags (copy-list flags)))
525
                 (declare (special flags))
526
                 (make-instance 'standalone
527
                                :regex (convert-aux (second parse-tree)))))
528
             ;; (:BACK-REFERENCE <number>)
529
             ((:back-reference)
530
               (let ((backref-number (second parse-tree)))
531
                 (declare (type fixnum backref-number))
532
                 (when (or (not (typep backref-number 'fixnum))
533
                           (<= backref-number 0))
534
                   (signal-ppcre-syntax-error
535
                    "Illegal back-reference: ~S"
536
                    parse-tree))
537
                 ;; stop accumulating into STARTS-WITH and increase
538
                 ;; MAX-BACK-REF if necessary
539
                 (setq accumulate-start-p nil
540
                       max-back-ref (max (the fixnum max-back-ref)
541
                                         backref-number))
542
                 (make-instance 'back-reference
543
                                ;; we start counting from 0 internally
544
                                :num (1- backref-number)
545
                                :case-insensitive-p (case-insensitive-mode-p
546
                                                     flags))))
547
             ;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*)
548
             ;; where item is one of
549
             ;;   - a character
550
             ;;   - a character range: (:RANGE <char1> <char2>)
551
             ;;   - a special char class symbol like :DIGIT-CHAR-CLASS
552
             ((:char-class :inverted-char-class)
553
               ;; first create the hash-table and some auxiliary values
554
               (let* (hash
555
                      hash-keys
556
                      (count most-positive-fixnum)
557
                      (item-list (rest parse-tree))
558
                      (invertedp (eq (first parse-tree) :inverted-char-class))
559
                      word-char-class-p)
560
                 (cond ((every (lambda (item) (eq item :word-char-class))
561
                               item-list)
562
                         ;; treat "[\\w]" like "\\w"
563
                         (setq word-char-class-p t))
564
                       ((every (lambda (item) (eq item :non-word-char-class))
565
                               item-list)
566
                         ;; treat "[\\W]" like "\\W"
567
                         (setq word-char-class-p t)
568
                         (setq invertedp (not invertedp)))
569
                       (t
570
                         (setq hash (convert-char-class-to-hash item-list)
571
                               count (hash-table-count hash))
572
                         (when (<= count 2)
573
                           ;; collect the hash-table keys into a list if
574
                           ;; COUNT is smaller than 3
575
                           (setq hash-keys
576
                                   (loop for chr being the hash-keys of hash
577
                                         collect chr)))))
578
                 (cond ((and (not invertedp)
579
                             (= count 1))
580
                         ;; convert one-element hash table into a STR
581
                         ;; object and try to accumulate into
582
                         ;; STARTS-WITH
583
                         (let ((str (make-instance 'str
584
                                                   :str (string
585
                                                         (first hash-keys))
586
                                                   :case-insensitive-p nil)))
587
                           (maybe-accumulate str)
588
                           str))
589
                       ((and (not invertedp)
590
                             (= count 2)
591
                             (char-equal (first hash-keys) (second hash-keys)))
592
                         ;; convert two-element hash table into a
593
                         ;; case-insensitive STR object and try to
594
                         ;; accumulate into STARTS-WITH if the two
595
                         ;; characters are CHAR-EQUAL
596
                         (let ((str (make-instance 'str
597
                                                   :str (string
598
                                                         (first hash-keys))
599
                                                   :case-insensitive-p t)))
600
                           (maybe-accumulate str)
601
                           str))
602
                       (t
603
                         ;; the general case; stop accumulating into STARTS-WITH
604
                         (setq accumulate-start-p nil)
605
                         (make-instance 'char-class
606
                                        :hash hash
607
                                        :case-insensitive-p
608
                                          (case-insensitive-mode-p flags)
609
                                        :invertedp invertedp
610
                                        :word-char-class-p word-char-class-p)))))
611
             ;; (:FLAGS {<flag>}*)
612
             ;; where flag is a modifier symbol like :CASE-INSENSITIVE-P
613
             ((:flags)
614
               ;; set/unset the flags corresponding to the symbols
615
               ;; following :FLAGS
616
               (mapc #'set-flag (rest parse-tree))
617
               ;; we're only interested in the side effect of
618
               ;; setting/unsetting the flags and turn this syntactical
619
               ;; construct into a VOID object which'll be optimized
620
               ;; away when creating the matcher
621
               (make-instance 'void))
622
             (otherwise
623
               (signal-ppcre-syntax-error
624
                "Unknown token ~A in parse-tree"
625
                (first parse-tree)))))
626
         ((or (characterp parse-tree) (stringp parse-tree))
627
           ;; turn characters or strings into STR objects and try to
628
           ;; accumulate into STARTS-WITH
629
           (let ((str (make-instance 'str
630
                                     :str (string parse-tree)
631
                                     :case-insensitive-p
632
                                       (case-insensitive-mode-p flags))))
633
             (maybe-accumulate str)
634
             str))
635
         (t
636
           ;; and now for the tokens which are symbols
637
           (case parse-tree
638
             ((:void)
639
               (make-instance 'void))
640
             ((:word-boundary)
641
               (make-instance 'word-boundary :negatedp nil))
642
             ((:non-word-boundary)
643
               (make-instance 'word-boundary :negatedp t))
644
             ;; the special character classes
645
             ((:digit-class
646
               :non-digit-class
647
               :word-char-class
648
               :non-word-char-class
649
               :whitespace-char-class
650
               :non-whitespace-char-class)
651
               ;; stop accumulating into STARTS-WITH
652
               (setq accumulate-start-p nil)
653
               (make-instance 'char-class
654
                              ;; use the constants defined in util.lisp
655
                              :hash (case parse-tree
656
                                      ((:digit-class
657
                                        :non-digit-class)
658
                                        +digit-hash+)
659
                                      ((:word-char-class
660
                                        :non-word-char-class)
661
                                        nil)
662
                                      ((:whitespace-char-class
663
                                        :non-whitespace-char-class)
664
                                        +whitespace-char-hash+))
665
                              ;; this value doesn't really matter but
666
                              ;; NIL should result in slightly faster
667
                              ;; matchers
668
                              :case-insensitive-p nil
669
                              :invertedp (member parse-tree
670
                                                 '(:non-digit-class
671
                                                   :non-word-char-class
672
                                                   :non-whitespace-char-class)
673
                                                 :test #'eq)
674
                              :word-char-class-p (member parse-tree
675
                                                         '(:word-char-class
676
                                                           :non-word-char-class)
677
                                                         :test #'eq)))
678
             ((:start-anchor             ; Perl's "^"
679
               :end-anchor               ; Perl's "$"
680
               :modeless-end-anchor-no-newline
681
                                         ; Perl's "\z"
682
               :modeless-start-anchor    ; Perl's "\A"
683
               :modeless-end-anchor)     ; Perl's "\Z"
684
               (make-instance 'anchor
685
                              :startp (member parse-tree
686
                                              '(:start-anchor
687
                                                :modeless-start-anchor)
688
                                              :test #'eq)
689
                              ;; set this value according to the
690
                              ;; current settings of FLAGS (unless it's
691
                              ;; a modeless anchor)
692
                              :multi-line-p
693
                                (and (multi-line-mode-p flags)
694
                                     (not (member parse-tree
695
                                                  '(:modeless-start-anchor
696
                                                    :modeless-end-anchor
697
                                                    :modeless-end-anchor-no-newline)
698
                                                  :test #'eq)))
699
                              :no-newline-p
700
                                (eq parse-tree
701
                                    :modeless-end-anchor-no-newline)))
702
             ((:everything)
703
               ;; stop accumulating into STARTS-WITHS
704
               (setq accumulate-start-p nil)
705
               (make-instance 'everything
706
                              :single-line-p (single-line-mode-p flags)))
707
             ;; special tokens corresponding to Perl's "ism" modifiers
708
             ((:case-insensitive-p
709
               :case-sensitive-p
710
               :multi-line-mode-p
711
               :not-multi-line-mode-p
712
               :single-line-mode-p
713
               :not-single-line-mode-p)
714
               ;; we're only interested in the side effect of
715
               ;; setting/unsetting the flags and turn these tokens
716
               ;; into VOID objects which'll be optimized away when
717
               ;; creating the matcher
718
               (set-flag parse-tree)
719
               (make-instance 'void))
720
             (otherwise
721
              (let ((translation (and (symbolp parse-tree)
722
                                      (parse-tree-synonym parse-tree))))
723
                (if translation
724
                  (convert-aux (copy-tree translation))
725
                  (signal-ppcre-syntax-error "Unknown token ~A in parse-tree"
726
                                             parse-tree))))))))
727
 
728
 (defun convert (parse-tree)
729
   (declare #.*standard-optimize-settings*)
730
   "Converts the parse tree PARSE-TREE into an equivalent REGEX object
731
 and returns three values: the REGEX object, the number of registers
732
 seen and an object the regex starts with which is either a STR object
733
 or an EVERYTHING object (if the regex starts with something like
734
 \".*\") or NIL."
735
   ;; this function basically just initializes the special variables
736
   ;; and then calls CONVERT-AUX to do all the work
737
   (let* ((flags (list nil nil nil))
738
          (reg-num 0)
739
          (accumulate-start-p t)
740
          starts-with
741
          (max-back-ref 0)
742
          (converted-parse-tree (convert-aux parse-tree)))
743
     (declare (special flags reg-num accumulate-start-p starts-with max-back-ref))
744
     ;; make sure we don't reference registers which aren't there
745
     (when (> (the fixnum max-back-ref)
746
              (the fixnum reg-num))
747
       (signal-ppcre-syntax-error
748
        "Backreference to register ~A which has not been defined"
749
        max-back-ref))
750
     (when (typep starts-with 'str)
751
       (setf (slot-value starts-with 'str)
752
               (coerce (slot-value starts-with 'str) 'simple-string)))
753
     (values converted-parse-tree reg-num starts-with)))