-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathregexp.go
909 lines (819 loc) · 23.3 KB
/
regexp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
package sre2
// Core regexp definitions. Provides the public interface Re for use of sre2.
// Internally, this file defines the sregexp struct and its list of states.
//
// Each state is an instruction satisfying one of six modes-
// iSplit: branching operation
// iIndexCap: index capture, e.g. due to start/end parenthesis
// iBoundaryCase: non-consuming matcher for left/right runes, such as '\w'
// iRuneClass: consuming matcher for current rune
// iMatch: terminal success state
//
// This file also describes Parse() which builds the regexp as a NFA, or
// provides a human-readable error of the failure. MustParse() is a variation
// which panics on an error condition.
import (
"fmt"
"os"
"strconv"
"strings"
"unicode"
)
// sregexp struct. Just a list of states and a number of subexpressions.
type sregexp struct {
prog []*instr // List of instruction states that comprise this RE.
start int // start instr
// Number of paired subexpressions [()'s], including the outermost brackets
// (i.e. which match the entire string).
caps int
}
// DebugOut writes the given regexp to Stderr, for debugging.
func (r *sregexp) DebugOut() {
for i := 0; i < len(r.prog); i++ {
fmt.Fprintln(os.Stderr, i, r.prog[i].String())
}
}
// NumSubexps returns the number of paired subexpressions [()'s] in this regexp.
func (r *sregexp) NumSubexps() int {
// we always have an outer () to match the whole re, subtract it
return r.caps - 1
}
// instrMode describes a particular instruction type for the regexp internal
// state machine.
type instrMode byte
// Enum-style definitions for the instrMode type.
const (
iSplit instrMode = iota // proceed down out & out1
iIndexCap // capturing start/end parenthesis
iBoundaryCase // match left/right runes here
iRuneClass // if match rune, proceed down out
iMatch // success state!
)
// boundaryMode describes a boundary matcher type, for instructions of type
// iBoundaryCase.
type boundaryMode byte
// Enum-style definitions for the the boundaryMode type.
const (
bNone boundaryMode = iota
bBeginText // beginning of text
bBeginLine // beginning of text or line
bEndText // end of text
bEndLine // end of text or line
bWordBoundary // ascii word boundary
bNotWordBoundary // inverse of above, not ascii word boundary
)
// instr represents a single instruction in any regexp.
type instr struct {
idx int // index of this instr
mode instrMode // mode (as above)
out *instr // next instr to process
// alternate path, for iSplit
out1 *instr
// boundary mode, for iBoundaryCase
lr boundaryMode
// rune class to match against, for iRuneClass
rf RuneFilter
// identifier of submatch for iIndexCap
cid int // numbered index
cname string // string identifier (blank=none)
}
// Describes the given instr in a human-readable format for debugging.
func (i *instr) String() string {
// TODO: Build the output string using a bytes.Buffer.
str := fmt.Sprintf("{%d", i.idx)
out := ""
if i.out != nil {
out += fmt.Sprintf(" out=%d", i.out.idx)
}
switch i.mode {
case iSplit:
str += " iSplit"
if i.out1 != nil {
out += fmt.Sprintf(" out1=%d", i.out1.idx)
}
case iIndexCap:
str += fmt.Sprintf(" iIndexCap cid=%d", i.cid)
if len(i.cname) != 0 {
str += fmt.Sprintf(" cname=%s", i.cname)
}
case iBoundaryCase:
var mode string
switch i.lr {
case bBeginText:
mode = "bBeginText"
case bBeginLine:
mode = "bBeginLine"
case bEndText:
mode = "bEndText"
case bEndLine:
mode = "bEndLine"
case bWordBoundary:
mode = "bWordBoundary"
case bNotWordBoundary:
mode = "bNotWordBoundary"
}
str += fmt.Sprintf(" iBoundaryCase [%s]", mode)
case iRuneClass:
str += fmt.Sprint(" iRuneClass ", i.rf)
case iMatch:
str += " iMatch"
}
return str + out + "}"
}
// Matcher method for consuming runes, thus only matches iRuneClass.
func (s *instr) match(r rune) bool {
return s.mode == iRuneClass && s.rf(r)
}
// Matcher method for iBoundaryCase. If either left or right is not within the
// target string, then -1 should be provided.
func (s *instr) matchBoundaryMode(left rune, right rune) bool {
if s.mode != iBoundaryCase {
return false
}
switch s.lr {
case bBeginText:
return left == -1
case bBeginLine:
return left == -1 || left == '\n'
case bEndText:
return right == -1
case bEndLine:
return right == -1 || right == '\n'
case bWordBoundary, bNotWordBoundary:
// TODO: This is ASCII-only at this point.
word_range := perl_groups['w']
whitespace_range := perl_groups['s']
wb := (unicode.Is(word_range, left) && unicode.Is(whitespace_range, right)) || (unicode.Is(whitespace_range, left) && unicode.Is(word_range, right))
if s.lr == bWordBoundary {
return wb
} else {
return !wb
}
}
panic("unexpected lr mode")
}
// Escape constants and their mapping to actual Unicode runes.
var (
ESCAPES = map[rune]rune{
'a': 7, 't': 9, 'n': 10, 'v': 11, 'f': 12, 'r': 13,
}
)
// Transient parser state, a combination of regexp and string iterator.
type parser struct {
re *sregexp
src SafeReader
flags int64 // on/off state for flags 64-127 (subtract 64, uses bits)
}
// Generate a new instruction struct for use in regexp. By default, the instr
// will be of type 'iSplit'.
func (p *parser) instr() *instr {
pos := len(p.re.prog)
if pos == cap(p.re.prog) {
if pos == 0 {
panic("should not have cap of zero")
}
local := p.re.prog
p.re.prog = make([]*instr, pos, pos*2)
copy(p.re.prog, local)
}
p.re.prog = p.re.prog[0 : pos+1]
i := &instr{pos, iSplit, nil, nil, bNone, nil, -1, ""}
p.re.prog[pos] = i
return i
}
// Determine whether the given flag is set. Requires flag in range 64-127,
// subtracts 64 and checks for bit set in flags int64.
func (p *parser) flag(flag int) bool {
if flag < 64 || flag > 127 {
panic(fmt.Sprintf("can't check flag, out of range: %c", flag))
}
return (p.flags & (1 << byte(flag-64))) != 0
}
// Helper method to connect instr 'from' to instr 'out'.
// TODO: Use safer connection helpers.
func (p *parser) out(from *instr, to *instr) {
if from.out == nil {
from.out = to
} else if from.mode == iSplit && from.out1 == nil {
from.out1 = to
} else {
panic("can't out")
}
}
// Consume some alternate regexps. That is, (regexp[|regexp][|regexp]...).
// This method will return when it encounters an outer ')', and the cursor
// will rest on that character.
func (p *parser) alt(cname string, capture bool) (start *instr, end *instr) {
// Hold onto the current set of flags; reset after.
old_flags := p.flags
defer func() {
p.flags = old_flags
}()
end = p.instr() // shared end state for alt
alt_begin := p.instr()
// Optionally mark this as a capturing group.
if capture {
alt_begin.mode = iIndexCap
alt_begin.cid = p.re.caps * 2
alt_begin.cname = cname
end.mode = iIndexCap
end.cid = alt_begin.cid + 1
end.cname = cname
// Increment alt counter.
p.re.caps += 1
}
b_start, b_end := p.regexp()
start = b_start
p.out(b_end, end)
for p.src.curr() == '|' {
start = p.instr()
p.out(start, b_start)
p.src.nextCh()
b_start, b_end = p.regexp()
p.out(start, b_start)
p.out(b_end, end)
b_start = start
}
// Note: We don't move over this final bracket.
if p.src.curr() != ')' {
panic("alt must end with ')'")
}
// Wire up the start of this alt to the first regexp part.
p.out(alt_begin, start)
return alt_begin, end
}
// Consume a single rune; assumes this is being invoked as the last possible
// option and will panic if an invalid escape sequence is found. Will return the
// found rune (as an integer) and with cursor past the entire representation.
func (p *parser) single_rune() rune {
if r := p.src.curr(); r != '\\' {
// This is just a regular character; return it immediately.
p.src.nextCh()
return r
}
if p.src.peek() == 'x' {
// Match hex character code.
var hex string
p.src.nextCh()
if p.src.nextCh() == '{' {
hex = p.src.literal("{", "}")
} else {
hex = fmt.Sprintf("%c%c", p.src.curr(), p.src.nextCh())
p.src.nextCh() // Step over the end of the hex code.
}
// Parse and return the corresponding rune.
raw, err := strconv.ParseUint(hex, 16, 32)
if err != nil {
panic(fmt.Sprintf("couldn't parse hex: %s", hex))
}
return rune(raw)
} else if r := ESCAPES[p.src.peek()]; r != 0 {
// Literally match '\n', '\r', etc.
p.src.nextCh()
p.src.nextCh()
return r
} else if unicode.Is(posix_groups["punct"], p.src.peek()) {
// Allow punctuation to be blindly escaped.
r := p.src.nextCh()
p.src.nextCh()
return r
} else if unicode.IsDigit(p.src.peek()) {
// Match octal character code (begins with digit, up to three digits).
oct := ""
p.src.nextCh()
for i := 0; i < 3; i++ {
oct += fmt.Sprintf("%c", p.src.curr())
if !unicode.IsDigit(p.src.nextCh()) {
break
}
}
// Parse and return the corresponding rune.
raw, err := strconv.ParseUint(oct, 8, 32)
if err != nil {
panic(fmt.Sprintf("couldn't parse oct: %s", oct))
}
return rune(raw)
}
// This is an escape sequence which does not identify a single rune.
panic(fmt.Sprintf("not a valid escape sequence: \\%c", p.src.peek()))
}
// Consume a single character class and provide an implementation of the
// RuneFilter interface. Consumes the entire definition.
func (p *parser) class(within_class bool) (filter RuneFilter) {
negate := false
switch p.src.curr() {
case '.':
if p.flag('s') {
filter = func(r rune) bool {
return true
}
} else {
filter = func(r rune) bool {
return r != '\n'
}
}
p.src.nextCh()
case '[':
if p.src.peek() == ':' {
// Match an ASCII/POSIX class name.
name := p.src.literal("[:", ":]")
if name[0] == '^' {
negate = true
name = name[1:]
}
ranges, ok := posix_groups[name]
if !ok {
panic(fmt.Sprintf("could not identify ascii/posix class: %s", name))
}
filter = func(r rune) bool {
return unicode.Is(ranges, r)
}
} else {
if within_class {
panic("can't match a [...] class within another class")
}
if p.src.nextCh() == '^' {
negate = true
p.src.nextCh()
}
// Consume and merge all valid classes within this [...] block.
filters := make([]RuneFilter, 0)
for p.src.curr() != ']' {
filters = append(filters, p.class(true))
}
filter = func(r rune) bool {
for _, rf := range filters {
if rf(r) {
return true
}
}
return false
}
p.src.nextCh() // Move over final ']'.
}
case '\\':
// Match some escaped character or escaped combination.
if p.src.peek() == 'p' || p.src.peek() == 'P' {
// Match a Unicode class name.
negate = (p.src.nextCh() == 'P')
unicode_class := fmt.Sprintf("%c", p.src.nextCh())
if unicode_class[0] == '{' {
unicode_class = p.src.literal("{", "}")
} else {
p.src.nextCh() // move past the single class description
}
// Find and return the class.
if filter = matchUnicodeClass(unicode_class); filter == nil {
panic(fmt.Sprintf("could not identify unicode class: %s", unicode_class))
}
} else if ranges, ok := perl_groups[unicode.ToLower(p.src.peek())]; ok {
// We've found a Perl group.
negate = unicode.IsUpper(p.src.nextCh())
p.src.nextCh()
filter = func(r rune) bool {
return unicode.Is(ranges, r)
}
}
}
if filter == nil {
// Match a single rune literal, or a range (when inside a character class).
// Note that '-' outside a character class is treated as a literal.
rune := p.single_rune()
if p.src.curr() == '-' && within_class {
p.src.nextCh() // move over '-'
rune_high := p.single_rune()
if rune_high < rune {
panic(fmt.Sprintf("unexpected range: %c >= %c", rune, rune_high))
}
filter = matchRuneRange(rune, rune_high)
} else {
filter = matchRune(rune)
}
}
if negate {
return filter.not()
}
return filter
}
// Build a left-right matcher of the given mode.
func (p *parser) makeBoundaryInstr(mode boundaryMode) *instr {
instr := p.instr()
instr.mode = iBoundaryCase
instr.lr = mode
return instr
}
// Consume a single term at the current cursor position. This may include a
// bracketed expression. When this function returns, the cursor will have moved
// past the final rune in this term.
func (p *parser) term() (start *instr, end *instr) {
switch p.src.curr() {
case -1:
panic("EOF in term")
case '*', '+', '{', '?':
panic(fmt.Sprintf("unexpected expansion char: %c at %d", p.src.curr(), p.src.opos))
case ')', '}', ']':
panic("unexpected close element")
case '(':
// Match a bracketed expression (or modify current flags, with '?').
capture := true
alt_id := ""
old_flags := p.flags
if p.src.nextCh() == '?' {
// Do something interesting before descending into this alt.
p.src.nextCh()
if p.src.curr() == 'P' {
p.src.nextCh() // move to '<'
alt_id = p.src.literal("<", ">")
} else {
// anything but 'P' means flags (and, non-captured).
capture = false
set := true
outer:
for {
switch p.src.curr() {
case ':':
p.src.nextCh() // move past ':'
break outer // no more flags, process re
case ')':
// Return immediately: there's no instructions here, just flag sets!
p.src.nextCh()
start = p.instr()
return start, start
case '-':
// now we're clearing flags
set = false
default:
if p.src.curr() < 64 && p.src.curr() > 127 {
panic(fmt.Sprintf("flag not in range: %c", p.src.curr()))
}
flag := byte(p.src.curr() - 64)
if set {
p.flags |= (1 << flag)
} else {
p.flags &= ^(1 << flag)
}
}
p.src.nextCh()
}
}
}
// Now actually consume the bracketed expression.
start, end = p.alt(alt_id, capture)
if p.src.curr() != ')' {
panic("alt should finish on end bracket")
}
p.src.nextCh()
p.flags = old_flags
return start, end
case '$':
// Match the end of text, or (with 'm') the end of a line.
p.src.nextCh() // consume '$'
mode := bEndText
if p.flag('m') {
mode = bEndLine
}
start = p.makeBoundaryInstr(mode)
return start, start
case '^':
// Match the beginning of text, or (with 'm') the start of a line.
p.src.nextCh() // consume '^'
mode := bBeginText
if p.flag('m') {
mode = bBeginLine
}
start = p.makeBoundaryInstr(mode)
return start, start
case '\\':
// Peek forward to match backslash-escaped terms which are not character
// classes. If any of these branches trigger, they will return past the
// consumed 'term'.
switch p.src.peek() {
case 'Q':
// Match a complete string literal, contained between '\Q' and the nearest
// '\E'. Use p.src.literal() since we're not interested in interpreting any
// unique characters, such as e.g. \x00 or \] (punct).
literal := p.src.literal("\\Q", "\\E")
start = p.instr()
end = start
for _, rune := range literal {
instr := p.instr()
instr.mode = iRuneClass
instr.rf = matchRune(rune)
p.out(end, instr)
end = instr
}
return start, end
case 'A':
// Match only the beginning of text.
p.src.consume("\\A")
start = p.makeBoundaryInstr(bBeginText)
return start, start
case 'z':
// Match only the end of text.
p.src.consume("\\z")
start = p.makeBoundaryInstr(bEndText)
return start, start
case 'b':
// Match an ASCII word boundary.
p.src.consume("\\b")
start = p.makeBoundaryInstr(bWordBoundary)
return start, start
case 'B':
// Match a non-ASCII word boundary.
p.src.consume("\\B")
start = p.makeBoundaryInstr(bNotWordBoundary)
return start, start
}
}
// Try to consume a rune class.
start = p.instr()
start.mode = iRuneClass
start.rf = p.class(false)
if p.flag('i') {
// Mark this class as case-insensitive.
start.rf = start.rf.ignoreCase()
}
return start, start
}
// Safely retrieve a given term from the given position and alt count. If the
// passed first is true, then set it to false and perform a no-op. Otherwise,
// retrieve the new term.
func (p *parser) safe_term(src SafeReader, alt int, first *bool, start **instr, end **instr) {
if *first {
*first = false
return
}
p.src = src
p.re.caps = alt
*start, *end = p.term()
}
// Consume a closure, defined as (term[repitition]). When this function returns,
// the cursor will be resting past the final rune in this closure.
func (p *parser) closure() (start *instr, end *instr) {
// Store state of pos/alts in case we have to reparse term.
revert_alts := p.re.caps
revert := p.src
// Grab first term.
start = p.instr()
end = start
t_start, t_end := p.term()
first := true // While true, we have a pending term.
// Req and opt represent the number of required cases, and the number of
// optional cases, respectively. Opt may be -1 to indicate no optional limit.
var req int
var opt int
// By default, greedily choose an optional step over continuing. If 'U' is
// flagged, swap this behaviour.
greedy := true
if p.flag('U') {
greedy = false
}
switch p.src.curr() {
case '?':
p.src.nextCh()
req, opt = 0, 1
case '*':
p.src.nextCh()
req, opt = 0, -1
case '+':
p.src.nextCh()
req, opt = 1, -1
case '{':
raw := p.src.literal("{", "}")
parts := strings.SplitN(raw, ",", 2)
// TODO: handle malformed int
req, _ = strconv.Atoi(parts[0])
if len(parts) == 2 {
if len(parts[1]) > 0 {
// TODO: handle malformed int
opt, _ = strconv.Atoi(parts[1])
opt -= req // {n,x} means: between n and x matches, not n req and x opt.
if opt < 0 {
panic("{n,x}: x must be greater or equal to n")
}
} else {
opt = -1
}
}
default:
return t_start, t_end // nothing to see here
}
if p.src.curr() == '?' {
greedy = !greedy
p.src.nextCh()
}
end_src := p.src
if req < 0 || opt < -1 || req == 0 && opt == 0 {
panic("invalid req/opt combination")
}
// Generate all required steps.
for i := 0; i < req; i++ {
p.safe_term(revert, revert_alts, &first, &t_start, &t_end)
p.out(end, t_start)
end = t_end
}
// Generate all optional steps.
if opt == -1 {
helper := p.instr()
p.out(end, helper)
if greedy {
helper.out = t_start // greedily choose optional step
} else {
helper.out1 = t_start // optional step is 2nd preference
}
if end != t_end {
// This is a little kludgy, but basically only wires up the term to the
// helper iff it hasn't already been done.
p.out(t_end, helper)
}
end = helper
} else {
real_end := p.instr()
for i := 0; i < opt; i++ {
p.safe_term(revert, revert_alts, &first, &t_start, &t_end)
helper := p.instr()
p.out(end, helper)
if greedy {
helper.out = t_start // greedily choose optional step
} else {
helper.out1 = t_start // optional step is 2nd preference
}
p.out(helper, real_end)
end = p.instr()
p.out(t_end, end)
}
p.out(end, real_end)
end = real_end
}
p.src = end_src
return start, end
}
// Match a regexp (defined as ([closure]*)) from the parser until either: EOF,
// the literal '|' or the literal ')'. At return, the cursor will still rest
// on this final terminal character.
func (p *parser) regexp() (start *instr, end *instr) {
start = p.instr()
curr := start
for {
if p.src.curr() == -1 || p.src.curr() == '|' || p.src.curr() == ')' {
break
}
s, e := p.closure()
p.out(curr, s)
curr = e
}
end = p.instr()
p.out(curr, end)
return start, end
}
// Cleanup the given program. Assumes the given input is a flat slice containing
// no nil instructions. Will not clean up the first instruction, as it is always
// the canonical entry point for the regexp.
// Returns a similarly flat slice containing no nil instructions, however the
// slice may potentially be smaller.
func cleanup(prog []*instr) []*instr {
// Detect iSplit recursion. We can remove this and convert it to a single path.
// This might happen in cases where we loop over some instructions which are
// not matchers, e.g. \Q\E*.
for i := 1; i < len(prog); i++ {
states := make(map[int]bool)
pi := prog[i]
var fn func(ci *instr) bool
fn = func(ci *instr) bool {
if ci != nil && ci.mode == iSplit {
if _, exists := states[ci.idx]; exists {
// We've found a recursion.
return true
}
states[ci.idx] = true
if fn(ci.out) {
ci.out = nil
}
if fn(ci.out1) {
ci.out1 = nil
}
return false
}
return false
}
fn(pi)
}
// Iterate through the program, and remove single-instr iSplits.
// NB: Don't parse the first instr, it will always be single.
for i := 1; i < len(prog); i++ {
pi := prog[i]
if pi.mode == iSplit && (pi.out1 == nil || pi.out == pi.out1) {
for j := 0; j < len(prog); j++ {
if prog[j] == nil {
continue
}
pj := prog[j]
if pj.out == pi {
pj.out = pi.out
}
if pj.out1 == pi {
pj.out1 = pi.out
}
}
prog[i] = nil
}
}
// We may now have nil gaps: shift everything up.
last := 0
for i := 0; i < len(prog); i++ {
if prog[i] != nil {
last = i
} else {
// find next non-nil, move here
var found int
for found = i; found < len(prog); found++ {
if prog[found] != nil {
break
}
}
if found == len(prog) {
break // no more entries
}
// move found to i
prog[i] = prog[found]
prog[i].idx = i
prog[found] = nil
last = i
}
}
return prog[0 : last+1]
}
// Public interface to a compiled regexp.
type Re interface {
NumSubexps() int
Match(s string) bool
MatchIndex(s string) []int
DebugOut()
}
// Helper method that generates instructions, for this parser, that would
// match the normal input string ".*?". Returns instructions begin and final,
// both which may be used in any way the caller likes.
func (p *parser) makeDotStarOpt() (begin *instr, final *instr) {
begin = p.instr()
final = p.instr()
choice := p.instr()
p.out(begin, choice)
p.out(choice, final)
r := p.instr()
r.mode = iRuneClass
r.rf = func(r rune) bool { return true }
p.out(choice, r)
p.out(r, choice)
return begin, final
}
// Generates a simple, straight-forward NFA. Matches an entire regexp from the
// given input string. If the regexp could not be parsed, returns a non-nil
// error string: the regexp will be nil in this case.
func Parse(src string) (re Re, err *string) {
defer func() {
if r := recover(); r != nil {
re = nil // clear re so it can't be used by caller
switch x := r.(type) {
case string:
response := fmt.Sprintf("could not parse `%s`, error: %s", src, x)
err = &response
default:
panic(fmt.Sprint("unknown parse error: ", r))
}
}
}()
p := parser{&sregexp{make([]*instr, 0, 1), -1, 1}, NewSafeReader(src), 0}
// generate the prefix, ala ".*?("
// note that this has to come first, since it represents instruction zero
_, prefix := p.makeDotStarOpt()
prefix.mode = iIndexCap
prefix.cid = 0
// generate the suffix, ala ").*?" (followed by match)
suffix, match := p.makeDotStarOpt()
suffix.mode = iIndexCap
suffix.cid = 1
match.mode = iMatch
// parse and consume the regexp, placing it between prefix/suffix.
p.src.nextCh()
re_start, re_end := p.regexp()
if p.src.curr() != -1 {
panic("could not consume all of regexp!")
}
p.out(prefix, re_start)
p.out(re_end, suffix)
// cleanup and return success
p.re.prog = cleanup(p.re.prog)
if p.re.prog[0].out1 == nil {
p.re.start = p.re.prog[0].out.idx
}
return p.re, nil
}
// Generates a NFA from the given source. If the regexp could not be parsed,
// panics with a string error.
func MustParse(src string) Re {
re, err := Parse(src)
if err != nil {
panic(*err)
}
return re
}