-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathantlr-parser.rkt
287 lines (251 loc) · 8.38 KB
/
antlr-parser.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#lang racket
(require rackunit
parser-tools/lex
(prefix-in : parser-tools/lex-sre)
parser-tools/yacc
syntax/readerr
racket/generator)
(provide rust-rules)
(define lines
(file->lines "/Users/clements/rust-antlr/Rust-massaged.g4")
#;(list "a" "b" "/// <foo>" ))
(define ((parse-line tags) line)
(match line
[(regexp #px"^///\\s*<\\s*(\\w+)\\s*>\\s*$"
(list _ tag))
(cond [(member (string->symbol tag) tags)
(list 'start (string->symbol tag))]
[else '(ignored-tag)])]
[(regexp #px"^///\\s*<\\s*/(\\w+)\\s*>\\s*$"
(list _ tag))
(cond [(member (string->symbol tag) tags)
(list 'end (string->symbol tag))]
[else '(ignored-tag)])]
[other (list 'line line)]))
(define line-stream
(sequence->generator (map (parse-line '(grammar)) lines)))
(define (line-parsing-loop stream in-tag)
(define first-line (stream))
(match first-line
[(? void? v)
(cond [in-tag
(error 'line-parsing-loop
"expected closing tag for ~e, found EOF"
in-tag)]
[else empty])]
[(list 'ignored-tag)
(line-parsing-loop stream in-tag)]
[(list 'line line)
(cons line
(line-parsing-loop stream in-tag))]
[(list 'start new-tag)
(cons (cons new-tag
(line-parsing-loop stream new-tag))
(line-parsing-loop stream in-tag))]
[(list 'end ending-tag)
(cond [(eq? ending-tag in-tag)
empty]
[else
(error 'line-parsing-loop
"expected ending tag ~a, found ~a"
in-tag ending-tag)])]
[else (error 'line-parsing-loop "unexpected line: ~e"
first-line)]))
(define the-grammar-text
(apply
string-append
(add-between
(match
(filter (lambda (x)
(and (list? x) (equal? (first x) 'grammar)))
(line-parsing-loop line-stream #f))
[(list g) (rest g)]
[other (error 'foo "expected just one grammar")])
"\n")))
;; okay, now we've got the grammar part...
(define-tokens data (TERM NONTERM))
(define-empty-tokens delim
(COLON OR SEMI LPAREN RPAREN HUH STAR PLUS EOF))
(define antlr-lexer
(lexer-src-pos
;; Skip comments, without accumulating extra position information
[(:or antlr-whitespace comment)
(return-without-pos (antlr-lexer input-port))]
[upper-ident (token-TERM lexeme)]
[lower-ident (token-NONTERM lexeme)]
[#\| 'OR]
[#\; 'SEMI]
[#\( 'LPAREN]
[#\) 'RPAREN]
[#\? 'HUH]
[#\* 'STAR]
[#\: 'COLON]
[#\+ 'PLUS]
[(eof) 'EOF]))
(define-lex-abbrevs
[lower-ident (:: lower-alpha (:* (:or alpha #\_ digit)))]
[upper-ident (:+ (:or (:/ #\A #\Z) #\_ digit))]
[lower-alpha (:/ #\a #\z)]
[alpha (:or lower-alpha (:/ #\A #\Z))]
[digit (:/ #\0 #\9)]
[antlr-whitespace (:or #\newline #\return #\tab #\space #\vtab)]
[comment (:or line-comment inline-comment)]
[line-comment (:: #\/ #\/ (:* (:~ #\newline)) #\newline)]
[inline-comment (:: #\/ #\*
(:* (:or (:~ #\*)
(:: #\* (:~ #\/ #\*))))
(:+ #\*) #\/)]
)
(define antlr-parser
(parser
(src-pos)
(start s)
(end EOF)
(error (lambda (a name val start end)
(error 'antlr-parser
"error-vals: ~e"
(list a name val start end))))
#;(debug "/tmp/grammar-debug")
(tokens data delim)
(grammar
(s [(rule s) (cons $1 $2)]
[() empty])
(rule [(NONTERM COLON pats SEMI)
(let ()
(define name $1)
(fence-out-name name)
(list
name
((rewrite-pat-to-or-of-seqs $1)
(cons 'or $3))))])
(pats [(pat-seq OR pats) (cons (cons 'seq $1) $3)]
[(pat-seq) (list (cons 'seq $1))])
(pat-seq [(one-pat pat-seq) (cons $1 $2)]
[() empty])
(one-pat [(term-or-nonterm) $1]
[(LPAREN pats RPAREN) (cons 'or $2)]
[(one-pat STAR) (list 'star $1)]
[(one-pat HUH) (list 'huh $1)]
[(one-pat PLUS) (list 'plus $1)])
(term-or-nonterm [(TERM) $1]
[(NONTERM) $1]))))
;; this would be perfect for monadic style... but
;; I'll just use mutation instead.
(define aux-defn-bucket (make-hash))
(define (get-aux-defn-name base)
(define new-idx
(let loop ([idx 1])
(define candidate (~a base "_" idx))
(cond [(hash-has-key? aux-defn-bucket candidate)
(loop (add1 idx))]
[else idx])))
(~a base "_" new-idx))
(define (fence-out-name name)
(cond [(hash-has-key? aux-defn-bucket name)
(error 'fence-out-name
"too late! name ~e already appears in the table"
name)]
[(hash-set! aux-defn-bucket name #f)]))
(define (reset-aux-defns!)
(set! aux-defn-bucket (make-hash)))
;; NOTE! I'VE ONLY IMPLEMENTED THE ONES I NEEDED, HERE...
(define ((rewrite-pat-to-nonterm name) orig-pat)
(define pat (maybe-simplify-pat orig-pat))
(match pat
[(? string? s) s]
;; special-case for 1-ary case makes things
;; *much* more readabale...
[(list 'star (? string? s))
(define new-name (get-aux-defn-name (~a name "_star")))
(hash-set! aux-defn-bucket
new-name
`(()
(,s ,new-name)))
new-name]
[(list 'star pat)
(define lifted-rhs-seq ((rewrite-pat-to-seq name) pat))
(define new-name (get-aux-defn-name (~a name "_star")))
(hash-set! aux-defn-bucket
new-name
`(()
(,@lifted-rhs-seq ,new-name)))
new-name]
;; special-case to improve readability
[(list 'plus (? string? s))
(define new-name (get-aux-defn-name (~a name "_plus")))
(hash-set! aux-defn-bucket
new-name
`((,s)
(,s ,new-name)))
new-name]
[(list 'huh (? string? s))
(define new-name (get-aux-defn-name (~a name "_star")))
(hash-set! aux-defn-bucket
new-name
`(()
(,s)))
new-name]
[(list 'huh pat)
(define lifted-rhs-seq ((rewrite-pat-to-seq name) pat))
(define new-name (get-aux-defn-name (~a name "_huh")))
(hash-set! aux-defn-bucket
new-name
`(()
,lifted-rhs-seq))
new-name]
[(cons 'seq pats)
(define new-name (get-aux-defn-name name))
(define rewritten ((rewrite-pat-to-or-of-seqs name)
(cons 'seq pats)))
(hash-set! aux-defn-bucket new-name rewritten)
new-name]
[(cons 'or pats)
(define new-name (get-aux-defn-name name))
(define rewritten ((rewrite-pat-to-or-of-seqs name)
(cons 'or pats)))
(hash-set! aux-defn-bucket new-name rewritten)
new-name]
[other (error 'abc "unimplemented: ~e" other)]))
(define ((rewrite-pat-to-or-of-seqs name) pat)
(match pat
[(cons 'or pats)
(map (rewrite-pat-to-seq name) pats)]
[(cons 'seq pats)
(list ((rewrite-pat-to-seq name) pat))]))
(define ((rewrite-pat-to-seq name) pat)
(match pat
[(cons 'seq pats)
(map (rewrite-pat-to-nonterm name) pats)]
[(? string? s)
(list s)]))
;; get rid of superfluous ors and seqs
(define (maybe-simplify-pat pat)
(match pat
[(? string? s) s]
[(list 'or pat) (maybe-simplify-pat pat)]
[(list 'seq pat) (maybe-simplify-pat pat)]
[(cons kind pats) (cons kind (map maybe-simplify-pat pats))]))
(check-equal? ((rewrite-pat-to-nonterm "bogo") "attrs_and_vis")
"attrs_and_vis")
(check-equal? ((rewrite-pat-to-nonterm "bogo") '(star "frongy"))
"bogo_star_1")
(check-equal? (hash-ref aux-defn-bucket "bogo_star_1")
'(() ("frongy" "bogo_star_1")))
(reset-aux-defns!)
(define (post-process-rules grammar)
(for/list ([rule grammar])
(match-define (list name rhses) rule)
(cons (string->symbol name)
(map post-process rhses))))
(define (post-process los)
(list (map string->symbol los) #f))
(define (rs ip)
(port-count-lines! ip)
(antlr-parser (lambda () (antlr-lexer ip))))
(define grammar-port (open-input-string the-grammar-text))
(define rust-rules
(post-process-rules
(append
(rs grammar-port)
(filter (lambda (rule) (not (false? (second rule))))
(hash-map aux-defn-bucket list)))))