-
Notifications
You must be signed in to change notification settings - Fork 13k
/
Copy pathpp.rs
569 lines (508 loc) · 18.8 KB
/
pp.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
//! This pretty-printer is a direct reimplementation of Philip Karlton's
//! Mesa pretty-printer, as described in the appendix to
//! Derek C. Oppen, "Pretty Printing" (1979),
//! Stanford Computer Science Department STAN-CS-79-770,
//! <http://i.stanford.edu/pub/cstr/reports/cs/tr/79/770/CS-TR-79-770.pdf>.
//!
//! The algorithm's aim is to break a stream into as few lines as possible
//! while respecting the indentation-consistency requirements of the enclosing
//! block, and avoiding breaking at silly places on block boundaries, for
//! example, between "x" and ")" in "x)".
//!
//! I am implementing this algorithm because it comes with 20 pages of
//! documentation explaining its theory, and because it addresses the set of
//! concerns I've seen other pretty-printers fall down on. Weirdly. Even though
//! it's 32 years old. What can I say?
//!
//! Despite some redundancies and quirks in the way it's implemented in that
//! paper, I've opted to keep the implementation here as similar as I can,
//! changing only what was blatantly wrong, a typo, or sufficiently
//! non-idiomatic rust that it really stuck out.
//!
//! In particular you'll see a certain amount of churn related to INTEGER vs.
//! CARDINAL in the Mesa implementation. Mesa apparently interconverts the two
//! somewhat readily? In any case, I've used usize for indices-in-buffers and
//! ints for character-sizes-and-indentation-offsets. This respects the need
//! for ints to "go negative" while carrying a pending-calculation balance, and
//! helps differentiate all the numbers flying around internally (slightly).
//!
//! I also inverted the indentation arithmetic used in the print stack, since
//! the Mesa implementation (somewhat randomly) stores the offset on the print
//! stack in terms of margin-col rather than col itself. I store col.
//!
//! I also implemented a small change in the String token, in that I store an
//! explicit length for the string. For most tokens this is just the length of
//! the accompanying string. But it's necessary to permit it to differ, for
//! encoding things that are supposed to "go on their own line" -- certain
//! classes of comment and blank-line -- where relying on adjacent
//! hardbreak-like Break tokens with long blankness indication doesn't actually
//! work. To see why, consider when there is a "thing that should be on its own
//! line" between two long blocks, say functions. If you put a hardbreak after
//! each function (or before each) and the breaking algorithm decides to break
//! there anyways (because the functions themselves are long) you wind up with
//! extra blank lines. If you don't put hardbreaks you can wind up with the
//! "thing which should be on its own line" not getting its own line in the
//! rare case of "really small functions" or such. This re-occurs with comments
//! and explicit blank lines. So in those cases we use a string with a payload
//! we want isolated to a line and an explicit length that's huge, surrounded
//! by two zero-length breaks. The algorithm will try its best to fit it on a
//! line (which it can't) and so naturally place the content on its own line to
//! avoid combining it with other lines and making matters even worse.
//!
//! # Explanation
//!
//! In case you do not have the paper, here is an explanation of what's going
//! on.
//!
//! There is a stream of input tokens flowing through this printer.
//!
//! The printer buffers up to 3N tokens inside itself, where N is linewidth.
//! Yes, linewidth is chars and tokens are multi-char, but in the worst
//! case every token worth buffering is 1 char long, so it's ok.
//!
//! Tokens are String, Break, and Begin/End to delimit blocks.
//!
//! Begin tokens can carry an offset, saying "how far to indent when you break
//! inside here", as well as a flag indicating "consistent" or "inconsistent"
//! breaking. Consistent breaking means that after the first break, no attempt
//! will be made to flow subsequent breaks together onto lines. Inconsistent
//! is the opposite. Inconsistent breaking example would be, say:
//!
//! ```
//! foo(hello, there, good, friends)
//! ```
//!
//! breaking inconsistently to become
//!
//! ```
//! foo(hello, there,
//! good, friends);
//! ```
//!
//! whereas a consistent breaking would yield:
//!
//! ```
//! foo(hello,
//! there,
//! good,
//! friends);
//! ```
//!
//! That is, in the consistent-break blocks we value vertical alignment
//! more than the ability to cram stuff onto a line. But in all cases if it
//! can make a block a one-liner, it'll do so.
//!
//! Carrying on with high-level logic:
//!
//! The buffered tokens go through a ring-buffer, 'tokens'. The 'left' and
//! 'right' indices denote the active portion of the ring buffer as well as
//! describing hypothetical points-in-the-infinite-stream at most 3N tokens
//! apart (i.e., "not wrapped to ring-buffer boundaries"). The paper will switch
//! between using 'left' and 'right' terms to denote the wrapped-to-ring-buffer
//! and point-in-infinite-stream senses freely.
//!
//! There is a parallel ring buffer, `size`, that holds the calculated size of
//! each token. Why calculated? Because for Begin/End pairs, the "size"
//! includes everything between the pair. That is, the "size" of Begin is
//! actually the sum of the sizes of everything between Begin and the paired
//! End that follows. Since that is arbitrarily far in the future, `size` is
//! being rewritten regularly while the printer runs; in fact most of the
//! machinery is here to work out `size` entries on the fly (and give up when
//! they're so obviously over-long that "infinity" is a good enough
//! approximation for purposes of line breaking).
//!
//! The "input side" of the printer is managed as an abstract process called
//! SCAN, which uses `scan_stack`, to manage calculating `size`. SCAN is, in
//! other words, the process of calculating 'size' entries.
//!
//! The "output side" of the printer is managed by an abstract process called
//! PRINT, which uses `print_stack`, `margin` and `space` to figure out what to
//! do with each token/size pair it consumes as it goes. It's trying to consume
//! the entire buffered window, but can't output anything until the size is >=
//! 0 (sizes are set to negative while they're pending calculation).
//!
//! So SCAN takes input and buffers tokens and pending calculations, while
//! PRINT gobbles up completed calculations and tokens from the buffer. The
//! theory is that the two can never get more than 3N tokens apart, because
//! once there's "obviously" too much data to fit on a line, in a size
//! calculation, SCAN will write "infinity" to the size and let PRINT consume
//! it.
//!
//! In this implementation (following the paper, again) the SCAN process is the
//! methods called `Printer::scan_*`, and the 'PRINT' process is the
//! method called `Printer::print`.
mod ring;
use ring::RingBuffer;
use std::borrow::Cow;
use std::collections::VecDeque;
use std::fmt;
/// How to break. Described in more detail in the module docs.
#[derive(Clone, Copy, PartialEq)]
pub enum Breaks {
Consistent,
Inconsistent,
}
#[derive(Clone, Copy)]
pub struct BreakToken {
offset: isize,
blank_space: isize,
}
#[derive(Clone, Copy)]
pub struct BeginToken {
offset: isize,
breaks: Breaks,
}
#[derive(Clone)]
pub enum Token {
// In practice a string token contains either a `&'static str` or a
// `String`. `Cow` is overkill for this because we never modify the data,
// but it's more convenient than rolling our own more specialized type.
String(Cow<'static, str>),
Break(BreakToken),
Begin(BeginToken),
End,
Eof,
}
impl Token {
crate fn is_eof(&self) -> bool {
matches!(self, Token::Eof)
}
pub fn is_hardbreak_tok(&self) -> bool {
matches!(self, Token::Break(BreakToken { offset: 0, blank_space: SIZE_INFINITY }))
}
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Token::String(ref s) => write!(f, "STR({},{})", s, s.len()),
Token::Break(_) => f.write_str("BREAK"),
Token::Begin(_) => f.write_str("BEGIN"),
Token::End => f.write_str("END"),
Token::Eof => f.write_str("EOF"),
}
}
}
#[derive(Copy, Clone)]
enum PrintStackBreak {
Fits,
Broken(Breaks),
}
#[derive(Copy, Clone)]
struct PrintStackElem {
offset: isize,
pbreak: PrintStackBreak,
}
const SIZE_INFINITY: isize = 0xffff;
pub struct Printer {
out: String,
/// Width of lines we're constrained to
margin: isize,
/// Number of spaces left on line
space: isize,
/// Index of left side of input stream
left: usize,
/// Index of right side of input stream
right: usize,
/// Ring-buffer of tokens and calculated sizes
buf: RingBuffer<BufEntry>,
/// Running size of stream "...left"
left_total: isize,
/// Running size of stream "...right"
right_total: isize,
/// Pseudo-stack, really a ring too. Holds the
/// primary-ring-buffers index of the Begin that started the
/// current block, possibly with the most recent Break after that
/// Begin (if there is any) on top of it. Stuff is flushed off the
/// bottom as it becomes irrelevant due to the primary ring-buffer
/// advancing.
scan_stack: VecDeque<usize>,
/// Stack of blocks-in-progress being flushed by print
print_stack: Vec<PrintStackElem>,
/// Buffered indentation to avoid writing trailing whitespace
pending_indentation: isize,
}
#[derive(Clone)]
struct BufEntry {
token: Token,
size: isize,
}
impl Default for BufEntry {
fn default() -> Self {
BufEntry { token: Token::Eof, size: 0 }
}
}
impl Printer {
pub fn new() -> Self {
let linewidth = 78;
let mut buf = RingBuffer::new();
buf.advance_right();
Printer {
out: String::new(),
margin: linewidth as isize,
space: linewidth as isize,
left: 0,
right: 0,
buf,
left_total: 0,
right_total: 0,
scan_stack: VecDeque::new(),
print_stack: Vec::new(),
pending_indentation: 0,
}
}
pub fn last_token(&self) -> Token {
self.buf[self.right].token.clone()
}
/// Be very careful with this!
pub fn replace_last_token(&mut self, t: Token) {
self.buf[self.right].token = t;
}
fn scan_eof(&mut self) {
if !self.scan_stack.is_empty() {
self.check_stack(0);
self.advance_left();
}
}
fn scan_begin(&mut self, b: BeginToken) {
if self.scan_stack.is_empty() {
self.left_total = 1;
self.right_total = 1;
self.right = self.left;
self.buf.truncate(1);
} else {
self.advance_right();
}
self.scan_push(BufEntry { token: Token::Begin(b), size: -self.right_total });
}
fn scan_end(&mut self) {
if self.scan_stack.is_empty() {
self.print_end();
} else {
self.advance_right();
self.scan_push(BufEntry { token: Token::End, size: -1 });
}
}
fn scan_break(&mut self, b: BreakToken) {
if self.scan_stack.is_empty() {
self.left_total = 1;
self.right_total = 1;
self.right = self.left;
self.buf.truncate(1);
} else {
self.advance_right();
}
self.check_stack(0);
self.scan_push(BufEntry { token: Token::Break(b), size: -self.right_total });
self.right_total += b.blank_space;
}
fn scan_string(&mut self, s: Cow<'static, str>) {
if self.scan_stack.is_empty() {
self.print_string(s);
} else {
self.advance_right();
let len = s.len() as isize;
self.buf[self.right] = BufEntry { token: Token::String(s), size: len };
self.right_total += len;
self.check_stream();
}
}
fn check_stream(&mut self) {
if self.right_total - self.left_total > self.space {
if Some(&self.left) == self.scan_stack.back() {
let scanned = self.scan_pop_bottom();
self.buf[scanned].size = SIZE_INFINITY;
}
self.advance_left();
if self.left != self.right {
self.check_stream();
}
}
}
fn scan_push(&mut self, entry: BufEntry) {
self.buf[self.right] = entry;
self.scan_stack.push_front(self.right);
}
fn scan_pop(&mut self) -> usize {
self.scan_stack.pop_front().unwrap()
}
fn scan_top(&self) -> usize {
*self.scan_stack.front().unwrap()
}
fn scan_pop_bottom(&mut self) -> usize {
self.scan_stack.pop_back().unwrap()
}
fn advance_right(&mut self) {
self.right += 1;
self.buf.advance_right();
}
fn advance_left(&mut self) {
let mut left_size = self.buf[self.left].size;
while left_size >= 0 {
let left = self.buf[self.left].token.clone();
let len = match left {
Token::Break(b) => b.blank_space,
Token::String(ref s) => {
let len = s.len() as isize;
assert_eq!(len, left_size);
len
}
_ => 0,
};
self.print(left, left_size);
self.left_total += len;
if self.left == self.right {
break;
}
self.buf.advance_left();
self.left += 1;
left_size = self.buf[self.left].size;
}
}
fn check_stack(&mut self, k: usize) {
if !self.scan_stack.is_empty() {
let x = self.scan_top();
match self.buf[x].token {
Token::Begin(_) => {
if k > 0 {
self.scan_pop();
self.buf[x].size += self.right_total;
self.check_stack(k - 1);
}
}
Token::End => {
// paper says + not =, but that makes no sense.
self.scan_pop();
self.buf[x].size = 1;
self.check_stack(k + 1);
}
_ => {
self.scan_pop();
self.buf[x].size += self.right_total;
if k > 0 {
self.check_stack(k);
}
}
}
}
}
fn print_newline(&mut self, amount: isize) {
self.out.push('\n');
self.pending_indentation = 0;
self.indent(amount);
}
fn indent(&mut self, amount: isize) {
self.pending_indentation += amount;
}
fn get_top(&self) -> PrintStackElem {
*self.print_stack.last().unwrap_or({
&PrintStackElem { offset: 0, pbreak: PrintStackBreak::Broken(Breaks::Inconsistent) }
})
}
fn print_begin(&mut self, b: BeginToken, l: isize) {
if l > self.space {
let col = self.margin - self.space + b.offset;
self.print_stack
.push(PrintStackElem { offset: col, pbreak: PrintStackBreak::Broken(b.breaks) });
} else {
self.print_stack.push(PrintStackElem { offset: 0, pbreak: PrintStackBreak::Fits });
}
}
fn print_end(&mut self) {
self.print_stack.pop().unwrap();
}
fn print_break(&mut self, b: BreakToken, l: isize) {
let top = self.get_top();
match top.pbreak {
PrintStackBreak::Fits => {
self.space -= b.blank_space;
self.indent(b.blank_space);
}
PrintStackBreak::Broken(Breaks::Consistent) => {
self.print_newline(top.offset + b.offset);
self.space = self.margin - (top.offset + b.offset);
}
PrintStackBreak::Broken(Breaks::Inconsistent) => {
if l > self.space {
self.print_newline(top.offset + b.offset);
self.space = self.margin - (top.offset + b.offset);
} else {
self.indent(b.blank_space);
self.space -= b.blank_space;
}
}
}
}
fn print_string(&mut self, s: Cow<'static, str>) {
let len = s.len() as isize;
// assert!(len <= space);
self.space -= len;
// Write the pending indent. A more concise way of doing this would be:
//
// write!(self.out, "{: >n$}", "", n = self.pending_indentation as usize)?;
//
// But that is significantly slower. This code is sufficiently hot, and indents can get
// sufficiently large, that the difference is significant on some workloads.
self.out.reserve(self.pending_indentation as usize);
self.out.extend(std::iter::repeat(' ').take(self.pending_indentation as usize));
self.pending_indentation = 0;
self.out.push_str(&s);
}
fn print(&mut self, token: Token, l: isize) {
match token {
Token::Begin(b) => self.print_begin(b, l),
Token::End => self.print_end(),
Token::Break(b) => self.print_break(b, l),
Token::String(s) => {
let len = s.len() as isize;
assert_eq!(len, l);
self.print_string(s);
}
Token::Eof => panic!(), // Eof should never get here.
}
}
// Convenience functions to talk to the printer.
/// "raw box"
pub fn rbox(&mut self, indent: usize, b: Breaks) {
self.scan_begin(BeginToken { offset: indent as isize, breaks: b })
}
/// Inconsistent breaking box
pub fn ibox(&mut self, indent: usize) {
self.rbox(indent, Breaks::Inconsistent)
}
/// Consistent breaking box
pub fn cbox(&mut self, indent: usize) {
self.rbox(indent, Breaks::Consistent)
}
pub fn break_offset(&mut self, n: usize, off: isize) {
self.scan_break(BreakToken { offset: off, blank_space: n as isize })
}
pub fn end(&mut self) {
self.scan_end()
}
pub fn eof(mut self) -> String {
self.scan_eof();
self.out
}
pub fn word<S: Into<Cow<'static, str>>>(&mut self, wrd: S) {
let s = wrd.into();
self.scan_string(s)
}
fn spaces(&mut self, n: usize) {
self.break_offset(n, 0)
}
crate fn zerobreak(&mut self) {
self.spaces(0)
}
pub fn space(&mut self) {
self.spaces(1)
}
pub fn hardbreak(&mut self) {
self.spaces(SIZE_INFINITY as usize)
}
pub fn is_beginning_of_line(&self) -> bool {
self.last_token().is_eof() || self.last_token().is_hardbreak_tok()
}
pub fn hardbreak_tok_offset(off: isize) -> Token {
Token::Break(BreakToken { offset: off, blank_space: SIZE_INFINITY })
}
}