Skip to content

Commit

Permalink
fix(lexer): fix curly counting in sub-tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
kollhof committed Nov 28, 2020
1 parent fca9648 commit 42968d2
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 49 deletions.
119 changes: 70 additions & 49 deletions src/lexer/tokens.fnk
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ get_loc = fn start, text:

fink_lex = rx'
^(
(?<ignorable>\s+)
|(?<comment_sl>#.+?(?=\n))
|(?<comment_ml>---[\s\S]*?---)
|(?<keyword>\b(
(?<keyword>\b(
fn|match|fold|unfold|else|map|filter|while|until
|await|import|try|throw|rec|dict|seq|list|true|false|pipe
|in|or|and|not
)\b)
|(?<ignorable>\s+)
|(?<comment_sl>#.*?(?=\n))
|(?<comment_ml>---[\s\S]*?---)

|(?<str_start>\'|")

|(?<jsx_elem_close>/>)
Expand All @@ -40,25 +41,24 @@ fink_lex = rx'
|(?<compare>((!=)|(==)|(<=)|(>=)|<))
|(?<assign>=)

|(?<overloaded>[>{}])

|(?<arithm>[-+*/%^](?=\s))
|(?<prefix_neg>-(?=\S))

|(?<overloaded>[>{}])

|(?<empty>\b_\b)

|(?<number>[0-9][\._abcdefxob+0-9-]*)
|(?<number>[0-9][\._a-fA-Fxobe+0-9-]*)
|(?<ident>[_$\p{L}][_$\p{L}\p{N}]*)

|(?<spread>\.\.\.)
|(?<member>\.)
|(?<operator>[?|])
|(?<grouping>[()[\]])
|(?<terminator>[,:])
|(?<other>.)
|(?<error>.)
|(?<end>$)
)
'
)'


str_sq_lex = rx"
Expand Down Expand Up @@ -95,7 +95,7 @@ jsx_lex = rx'
jsx_elem_lex = rx'
^(
(?<ignorable>\s+)
|(?<comment_sl>#.+?\n)
|(?<comment_sl>#.*?(?=\n))
|(?<comment_ml>---[\s\S]*?---)

|(?<str_start>\'|")
Expand All @@ -111,23 +111,25 @@ jsx_elem_lex = rx'

|(?<arithm>[-+*/%^](?=\s))
|(?<prefix_neg>-(?=\S))

|(?<empty>\b_\b)

|(?<number>[0-9][\._abcdefxob+0-9-]*)
|(?<number>[0-9][\._a-fA-Fxobe+0-9-]*)
|(?<ident>[_$\p{L}][_$\p{L}\p{N}]*)

|(?<spread>\.\.\.)
|(?<member>\.)
|(?<operator>[?|])
|(?<grouping>[()[\]])
|(?<terminator>[,:])
|(?<other>.)
|(?<error>.)
|(?<end>$)
)'



get_token_type = fn matched, value, curr_matcher, parent_lex, cc:
get_token_type = fn matched, value, matchers:

match matched.groups:
{str_start: {}}: 'str-start'
{str_text: {}}: 'str-text'
Expand All @@ -142,7 +144,7 @@ get_token_type = fn matched, value, curr_matcher, parent_lex, cc:
{ident: {}}: 'ident'
{number: {}}: 'number'

{other: {}}: 'other'
{error: {}}: 'error'
{end: {}}: 'end'

{jsx_elem_start: {}}: 'jsx-elem-start'
Expand All @@ -153,17 +155,20 @@ get_token_type = fn matched, value, curr_matcher, parent_lex, cc:
{jsx_text: {}}: 'jsx-text'

{overloaded: {}}:
[[curr_matcher, cc], parent] = matchers
match value:
'{': match curr_matcher:
jsx_elem_lex: 'jsx-expr-start'
else: value

'}': match true:
cc == 0 and parent_lex == str_dq_lex: 'str-expr-end'
cc == 0 and parent_lex == str_sq_lex: 'str-expr-end'
cc == 0 and parent_lex == jsx_lex: 'jsx-expr-end'
cc == 0 and parent_lex == jsx_elem_lex: 'jsx-expr-end'
else: value
'}':
[parent_lex] = parent
match true:
cc == 0 and parent_lex == str_dq_lex: 'str-expr-end'
cc == 0 and parent_lex == str_sq_lex: 'str-expr-end'
cc == 0 and parent_lex == jsx_lex: 'jsx-expr-end'
cc == 0 and parent_lex == jsx_elem_lex: 'jsx-expr-end'
else: value

--- istanbul ignore else TODO: cov should be done by loxia ---
'>': match curr_matcher:
Expand All @@ -175,12 +180,13 @@ get_token_type = fn matched, value, curr_matcher, parent_lex, cc:



get_next_token = fn [lex, parent_lex], code, start, cc:
get_next_token = fn matchers, code, start:
[[lex]] = matchers
code_slice = slice code, start.pos

[matched] = match_all code_slice, lex
[value] = matched
type = get_token_type matched, value, lex, parent_lex, cc
type = get_token_type matched, value, matchers


end = get_loc start, value
Expand All @@ -190,58 +196,73 @@ get_next_token = fn [lex, parent_lex], code, start, cc:



get_next_matchers = fn token, [curr_matcher, ...parent_matchers]:
get_next_matchers = fn token, matchers:
[curr_matcher, ...parent_matchers] = matchers

match token:
# {type: 'jsx-elem-close', value: '/>'}: parent_matchers
{type: 'jsx-elem-close'}: parent_matchers

{type: 'jsx-frag-open'}: [jsx_lex, curr_matcher, ...parent_matchers]
{type: 'jsx-elem-start'}: [jsx_elem_lex, curr_matcher, ...parent_matchers]
{type: 'jsx-elem-end'}: [jsx_lex, ...parent_matchers]
{type: 'jsx-frag-open'}: [[jsx_lex, 0], ...matchers]
{type: 'jsx-elem-start'}: [[jsx_elem_lex, 0], ...matchers]
{type: 'jsx-elem-end'}: [[jsx_lex, 0], ...parent_matchers]

{type: 'jsx-expr-start'}: [fink_lex, curr_matcher, ...parent_matchers]
{type: 'jsx-expr-start'}: [[fink_lex, 0], ...matchers]
{type: 'jsx-expr-end'}: parent_matchers

{type: 'str-start', value: '"'}: [str_dq_lex, curr_matcher, ...parent_matchers]
{type: 'str-start', value: "'"}: [str_sq_lex, curr_matcher, ...parent_matchers]
{type: 'str-expr-start'}: [fink_lex, curr_matcher, ...parent_matchers]
{type: 'str-expr-end'}: parent_matchers
{type: 'str-start', value: '"'}: [[str_dq_lex, 0], ...matchers]
{type: 'str-start', value: "'"}: [[str_sq_lex, 0], ...matchers]
{type: 'str-end'}: parent_matchers

{type: 'str-expr-start'}: [[fink_lex, 0], ...matchers]

{type: 'str-expr-end'}: parent_matchers

else: [curr_matcher, ...parent_matchers]



get_cc = fn token, cc:
match token:
{type: '{'}:
cc + 1
{type: '}'}:
cc - 1
update_matcher_state = fn token, matchers:
[[curr_m, curr_cc], ...rest] = matchers

next_cc = match token:
{value: '{'}:
curr_cc + 1
{value: '}'}:
curr_cc - 1
else:
cc
curr_cc

[[curr_m, next_cc], ...rest]



tokenize = fn code, start={pos: 0, line: 1, column: 0}:
initial_ctx = rec:
code
start
matchers: [fink_lex]
cc: 0
matchers: [[fink_lex, 0], []]


pipe:
unfold , {code, start, matchers, cc}=initial_ctx:
[token, next_start] = get_next_token matchers, code, start, cc
unfold , {code, start, matchers}=initial_ctx:
[token, next_start] = get_next_token matchers, code, start

curr_matchers = update_matcher_state token, matchers

next_matchers = get_next_matchers token, curr_matchers

next_matchers = get_next_matchers token, matchers
next_cc = get_cc token, cc
[token, {code, start: next_start, matchers: next_matchers}]

[token, {code, start: next_start, matchers: next_matchers, cc: next_cc}]


# TODO: should prattler allow registering for token-type + value
# if so, we could simply test on type being 'keyword'
ident_or_keyword = seq:
'ident', 'fn', 'pipe', 'match',
'fold', 'unfold', 'map', 'filter', 'while', 'until'
'seq', 'rec', 'await', 'try', 'throw'
'ident', 'import', 'fn', 'pipe', 'match', 'else'
'fold', 'unfold', 'map', 'filter', 'while', 'until'
'seq', 'rec', 'await', 'try', 'throw',
'and', 'or', 'not', 'in', 'true', 'false'



8 changes: 8 additions & 0 deletions src/lexer/tokens.test.fnk
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ describe 'tokenizer', fn:
to_match_snapshot


it 'tokenizes str exprs', fn:
expect
foo '
{foo: "\${ham}"}
'
to_match_snapshot



describe 'JSX tokenizer', fn:
it 'tokenizes fragment', fn:
Expand Down
24 changes: 24 additions & 0 deletions src/lexer/tokens.test.fnk.snap
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,27 @@ ignorable (99-100) (8:13-9:0)
end (100-100) (9:0-9:0)
\\"\\""
`;
exports[`tokenizer tokenizes str exprs 1`] = `
"{ (0-1) (1:0-1:1)
ident (1-4) (1:1-1:4)
\\"foo\\"
: (4-5) (1:4-1:5)
ignorable (5-6) (1:5-1:6)
\\" \\"
str-start (6-7) (1:6-1:7)
\\"\\\\\\"\\"
str-expr-start (7-9) (1:7-1:9)
\\"\${\\"
ident (9-12) (1:9-1:12)
\\"ham\\"
str-expr-end (12-13) (1:12-1:13)
\\"}\\"
str-end (13-14) (1:13-1:14)
\\"\\\\\\"\\"
} (14-15) (1:14-1:15)
ignorable (15-16) (1:15-2:0)
\\"\\\\n\\"
end (16-16) (2:0-2:0)
\\"\\""
`;

0 comments on commit 42968d2

Please sign in to comment.