Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Markdown: allow to end URL with balanced parenthesis #18321

Merged
merged 4 commits into from
Jun 23, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions lib/packages/docutils/rst.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1246,15 +1246,44 @@ proc isUrl(p: RstParser, i: int): bool =
p.tok[i+3].kind == tkWord and
p.tok[i].symbol in ["http", "https", "ftp", "telnet", "file"]

proc checkParen(token: Token, parensStack: var seq[char]): bool {.inline.} =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

completely optional: can you split this into a more general reusable proc and then call it, and then in future work we can move this reusable proc to strbasics or strutils? (similar to https://dlang.org/library/std/algorithm/searching/balanced_parens.html, and same functionality as what you have)

proc balancedParens(a: openArray[char], parensStack: var seq[char]): bool =
  # xxx in future work, move to stdlib
proc checkParen(token: Token, parensStack: var seq[char]): bool {.inline.} =
  call balancedParens somehow

Copy link
Contributor Author

@a-mr a-mr Jun 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea of balancedParens is very different: it check whether parens are balanced or not.

checkParen does not always forbid unbalanced parens: e.g. ([) and (]) are legal, and checkParen will return 3 values False,False,True in both cases for 3 tokens.

## Returns `true` iff `token` is a closing parenthesis for some
## previous opening parenthesis saved in `parensStack`.
## This is according Markdown balanced parentheses rule
## (https://spec.commonmark.org/0.29/#link-destination)
## to allow links like
## https://en.wikipedia.org/wiki/APL_(programming_language),
## we use it for RST also.
result = false
if token.kind == tkPunct:
let c = token.symbol[0]
if c in {'(', '[', '{'}: # push
parensStack.add c
elif c in {')', ']', '}'}: # try pop
if parensStack.len > 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pointless if statement, if the collection is empty the loop will iterate 0 times.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed, thanks.

# a case like ([) inside a link is allowed and [ is discarded:
for i in countdown(parensStack.len - 1, 0):
if (parensStack[i] == '(' and c == ')' or
parensStack[i] == '[' and c == ']' or
parensStack[i] == '{' and c == '}'):
parensStack.setLen i
result = true
break

proc parseUrl(p: var RstParser): PRstNode =
## https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#standalone-hyperlinks
result = newRstNode(rnStandaloneHyperlink)
var lastIdx = p.idx
var closedParenIdx = p.idx - 1 # for balanced parens rule
var parensStack: seq[char]
while p.tok[lastIdx].kind in {tkWord, tkPunct, tkOther}:
let isClosing = checkParen(p.tok[lastIdx], parensStack)
if isClosing:
closedParenIdx = lastIdx
inc lastIdx
dec lastIdx
# standalone URL can not end with punctuation in RST
while lastIdx >= p.idx and p.tok[lastIdx].kind == tkPunct and
while lastIdx > closedParenIdx and p.tok[lastIdx].kind == tkPunct and
p.tok[lastIdx].symbol != "/":
dec lastIdx
var s = ""
Expand Down Expand Up @@ -1393,11 +1422,15 @@ proc parseMarkdownLink(p: var RstParser; father: PRstNode): bool =
var desc, link = ""
var i = p.idx

var parensStack: seq[char]
template parse(endToken, dest) =
parensStack.setLen 0
inc i # skip begin token
while true:
if p.tok[i].kind in {tkEof, tkIndent}: return false
if p.tok[i].symbol == endToken: break
let isClosing = checkParen(p.tok[i], parensStack)
if p.tok[i].symbol == endToken and not isClosing:
break
dest.add p.tok[i].symbol
inc i
inc i # skip end token
Expand Down
41 changes: 41 additions & 0 deletions tests/stdlib/trst.nim
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,44 @@ suite "RST inline markup":
rnLeaf ' '
rnLeaf 'end'
""")

test "URL with balanced parentheses (Markdown rule)":
# 2 balanced parens, 1 unbalanced:
check(dedent"""
https://en.wikipedia.org/wiki/APL_((programming_language)))""".toAst ==
dedent"""
rnInner
rnStandaloneHyperlink
rnLeaf 'https://en.wikipedia.org/wiki/APL_((programming_language))'
rnLeaf ')'
""")

# the same for Markdown-style link:
check(dedent"""
[foo [bar]](https://en.wikipedia.org/wiki/APL_((programming_language))))""".toAst ==
dedent"""
rnInner
rnHyperlink
rnLeaf 'foo [bar]'
rnLeaf 'https://en.wikipedia.org/wiki/APL_((programming_language))'
rnLeaf ')'
""")

# unbalanced (here behavior is more RST-like actually):
check(dedent"""
https://en.wikipedia.org/wiki/APL_(programming_language(""".toAst ==
dedent"""
rnInner
rnStandaloneHyperlink
rnLeaf 'https://en.wikipedia.org/wiki/APL_(programming_language'
rnLeaf '('
""")

# unbalanced [, but still acceptable:
check(dedent"""
[my {link example](http://example.com/bracket_(symbol_[))""".toAst ==
dedent"""
rnHyperlink
rnLeaf 'my {link example'
rnLeaf 'http://example.com/bracket_(symbol_[)'
""")