Skip to content

Commit

Permalink
rustdoc: check parsing diffs between pulldown-cmark 0.9.6 and 0.10
Browse files Browse the repository at this point in the history
This commit is not meant to be merged as-is. It's meant to run
in Crater, so that we can estimate the impact of bumping to the
new version of the markdown parser.
  • Loading branch information
notriddle committed Feb 27, 2024
1 parent 829308e commit 8ddc204
Show file tree
Hide file tree
Showing 7 changed files with 431 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4704,6 +4704,7 @@ dependencies = [
"itertools",
"minifier",
"once_cell",
"pulldown-cmark 0.10.0",
"regex",
"rustdoc-json-types",
"serde",
Expand Down
1 change: 1 addition & 0 deletions src/librustdoc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ itertools = "0.11"
indexmap = "2"
minifier = "0.3.0"
once_cell = "1.10.0"
pulldown-cmark-new = { version = "0.10", package = "pulldown-cmark", default-features = false }
regex = "1"
rustdoc-json-types = { path = "../rustdoc-json-types" }
serde_json = "1.0"
Expand Down
9 changes: 9 additions & 0 deletions src/librustdoc/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ declare_rustdoc_lint! {
"detects redundant explicit links in doc comments"
}

declare_rustdoc_lint! {
/// This compatibility lint checks for Markdown syntax that works in the old engine but not
/// the new one.
UNPORTABLE_MARKDOWN,
Deny,
"detects markdown that is interpreted differently in different parser"
}

pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
vec![
BROKEN_INTRA_DOC_LINKS,
Expand All @@ -209,6 +217,7 @@ pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
MISSING_CRATE_LEVEL_DOCS,
UNESCAPED_BACKTICKS,
REDUNDANT_EXPLICIT_LINKS,
UNPORTABLE_MARKDOWN,
]
});

Expand Down
2 changes: 2 additions & 0 deletions src/librustdoc/passes/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod check_code_block_syntax;
mod html_tags;
mod redundant_explicit_links;
mod unescaped_backticks;
mod unportable_markdown;

use super::Pass;
use crate::clean::*;
Expand All @@ -31,6 +32,7 @@ impl<'a, 'tcx> DocVisitor for Linter<'a, 'tcx> {
html_tags::visit_item(self.cx, item);
unescaped_backticks::visit_item(self.cx, item);
redundant_explicit_links::visit_item(self.cx, item);
unportable_markdown::visit_item(self.cx, item);

self.visit_item_recur(item)
}
Expand Down
314 changes: 314 additions & 0 deletions src/librustdoc/passes/lint/unportable_markdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
//! Detects markdown syntax that's different between pulldown-cmark
//! 0.9 and 0.10.
use crate::clean::Item;
use crate::core::DocContext;
use crate::html::markdown::main_body_opts;
use pulldown_cmark as cmarko;
use pulldown_cmark_new as cmarkn;
use rustc_resolve::rustdoc::source_span_for_markdown_range;

pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item) {
let tcx = cx.tcx;
let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id) else {
// If non-local, no need to check anything.
return;
};

let dox = item.doc_value();
if dox.is_empty() {
return;
}

let link_names = item.link_names(&cx.cache);
let mut replacer_old = |broken_link: cmarko::BrokenLink<'_>| {
link_names
.iter()
.find(|link| *link.original_text == *broken_link.reference)
.map(|link| ((*link.href).into(), (*link.new_text).into()))
};
let parser_old = cmarko::Parser::new_with_broken_link_callback(
&dox,
main_body_opts(),
Some(&mut replacer_old),
)
.into_offset_iter()
// Not worth cleaning up minor "distinctions without difference" in the AST.
// Text events get chopped up differently between versions.
// <html> and `code` mistakes are usually covered by unescaped_backticks and html_tags lints.
.filter(|(event, _event_range)| {
!matches!(
event,
cmarko::Event::Code(_)
| cmarko::Event::Text(_)
| cmarko::Event::Html(_)
| cmarko::Event::SoftBreak
)
});

pub fn main_body_opts_new() -> cmarkn::Options {
cmarkn::Options::ENABLE_TABLES
| cmarkn::Options::ENABLE_FOOTNOTES
| cmarkn::Options::ENABLE_STRIKETHROUGH
| cmarkn::Options::ENABLE_TASKLISTS
| cmarkn::Options::ENABLE_SMART_PUNCTUATION
}
let mut replacer_new = |broken_link: cmarkn::BrokenLink<'_>| {
link_names
.iter()
.find(|link| *link.original_text.trim() == *broken_link.reference.trim())
.map(|link| ((*link.href).into(), (*link.new_text).into()))
};
let parser_new = cmarkn::Parser::new_with_broken_link_callback(
&dox,
main_body_opts_new(),
Some(&mut replacer_new),
)
.into_offset_iter()
.filter(|(event, _event_range)| {
!matches!(
event,
cmarkn::Event::Code(_)
| cmarkn::Event::Text(_)
| cmarkn::Event::Html(_)
| cmarkn::Event::InlineHtml(_)
| cmarkn::Event::SoftBreak
)
});

let mut reported_an_error = false;
for ((event_old, event_range_old), (event_new, event_range_new)) in parser_old.zip(parser_new) {
match (event_old, event_new) {
(
cmarko::Event::Start(cmarko::Tag::Emphasis),
cmarkn::Event::Start(cmarkn::Tag::Emphasis),
)
| (
cmarko::Event::Start(cmarko::Tag::Strong),
cmarkn::Event::Start(cmarkn::Tag::Strong),
)
| (
cmarko::Event::Start(cmarko::Tag::Strikethrough),
cmarkn::Event::Start(cmarkn::Tag::Strikethrough),
)
| (
cmarko::Event::End(cmarko::Tag::Emphasis),
cmarkn::Event::End(cmarkn::TagEnd::Emphasis),
)
| (
cmarko::Event::End(cmarko::Tag::Strong),
cmarkn::Event::End(cmarkn::TagEnd::Strong),
)
| (
cmarko::Event::End(cmarko::Tag::Strikethrough),
cmarkn::Event::End(cmarkn::TagEnd::Strikethrough),
)
| (
cmarko::Event::End(cmarko::Tag::Link(..)),
cmarkn::Event::End(cmarkn::TagEnd::Link),
)
| (
cmarko::Event::End(cmarko::Tag::Image(..)),
cmarkn::Event::End(cmarkn::TagEnd::Image),
)
| (cmarko::Event::FootnoteReference(..), cmarkn::Event::FootnoteReference(..))
| (cmarko::Event::TaskListMarker(false), cmarkn::Event::TaskListMarker(false))
| (cmarko::Event::TaskListMarker(true), cmarkn::Event::TaskListMarker(true))
if event_range_old == event_range_new =>
{
// Matching tags. Do nothing.
}
(
cmarko::Event::Start(cmarko::Tag::Link(_, old_dest_url, old_title)),
cmarkn::Event::Start(cmarkn::Tag::Link { dest_url, title, .. }),
)
| (
cmarko::Event::Start(cmarko::Tag::Image(_, old_dest_url, old_title)),
cmarkn::Event::Start(cmarkn::Tag::Image { dest_url, title, .. }),
) if event_range_old == event_range_new
&& &old_dest_url[..] == &dest_url[..]
&& &old_title[..] == &title[..] =>
{
// Matching tags. Do nothing.
}
(cmarko::Event::SoftBreak, cmarkn::Event::SoftBreak)
| (cmarko::Event::HardBreak, cmarkn::Event::HardBreak)
| (cmarko::Event::Rule, cmarkn::Event::Rule)
| (
cmarko::Event::Start(cmarko::Tag::Paragraph),
cmarkn::Event::Start(cmarkn::Tag::Paragraph),
)
| (
cmarko::Event::Start(cmarko::Tag::Heading(..)),
cmarkn::Event::Start(cmarkn::Tag::Heading { .. }),
)
| (
cmarko::Event::Start(cmarko::Tag::BlockQuote),
cmarkn::Event::Start(cmarkn::Tag::BlockQuote),
)
| (
cmarko::Event::Start(cmarko::Tag::CodeBlock(..)),
cmarkn::Event::Start(cmarkn::Tag::CodeBlock(..)),
)
| (
cmarko::Event::Start(cmarko::Tag::List(..)),
cmarkn::Event::Start(cmarkn::Tag::List(..)),
)
| (cmarko::Event::Start(cmarko::Tag::Item), cmarkn::Event::Start(cmarkn::Tag::Item))
| (
cmarko::Event::Start(cmarko::Tag::FootnoteDefinition(..)),
cmarkn::Event::Start(cmarkn::Tag::FootnoteDefinition(..)),
)
| (
cmarko::Event::Start(cmarko::Tag::Table(..)),
cmarkn::Event::Start(cmarkn::Tag::Table(..)),
)
| (
cmarko::Event::Start(cmarko::Tag::TableHead),
cmarkn::Event::Start(cmarkn::Tag::TableHead),
)
| (
cmarko::Event::Start(cmarko::Tag::TableRow),
cmarkn::Event::Start(cmarkn::Tag::TableRow),
)
| (
cmarko::Event::Start(cmarko::Tag::TableCell),
cmarkn::Event::Start(cmarkn::Tag::TableCell),
)
| (
cmarko::Event::End(cmarko::Tag::Paragraph),
cmarkn::Event::End(cmarkn::TagEnd::Paragraph),
)
| (
cmarko::Event::End(cmarko::Tag::Heading(..)),
cmarkn::Event::End(cmarkn::TagEnd::Heading(_)),
)
| (
cmarko::Event::End(cmarko::Tag::BlockQuote),
cmarkn::Event::End(cmarkn::TagEnd::BlockQuote),
)
| (
cmarko::Event::End(cmarko::Tag::CodeBlock(..)),
cmarkn::Event::End(cmarkn::TagEnd::CodeBlock),
)
| (
cmarko::Event::End(cmarko::Tag::List(..)),
cmarkn::Event::End(cmarkn::TagEnd::List(_)),
)
| (cmarko::Event::End(cmarko::Tag::Item), cmarkn::Event::End(cmarkn::TagEnd::Item))
| (
cmarko::Event::End(cmarko::Tag::FootnoteDefinition(..)),
cmarkn::Event::End(cmarkn::TagEnd::FootnoteDefinition),
)
| (
cmarko::Event::End(cmarko::Tag::Table(..)),
cmarkn::Event::End(cmarkn::TagEnd::Table),
)
| (
cmarko::Event::End(cmarko::Tag::TableHead),
cmarkn::Event::End(cmarkn::TagEnd::TableHead),
)
| (
cmarko::Event::End(cmarko::Tag::TableRow),
cmarkn::Event::End(cmarkn::TagEnd::TableRow),
)
| (
cmarko::Event::End(cmarko::Tag::TableCell),
cmarkn::Event::End(cmarkn::TagEnd::TableCell),
) => {
// Matching tags. Do nothing.
//
// Parsers sometimes differ in what they consider the "range of an event,"
// even though the event is really the same. Inlines are pretty consistent,
// but stuff like list items? Not really.
//
// Mismatched block elements will usually nest differently, so ignoring it
// works good enough.
}
// If we've already reported an error on the start tag, don't bother on the end tag.
(cmarko::Event::End(_), _) | (_, cmarkn::Event::End(_)) if reported_an_error => {}
// Non-matching inline.
(cmarko::Event::Start(cmarko::Tag::Link(..)), cmarkn::Event::FootnoteReference(..))
| (
cmarko::Event::Start(cmarko::Tag::Image(..)),
cmarkn::Event::FootnoteReference(..),
)
| (
cmarko::Event::FootnoteReference(..),
cmarkn::Event::Start(cmarkn::Tag::Link { .. }),
)
| (
cmarko::Event::FootnoteReference(..),
cmarkn::Event::Start(cmarkn::Tag::Image { .. }),
) if event_range_old == event_range_new => {
reported_an_error = true;
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
// use the span of the entire attribute as a fallback.
let span = source_span_for_markdown_range(
tcx,
&dox,
&event_range_old,
&item.attrs.doc_strings,
)
.unwrap_or_else(|| item.attr_span(tcx));
tcx.node_span_lint(
crate::lint::UNPORTABLE_MARKDOWN,
hir_id,
span,
"unportable markdown",
|lint| {
lint.help(format!("syntax ambiguous between footnote and link"));
},
);
}
// Non-matching results.
(event_old, event_new) => {
reported_an_error = true;
let (range, range_other, desc, desc_other, tag, tag_other) = if event_range_old.end
- event_range_old.start
< event_range_new.end - event_range_new.start
{
(
event_range_old,
event_range_new,
"old",
"new",
format!("{event_old:?}"),
format!("{event_new:?}"),
)
} else {
(
event_range_new,
event_range_old,
"new",
"old",
format!("{event_new:?}"),
format!("{event_old:?}"),
)
};
let (range, tag_other) =
if range_other.start <= range.start && range_other.end <= range.end {
(range_other.start..range.end, tag_other)
} else {
(range, format!("nothing"))
};
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
// use the span of the entire attribute as a fallback.
let span =
source_span_for_markdown_range(tcx, &dox, &range, &item.attrs.doc_strings)
.unwrap_or_else(|| item.attr_span(tcx));
tcx.node_span_lint(
crate::lint::UNPORTABLE_MARKDOWN,
hir_id,
span,
"unportable markdown",
|lint| {
lint.help(format!(
"{desc} parser sees {tag}, {desc_other} sees {tag_other}"
));
},
);
}
}
}
}
Loading

0 comments on commit 8ddc204

Please sign in to comment.