Skip to content

Commit 223112b

Browse files
committed
Auto merge of #121659 - notriddle:notriddle/bump-pulldown-cmark, r=<try>
rustdoc: check parsing diffs between pulldown-cmark 0.9.6 and 0.10 This commit is not meant to be merged as-is. It's meant to run in Crater, so that we can estimate the impact of bumping to the new version of the markdown parser. r? rustdoc
2 parents ef32456 + 5b1ebc2 commit 223112b

File tree

7 files changed

+433
-0
lines changed

7 files changed

+433
-0
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4689,6 +4689,7 @@ dependencies = [
46894689
"itertools 0.11.0",
46904690
"minifier",
46914691
"once_cell",
4692+
"pulldown-cmark 0.10.0",
46924693
"regex",
46934694
"rustdoc-json-types",
46944695
"serde",

src/librustdoc/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ itertools = "0.11"
1313
indexmap = "2"
1414
minifier = "0.3.0"
1515
once_cell = "1.10.0"
16+
pulldown-cmark-new = { version = "0.10", package = "pulldown-cmark", default-features = false }
1617
regex = "1"
1718
rustdoc-json-types = { path = "../rustdoc-json-types" }
1819
serde_json = "1.0"

src/librustdoc/lint.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ declare_rustdoc_lint! {
196196
"detects redundant explicit links in doc comments"
197197
}
198198

199+
declare_rustdoc_lint! {
200+
/// This compatibility lint checks for Markdown syntax that works in the old engine but not
201+
/// the new one.
202+
UNPORTABLE_MARKDOWN,
203+
Deny,
204+
"detects markdown that is interpreted differently in different parser"
205+
}
206+
199207
pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
200208
vec![
201209
BROKEN_INTRA_DOC_LINKS,
@@ -209,6 +217,7 @@ pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
209217
MISSING_CRATE_LEVEL_DOCS,
210218
UNESCAPED_BACKTICKS,
211219
REDUNDANT_EXPLICIT_LINKS,
220+
UNPORTABLE_MARKDOWN,
212221
]
213222
});
214223

src/librustdoc/passes/lint.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod check_code_block_syntax;
66
mod html_tags;
77
mod redundant_explicit_links;
88
mod unescaped_backticks;
9+
mod unportable_markdown;
910

1011
use super::Pass;
1112
use crate::clean::*;
@@ -31,6 +32,7 @@ impl<'a, 'tcx> DocVisitor for Linter<'a, 'tcx> {
3132
html_tags::visit_item(self.cx, item);
3233
unescaped_backticks::visit_item(self.cx, item);
3334
redundant_explicit_links::visit_item(self.cx, item);
35+
unportable_markdown::visit_item(self.cx, item);
3436

3537
self.visit_item_recur(item)
3638
}
Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
//! Detects markdown syntax that's different between pulldown-cmark
2+
//! 0.9 and 0.10.
3+
4+
use crate::clean::Item;
5+
use crate::core::DocContext;
6+
use crate::html::markdown::main_body_opts;
7+
use pulldown_cmark as cmarko;
8+
use pulldown_cmark_new as cmarkn;
9+
use rustc_resolve::rustdoc::source_span_for_markdown_range;
10+
11+
pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item) {
12+
let tcx = cx.tcx;
13+
let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id) else {
14+
// If non-local, no need to check anything.
15+
return;
16+
};
17+
18+
let dox = item.doc_value();
19+
if dox.is_empty() {
20+
return;
21+
}
22+
23+
let link_names = item.link_names(&cx.cache);
24+
let mut replacer_old = |broken_link: cmarko::BrokenLink<'_>| {
25+
link_names
26+
.iter()
27+
.find(|link| *link.original_text == *broken_link.reference)
28+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
29+
};
30+
let parser_old = cmarko::Parser::new_with_broken_link_callback(
31+
&dox,
32+
main_body_opts(),
33+
Some(&mut replacer_old),
34+
)
35+
.into_offset_iter()
36+
// Not worth cleaning up minor "distinctions without difference" in the AST.
37+
// Text events get chopped up differently between versions.
38+
// <html> and `code` mistakes are usually covered by unescaped_backticks and html_tags lints.
39+
.filter(|(event, _event_range)| {
40+
!matches!(
41+
event,
42+
cmarko::Event::Code(_)
43+
| cmarko::Event::Text(_)
44+
| cmarko::Event::Html(_)
45+
| cmarko::Event::SoftBreak
46+
)
47+
});
48+
49+
pub fn main_body_opts_new() -> cmarkn::Options {
50+
cmarkn::Options::ENABLE_TABLES
51+
| cmarkn::Options::ENABLE_FOOTNOTES
52+
| cmarkn::Options::ENABLE_STRIKETHROUGH
53+
| cmarkn::Options::ENABLE_TASKLISTS
54+
| cmarkn::Options::ENABLE_SMART_PUNCTUATION
55+
}
56+
let mut replacer_new = |broken_link: cmarkn::BrokenLink<'_>| {
57+
link_names
58+
.iter()
59+
.find(|link| *link.original_text.trim() == *broken_link.reference.trim())
60+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
61+
};
62+
let parser_new = cmarkn::Parser::new_with_broken_link_callback(
63+
&dox,
64+
main_body_opts_new(),
65+
Some(&mut replacer_new),
66+
)
67+
.into_offset_iter()
68+
.filter(|(event, _event_range)| {
69+
!matches!(
70+
event,
71+
cmarkn::Event::Code(_)
72+
| cmarkn::Event::Text(_)
73+
| cmarkn::Event::Html(_)
74+
| cmarkn::Event::InlineHtml(_)
75+
| cmarkn::Event::Start(cmarkn::Tag::HtmlBlock)
76+
| cmarkn::Event::End(cmarkn::TagEnd::HtmlBlock)
77+
| cmarkn::Event::SoftBreak
78+
)
79+
});
80+
81+
let mut reported_an_error = false;
82+
for ((event_old, event_range_old), (event_new, event_range_new)) in parser_old.zip(parser_new) {
83+
match (event_old, event_new) {
84+
(
85+
cmarko::Event::Start(cmarko::Tag::Emphasis),
86+
cmarkn::Event::Start(cmarkn::Tag::Emphasis),
87+
)
88+
| (
89+
cmarko::Event::Start(cmarko::Tag::Strong),
90+
cmarkn::Event::Start(cmarkn::Tag::Strong),
91+
)
92+
| (
93+
cmarko::Event::Start(cmarko::Tag::Strikethrough),
94+
cmarkn::Event::Start(cmarkn::Tag::Strikethrough),
95+
)
96+
| (
97+
cmarko::Event::End(cmarko::Tag::Emphasis),
98+
cmarkn::Event::End(cmarkn::TagEnd::Emphasis),
99+
)
100+
| (
101+
cmarko::Event::End(cmarko::Tag::Strong),
102+
cmarkn::Event::End(cmarkn::TagEnd::Strong),
103+
)
104+
| (
105+
cmarko::Event::End(cmarko::Tag::Strikethrough),
106+
cmarkn::Event::End(cmarkn::TagEnd::Strikethrough),
107+
)
108+
| (
109+
cmarko::Event::End(cmarko::Tag::Link(..)),
110+
cmarkn::Event::End(cmarkn::TagEnd::Link),
111+
)
112+
| (
113+
cmarko::Event::End(cmarko::Tag::Image(..)),
114+
cmarkn::Event::End(cmarkn::TagEnd::Image),
115+
)
116+
| (cmarko::Event::FootnoteReference(..), cmarkn::Event::FootnoteReference(..))
117+
| (cmarko::Event::TaskListMarker(false), cmarkn::Event::TaskListMarker(false))
118+
| (cmarko::Event::TaskListMarker(true), cmarkn::Event::TaskListMarker(true))
119+
if event_range_old == event_range_new =>
120+
{
121+
// Matching tags. Do nothing.
122+
}
123+
(
124+
cmarko::Event::Start(cmarko::Tag::Link(_, old_dest_url, old_title)),
125+
cmarkn::Event::Start(cmarkn::Tag::Link { dest_url, title, .. }),
126+
)
127+
| (
128+
cmarko::Event::Start(cmarko::Tag::Image(_, old_dest_url, old_title)),
129+
cmarkn::Event::Start(cmarkn::Tag::Image { dest_url, title, .. }),
130+
) if event_range_old == event_range_new
131+
&& &old_dest_url[..] == &dest_url[..]
132+
&& &old_title[..] == &title[..] =>
133+
{
134+
// Matching tags. Do nothing.
135+
}
136+
(cmarko::Event::SoftBreak, cmarkn::Event::SoftBreak)
137+
| (cmarko::Event::HardBreak, cmarkn::Event::HardBreak)
138+
| (cmarko::Event::Rule, cmarkn::Event::Rule)
139+
| (
140+
cmarko::Event::Start(cmarko::Tag::Paragraph),
141+
cmarkn::Event::Start(cmarkn::Tag::Paragraph),
142+
)
143+
| (
144+
cmarko::Event::Start(cmarko::Tag::Heading(..)),
145+
cmarkn::Event::Start(cmarkn::Tag::Heading { .. }),
146+
)
147+
| (
148+
cmarko::Event::Start(cmarko::Tag::BlockQuote),
149+
cmarkn::Event::Start(cmarkn::Tag::BlockQuote),
150+
)
151+
| (
152+
cmarko::Event::Start(cmarko::Tag::CodeBlock(..)),
153+
cmarkn::Event::Start(cmarkn::Tag::CodeBlock(..)),
154+
)
155+
| (
156+
cmarko::Event::Start(cmarko::Tag::List(..)),
157+
cmarkn::Event::Start(cmarkn::Tag::List(..)),
158+
)
159+
| (cmarko::Event::Start(cmarko::Tag::Item), cmarkn::Event::Start(cmarkn::Tag::Item))
160+
| (
161+
cmarko::Event::Start(cmarko::Tag::FootnoteDefinition(..)),
162+
cmarkn::Event::Start(cmarkn::Tag::FootnoteDefinition(..)),
163+
)
164+
| (
165+
cmarko::Event::Start(cmarko::Tag::Table(..)),
166+
cmarkn::Event::Start(cmarkn::Tag::Table(..)),
167+
)
168+
| (
169+
cmarko::Event::Start(cmarko::Tag::TableHead),
170+
cmarkn::Event::Start(cmarkn::Tag::TableHead),
171+
)
172+
| (
173+
cmarko::Event::Start(cmarko::Tag::TableRow),
174+
cmarkn::Event::Start(cmarkn::Tag::TableRow),
175+
)
176+
| (
177+
cmarko::Event::Start(cmarko::Tag::TableCell),
178+
cmarkn::Event::Start(cmarkn::Tag::TableCell),
179+
)
180+
| (
181+
cmarko::Event::End(cmarko::Tag::Paragraph),
182+
cmarkn::Event::End(cmarkn::TagEnd::Paragraph),
183+
)
184+
| (
185+
cmarko::Event::End(cmarko::Tag::Heading(..)),
186+
cmarkn::Event::End(cmarkn::TagEnd::Heading(_)),
187+
)
188+
| (
189+
cmarko::Event::End(cmarko::Tag::BlockQuote),
190+
cmarkn::Event::End(cmarkn::TagEnd::BlockQuote),
191+
)
192+
| (
193+
cmarko::Event::End(cmarko::Tag::CodeBlock(..)),
194+
cmarkn::Event::End(cmarkn::TagEnd::CodeBlock),
195+
)
196+
| (
197+
cmarko::Event::End(cmarko::Tag::List(..)),
198+
cmarkn::Event::End(cmarkn::TagEnd::List(_)),
199+
)
200+
| (cmarko::Event::End(cmarko::Tag::Item), cmarkn::Event::End(cmarkn::TagEnd::Item))
201+
| (
202+
cmarko::Event::End(cmarko::Tag::FootnoteDefinition(..)),
203+
cmarkn::Event::End(cmarkn::TagEnd::FootnoteDefinition),
204+
)
205+
| (
206+
cmarko::Event::End(cmarko::Tag::Table(..)),
207+
cmarkn::Event::End(cmarkn::TagEnd::Table),
208+
)
209+
| (
210+
cmarko::Event::End(cmarko::Tag::TableHead),
211+
cmarkn::Event::End(cmarkn::TagEnd::TableHead),
212+
)
213+
| (
214+
cmarko::Event::End(cmarko::Tag::TableRow),
215+
cmarkn::Event::End(cmarkn::TagEnd::TableRow),
216+
)
217+
| (
218+
cmarko::Event::End(cmarko::Tag::TableCell),
219+
cmarkn::Event::End(cmarkn::TagEnd::TableCell),
220+
) => {
221+
// Matching tags. Do nothing.
222+
//
223+
// Parsers sometimes differ in what they consider the "range of an event,"
224+
// even though the event is really the same. Inlines are pretty consistent,
225+
// but stuff like list items? Not really.
226+
//
227+
// Mismatched block elements will usually nest differently, so ignoring it
228+
// works good enough.
229+
}
230+
// If we've already reported an error on the start tag, don't bother on the end tag.
231+
(cmarko::Event::End(_), _) | (_, cmarkn::Event::End(_)) if reported_an_error => {}
232+
// Non-matching inline.
233+
(cmarko::Event::Start(cmarko::Tag::Link(..)), cmarkn::Event::FootnoteReference(..))
234+
| (
235+
cmarko::Event::Start(cmarko::Tag::Image(..)),
236+
cmarkn::Event::FootnoteReference(..),
237+
)
238+
| (
239+
cmarko::Event::FootnoteReference(..),
240+
cmarkn::Event::Start(cmarkn::Tag::Link { .. }),
241+
)
242+
| (
243+
cmarko::Event::FootnoteReference(..),
244+
cmarkn::Event::Start(cmarkn::Tag::Image { .. }),
245+
) if event_range_old == event_range_new => {
246+
reported_an_error = true;
247+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
248+
// use the span of the entire attribute as a fallback.
249+
let span = source_span_for_markdown_range(
250+
tcx,
251+
&dox,
252+
&event_range_old,
253+
&item.attrs.doc_strings,
254+
)
255+
.unwrap_or_else(|| item.attr_span(tcx));
256+
tcx.node_span_lint(
257+
crate::lint::UNPORTABLE_MARKDOWN,
258+
hir_id,
259+
span,
260+
"unportable markdown",
261+
|lint| {
262+
lint.help(format!("syntax ambiguous between footnote and link"));
263+
},
264+
);
265+
}
266+
// Non-matching results.
267+
(event_old, event_new) => {
268+
reported_an_error = true;
269+
let (range, range_other, desc, desc_other, tag, tag_other) = if event_range_old.end
270+
- event_range_old.start
271+
< event_range_new.end - event_range_new.start
272+
{
273+
(
274+
event_range_old,
275+
event_range_new,
276+
"old",
277+
"new",
278+
format!("{event_old:?}"),
279+
format!("{event_new:?}"),
280+
)
281+
} else {
282+
(
283+
event_range_new,
284+
event_range_old,
285+
"new",
286+
"old",
287+
format!("{event_new:?}"),
288+
format!("{event_old:?}"),
289+
)
290+
};
291+
let (range, tag_other) =
292+
if range_other.start <= range.start && range_other.end <= range.end {
293+
(range_other.start..range.end, tag_other)
294+
} else {
295+
(range, format!("nothing"))
296+
};
297+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
298+
// use the span of the entire attribute as a fallback.
299+
let span =
300+
source_span_for_markdown_range(tcx, &dox, &range, &item.attrs.doc_strings)
301+
.unwrap_or_else(|| item.attr_span(tcx));
302+
tcx.node_span_lint(
303+
crate::lint::UNPORTABLE_MARKDOWN,
304+
hir_id,
305+
span,
306+
"unportable markdown",
307+
|lint| {
308+
lint.help(format!(
309+
"{desc} parser sees {tag}, {desc_other} sees {tag_other}"
310+
));
311+
},
312+
);
313+
}
314+
}
315+
}
316+
}

0 commit comments

Comments
 (0)