From 3b2b8b78266368b85190778780c6b2eb78c7d10e Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:05:59 -0500
Subject: [PATCH 01/18] test(markdown_parser): pin html fragment parsing
behavior before refactor
---
crates/markdown_parser/src/html_parser_tests.rs | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/crates/markdown_parser/src/html_parser_tests.rs b/crates/markdown_parser/src/html_parser_tests.rs
index b1f937793..4ab75a84b 100644
--- a/crates/markdown_parser/src/html_parser_tests.rs
+++ b/crates/markdown_parser/src/html_parser_tests.rs
@@ -553,6 +553,23 @@ fn test_code_and_inline_code() {
);
}
+#[test]
+fn test_parse_html_fragment_after_refactor_inline() {
+ // Regression: a bare inline fragment (no /
) must still parse.
+ // This is what the markdown parser will hand into the new pub(crate) helper.
+ let html = "hi there";
+ let parsed = parse_html(html).expect("HTML should parse");
+ assert!(!parsed.lines.is_empty());
+}
+
+#[test]
+fn test_parse_html_fragment_after_refactor_block() {
+ // Regression: a fragment containing only a block element must still parse.
+ let html = "hello
world
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ assert_eq!(parsed.lines.len(), 2);
+}
+
// Test for CLD-860
#[test]
fn test_confluence_code_block() {
From 4506fc393b0e9faec879c857a61d20fdd8bcb4d3 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:08:44 -0500
Subject: [PATCH 02/18] refactor(markdown_parser): expose pub(crate) html
fragment helpers
Add parse_html_block_lines, parse_html_inline_fragments, and find_body
as pub(crate) helpers above parse_html for use by the upcoming markdown
GFM HTML block/inline dispatch. No behavior change to parse_html itself.
---
crates/markdown_parser/src/html_parser.rs | 56 +++++++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/crates/markdown_parser/src/html_parser.rs b/crates/markdown_parser/src/html_parser.rs
index 98d110d18..32f743a6f 100644
--- a/crates/markdown_parser/src/html_parser.rs
+++ b/crates/markdown_parser/src/html_parser.rs
@@ -123,6 +123,62 @@ fn type_matches(attributes: &[Attribute], value: &str) -> bool {
get_attribute(attributes, "type") == Some(value)
}
+/// Parse an HTML fragment and return its block-level lines.
+///
+/// Used by the markdown parser when it encounters a block-level HTML span in a `.md` file.
+/// Unlike `parse_html`, this assumes the caller already classified the span as block-level
+/// and does not perform any top-level element skipping.
+pub(crate) fn parse_html_block_lines(html: &str) -> Vec {
+ parse_html(html)
+ .map(|formatted| formatted.lines.into_iter().collect())
+ .unwrap_or_default()
+}
+
+/// Parse an HTML fragment and return its inline phrasing fragments.
+///
+/// Used by the markdown parser when it encounters an inline-level HTML span in a `.md` file.
+/// Walks `html5ever`'s tree and applies `parse_phrasing_content` to every text/element node it
+/// finds, ignoring block structure.
+pub(crate) fn parse_html_inline_fragments(html: &str) -> Vec {
+ let opts = ParseOpts {
+ tree_builder: TreeBuilderOpts {
+ drop_doctype: true,
+ ..Default::default()
+ },
+ ..Default::default()
+ };
+ let dom = match parse_document(RcDom::default(), opts)
+ .from_utf8()
+ .read_from(&mut html.as_bytes())
+ {
+ Ok(dom) => dom,
+ Err(_) => return Vec::new(),
+ };
+
+ // html5ever wraps fragments in ….
+ // Walk down to and collect its children for phrasing.
+ let body = find_body(&dom.document);
+ let children: Vec> = match body {
+ Some(body) => body.children.borrow().iter().cloned().collect(),
+ None => return Vec::new(),
+ };
+ parse_phrasing_content(&children, Styling::default())
+}
+
+fn find_body(node: &Rc) -> Option> {
+ if let NodeData::Element { name, .. } = &node.data
+ && name.local.as_ref() == "body"
+ {
+ return Some(Rc::clone(node));
+ }
+ for child in node.children.borrow().iter() {
+ if let Some(found) = find_body(child) {
+ return Some(found);
+ }
+ }
+ None
+}
+
// Top-level function to parse a HTML string into a FormattedText document.
pub fn parse_html(html: &str) -> Result {
let opts = ParseOpts {
From a89213283ec63eb799dc5707178ea0dfcac6ba56 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:13:49 -0500
Subject: [PATCH 03/18] feat(markdown_parser): render
details/kbd/sub/sup/del/raw table/img html
---
crates/markdown_parser/src/html_parser.rs | 146 +++++++++++++++++-
.../markdown_parser/src/html_parser_tests.rs | 101 ++++++++++--
2 files changed, 226 insertions(+), 21 deletions(-)
diff --git a/crates/markdown_parser/src/html_parser.rs b/crates/markdown_parser/src/html_parser.rs
index 32f743a6f..949704861 100644
--- a/crates/markdown_parser/src/html_parser.rs
+++ b/crates/markdown_parser/src/html_parser.rs
@@ -22,10 +22,11 @@ use crate::{
// Note that we have "" here because GDocs always include a top level element to add additional
// GDocs specific meta-data for its rich text content.
const TOP_LEVEL_ELEMENT_TAGS_TO_SKIP: &[&str] = &[
- "head", "body", "html", "meta", "table", "b", "div", "ul", "ol", "li", "input",
+ "head", "body", "html", "meta", "b", "div", "ul", "ol", "li", "input",
];
const PHRASING_ELEMENT_TAGS: &[&str] = &[
"span", "i", "code", "strong", "em", "br", "a", "s", "u", "ins",
+ "del", "kbd", "sub", "sup", "mark", "small", "q",
];
pub const WARP_EMBED_ATTRIBUTE_NAME: &str = "data-warp-embedded-item";
@@ -317,6 +318,49 @@ pub fn parse_html(html: &str) -> Result {
pending_inline_nodes.clear();
}
+ if node_name.as_str() == "details" {
+ // Always-expanded rendering: emit a "▾ " line, then walk the children
+ // (excluding the element itself) into block lines.
+ let children_ref = node.children.borrow();
+ let summary_node = children_ref.iter().find(|child| {
+ matches!(&child.data, NodeData::Element { name, .. } if name.local.as_ref() == "summary")
+ });
+ let summary_fragments = match summary_node {
+ Some(summary) => {
+ let summary_children: Vec> =
+ summary.children.borrow().iter().cloned().collect();
+ parse_phrasing_content(&summary_children, Styling::default())
+ }
+ None => vec![FormattedTextFragment::plain_text("Details")],
+ };
+ let mut first_line = vec![FormattedTextFragment::plain_text("▾ ")];
+ first_line.extend(summary_fragments);
+ result.push_back(FormattedTextLine::Line(first_line));
+ // Queue non-summary children for block parsing.
+ for child in children_ref.iter().rev() {
+ let is_summary = matches!(
+ &child.data,
+ NodeData::Element { name, .. } if name.local.as_ref() == "summary"
+ );
+ if !is_summary {
+ nodes.push((Rc::clone(child), indent_level.clone()));
+ }
+ }
+ last_active_indent_level = indent_level;
+ continue;
+ }
+ if node_name.as_str() == "summary" {
+ // Stray outside a renders as a plain line.
+ let summary_children: Vec> =
+ node.children.borrow().iter().cloned().collect();
+ let line = parse_phrasing_content(&summary_children, Styling::default());
+ if !line.is_empty() {
+ result.push_back(FormattedTextLine::Line(line));
+ }
+ last_active_indent_level = indent_level;
+ continue;
+ }
+
// Update styling based on the node's attribute.
decorated_styling.update_with_attributes(&attrs.borrow());
@@ -391,6 +435,17 @@ pub fn parse_html(html: &str) -> Result {
}),
"br" => FormattedTextLine::LineBreak,
"hr" => FormattedTextLine::HorizontalRule,
+ "table" => build_table_from_html(&node.children.borrow()),
+ "img" => {
+ let attrs = attrs.borrow();
+ let source = get_attribute(&attrs, "src").unwrap_or("").to_string();
+ let alt_text = get_attribute(&attrs, "alt").unwrap_or("").to_string();
+ FormattedTextLine::Image(crate::FormattedImage {
+ alt_text,
+ source,
+ title: None,
+ })
+ }
_ => {
// Take into consideration the indent level when parsing the nodes.
let parsed_node = parse_pending_inline_nodes(
@@ -498,11 +553,11 @@ fn parse_phrasing_content(nodes: &[Rc], text_styling: Styling) -> Formatte
match node_name.as_ref() {
"b" | "strong" => decorated_styling.bold = true,
"i" | "em" => decorated_styling.italic = true,
- "s" => decorated_styling.strikethrough = true,
+ "s" | "del" => decorated_styling.strikethrough = true,
"u" | "ins" => decorated_styling.underline = true,
- "code" => decorated_styling.inline_code = true,
- // TODO: We need to add more phrasing styling we support (e.g. links) here.
- // https://linear.app/warpdotdev/issue/CLD-335/add-html-parsing-for-headers-and-lists
+ "code" | "kbd" => decorated_styling.inline_code = true,
+ "sub" | "sup" => decorated_styling.italic = true,
+ "mark" | "small" | "q" | "span" => {}
_ => (),
};
@@ -614,6 +669,87 @@ fn parse_code_block_and_language(nodes: &[Rc]) -> (String, Option)
(text, language)
}
+fn build_table_from_html(rows: &[Rc]) -> FormattedTextLine {
+ use crate::{FormattedTable, TableAlignment};
+
+ let mut headers: Vec = Vec::new();
+ let mut body_rows: Vec> = Vec::new();
+
+ fn walk(
+ nodes: &[Rc],
+ headers: &mut Vec,
+ body_rows: &mut Vec>,
+ ) {
+ for node in nodes {
+ let NodeData::Element { name, .. } = &node.data else { continue };
+ match name.local.as_ref() {
+ "thead" => {
+ for tr in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &tr.data else { continue };
+ if name.local.as_ref() != "tr" { continue }
+ for cell in tr.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ headers.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ }
+ }
+ "tbody" | "tfoot" => {
+ for tr in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &tr.data else { continue };
+ if name.local.as_ref() != "tr" { continue }
+ let mut row: Vec = Vec::new();
+ for cell in tr.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ row.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ body_rows.push(row);
+ }
+ }
+ "tr" => {
+ // with no / wrapper: first is header.
+ let mut row: Vec = Vec::new();
+ for cell in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ row.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ if headers.is_empty() {
+ *headers = row;
+ } else {
+ body_rows.push(row);
+ }
+ }
+ _ => walk(&node.children.borrow(), headers, body_rows),
+ }
+ }
+ }
+
+ walk(rows, &mut headers, &mut body_rows);
+
+ let col_count = headers.len().max(body_rows.iter().map(Vec::len).max().unwrap_or(0));
+ let mut table = FormattedTable {
+ headers,
+ alignments: vec![TableAlignment::Left; col_count],
+ rows: body_rows,
+ };
+ table.normalize_shape();
+ FormattedTextLine::Table(table)
+}
+
// Parse a HTML style string into its corresponding name -> value hashmap
// For example "font-style:italic;font-weight:400" will be parsed into
// {"font-style": "italic", "font-weight": "400"}.
diff --git a/crates/markdown_parser/src/html_parser_tests.rs b/crates/markdown_parser/src/html_parser_tests.rs
index 4ab75a84b..3d35da949 100644
--- a/crates/markdown_parser/src/html_parser_tests.rs
+++ b/crates/markdown_parser/src/html_parser_tests.rs
@@ -203,22 +203,6 @@ fn test_unsupported_html_types() {
])
]
);
-
- assert_eq!(
- test_parse_html(
- ""
- ),
- vec![
- FormattedTextLine::Line(vec![
- FormattedTextFragment::plain_text("Text 1"),
- FormattedTextFragment::plain_text("Text 2")
- ]),
- FormattedTextLine::Line(vec![
- FormattedTextFragment::plain_text("Test"),
- FormattedTextFragment::plain_text("Test")
- ])
- ]
- );
}
#[test]
@@ -582,3 +566,88 @@ fn test_confluence_code_block() {
}),]
);
}
+
+#[test]
+fn test_parse_html_details_with_summary() {
+ let html = "Click me
Hidden text
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let lines: Vec<_> = parsed.lines.iter().collect();
+ // First line: "▾ Click me"
+ match lines.first() {
+ Some(FormattedTextLine::Line(fragments)) => {
+ let joined: String = fragments.iter().map(|f| f.text.clone()).collect();
+ assert!(joined.starts_with("▾ "), "expected disclosure glyph, got {joined:?}");
+ assert!(joined.contains("Click me"), "expected summary text, got {joined:?}");
+ }
+ other => panic!("expected first line to be Line(_), got {other:?}"),
+ }
+ // The body must follow.
+ assert!(lines.len() >= 2, "expected details body line, got {lines:?}");
+}
+
+#[test]
+fn test_parse_html_raw_table() {
+ let html = "";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let table = parsed
+ .lines
+ .iter()
+ .find_map(|line| match line {
+ FormattedTextLine::Table(t) => Some(t),
+ _ => None,
+ })
+ .expect("expected a Table line");
+ assert_eq!(table.headers.len(), 2);
+ assert_eq!(table.rows.len(), 1);
+ assert_eq!(table.rows[0].len(), 2);
+}
+
+#[test]
+fn test_parse_html_img_block() {
+ let html = r#"
"#;
+ let parsed = parse_html(html).expect("HTML should parse");
+ let image = parsed
+ .lines
+ .iter()
+ .find_map(|line| match line {
+ FormattedTextLine::Image(img) => Some(img),
+ _ => None,
+ })
+ .expect("expected an Image line");
+ assert_eq!(image.source, "cat.png");
+ assert_eq!(image.alt_text, "A cat");
+}
+
+#[test]
+fn test_parse_html_phrasing_tags() {
+ let html = "Press Cmd for 2 + 3, \
+ old, marked.
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let fragments: Vec<_> = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .cloned()
+ .collect();
+ let by_text = |needle: &str| {
+ fragments
+ .iter()
+ .find(|f| f.text == needle)
+ .cloned()
+ .unwrap_or_else(|| panic!("no fragment {needle:?} in {fragments:?}"))
+ };
+
+ assert!(by_text("Cmd").styles.inline_code, "kbd → inline_code");
+ assert!(by_text("2").styles.italic, "sub → italic");
+ assert!(by_text("3").styles.italic, "sup → italic");
+ assert!(by_text("old").styles.strikethrough, "del → strikethrough");
+ // has no equivalent style flag for v1 — passes through as plain text.
+ let marked = by_text("marked");
+ assert!(!marked.styles.inline_code);
+ assert_eq!(marked.styles.weight, None);
+}
From ee84b458ea17240c40442efc306ddd54dc21f89e Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:16:56 -0500
Subject: [PATCH 04/18] feat(markdown_parser): add gfm html span lexer with
safe-list
---
crates/markdown_parser/src/gfm_html.rs | 188 +++++++++++++++++++
crates/markdown_parser/src/gfm_html_tests.rs | 68 +++++++
crates/markdown_parser/src/lib.rs | 1 +
3 files changed, 257 insertions(+)
create mode 100644 crates/markdown_parser/src/gfm_html.rs
create mode 100644 crates/markdown_parser/src/gfm_html_tests.rs
diff --git a/crates/markdown_parser/src/gfm_html.rs b/crates/markdown_parser/src/gfm_html.rs
new file mode 100644
index 000000000..6554fb6e7
--- /dev/null
+++ b/crates/markdown_parser/src/gfm_html.rs
@@ -0,0 +1,188 @@
+//! GFM-style HTML span lexing for the markdown parser.
+//!
+//! `try_lex_html_span` recognizes a single HTML span starting at the current
+//! input position and returns it together with the remaining input. Tags are
+//! classified against a hardcoded safe-list / strip-list / block-list.
+//!
+//! This module does NOT parse HTML structure — that work is delegated to
+//! `html_parser::parse_html_inline_fragments` and
+//! `html_parser::parse_html_block_lines`, which both use `html5ever`.
+
+pub(crate) const PHRASING_SAFE_TAGS: &[&str] = &[
+ "a", "b", "br", "code", "del", "em", "i", "ins", "kbd",
+ "mark", "q", "s", "small", "span", "strong", "sub", "sup", "u",
+];
+
+pub(crate) const BLOCK_SAFE_TAGS: &[&str] = &[
+ "blockquote", "caption", "dd", "details", "div", "dl", "dt",
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "hr", "img", "li", "ol", "p", "pre", "summary",
+ "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul",
+];
+
+pub(crate) const STRIPPED_TAGS: &[&str] = &[
+ "applet", "body", "button", "embed", "form", "frame", "frameset",
+ "head", "html", "iframe", "input", "link", "meta", "noscript",
+ "object", "script", "style", "title",
+];
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum HtmlSpanKind {
+ PhrasingSafe,
+ BlockSafe,
+ Stripped,
+ Unknown,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) struct HtmlSpan<'a> {
+ /// The entire matched span, including `` … `` (or just the open tag if
+ /// self-closing or unmatched).
+ pub(crate) raw: &'a str,
+ /// Lowercased tag name.
+ pub(crate) tag: &'a str,
+ pub(crate) kind: HtmlSpanKind,
+}
+
+impl HtmlSpan<'_> {
+ pub(crate) fn is_block(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::BlockSafe)
+ }
+ pub(crate) fn is_phrasing(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::PhrasingSafe)
+ }
+ pub(crate) fn is_stripped(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::Stripped)
+ }
+}
+
+pub(crate) fn classify(tag: &str) -> HtmlSpanKind {
+ let lower = tag.to_ascii_lowercase();
+ if PHRASING_SAFE_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::PhrasingSafe
+ } else if BLOCK_SAFE_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::BlockSafe
+ } else if STRIPPED_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::Stripped
+ } else {
+ HtmlSpanKind::Unknown
+ }
+}
+
+pub(crate) fn try_lex_html_span(input: &str) -> Option<(HtmlSpan<'_>, &str)> {
+ let bytes = input.as_bytes();
+ if bytes.first() != Some(&b'<') {
+ return None;
+ }
+ // HTML comment:
+ if input.starts_with("").map(|i| i + 4 + 3).unwrap_or(input.len());
+ return Some((
+ HtmlSpan { raw: &input[..end], tag: "", kind: HtmlSpanKind::Stripped },
+ &input[end..],
+ ));
+ }
+ let mut idx = 1;
+ let is_close = bytes.get(idx) == Some(&b'/');
+ if is_close {
+ idx += 1;
+ }
+ let tag_start = idx;
+ while idx < bytes.len() && (bytes[idx].is_ascii_alphanumeric() || bytes[idx] == b'-') {
+ idx += 1;
+ }
+ if idx == tag_start {
+ return None; // not a tag
+ }
+ let tag = &input[tag_start..idx];
+ // For closing tags we have no body to scan; just find '>'.
+ if is_close {
+ let close_offset = input[idx..].find('>')?;
+ let after = idx + close_offset + 1;
+ return Some((
+ HtmlSpan { raw: &input[..after], tag, kind: classify(tag) },
+ &input[after..],
+ ));
+ }
+ // Skip the open-tag's attributes. We need to honor quoted attribute values
+ // so we don't mistake `'>'` inside a quote for end-of-tag.
+ let mut quote: Option = None;
+ let mut self_closing = false;
+ while idx < bytes.len() {
+ let c = bytes[idx];
+ match (quote, c) {
+ (Some(q), c) if c == q => quote = None,
+ (None, b'"') | (None, b'\'') => quote = Some(c),
+ (None, b'/') if bytes.get(idx + 1) == Some(&b'>') => {
+ self_closing = true;
+ idx += 2;
+ break;
+ }
+ (None, b'>') => {
+ idx += 1;
+ break;
+ }
+ _ => {}
+ }
+ idx += 1;
+ }
+ // Void / self-closing elements end here.
+ let void_tags = ["br", "hr", "img", "input", "meta", "link"];
+ if self_closing || void_tags.iter().any(|t| tag.eq_ignore_ascii_case(t)) {
+ return Some((
+ HtmlSpan { raw: &input[..idx], tag, kind: classify(tag) },
+ &input[idx..],
+ ));
+ }
+ // Scan for matching close tag, honoring nested same-tag pairs.
+ let mut depth = 1usize;
+ let mut scan = idx;
+ while scan < bytes.len() {
+ let Some(rel) = input[scan..].find('<') else { break };
+ scan += rel;
+ if input[scan..].starts_with("") {
+ Some(i) => scan + 4 + i + 3,
+ None => bytes.len(),
+ };
+ continue;
+ }
+ let is_close_here = bytes.get(scan + 1) == Some(&b'/');
+ let name_start = scan + if is_close_here { 2 } else { 1 };
+ let mut name_end = name_start;
+ while name_end < bytes.len()
+ && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
+ {
+ name_end += 1;
+ }
+ let same = input[name_start..name_end].eq_ignore_ascii_case(tag);
+ let close_rel = match input[name_end..].find('>') {
+ Some(i) => i + 1,
+ None => bytes.len() - name_end,
+ };
+ let after_tag = name_end + close_rel;
+ if same {
+ if is_close_here {
+ depth -= 1;
+ if depth == 0 {
+ return Some((
+ HtmlSpan { raw: &input[..after_tag], tag, kind: classify(tag) },
+ &input[after_tag..],
+ ));
+ }
+ } else if !void_tags.iter().any(|t| (*t).eq_ignore_ascii_case(tag)) {
+ depth += 1;
+ }
+ }
+ scan = after_tag;
+ }
+ // No matching close; treat the open tag alone as the span.
+ Some((
+ HtmlSpan { raw: &input[..idx], tag, kind: classify(tag) },
+ &input[idx..],
+ ))
+}
+
+#[cfg(test)]
+#[path = "gfm_html_tests.rs"]
+mod tests;
diff --git a/crates/markdown_parser/src/gfm_html_tests.rs b/crates/markdown_parser/src/gfm_html_tests.rs
new file mode 100644
index 000000000..23d818449
--- /dev/null
+++ b/crates/markdown_parser/src/gfm_html_tests.rs
@@ -0,0 +1,68 @@
+use super::*;
+
+#[test]
+fn lex_simple_open_close() {
+ let (span, rest) = try_lex_html_span("hi rest").unwrap();
+ assert_eq!(span.raw, "hi");
+ assert_eq!(span.tag, "b");
+ assert_eq!(span.kind, HtmlSpanKind::PhrasingSafe);
+ assert_eq!(rest, " rest");
+}
+
+#[test]
+fn lex_self_closing() {
+ let (span, rest) = try_lex_html_span("
after").unwrap();
+ assert_eq!(span.raw, "
");
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_void_no_slash() {
+ let (span, rest) = try_lex_html_span("
after").unwrap();
+ assert_eq!(span.raw, "
");
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_attribute_with_gt_in_quote() {
+ let (span, rest) = try_lex_html_span(r#"link!"#).unwrap();
+ assert_eq!(span.raw, r#"link"#);
+ assert_eq!(rest, "!");
+}
+
+#[test]
+fn lex_nested_same_tag() {
+ let (span, rest) =
+ try_lex_html_span("x tail").unwrap();
+ assert_eq!(span.raw, "x ");
+ assert_eq!(rest, "tail");
+}
+
+#[test]
+fn lex_missing_close_returns_open_only() {
+ let (span, rest) = try_lex_html_span("x and more").unwrap();
+ assert_eq!(span.raw, "");
+ assert_eq!(rest, "x and more");
+}
+
+#[test]
+fn lex_comment_stripped() {
+ let (span, rest) = try_lex_html_span("after").unwrap();
+ assert!(span.is_stripped());
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_not_a_tag_returns_none() {
+ assert!(try_lex_html_span("< not a tag").is_none());
+ assert!(try_lex_html_span("a ").is_none());
+}
+
+#[test]
+fn lex_classification() {
+ assert_eq!(classify("kbd"), HtmlSpanKind::PhrasingSafe);
+ assert_eq!(classify("details"), HtmlSpanKind::BlockSafe);
+ assert_eq!(classify("script"), HtmlSpanKind::Stripped);
+ assert_eq!(classify("foo"), HtmlSpanKind::Unknown);
+ assert_eq!(classify("DETAILS"), HtmlSpanKind::BlockSafe, "case-insensitive");
+}
diff --git a/crates/markdown_parser/src/lib.rs b/crates/markdown_parser/src/lib.rs
index f2bc1bc61..f3e052b11 100644
--- a/crates/markdown_parser/src/lib.rs
+++ b/crates/markdown_parser/src/lib.rs
@@ -6,6 +6,7 @@ use std::{collections::VecDeque, fmt, fmt::Debug};
pub mod html_parser;
pub mod markdown_parser;
pub mod weight;
+mod gfm_html;
pub use html_parser::parse_html;
use itertools::Itertools;
pub use markdown_parser::{
From 1fc08cd47bcb9c2d14690e178d74d77b98ec432f Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:23:46 -0500
Subject: [PATCH 05/18] feat(markdown_parser): dispatch block html in markdown
body
---
crates/markdown_parser/src/markdown_parser.rs | 37 +++++++++++++++++++
.../src/markdown_parser_tests.rs | 32 ++++++++++++++++
2 files changed, 69 insertions(+)
diff --git a/crates/markdown_parser/src/markdown_parser.rs b/crates/markdown_parser/src/markdown_parser.rs
index b8742762f..a165ce7a7 100644
--- a/crates/markdown_parser/src/markdown_parser.rs
+++ b/crates/markdown_parser/src/markdown_parser.rs
@@ -188,6 +188,15 @@ fn parse_markdown_internal<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
let mut remaining = markdown;
let mut lines = Vec::new();
while !remaining.is_empty() {
+ // Try block HTML first; if it matches, splice in its result.
+ if let Ok((rest, html_lines)) = parse_block_html::(remaining) {
+ if !html_lines.is_empty() {
+ lines.extend(html_lines);
+ }
+ remaining = rest;
+ indentation_context.borrow_mut().clear();
+ continue;
+ }
let (remaining_after_block, mut line) = block(remaining)?;
remaining = remaining_after_block;
@@ -221,6 +230,34 @@ fn parse_markdown_internal<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
Ok((remaining, lines))
}
+/// Parse a block-level HTML span, dispatching to the html_parser helper for rendering.
+fn parse_block_html<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
+ input: &'a str,
+) -> IResult<&'a str, Vec, E> {
+ use crate::gfm_html::{HtmlSpanKind, try_lex_html_span};
+ use crate::html_parser::parse_html_block_lines;
+
+ let Some((span, rest)) = try_lex_html_span(input) else {
+ return Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)));
+ };
+ match span.kind {
+ HtmlSpanKind::BlockSafe => {
+ let lines = parse_html_block_lines(span.raw);
+ // Consume one trailing newline if present so we don't emit a spurious blank line.
+ let rest = rest.strip_prefix('\n').unwrap_or(rest);
+ Ok((rest, lines))
+ }
+ HtmlSpanKind::Stripped => {
+ let rest = rest.strip_prefix('\n').unwrap_or(rest);
+ Ok((rest, Vec::new()))
+ }
+ HtmlSpanKind::PhrasingSafe | HtmlSpanKind::Unknown => {
+ // Not a block — let the paragraph parser handle it.
+ Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)))
+ }
+ }
+}
+
/// Parse a single paragraph of Markdown text.
fn parse_paragraph<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
markdown: &'a str,
diff --git a/crates/markdown_parser/src/markdown_parser_tests.rs b/crates/markdown_parser/src/markdown_parser_tests.rs
index 8c69fdfc1..f4e6f275e 100644
--- a/crates/markdown_parser/src/markdown_parser_tests.rs
+++ b/crates/markdown_parser/src/markdown_parser_tests.rs
@@ -2752,3 +2752,35 @@ fn test_parse_table_with_strikethrough() {
panic!("Expected table");
}
}
+
+#[test]
+fn test_parse_block_html_details() {
+ let source = "Click
Body
\n";
+ let parsed = test_parse_markdown(source);
+ let first = parsed.first().expect("at least one line");
+ match first {
+ FormattedTextLine::Line(fragments) => {
+ let joined: String = fragments.iter().map(|f| f.text.clone()).collect();
+ assert!(joined.starts_with("▾ Click"), "got {joined:?}");
+ }
+ other => panic!("expected Line, got {other:?}"),
+ }
+}
+
+#[test]
+fn test_parse_block_html_strips_script() {
+ let source = "before\n\nafter\n";
+ let parsed = test_parse_markdown(source);
+ let texts: Vec = parsed
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(fragments) => {
+ Some(fragments.iter().map(|f| f.text.clone()).collect::())
+ }
+ _ => None,
+ })
+ .collect();
+ assert!(texts.iter().any(|t| t == "before"));
+ assert!(texts.iter().any(|t| t == "after"));
+ assert!(!texts.iter().any(|t| t.contains("alert")), "script body must be stripped: {texts:?}");
+}
From 0095b4d95d980dda25169f3e930fd3058d9b5348 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:24:51 -0500
Subject: [PATCH 06/18] docs(specs): add gfm markdown preview spec and
implementation plan
---
.../PLAN-01-html-and-footnotes.md | 1887 +++++++++++++++++
specs/castcodes-md-gfm-html/PRODUCT.md | 94 +
specs/castcodes-md-gfm-html/TECH.md | 212 ++
3 files changed, 2193 insertions(+)
create mode 100644 specs/castcodes-md-gfm-html/PLAN-01-html-and-footnotes.md
create mode 100644 specs/castcodes-md-gfm-html/PRODUCT.md
create mode 100644 specs/castcodes-md-gfm-html/TECH.md
diff --git a/specs/castcodes-md-gfm-html/PLAN-01-html-and-footnotes.md b/specs/castcodes-md-gfm-html/PLAN-01-html-and-footnotes.md
new file mode 100644
index 000000000..7b3b1047c
--- /dev/null
+++ b/specs/castcodes-md-gfm-html/PLAN-01-html-and-footnotes.md
@@ -0,0 +1,1887 @@
+# GFM Markdown Preview — Plan 1: HTML + Footnotes
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:subagent-driven-development` (recommended) or `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. Read [`PRODUCT.md`](./PRODUCT.md) and [`TECH.md`](./TECH.md) before starting; this plan assumes both have been read.
+>
+> **Signing rule:** Every `git commit` in this plan MUST pass `-S`. The repo's local `commit.gpgsign` is false; without `-S` commits land unsigned. After each commit, run `git log -1 --show-signature` and confirm a `Good "git" signature` line. If signing failed, STOP and surface to the user.
+>
+> **AI attribution rule:** Commit messages and any file added by this plan MUST contain no AI-attribution markers. Do NOT add `Co-Authored-By:` lines naming an AI tool, model, or harness. Do NOT add "Generated with …" footers. Run `./script/check_ai_attribution` before every commit.
+>
+> **Branch verification rule:** Before each commit run `git branch --show-current`. If the branch is not `cast/md-html-support` (or whatever branch you started this work on), STOP — a parallel call may have switched HEAD.
+
+**Goal:** Render the GFM HTML safe-list and footnotes inside `.md` files when viewed in CodeView's Rendered preview, with no regression to existing markdown rendering or HTML-paste handling.
+
+**Architecture:** All work lives in `crates/markdown_parser/`. Refactor `html_parser.rs` to expose two pub-crate helpers (`parse_html_inline_fragments`, `parse_html_block_lines`) that the existing `parse_html` entry point also uses. Add a new `gfm_html.rs` with the safe-list constants and a hand-rolled HTML-span lexer. Wire the lexer into `parse_markdown_internal` (block context) and `parse_inline_token` (inline context). Add a new `footnotes.rs` with a two-pass extract-and-rewrite implementation called from `parse_markdown_impl`. No editor, renderer, AI-block, or notebook changes.
+
+**Tech Stack:** Rust 2024 edition, `nom` parser combinators (existing), `html5ever` + `markup5ever_rcdom` (existing), `anyhow` (existing). No new dependencies.
+
+---
+
+## Files created or modified
+
+**Created:**
+
+| Path | Responsibility |
+|---|---|
+| `crates/markdown_parser/src/gfm_html.rs` | Safe-list / stripped-list tag constants; `try_lex_html_span` lexer; `HtmlSpan` type. |
+| `crates/markdown_parser/src/gfm_html_tests.rs` | Unit tests for the lexer (open/close pairing, self-closing, attributes, nesting, missing close). |
+| `crates/markdown_parser/src/footnotes.rs` | `FootnoteDef` collection, reference rewriting, appended-section construction. |
+| `crates/markdown_parser/src/footnotes_tests.rs` | Unit tests for the two-pass footnote pipeline. |
+| `crates/markdown_parser/test-fixtures/gfm-smoketest.md` | Manual smoke fixture for CodeView visual verification (not a runnable test). |
+
+**Modified:**
+
+| Path | Change |
+|---|---|
+| `crates/markdown_parser/src/lib.rs` | Add `mod gfm_html;` and `mod footnotes;` lines. No public API changes. |
+| `crates/markdown_parser/src/html_parser.rs` | Refactor: split the existing `parse_html` body into two new `pub(crate)` helpers `parse_html_inline_fragments(html: &str) -> Vec` and `parse_html_block_lines(html: &str) -> Vec`. `parse_html` becomes a thin wrapper. Extend the per-element match to handle `details`, `summary`, `kbd`, `sub`, `sup`, `del`, `mark`, raw ``, raw `
`. |
+| `crates/markdown_parser/src/markdown_parser.rs` | Add a block-level HTML branch in `parse_markdown_internal`'s `alt` chain. Add an inline-level HTML branch in `parse_inline_token`. Wrap the existing `parse_markdown_impl` call with the footnote pipeline. |
+| `crates/markdown_parser/src/markdown_parser_tests.rs` | Add the new HTML + footnote test cases (listed in TECH §6). |
+| `crates/markdown_parser/src/html_parser_tests.rs` | Add cases that confirm the refactor's existing-paste behavior is unchanged. |
+| `DESIGN-CHANGES.md` | Append a short entry noting GFM HTML + footnote support in the `.md` preview. |
+
+---
+
+## Phase 0 — Branch / environment checks
+
+### Task 0.1: Confirm working environment
+
+**Files:** none.
+
+- [ ] **Step 1: Confirm branch and clean working tree (besides expected auth files)**
+
+Run:
+```bash
+git branch --show-current
+git status
+```
+Expected: branch `cast/md-html-support`; only `app/src/auth/auth_manager.rs` and `app/src/auth/auth_state.rs` show as modified (pre-existing changes from prior session — leave them alone).
+
+- [ ] **Step 2: Confirm signing key is set**
+
+Run:
+```bash
+git config --get user.signingkey
+git config --get gpg.format
+```
+Expected: both return non-empty values. If either is empty, STOP and tell the user signing is misconfigured.
+
+- [ ] **Step 3: Confirm AI-attribution guard is executable**
+
+Run:
+```bash
+./script/check_ai_attribution
+echo "exit=$?"
+```
+Expected: prints `AI attribution guard passed.` and `exit=0`.
+
+- [ ] **Step 4: Establish a baseline test run for the parser crate**
+
+Run:
+```bash
+cargo test -p markdown_parser --quiet
+```
+Expected: all tests pass. Note the count. If anything fails, STOP — the baseline is broken before you've made any changes.
+
+---
+
+## Phase 1 — Refactor `html_parser.rs` to expose pub-crate helpers (no behavior change)
+
+The existing `parse_html` does three things in one body: parse the document, walk top-level elements (with a skip-list), and dispatch per-element. Phase 1 splits this so the markdown parser can call into the same per-element logic for HTML fragments embedded in `.md`.
+
+### Task 1.1: Add a regression baseline test for `parse_html`
+
+**Files:**
+- Modify: `crates/markdown_parser/src/html_parser_tests.rs`
+
+- [ ] **Step 1: Add the regression test**
+
+Append to `crates/markdown_parser/src/html_parser_tests.rs`:
+
+```rust
+#[test]
+fn test_parse_html_fragment_after_refactor_inline() {
+ // Regression: a bare inline fragment (no /) must still parse.
+ // This is what the markdown parser will hand into the new pub(crate) helper.
+ let html = "hi there";
+ let parsed = parse_html(html).expect("HTML should parse");
+ assert!(!parsed.lines.is_empty());
+}
+
+#[test]
+fn test_parse_html_fragment_after_refactor_block() {
+ // Regression: a fragment containing only a block element must still parse.
+ let html = "hello
world
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ assert_eq!(parsed.lines.len(), 2);
+}
+```
+
+- [ ] **Step 2: Run the tests to confirm they pass on current code**
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_fragment_after_refactor --quiet
+```
+Expected: both tests PASS (current `parse_html` already handles these fragments via html5ever's body auto-creation).
+
+- [ ] **Step 3: Commit the baseline tests**
+
+Run:
+```bash
+git branch --show-current # must print: cast/md-html-support
+./script/check_ai_attribution
+git add crates/markdown_parser/src/html_parser_tests.rs
+git commit -S -m "test(markdown_parser): pin html fragment parsing behavior before refactor"
+git log -1 --show-signature # confirm: Good "git" signature
+```
+
+If `git log -1 --show-signature` does NOT print `Good "git" signature`, STOP.
+
+### Task 1.2: Extract `parse_html_inline_fragments` and `parse_html_block_lines`
+
+**Files:**
+- Modify: `crates/markdown_parser/src/html_parser.rs`
+
+- [ ] **Step 1: Locate the body of `parse_html`**
+
+`parse_html` currently lives at `crates/markdown_parser/src/html_parser.rs:127`. Read it (lines 127–367) before editing — you'll be cutting a large block.
+
+- [ ] **Step 2: Add the two pub-crate helpers above `parse_html`**
+
+In `crates/markdown_parser/src/html_parser.rs`, immediately above the existing `pub fn parse_html`, add:
+
+```rust
+/// Parse an HTML fragment and return its block-level lines.
+///
+/// Used by the markdown parser when it encounters a block-level HTML span in a `.md` file.
+/// Unlike `parse_html`, this assumes the caller already classified the span as block-level
+/// and does not perform any top-level element skipping.
+pub(crate) fn parse_html_block_lines(html: &str) -> Vec {
+ parse_html(html)
+ .map(|formatted| formatted.lines.into_iter().collect())
+ .unwrap_or_default()
+}
+
+/// Parse an HTML fragment and return its inline phrasing fragments.
+///
+/// Used by the markdown parser when it encounters an inline-level HTML span in a `.md` file.
+/// Walks `html5ever`'s tree and applies `parse_phrasing_content` to every text/element node it
+/// finds, ignoring block structure.
+pub(crate) fn parse_html_inline_fragments(html: &str) -> Vec {
+ let opts = ParseOpts {
+ tree_builder: TreeBuilderOpts {
+ drop_doctype: true,
+ ..Default::default()
+ },
+ ..Default::default()
+ };
+ let dom = match parse_document(RcDom::default(), opts)
+ .from_utf8()
+ .read_from(&mut html.as_bytes())
+ {
+ Ok(dom) => dom,
+ Err(_) => return Vec::new(),
+ };
+
+ // html5ever wraps fragments in ….
+ // Walk down to and collect its children for phrasing.
+ let body = find_body(&dom.document);
+ let children: Vec> = match body {
+ Some(body) => body.children.borrow().iter().cloned().collect(),
+ None => return Vec::new(),
+ };
+ parse_phrasing_content(&children, Styling::default())
+}
+
+fn find_body(node: &Rc) -> Option> {
+ if let NodeData::Element { name, .. } = &node.data
+ && name.local.as_ref() == "body"
+ {
+ return Some(Rc::clone(node));
+ }
+ for child in node.children.borrow().iter() {
+ if let Some(found) = find_body(child) {
+ return Some(found);
+ }
+ }
+ None
+}
+```
+
+- [ ] **Step 3: Run the existing html_parser tests and the new regression tests**
+
+Run:
+```bash
+cargo test -p markdown_parser html_parser --quiet
+cargo test -p markdown_parser test_parse_html_fragment_after_refactor --quiet
+```
+Expected: all pass. If `find_body` or any new symbol fails to compile, fix the import — `Rc` and `NodeData` are already imported in this file.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/html_parser.rs
+git commit -S -m "refactor(markdown_parser): expose pub(crate) html fragment helpers"
+git log -1 --show-signature
+```
+
+### Task 1.3: Add per-element handling for new safe-list tags inside `html_parser.rs`
+
+**Files:**
+- Modify: `crates/markdown_parser/src/html_parser.rs`
+
+The existing per-element match (lines 267–350 of the original file) handles `pre`, `h1`–`h6`, `br`, `hr`, plus the phrasing tags inside `parse_phrasing_content`. We need to add: `details`, `summary`, `kbd`, `sub`, `sup`, `del`, `mark`, raw `
`, raw `` (full mapping per TECH §2).
+
+- [ ] **Step 1: Add the failing test for `` with ``**
+
+Append to `crates/markdown_parser/src/html_parser_tests.rs`:
+
+```rust
+#[test]
+fn test_parse_html_details_with_summary() {
+ let html = "Click me
Hidden text
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let lines: Vec<_> = parsed.lines.iter().collect();
+ // First line: "▾ Click me"
+ match lines.first() {
+ Some(FormattedTextLine::Line(fragments)) => {
+ let joined: String = fragments.iter().map(|f| f.text.clone()).collect();
+ assert!(joined.starts_with("▾ "), "expected disclosure glyph, got {joined:?}");
+ assert!(joined.contains("Click me"), "expected summary text, got {joined:?}");
+ }
+ other => panic!("expected first line to be Line(_), got {other:?}"),
+ }
+ // The body must follow.
+ assert!(lines.len() >= 2, "expected details body line, got {lines:?}");
+}
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_details_with_summary --quiet
+```
+Expected: FAIL (the current parser does not recognize ``/`` specifically).
+
+- [ ] **Step 2: Wire `` handling**
+
+In `crates/markdown_parser/src/html_parser.rs`, locate the `TOP_LEVEL_ELEMENT_TAGS_TO_SKIP` constant at lines 24–26. Remove `"table"` from that list (we now want to render raw tables) and add a new branch in the element match for `details`. Replace the relevant section so the per-element dispatch (`match node_name.as_str()`) includes:
+
+```rust
+"details" => {
+ // Always-expanded rendering: emit a "▾ " line, then walk the children
+ // (excluding the element itself) into block lines.
+ let children = node.children.borrow();
+ let summary_node = children.iter().find(|child| {
+ matches!(&child.data, NodeData::Element { name, .. } if name.local.as_ref() == "summary")
+ });
+ let summary_text = match summary_node {
+ Some(summary) => {
+ let summary_children: Vec> =
+ summary.children.borrow().iter().cloned().collect();
+ parse_phrasing_content(&summary_children, Styling::default())
+ }
+ None => vec![FormattedTextFragment::plain_text("Details")],
+ };
+ let mut first_line = vec![FormattedTextFragment::plain_text("▾ ")];
+ first_line.extend(summary_text);
+ result.push_back(FormattedTextLine::Line(first_line));
+ // Queue non-summary children for block parsing.
+ for child in children.iter().rev() {
+ let is_summary = matches!(
+ &child.data,
+ NodeData::Element { name, .. } if name.local.as_ref() == "summary"
+ );
+ if !is_summary {
+ nodes.push((Rc::clone(child), indent_level.clone()));
+ }
+ }
+ last_active_indent_level = indent_level;
+ continue;
+}
+"summary" => {
+ // Stray outside a renders as a plain line.
+ let summary_children: Vec> =
+ node.children.borrow().iter().cloned().collect();
+ let line = parse_phrasing_content(&summary_children, Styling::default());
+ if !line.is_empty() {
+ result.push_back(FormattedTextLine::Line(line));
+ }
+ last_active_indent_level = indent_level;
+ continue;
+}
+```
+
+The exact placement is inside the `NodeData::Element { name, attrs, .. } => { … }` branch, immediately after the `TOP_LEVEL_ELEMENT_TAGS_TO_SKIP` `continue` and before the phrasing-tag `continue`. Read the surrounding code to ensure the borrowing patterns match (the existing code calls `node.children.borrow()` repeatedly; keep that pattern).
+
+- [ ] **Step 3: Run the test to confirm it passes**
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_details_with_summary --quiet
+```
+Expected: PASS.
+
+- [ ] **Step 4: Add and pass tests for raw `` and `
` block**
+
+Append to `crates/markdown_parser/src/html_parser_tests.rs`:
+
+```rust
+#[test]
+fn test_parse_html_raw_table() {
+ let html = "";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let table = parsed
+ .lines
+ .iter()
+ .find_map(|line| match line {
+ FormattedTextLine::Table(t) => Some(t),
+ _ => None,
+ })
+ .expect("expected a Table line");
+ assert_eq!(table.headers.len(), 2);
+ assert_eq!(table.rows.len(), 1);
+ assert_eq!(table.rows[0].len(), 2);
+}
+
+#[test]
+fn test_parse_html_img_block() {
+ let html = r#"
"#;
+ let parsed = parse_html(html).expect("HTML should parse");
+ let image = parsed
+ .lines
+ .iter()
+ .find_map(|line| match line {
+ FormattedTextLine::Image(img) => Some(img),
+ _ => None,
+ })
+ .expect("expected an Image line");
+ assert_eq!(image.source, "cat.png");
+ assert_eq!(image.alt_text, "A cat");
+}
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_raw_table test_parse_html_img_block --quiet
+```
+Expected: FAIL initially. Then add the per-element handling:
+
+In the `match node_name.as_str()` block, add:
+
+```rust
+"table" => build_table_from_html(&node.children.borrow()),
+"img" => {
+ let attrs = attrs.borrow();
+ let source = get_attribute(&attrs, "src").unwrap_or("").to_string();
+ let alt_text = get_attribute(&attrs, "alt").unwrap_or("").to_string();
+ FormattedTextLine::Image(crate::FormattedImage {
+ alt_text,
+ source,
+ title: None,
+ })
+}
+```
+
+Below `parse_code_block_and_language` (near the end of the file), add a new helper:
+
+```rust
+fn build_table_from_html(rows: &[Rc]) -> FormattedTextLine {
+ use crate::{FormattedTable, TableAlignment};
+
+ let mut headers: Vec = Vec::new();
+ let mut body_rows: Vec> = Vec::new();
+
+ fn walk(
+ nodes: &[Rc],
+ headers: &mut Vec,
+ body_rows: &mut Vec>,
+ ) {
+ for node in nodes {
+ let NodeData::Element { name, .. } = &node.data else { continue };
+ match name.local.as_ref() {
+ "thead" => {
+ for tr in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &tr.data else { continue };
+ if name.local.as_ref() != "tr" { continue }
+ for cell in tr.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ headers.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ }
+ }
+ "tbody" | "tfoot" => {
+ for tr in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &tr.data else { continue };
+ if name.local.as_ref() != "tr" { continue }
+ let mut row: Vec = Vec::new();
+ for cell in tr.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ row.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ body_rows.push(row);
+ }
+ }
+ "tr" => {
+ // with no / wrapper: first is header.
+ let mut row: Vec = Vec::new();
+ for cell in node.children.borrow().iter() {
+ let NodeData::Element { name, .. } = &cell.data else { continue };
+ if name.local.as_ref() != "th" && name.local.as_ref() != "td" {
+ continue;
+ }
+ let cell_children: Vec> =
+ cell.children.borrow().iter().cloned().collect();
+ row.push(parse_phrasing_content(&cell_children, Styling::default()));
+ }
+ if headers.is_empty() {
+ headers = row;
+ } else {
+ body_rows.push(row);
+ }
+ }
+ _ => walk(&node.children.borrow(), headers, body_rows),
+ }
+ }
+ }
+
+ walk(rows, &mut headers, &mut body_rows);
+
+ let col_count = headers.len().max(body_rows.iter().map(Vec::len).max().unwrap_or(0));
+ let mut table = FormattedTable {
+ headers,
+ alignments: vec![TableAlignment::Left; col_count],
+ rows: body_rows,
+ };
+ table.normalize_shape();
+ FormattedTextLine::Table(table)
+}
+```
+
+NOTE: the `headers = row;` line inside the nested closure won't compile because `headers` is an `&mut Vec` borrow. Use `*headers = row;` instead. The replacement code above is in the same spirit as the existing `parse_phrasing_content` recursion, where the helpers take `&mut Vec`. When editing, double-check borrow semantics against your local error output and adjust by adding `*` where needed.
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_raw_table test_parse_html_img_block --quiet
+```
+Expected: PASS. If clippy complains about the closure-borrow pattern, refactor to a free function that takes `&mut Vec` and returns `()`.
+
+- [ ] **Step 5: Add per-tag tests for kbd, sub, sup, del, mark**
+
+Append to `crates/markdown_parser/src/html_parser_tests.rs`:
+
+```rust
+#[test]
+fn test_parse_html_phrasing_tags() {
+ let html = "Press Cmd for 2 + 3, \
+ old, marked.
";
+ let parsed = parse_html(html).expect("HTML should parse");
+ let fragments: Vec<_> = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .cloned()
+ .collect();
+ let by_text = |needle: &str| {
+ fragments
+ .iter()
+ .find(|f| f.text == needle)
+ .cloned()
+ .unwrap_or_else(|| panic!("no fragment {needle:?} in {fragments:?}"))
+ };
+
+ assert!(by_text("Cmd").styles.inline_code, "kbd → inline_code");
+ assert!(by_text("2").styles.italic, "sub → italic");
+ assert!(by_text("3").styles.italic, "sup → italic");
+ assert!(by_text("old").styles.strikethrough, "del → strikethrough");
+ // has no equivalent style flag for v1 — passes through as plain text.
+ let marked = by_text("marked");
+ assert!(!marked.styles.inline_code);
+ assert_eq!(marked.styles.weight, None);
+}
+```
+
+In `crates/markdown_parser/src/html_parser.rs`, find the `match node_name.as_ref()` inside `parse_phrasing_content` (currently lines 442–451). Replace it with:
+
+```rust
+match node_name.as_ref() {
+ "b" | "strong" => decorated_styling.bold = true,
+ "i" | "em" => decorated_styling.italic = true,
+ "s" | "del" => decorated_styling.strikethrough = true,
+ "u" | "ins" => decorated_styling.underline = true,
+ "code" | "kbd" => decorated_styling.inline_code = true,
+ "sub" | "sup" => decorated_styling.italic = true,
+ "mark" | "small" | "q" | "span" => {}
+ _ => (),
+};
+```
+
+Also update `PHRASING_ELEMENT_TAGS` at line 27–29 to include the newcomers so the dispatcher routes them as phrasing rather than block:
+
+```rust
+const PHRASING_ELEMENT_TAGS: &[&str] = &[
+ "span", "i", "code", "strong", "em", "br", "a", "s", "u", "ins",
+ "del", "kbd", "sub", "sup", "mark", "small", "q",
+];
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_html_phrasing_tags --quiet
+```
+Expected: PASS.
+
+- [ ] **Step 6: Run the full html_parser test suite**
+
+```bash
+cargo test -p markdown_parser html_parser --quiet
+```
+Expected: all pass (existing + new).
+
+- [ ] **Step 7: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/html_parser.rs crates/markdown_parser/src/html_parser_tests.rs
+git commit -S -m "feat(markdown_parser): render details/kbd/sub/sup/del/raw table/img html"
+git log -1 --show-signature
+```
+
+---
+
+## Phase 2 — HTML span lexer (`gfm_html.rs`)
+
+### Task 2.1: Scaffold `gfm_html.rs` with safe-list constants
+
+**Files:**
+- Create: `crates/markdown_parser/src/gfm_html.rs`
+- Modify: `crates/markdown_parser/src/lib.rs`
+
+- [ ] **Step 1: Create the file**
+
+Write `crates/markdown_parser/src/gfm_html.rs`:
+
+```rust
+//! GFM-style HTML span lexing for the markdown parser.
+//!
+//! `try_lex_html_span` recognizes a single HTML span starting at the current
+//! input position and returns it together with the remaining input. Tags are
+//! classified against a hardcoded safe-list / strip-list / block-list.
+//!
+//! This module does NOT parse HTML structure — that work is delegated to
+//! `html_parser::parse_html_inline_fragments` and
+//! `html_parser::parse_html_block_lines`, which both use `html5ever`.
+
+pub(crate) const PHRASING_SAFE_TAGS: &[&str] = &[
+ "a", "b", "br", "code", "del", "em", "i", "ins", "kbd",
+ "mark", "q", "s", "small", "span", "strong", "sub", "sup", "u",
+];
+
+pub(crate) const BLOCK_SAFE_TAGS: &[&str] = &[
+ "blockquote", "caption", "dd", "details", "div", "dl", "dt",
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "hr", "img", "li", "ol", "p", "pre", "summary",
+ "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul",
+];
+
+pub(crate) const STRIPPED_TAGS: &[&str] = &[
+ "applet", "body", "button", "embed", "form", "frame", "frameset",
+ "head", "html", "iframe", "input", "link", "meta", "noscript",
+ "object", "script", "style", "title",
+];
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum HtmlSpanKind {
+ PhrasingSafe,
+ BlockSafe,
+ Stripped,
+ Unknown,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) struct HtmlSpan<'a> {
+ /// The entire matched span, including `` … `` (or just the open tag if
+ /// self-closing or unmatched).
+ pub(crate) raw: &'a str,
+ /// Lowercased tag name.
+ pub(crate) tag: &'a str,
+ pub(crate) kind: HtmlSpanKind,
+}
+
+impl HtmlSpan<'_> {
+ pub(crate) fn is_block(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::BlockSafe)
+ }
+ pub(crate) fn is_phrasing(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::PhrasingSafe)
+ }
+ pub(crate) fn is_stripped(&self) -> bool {
+ matches!(self.kind, HtmlSpanKind::Stripped)
+ }
+}
+
+pub(crate) fn classify(tag: &str) -> HtmlSpanKind {
+ let lower = tag.to_ascii_lowercase();
+ if PHRASING_SAFE_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::PhrasingSafe
+ } else if BLOCK_SAFE_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::BlockSafe
+ } else if STRIPPED_TAGS.iter().any(|t| *t == lower) {
+ HtmlSpanKind::Stripped
+ } else {
+ HtmlSpanKind::Unknown
+ }
+}
+
+pub(crate) fn try_lex_html_span(input: &str) -> Option<(HtmlSpan<'_>, &str)> {
+ let bytes = input.as_bytes();
+ if bytes.first() != Some(&b'<') {
+ return None;
+ }
+ // HTML comment:
+ if input.starts_with("").map(|i| i + 4 + 3).unwrap_or(input.len());
+ return Some((
+ HtmlSpan { raw: &input[..end], tag: "", kind: HtmlSpanKind::Stripped },
+ &input[end..],
+ ));
+ }
+ let mut idx = 1;
+ let is_close = bytes.get(idx) == Some(&b'/');
+ if is_close {
+ idx += 1;
+ }
+ let tag_start = idx;
+ while idx < bytes.len() && (bytes[idx].is_ascii_alphanumeric() || bytes[idx] == b'-') {
+ idx += 1;
+ }
+ if idx == tag_start {
+ return None; // not a tag
+ }
+ let tag = &input[tag_start..idx];
+ let tag_lower_len = tag.len();
+ // For closing tags we have no body to scan; just find '>'.
+ if is_close {
+ let close_offset = input[idx..].find('>')?;
+ let after = idx + close_offset + 1;
+ return Some((
+ HtmlSpan { raw: &input[..after], tag, kind: classify(tag) },
+ &input[after..],
+ ));
+ }
+ // Skip the open-tag's attributes. We need to honor quoted attribute values
+ // so we don't mistake `'>'` inside a quote for end-of-tag.
+ let mut quote: Option = None;
+ let mut self_closing = false;
+ while idx < bytes.len() {
+ let c = bytes[idx];
+ match (quote, c) {
+ (Some(q), c) if c == q => quote = None,
+ (None, b'"') | (None, b'\'') => quote = Some(c),
+ (None, b'/') if bytes.get(idx + 1) == Some(&b'>') => {
+ self_closing = true;
+ idx += 2;
+ break;
+ }
+ (None, b'>') => {
+ idx += 1;
+ break;
+ }
+ _ => {}
+ }
+ idx += 1;
+ }
+ // Void / self-closing elements end here.
+ let void_tags = ["br", "hr", "img", "input", "meta", "link"];
+ let _ = tag_lower_len;
+ if self_closing || void_tags.iter().any(|t| tag.eq_ignore_ascii_case(t)) {
+ return Some((
+ HtmlSpan { raw: &input[..idx], tag, kind: classify(tag) },
+ &input[idx..],
+ ));
+ }
+ // Scan for matching close tag, honoring nested same-tag pairs.
+ let mut depth = 1usize;
+ let mut scan = idx;
+ while scan < bytes.len() {
+ // Skip non-`<` quickly.
+ let Some(rel) = input[scan..].find('<') else { break };
+ scan += rel;
+ // `") {
+ Some(i) => scan + 4 + i + 3,
+ None => bytes.len(),
+ };
+ continue;
+ }
+ let is_close_here = bytes.get(scan + 1) == Some(&b'/');
+ let name_start = scan + if is_close_here { 2 } else { 1 };
+ let mut name_end = name_start;
+ while name_end < bytes.len()
+ && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
+ {
+ name_end += 1;
+ }
+ let same = input[name_start..name_end].eq_ignore_ascii_case(tag);
+ // Find this tag's '>' to advance scan past it.
+ let close_rel = match input[name_end..].find('>') {
+ Some(i) => i + 1,
+ None => bytes.len() - name_end,
+ };
+ let after_tag = name_end + close_rel;
+ if same {
+ if is_close_here {
+ depth -= 1;
+ if depth == 0 {
+ return Some((
+ HtmlSpan { raw: &input[..after_tag], tag, kind: classify(tag) },
+ &input[after_tag..],
+ ));
+ }
+ } else if !void_tags.iter().any(|t| (*t).eq_ignore_ascii_case(tag)) {
+ depth += 1;
+ }
+ }
+ scan = after_tag;
+ }
+ // No matching close; treat the open tag alone as the span.
+ Some((
+ HtmlSpan { raw: &input[..idx], tag, kind: classify(tag) },
+ &input[idx..],
+ ))
+}
+
+#[cfg(test)]
+#[path = "gfm_html_tests.rs"]
+mod tests;
+```
+
+Add `mod gfm_html;` to `crates/markdown_parser/src/lib.rs`. Place it next to the existing `pub mod html_parser;` and `pub mod markdown_parser;` declarations.
+
+```rust
+// crates/markdown_parser/src/lib.rs (around line 6)
+pub mod html_parser;
+pub mod markdown_parser;
+pub mod weight;
+mod gfm_html; // ← new
+```
+
+- [ ] **Step 2: Create the test stub file**
+
+Write `crates/markdown_parser/src/gfm_html_tests.rs`:
+
+```rust
+use super::*;
+
+#[test]
+fn lex_simple_open_close() {
+ let (span, rest) = try_lex_html_span("hi rest").unwrap();
+ assert_eq!(span.raw, "hi");
+ assert_eq!(span.tag, "b");
+ assert_eq!(span.kind, HtmlSpanKind::PhrasingSafe);
+ assert_eq!(rest, " rest");
+}
+
+#[test]
+fn lex_self_closing() {
+ let (span, rest) = try_lex_html_span("
after").unwrap();
+ assert_eq!(span.raw, "
");
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_void_no_slash() {
+ let (span, rest) = try_lex_html_span("
after").unwrap();
+ assert_eq!(span.raw, "
");
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_attribute_with_gt_in_quote() {
+ let (span, rest) = try_lex_html_span(r#"link!"#).unwrap();
+ assert_eq!(span.raw, r#"link"#);
+ assert_eq!(rest, "!");
+}
+
+#[test]
+fn lex_nested_same_tag() {
+ let (span, rest) =
+ try_lex_html_span("x tail").unwrap();
+ assert_eq!(span.raw, "x ");
+ assert_eq!(rest, "tail");
+}
+
+#[test]
+fn lex_missing_close_returns_open_only() {
+ let (span, rest) = try_lex_html_span("x and more").unwrap();
+ assert_eq!(span.raw, "");
+ assert_eq!(rest, "x and more");
+}
+
+#[test]
+fn lex_comment_stripped() {
+ let (span, rest) = try_lex_html_span("after").unwrap();
+ assert!(span.is_stripped());
+ assert_eq!(rest, "after");
+}
+
+#[test]
+fn lex_not_a_tag_returns_none() {
+ assert!(try_lex_html_span("< not a tag").is_none());
+ assert!(try_lex_html_span("a ").is_none());
+}
+
+#[test]
+fn lex_classification() {
+ assert_eq!(classify("kbd"), HtmlSpanKind::PhrasingSafe);
+ assert_eq!(classify("details"), HtmlSpanKind::BlockSafe);
+ assert_eq!(classify("script"), HtmlSpanKind::Stripped);
+ assert_eq!(classify("foo"), HtmlSpanKind::Unknown);
+ assert_eq!(classify("DETAILS"), HtmlSpanKind::BlockSafe, "case-insensitive");
+}
+```
+
+- [ ] **Step 3: Run the lexer tests**
+
+```bash
+cargo test -p markdown_parser gfm_html --quiet
+```
+Expected: all 9 tests PASS. If they fail, fix the lexer (most likely culprit: the attribute-in-quote scan or the nested-same-tag depth counter).
+
+- [ ] **Step 4: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/gfm_html.rs crates/markdown_parser/src/gfm_html_tests.rs crates/markdown_parser/src/lib.rs
+git commit -S -m "feat(markdown_parser): add gfm html span lexer with safe-list"
+git log -1 --show-signature
+```
+
+---
+
+## Phase 3 — Wire HTML dispatch into the markdown parser
+
+### Task 3.1: Add block-level HTML dispatch in `parse_markdown_internal`
+
+**Files:**
+- Modify: `crates/markdown_parser/src/markdown_parser.rs`
+- Modify: `crates/markdown_parser/src/markdown_parser_tests.rs`
+
+- [ ] **Step 1: Add the failing test**
+
+Append to `crates/markdown_parser/src/markdown_parser_tests.rs`:
+
+```rust
+#[test]
+fn test_parse_block_html_details() {
+ let source = "Click
Body
\n";
+ let parsed = test_parse_markdown(source);
+ let first = parsed.first().expect("at least one line");
+ match first {
+ FormattedTextLine::Line(fragments) => {
+ let joined: String = fragments.iter().map(|f| f.text.clone()).collect();
+ assert!(joined.starts_with("▾ Click"), "got {joined:?}");
+ }
+ other => panic!("expected Line, got {other:?}"),
+ }
+}
+
+#[test]
+fn test_parse_block_html_strips_script() {
+ let source = "before\n\nafter\n";
+ let parsed = test_parse_markdown(source);
+ let texts: Vec = parsed
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(fragments) => {
+ Some(fragments.iter().map(|f| f.text.clone()).collect::())
+ }
+ _ => None,
+ })
+ .collect();
+ assert!(texts.iter().any(|t| t == "before"));
+ assert!(texts.iter().any(|t| t == "after"));
+ assert!(!texts.iter().any(|t| t.contains("alert")), "script body must be stripped: {texts:?}");
+}
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_block_html_details test_parse_block_html_strips_script --quiet
+```
+Expected: FAIL (both).
+
+- [ ] **Step 2: Add the block dispatch parser**
+
+In `crates/markdown_parser/src/markdown_parser.rs`, near the other block parsers (around line 235), add a new parser:
+
+```rust
+fn parse_block_html<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
+ input: &'a str,
+) -> IResult<&'a str, Vec, E> {
+ use crate::gfm_html::{HtmlSpanKind, try_lex_html_span};
+ use crate::html_parser::parse_html_block_lines;
+
+ // Must be at the start of a line (caller invariant — `parse_markdown_internal`
+ // calls this between lines).
+ let Some((span, rest)) = try_lex_html_span(input) else {
+ return Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)));
+ };
+ match span.kind {
+ HtmlSpanKind::BlockSafe => {
+ let lines = parse_html_block_lines(span.raw);
+ // Consume one trailing newline if present so we don't emit a spurious blank line.
+ let rest = rest.strip_prefix('\n').unwrap_or(rest);
+ Ok((rest, lines))
+ }
+ HtmlSpanKind::Stripped => {
+ let rest = rest.strip_prefix('\n').unwrap_or(rest);
+ Ok((rest, Vec::new()))
+ }
+ HtmlSpanKind::PhrasingSafe | HtmlSpanKind::Unknown => {
+ // Not a block — let the paragraph parser handle it.
+ Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)))
+ }
+ }
+}
+```
+
+The block parser at lines 142–185 returns a single `FormattedTextLine` per call via `alt`. Our new parser returns `Vec`. To integrate, alter the main loop (around lines 190–219) to splice in either the single line or the multi-line result:
+
+```rust
+let mut remaining = markdown;
+let mut lines = Vec::new();
+while !remaining.is_empty() {
+ // Try block HTML first; if it matches, splice in its result.
+ if let Ok((rest, html_lines)) = parse_block_html::(remaining) {
+ if !html_lines.is_empty() {
+ lines.extend(html_lines);
+ }
+ remaining = rest;
+ indentation_context.borrow_mut().clear();
+ continue;
+ }
+ let (remaining_after_block, mut line) = block(remaining)?;
+ remaining = remaining_after_block;
+ // … existing match/list-handling code, unchanged …
+ lines.push(line);
+}
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_block_html_details test_parse_block_html_strips_script --quiet
+```
+Expected: PASS.
+
+- [ ] **Step 3: Confirm no existing test regressed**
+
+```bash
+cargo test -p markdown_parser --quiet
+```
+Expected: all pass.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/markdown_parser.rs crates/markdown_parser/src/markdown_parser_tests.rs
+git commit -S -m "feat(markdown_parser): dispatch block html in markdown body"
+git log -1 --show-signature
+```
+
+### Task 3.2: Add inline-level HTML dispatch in `parse_inline_token`
+
+**Files:**
+- Modify: `crates/markdown_parser/src/markdown_parser.rs`
+- Modify: `crates/markdown_parser/src/markdown_parser_tests.rs`
+
+- [ ] **Step 1: Add the failing test**
+
+Append:
+
+```rust
+#[test]
+fn test_parse_inline_html_kbd() {
+ let source = "Press Cmd+K to clear.";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let cmd = line.iter().find(|f| f.text == "Cmd").expect("Cmd fragment");
+ assert!(cmd.styles.inline_code, "kbd should map to inline_code: {cmd:?}");
+}
+
+#[test]
+fn test_parse_inline_html_sub_sup() {
+ let source = "H2O and E=mc2";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let two = line.iter().find(|f| f.text == "2").expect("subscript 2");
+ assert!(two.styles.italic, "sub should be italic for v1: {two:?}");
+}
+
+#[test]
+fn test_parse_inline_html_unknown_tag_passes_through() {
+ let source = "bar";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let joined: String = line.iter().map(|f| f.text.clone()).collect();
+ assert_eq!(joined, "bar");
+}
+```
+
+Run and expect FAIL.
+
+- [ ] **Step 2: Add the inline HTML token**
+
+In `crates/markdown_parser/src/markdown_parser.rs`, extend the `InlineToken` enum (around line 1679):
+
+```rust
+enum InlineToken<'a> {
+ // … existing variants …
+ /// A complete inline HTML span (open + body + close, or self-closing) on the
+ /// phrasing safe-list. The fragments are produced by html5ever, NOT by
+ /// the markdown inline parser.
+ InlineHtml(Vec),
+ /// An HTML span on the stripped-list. Emits nothing.
+ StrippedHtml,
+}
+```
+
+Add a parser for it (place near the other `parse_inline_token_*` helpers):
+
+```rust
+fn parse_inline_token_html<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
+ input: &'a str,
+) -> IResult<&'a str, InlineToken<'a>, E> {
+ use crate::gfm_html::{HtmlSpanKind, try_lex_html_span};
+ use crate::html_parser::parse_html_inline_fragments;
+
+ let Some((span, rest)) = try_lex_html_span(input) else {
+ return Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)));
+ };
+ match span.kind {
+ HtmlSpanKind::PhrasingSafe => {
+ let fragments = parse_html_inline_fragments(span.raw);
+ Ok((rest, InlineToken::InlineHtml(fragments)))
+ }
+ HtmlSpanKind::Stripped => Ok((rest, InlineToken::StrippedHtml)),
+ // Block-safe or unknown tags fall through; the lexer doesn't consume them here.
+ HtmlSpanKind::BlockSafe | HtmlSpanKind::Unknown => {
+ Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)))
+ }
+ }
+}
+```
+
+Register it in `parse_inline_token`'s `alt` (line ~1535). Place it BEFORE `parse_inline_token_underline_start` (since `…` should now route through the generic phrasing path, but the existing `UnderlineEnd` token also handles it — both work because both produce underline-styled fragments). To avoid double-handling, drop the dedicated `` underline-start/end parsers in favor of the generic HTML path:
+
+Replace the `alt` block (lines ~1533–1554) with:
+
+```rust
+context(
+ "inline_token",
+ alt((
+ backslash_escape,
+ html_entity,
+ code_span,
+ parse_inline_token_link_start,
+ parse_inline_token_link_end,
+ parse_inline_token_asterisk,
+ parse_inline_token_underscore,
+ parse_inline_token_strikethrough,
+ parse_inline_token_autolink,
+ parse_inline_token_html,
+ whitespace,
+ text,
+ unmatched_char,
+ )),
+)(input)
+```
+
+Notice the removal of `parse_inline_token_underline_start` and `parse_inline_token_underline_end`. Update the `InlineToken` match in `parse_inline` (around lines 987–1033) to remove the `UnderlineEnd` branch and add handling for the new variants:
+
+```rust
+InlineToken::InlineHtml(fragments) => {
+ for fragment in fragments {
+ state.push_closed_node(fragment);
+ }
+}
+InlineToken::StrippedHtml => {
+ // Emit nothing.
+}
+```
+
+Also remove the now-unused `UnderlineEnd` variant from the enum and the corresponding parser functions (`parse_inline_token_underline_start`, `parse_inline_token_underline_end`, `parse_underline`). Run cargo check after each removal to catch leftover references.
+
+- [ ] **Step 3: Run the inline HTML tests**
+
+```bash
+cargo test -p markdown_parser test_parse_inline_html --quiet
+cargo test -p markdown_parser --quiet
+```
+Expected: new tests PASS; `test_parse_single_line` (which exercises `Hooray!`) STILL PASSES because `` routes through the new HTML path, which still applies the underline style.
+
+If `test_parse_single_line` fails because the underline assertion no longer matches the structure (e.g. the fragment is now produced by html5ever and has slightly different styling), inspect the actual output via `cargo test -p markdown_parser test_parse_single_line -- --nocapture` and either:
+- Adjust the assertion to be structure-tolerant (compare via raw text + style flags), or
+- Add a normalization step in the `InlineHtml` handler that merges adjacent fragments with identical styles.
+
+The simpler fix is the second — see if the existing `consolidate_fragments` (called at the end of `parse_inline`) already covers this. If not, call it on the InlineHtml fragments before pushing.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/markdown_parser.rs crates/markdown_parser/src/markdown_parser_tests.rs
+git commit -S -m "feat(markdown_parser): dispatch inline html via html5ever phrasing helper"
+git log -1 --show-signature
+```
+
+### Task 3.3: Add tests for `
` line break and unknown-tag pass-through inline
+
+**Files:**
+- Modify: `crates/markdown_parser/src/markdown_parser_tests.rs`
+
+- [ ] **Step 1: Add the tests**
+
+Append:
+
+```rust
+#[test]
+fn test_parse_inline_html_br_emits_linebreak() {
+ let source = "before
after";
+ let parsed = test_parse_markdown(source);
+ // The HTML helper emits the
as a LineBreak line, then "after" as a
+ // continuation. Joined raw text should reflect the break.
+ let joined = parsed
+ .iter()
+ .map(|line| line.raw_text())
+ .collect::();
+ assert!(joined.contains("before"));
+ assert!(joined.contains("after"));
+}
+
+#[test]
+fn test_parse_inline_html_stripped_emits_nothing() {
+ let source = "before after";
+ let parsed = test_parse_markdown(source);
+ let joined: String = parsed
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(fragments) => {
+ Some(fragments.iter().map(|f| f.text.clone()).collect::())
+ }
+ _ => None,
+ })
+ .collect();
+ assert!(joined.contains("before"));
+ assert!(joined.contains("after"));
+ assert!(!joined.contains("alert"), "got {joined:?}");
+}
+```
+
+Run:
+```bash
+cargo test -p markdown_parser test_parse_inline_html_br_emits_linebreak test_parse_inline_html_stripped_emits_nothing --quiet
+```
+Expected: PASS.
+
+- [ ] **Step 2: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/markdown_parser_tests.rs
+git commit -S -m "test(markdown_parser): cover inline html br and stripped tags"
+git log -1 --show-signature
+```
+
+---
+
+## Phase 4 — Footnotes (`footnotes.rs`)
+
+### Task 4.1: Scaffold the module and definition extraction
+
+**Files:**
+- Create: `crates/markdown_parser/src/footnotes.rs`
+- Create: `crates/markdown_parser/src/footnotes_tests.rs`
+- Modify: `crates/markdown_parser/src/lib.rs`
+
+- [ ] **Step 1: Create the module**
+
+Write `crates/markdown_parser/src/footnotes.rs`:
+
+```rust
+//! Footnote pre/post-processing for the markdown parser.
+//!
+//! GFM footnotes have two parts:
+//! - Definitions of the form `[^id]: text` (block-level, may continue on
+//! subsequent indented lines).
+//! - References of the form `[^id]` (inline).
+//!
+//! This module pre-extracts definitions from the raw markdown (returning the
+//! source minus the definition lines), and post-rewrites references in the
+//! parsed `FormattedText`, appending a footnotes section if any references
+//! were resolved.
+
+use std::collections::HashMap;
+
+use crate::{
+ FormattedIndentTextInline, FormattedText, FormattedTextFragment, FormattedTextLine,
+ FormattedTextStyles, Hyperlink, OrderedFormattedIndentTextInline,
+};
+
+#[derive(Debug, Clone)]
+pub(crate) struct FootnoteDef {
+ pub(crate) id: String,
+ pub(crate) content: String,
+}
+
+pub(crate) struct FootnoteContext {
+ pub(crate) definitions: HashMap,
+ /// Resolved id → assigned number (1-based, in order of first reference).
+ pub(crate) numbers: HashMap,
+ /// Definitions used at least once, in number order.
+ pub(crate) used: Vec,
+}
+
+impl FootnoteContext {
+ pub(crate) fn empty() -> Self {
+ Self {
+ definitions: HashMap::new(),
+ numbers: HashMap::new(),
+ used: Vec::new(),
+ }
+ }
+}
+
+/// Strip footnote definitions from the source. Returns the source minus the
+/// definition lines and the collected definition map.
+pub(crate) fn extract_definitions(source: &str) -> (String, HashMap) {
+ let mut definitions: HashMap = HashMap::new();
+ let mut output = String::with_capacity(source.len());
+ let mut lines = source.split_inclusive('\n').peekable();
+
+ while let Some(line) = lines.next() {
+ let trimmed = line.trim_end_matches(['\r', '\n']);
+ if let Some((id, first)) = parse_definition_line(trimmed) {
+ let mut content = first.to_string();
+ // Absorb indented continuation lines (4 spaces or a tab).
+ while let Some(peek) = lines.peek() {
+ let peek_no_eol = peek.trim_end_matches(['\r', '\n']);
+ if peek_no_eol.starts_with(" ") || peek_no_eol.starts_with('\t') {
+ let cont = lines.next().unwrap();
+ let cont_trimmed = cont
+ .trim_end_matches(['\r', '\n'])
+ .trim_start_matches(['\t'])
+ .trim_start_matches(" ");
+ content.push(' ');
+ content.push_str(cont_trimmed);
+ } else {
+ break;
+ }
+ }
+ definitions.insert(id.clone(), FootnoteDef { id, content });
+ continue;
+ }
+ output.push_str(line);
+ }
+
+ (output, definitions)
+}
+
+fn parse_definition_line(line: &str) -> Option<(String, &str)> {
+ let rest = line.strip_prefix("[^")?;
+ let close = rest.find(']')?;
+ let id = &rest[..close];
+ if id.is_empty() || id.contains(char::is_whitespace) {
+ return None;
+ }
+ let after = &rest[close + 1..];
+ let body = after.strip_prefix(':')?.trim_start();
+ Some((id.to_string(), body))
+}
+
+#[cfg(test)]
+#[path = "footnotes_tests.rs"]
+mod tests;
+```
+
+Add `mod footnotes;` to `crates/markdown_parser/src/lib.rs` next to `mod gfm_html;`.
+
+Write `crates/markdown_parser/src/footnotes_tests.rs`:
+
+```rust
+use super::*;
+
+#[test]
+fn extract_single_definition() {
+ let source = "claim[^x]\n\n[^x]: defn\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "claim[^x]\n\n");
+ assert_eq!(defs.len(), 1);
+ assert_eq!(defs.get("x").unwrap().content, "defn");
+}
+
+#[test]
+fn extract_no_definitions_passthrough() {
+ let source = "plain text\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "plain text\n");
+ assert!(defs.is_empty());
+}
+
+#[test]
+fn extract_continuation_line() {
+ let source = "[^x]: first\n continued\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "");
+ assert_eq!(defs.get("x").unwrap().content, "first continued");
+}
+
+#[test]
+fn extract_id_with_space_skipped() {
+ // GFM ids don't allow whitespace; the line falls through as a regular paragraph.
+ let source = "[^bad id]: defn\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, source);
+ assert!(defs.is_empty());
+}
+```
+
+- [ ] **Step 2: Run extraction tests**
+
+```bash
+cargo test -p markdown_parser footnotes::tests --quiet
+```
+Expected: PASS.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/footnotes.rs crates/markdown_parser/src/footnotes_tests.rs crates/markdown_parser/src/lib.rs
+git commit -S -m "feat(markdown_parser): extract footnote definitions from markdown source"
+git log -1 --show-signature
+```
+
+### Task 4.2: Reference rewriting + appended section
+
+**Files:**
+- Modify: `crates/markdown_parser/src/footnotes.rs`
+- Modify: `crates/markdown_parser/src/footnotes_tests.rs`
+
+- [ ] **Step 1: Add the failing tests**
+
+Append to `footnotes_tests.rs`:
+
+```rust
+use crate::{parse_markdown};
+
+#[test]
+fn rewrite_single_reference_appends_section() {
+ let parsed = parse_markdown("Some claim[^x].\n\n[^x]: Because reasons.\n")
+ .expect("parse");
+ // Expect: paragraph line with "Some claim" + "1" + "."; then hr; then ordered-list with content + " ↩"
+ let last_three: Vec<_> = parsed.lines.iter().rev().take(3).collect();
+ let (back, hr, list_or_paragraph) = (last_three[0], last_three[1], last_three[2]);
+ let _ = (back, hr, list_or_paragraph);
+ // Structural assertions:
+ assert!(parsed.lines.iter().any(|l| matches!(l, FormattedTextLine::HorizontalRule)));
+ let has_back_ref = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::OrderedList(list) => Some(&list.indented_text.text),
+ _ => None,
+ })
+ .flatten()
+ .any(|f| f.text == " ↩" && matches!(&f.styles.hyperlink, Some(Hyperlink::Url(u)) if u.contains("fnref")));
+ assert!(has_back_ref, "expected back-reference fragment");
+ // The reference itself is now a hyperlink "1"
+ let reference_hyperlink_present = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .any(|f| f.text == "1" && matches!(&f.styles.hyperlink, Some(Hyperlink::Url(u)) if u == "#fn-x"));
+ assert!(reference_hyperlink_present, "expected #fn-x reference");
+}
+
+#[test]
+fn unused_definition_dropped() {
+ let parsed = parse_markdown("Plain text.\n\n[^never]: Unused.\n").expect("parse");
+ assert!(parsed.lines.iter().all(|l| !matches!(l, FormattedTextLine::HorizontalRule)));
+}
+
+#[test]
+fn undefined_reference_passes_through() {
+ let parsed = parse_markdown("Some claim[^missing] here.\n").expect("parse");
+ let joined: String = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags.iter().map(|f| f.text.clone()).collect::()),
+ _ => None,
+ })
+ .collect();
+ assert!(joined.contains("[^missing]"), "expected literal pass-through: {joined:?}");
+}
+
+#[test]
+fn repeated_references_share_number() {
+ let parsed = parse_markdown("A[^x] B[^x]\n\n[^x]: D\n").expect("parse");
+ let line_fragments: Vec<_> = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .filter(|f| f.text == "1")
+ .count();
+ assert_eq!(line_fragments, 2, "both references should be number 1");
+}
+```
+
+Run and confirm FAIL.
+
+- [ ] **Step 2: Implement reference rewriting**
+
+Append to `crates/markdown_parser/src/footnotes.rs`:
+
+```rust
+/// Walk every inline fragment in `text` and rewrite occurrences of `[^id]` (for
+/// `id` present in `defs`) into a hyperlink fragment numbered in first-reference order.
+///
+/// Returns the rewritten text and a `FootnoteContext` populated with the
+/// definitions that were actually used (in numbered order).
+pub(crate) fn rewrite_references(
+ mut text: FormattedText,
+ defs: HashMap,
+) -> (FormattedText, FootnoteContext) {
+ let mut ctx = FootnoteContext {
+ definitions: defs,
+ numbers: HashMap::new(),
+ used: Vec::new(),
+ };
+
+ for line in text.lines.iter_mut() {
+ rewrite_line(line, &mut ctx);
+ }
+
+ (text, ctx)
+}
+
+fn rewrite_line(line: &mut FormattedTextLine, ctx: &mut FootnoteContext) {
+ let fragments = match line {
+ FormattedTextLine::Line(frags) => frags,
+ FormattedTextLine::Heading(h) => &mut h.text,
+ FormattedTextLine::OrderedList(list) => &mut list.indented_text.text,
+ FormattedTextLine::UnorderedList(list) => &mut list.text,
+ FormattedTextLine::TaskList(list) => &mut list.text,
+ _ => return,
+ };
+ rewrite_fragments(fragments, ctx);
+}
+
+fn rewrite_fragments(
+ fragments: &mut Vec,
+ ctx: &mut FootnoteContext,
+) {
+ let mut out: Vec = Vec::with_capacity(fragments.len());
+ for fragment in fragments.drain(..) {
+ if !fragment.text.contains("[^") || fragment.styles.inline_code {
+ out.push(fragment);
+ continue;
+ }
+ // Walk the fragment text and split out [^id] references.
+ let original_styles = fragment.styles.clone();
+ let mut remaining = fragment.text.as_str();
+ let mut buf = String::new();
+ while let Some(at) = remaining.find("[^") {
+ buf.push_str(&remaining[..at]);
+ let after = &remaining[at + 2..];
+ if let Some(close) = after.find(']') {
+ let id = &after[..close];
+ if !id.is_empty()
+ && !id.contains(char::is_whitespace)
+ && ctx.definitions.contains_key(id)
+ {
+ // Resolve.
+ if !buf.is_empty() {
+ out.push(FormattedTextFragment {
+ text: std::mem::take(&mut buf),
+ styles: original_styles.clone(),
+ });
+ }
+ let number = match ctx.numbers.get(id) {
+ Some(n) => *n,
+ None => {
+ let n = ctx.used.len() + 1;
+ ctx.numbers.insert(id.to_string(), n);
+ ctx.used.push(ctx.definitions[id].clone());
+ n
+ }
+ };
+ out.push(FormattedTextFragment {
+ text: number.to_string(),
+ styles: FormattedTextStyles {
+ italic: true,
+ hyperlink: Some(Hyperlink::Url(format!("#fn-{id}"))),
+ ..Default::default()
+ },
+ });
+ remaining = &after[close + 1..];
+ continue;
+ }
+ }
+ // Not a defined reference — emit the [^ literally and keep scanning.
+ buf.push_str("[^");
+ remaining = after;
+ }
+ buf.push_str(remaining);
+ if !buf.is_empty() {
+ out.push(FormattedTextFragment {
+ text: buf,
+ styles: original_styles,
+ });
+ }
+ }
+ *fragments = out;
+}
+
+/// Append a footnotes section to `text` based on `ctx.used`.
+pub(crate) fn append_section(text: &mut FormattedText, ctx: &FootnoteContext) {
+ if ctx.used.is_empty() {
+ return;
+ }
+ text.lines.push_back(FormattedTextLine::HorizontalRule);
+ for (index, def) in ctx.used.iter().enumerate() {
+ let number = index + 1;
+ let body_fragments = crate::parse_inline_markdown(&def.content);
+ let mut content_fragments: Vec = body_fragments.into_iter().collect();
+ content_fragments.push(FormattedTextFragment {
+ text: " ↩".to_string(),
+ styles: FormattedTextStyles {
+ hyperlink: Some(Hyperlink::Url(format!("#fnref-{}", def.id))),
+ ..Default::default()
+ },
+ });
+ text.lines
+ .push_back(FormattedTextLine::OrderedList(OrderedFormattedIndentTextInline {
+ number: Some(number),
+ indented_text: FormattedIndentTextInline {
+ indent_level: 0,
+ text: content_fragments,
+ },
+ }));
+ }
+}
+```
+
+- [ ] **Step 3: Wire the footnote pipeline into `parse_markdown_impl`**
+
+In `crates/markdown_parser/src/markdown_parser.rs`, change `parse_markdown_impl` to run the two passes:
+
+```rust
+fn parse_markdown_impl(markdown: &str, parse_gfm_tables: bool) -> Result {
+ let (stripped, defs) = crate::footnotes::extract_definitions(markdown);
+ let parsed = parse_markdown_internal::<'_, nom::error::Error<_>>(&stripped, parse_gfm_tables)
+ .map(|(_, mut res)| {
+ if let Some(FormattedTextLine::LineBreak) = res.last() {
+ res.pop();
+ }
+ FormattedText { lines: res.into() }
+ })
+ .map_err(|err| {
+ if cfg!(debug_assertions) {
+ anyhow::anyhow!("Failed to parse Markdown: {err}")
+ } else {
+ anyhow::anyhow!("Failed to parse Markdown")
+ }
+ })?;
+ let (mut rewritten, ctx) = crate::footnotes::rewrite_references(parsed, defs);
+ crate::footnotes::append_section(&mut rewritten, &ctx);
+ Ok(rewritten)
+}
+```
+
+- [ ] **Step 4: Run the footnote tests**
+
+```bash
+cargo test -p markdown_parser footnotes --quiet
+cargo test -p markdown_parser --quiet
+```
+Expected: all pass. If any existing test breaks because the footnote pipeline rewrites a fragment containing `[^x]` that's part of regular markdown (e.g. unrelated `^` usage), inspect and adjust: the `defs.contains_key(id)` gate protects against this — only defined ids get rewritten.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/src/footnotes.rs crates/markdown_parser/src/footnotes_tests.rs crates/markdown_parser/src/markdown_parser.rs
+git commit -S -m "feat(markdown_parser): resolve footnote references and append section"
+git log -1 --show-signature
+```
+
+---
+
+## Phase 5 — Smoke test in CodeView
+
+### Task 5.1: Create the smoke fixture
+
+**Files:**
+- Create: `crates/markdown_parser/test-fixtures/gfm-smoketest.md`
+
+- [ ] **Step 1: Create the directory and file**
+
+```bash
+mkdir -p crates/markdown_parser/test-fixtures
+```
+
+Write `crates/markdown_parser/test-fixtures/gfm-smoketest.md`:
+
+````markdown
+# GFM Smoke Test
+
+This file is opened in CodeView's `.md` Rendered preview to confirm the GFM HTML
++ footnote work renders correctly. It is not a runnable test.
+
+## Inline HTML
+
+Press Cmd+K to clear. H2O reacts with E=mc2.
+This is important, this is old, and this is highlighted.
+
+## Block HTML
+
+
+Click to see commands
+
+Body text inside details.
+
+
+
+
+| Cmd | What |
+
+| build | Compile the project |
+| test | Run the test suite |
+
+
+
+## Mermaid
+
+```mermaid
+graph TD
+ A[Start] --> B{Decision}
+ B -->|Yes| C[Continue]
+ B -->|No| D[Stop]
+```
+
+## Footnotes
+
+A claim with a footnote.[^why] And another reference to the same one.[^why]
+And a separate footnote.[^how]
+
+[^why]: Because the spec says so.
+[^how]: With a hand-rolled two-pass parser.
+
+## Stripped
+
+
+This text appears.
+
+## Existing GFM (regression baseline)
+
+- [ ] Task list works
+- [x] So does this one
+- ~~strikethrough~~
+- A bare URL: https://github.com
+- A pipe table:
+
+| Col A | Col B |
+|-------|-------|
+| 1 | 2 |
+````
+
+- [ ] **Step 2: Commit**
+
+```bash
+git branch --show-current
+./script/check_ai_attribution
+git add crates/markdown_parser/test-fixtures/gfm-smoketest.md
+git commit -S -m "test(markdown_parser): add gfm smoketest fixture for codeview preview"
+git log -1 --show-signature
+```
+
+### Task 5.2: Run the app, open the fixture, verify
+
+**Files:** none modified.
+
+- [ ] **Step 1: Build the app**
+
+Run:
+```bash
+cargo build -p warp-app --bin cast-codes
+```
+Expected: build succeeds. The package name is `warp-app` (per `app/Cargo.toml:7`); the OSS binary is `cast-codes` (per `default-run` on line 3). If the build fails, follow the existing `castcodes-dev-loop` skill guidance.
+
+- [ ] **Step 2: Launch and open the fixture**
+
+```bash
+cargo run -p warp-app --bin cast-codes -- crates/markdown_parser/test-fixtures/gfm-smoketest.md
+```
+
+If the binary does not accept a file path positional argument, launch without args and open the file via the CodeView open-file affordance instead.
+
+In the app:
+1. The file opens in CodeView.
+2. Click the **Rendered** segmented control toggle in the file header (created by `update_markdown_mode_segmented_control` in `app/src/code/view.rs:299`).
+
+- [ ] **Step 3: Visual checks**
+
+Confirm each of:
+
+| Section | Expected |
+|---|---|
+| `# GFM Smoke Test` | Renders as an H1. |
+| `Cmd+K` | Each key label rendered as an inline-code chip. |
+| `H2O` and `mc2` | The `2` characters are italicized; rest is plain text. |
+| `important` | Underlined. |
+| `old` | Strikethrough. |
+| `highlighted` | Plain text (v1 has no highlight style). |
+| `` block | First line shows `▾ Click to see commands`; body paragraph follows immediately below. |
+| `` block | Renders as a 2-column table with the same headers and rows as a pipe table would. |
+| Mermaid block | Renders as an SVG (this is the verification step — no code expected to change). If it shows the placeholder text indefinitely or shows the raw text, capture a screenshot and STOP. |
+| Footnotes section | A horizontal rule appears below the document, followed by an ordered list (`1.`, `2.`). The numeric `1` and `2` in the body are styled (italic + hyperlink). The list items end with ` ↩`. |
+| `` → emits nothing.
+- `test_parse_html_unknown_tag_passes_through` — `x` → literal "x".
+- `test_parse_html_malformed_open_no_close` — `x` (no ` `) → renders the open tag as a void element + "x" as the next paragraph.
+- `test_parse_footnote_single` — `claim[^1]\n\n[^1]: defn` → "claim¹" + hr + ordered-list "1. defn ↩".
+- `test_parse_footnote_repeated_reference` — two `[^x]` references share number 1.
+- `test_parse_footnote_unused_definition_dropped` — `[^x]: never used` produces no footnote section.
+- `test_parse_footnote_undefined_reference_passthrough` — `claim[^missing]` with no definition → literal "claim[^missing]".
+
+### Integration (smoke test, performed manually before merging)
+
+Add `crates/markdown_parser/test-fixtures/gfm-smoketest.md` (a fixture file, not a runnable test) containing the safe-list tags + footnote + mermaid + GFM tables + task lists. Open it in CodeView, switch to Rendered, visually confirm:
+
+- Each block tag renders with the matching visual treatment.
+- The mermaid block renders as an SVG (verifies the existing wiring; no code expected to change).
+- The footnote reference and back-reference render as styled hyperlinks.
+- No leftover angle-bracket text or panics.
+
+Failure of the mermaid step expands the scope of this PR with a follow-up commit; passing it closes that question.
+
+### Existing tests
+
+All existing `markdown_parser_tests.rs` cases must continue to pass unchanged. In particular, the HTML-paste tests in `html_parser_tests.rs` must be unaffected by the extraction of helpers.
+
+## 7. Risk and rollback
+
+- **Risk**: the refactor of `html_parser.rs` accidentally changes HTML-paste behavior (paste from Notion, GDocs, Confluence). Mitigation: the extracted helpers are called from `parse_html` via the same code path; the existing `html_parser_tests.rs` suite is the regression net.
+- **Risk**: footnote rewriting touches every fragment of every line, which could be quadratic on documents with many footnote references. Mitigation: build a `HashMap` of definitions once, and short-circuit when the line text contains no `[^`.
+- **Risk**: `` opens that contain block content the markdown parser would normally treat as paragraphs (e.g. lists inside ``) get parsed by `html5ever`, which doesn't run markdown inside HTML blocks. GitHub *does* render markdown inside HTML blocks (with a blank line separator). v1 documents this as a known limitation: markdown inside `` is rendered as HTML only. If users hit this, a follow-up can add a "re-enter markdown parser when an HTML block contains a blank-line-separated markdown island" pass.
+- **Rollback**: revert the parser PR. No downstream consumers are touched; no DB migration; no settings persistence; no feature flag to clean up.
+
+## 8. Sequencing
+
+1. Extract `parse_html_inline_fragments` and `parse_html_block_lines` from `html_parser.rs`; confirm existing tests pass.
+2. Add `gfm_html.rs` with constants + `try_lex_html_span`. Unit-test the lexer.
+3. Wire HTML dispatch into `parse_markdown_internal` (block path) and `parse_markdown_line` (inline path). Add the HTML test cases.
+4. Add `footnotes.rs` with the two-pass implementation. Wire into `parse_markdown_impl`. Add the footnote test cases.
+5. Hand-run the smoke fixture in CodeView. Capture screenshots in the PR description.
From 2d44e0d63c3f7fb91736ee8871d64faad2507b36 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:36:30 -0500
Subject: [PATCH 07/18] feat(markdown_parser): dispatch inline html via
html5ever phrasing helper
---
crates/markdown_parser/src/html_parser.rs | 11 +-
crates/markdown_parser/src/markdown_parser.rs | 139 ++++++++++--------
.../src/markdown_parser_tests.rs | 36 +++++
3 files changed, 126 insertions(+), 60 deletions(-)
diff --git a/crates/markdown_parser/src/html_parser.rs b/crates/markdown_parser/src/html_parser.rs
index 949704861..3396c9511 100644
--- a/crates/markdown_parser/src/html_parser.rs
+++ b/crates/markdown_parser/src/html_parser.rs
@@ -44,7 +44,7 @@ enum ListType {
ListItem { ordered: bool },
}
-#[derive(Clone, Default)]
+#[derive(Clone, Default, PartialEq)]
struct Styling {
bold: bool,
italic: bool,
@@ -561,10 +561,17 @@ fn parse_phrasing_content(nodes: &[Rc], text_styling: Styling) -> Formatte
_ => (),
};
+ let before = result.len();
result.extend(parse_phrasing_content(
node.children.borrow().as_ref(),
- decorated_styling,
+ decorated_styling.clone(),
));
+ // If the element added no children (e.g. ``) but carries
+ // non-default styling, emit an empty fragment so callers can
+ // detect and preserve the style even when the text is "".
+ if result.len() == before && decorated_styling != Styling::default() {
+ result.push(phrasing_to_formatted_text("", &decorated_styling));
+ }
}
}
}
diff --git a/crates/markdown_parser/src/markdown_parser.rs b/crates/markdown_parser/src/markdown_parser.rs
index a165ce7a7..95d5f391d 100644
--- a/crates/markdown_parser/src/markdown_parser.rs
+++ b/crates/markdown_parser/src/markdown_parser.rs
@@ -1064,8 +1064,13 @@ fn parse_inline<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
InlineToken::LinkEnd => {
input = parse_link(&mut state, remaining);
}
- InlineToken::UnderlineEnd => {
- input = parse_underline(&mut state, remaining);
+ InlineToken::InlineHtml(fragments) => {
+ for fragment in fragments {
+ state.push_closed_node(fragment);
+ }
+ }
+ InlineToken::StrippedHtml => {
+ // Emit nothing.
}
}
}
@@ -1312,41 +1317,6 @@ fn parse_link_target<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
}
/// Parses underlined text using the same logic as parse_link.
-fn parse_underline<'a>(state: &mut InlineState, remaining: &'a str) -> &'a str {
- let Some((underline_start_index, underline_start)) = state
- .delimiters
- .iter()
- .enumerate()
- .rev()
- .find(|(_, delimiter)| delimiter.kind == DelimiterKind::UnderlineStart)
- else {
- // If there's no underline start, treat this as a literal ``.
- state.push_text("");
- return remaining;
- };
-
- if !underline_start.active {
- // If the start is inactive, remove it - this prevents nested underlines.
- state.delimiters.remove(underline_start_index);
- state.push_text("");
- return remaining;
- }
-
- let underline_start_node = underline_start.node_index;
- state.backtrack_styles(underline_start_node, |styles| styles.underline = true);
- process_emphasis(state, Some(underline_start_index));
-
- state.delimiters.remove(underline_start_index);
- for delimiter in &mut state.delimiters[..underline_start_index] {
- if delimiter.kind == DelimiterKind::UnderlineStart {
- delimiter.active = false;
- }
- }
-
- state.remove_node(underline_start_node);
- state.last_node_closed = true;
- remaining
-}
/// Process emphasis delimiters on the state's delimiter stack, bounded by `stack_bottom`.
///
@@ -1579,8 +1549,7 @@ fn parse_inline_token<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
parse_inline_token_underscore,
parse_inline_token_strikethrough,
parse_inline_token_autolink,
- parse_inline_token_underline_start,
- parse_inline_token_underline_end,
+ parse_inline_token_html,
whitespace,
text,
// This _must_ be the last parser in the chain. It unconditionally consumes a single
@@ -1663,27 +1632,78 @@ fn parse_inline_token_link_end<'a, E: ContextError<&'a str> + ParseError<&'a str
context("link_end", map(tag("]"), |_| InlineToken::LinkEnd))(input)
}
-/// Parse an underline-start delimiter.
-fn parse_inline_token_underline_start<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
- input: &'a str,
-) -> IResult<&'a str, InlineToken<'a>, E> {
- context(
- "underline_start",
- map(tag(""), |_| InlineToken::Delimiter {
- kind: DelimiterKind::UnderlineStart,
- count: 1,
- }),
- )(input)
+/// Return `true` if `raw` looks like a syntactically well-formed HTML tag or tag pair.
+///
+/// Checks (in order):
+/// 1. The span must contain `>` (the open tag was properly closed).
+/// 2. The character immediately after the tag name must be a valid separator:
+/// whitespace, `>`, or `/`. This rejects malformed tags like ``.
+/// 3. For non-void, non-self-closing open tags, the span must contain a matching
+/// close tag (``). This ensures open-only tags like a bare `` are not
+/// consumed as complete spans — they fall through to literal-text parsing.
+fn is_well_formed_html_span(raw: &str) -> bool {
+ if !raw.contains('>') {
+ return false;
+ }
+ let bytes = raw.as_bytes();
+ let is_close_tag = bytes.get(1) == Some(&b'/');
+ // Skip `<` and optional `/` for close tags.
+ let name_start = if is_close_tag { 2 } else { 1 };
+ // Advance through the tag name.
+ let mut i = name_start;
+ while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
+ i += 1;
+ }
+ // After the tag name there must be whitespace, `>`, or `/` (for self-closing).
+ // Anything else (e.g. `)`, `(`) means this is not a real tag.
+ if !matches!(bytes.get(i), Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None) {
+ return false;
+ }
+ // Close tags and self-closing tags are already complete.
+ if is_close_tag || raw.ends_with("/>") {
+ return true;
+ }
+ // Void elements are inherently self-contained.
+ let void_tags = ["br", "hr", "img", "input", "meta", "link"];
+ let tag_name = &raw[name_start..i];
+ if void_tags.iter().any(|t| tag_name.eq_ignore_ascii_case(t)) {
+ return true;
+ }
+ // For regular open tags, require a matching close tag in the same span.
+ // A bare `` (no ``) is rejected so it falls through to literal text.
+ raw.contains("")
}
-/// Parse an underline-end delimiter.
-fn parse_inline_token_underline_end<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
+/// Parse an inline HTML span. Recognises phrasing-safe tags (routes to html5ever) and
+/// stripped tags (silently dropped). Block-safe and unknown tags are rejected so the caller
+/// falls through to the character-by-character `unmatched_char` parser.
+fn parse_inline_token_html<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
input: &'a str,
) -> IResult<&'a str, InlineToken<'a>, E> {
- context(
- "underline_end",
- map(tag(""), |_| InlineToken::UnderlineEnd),
- )(input)
+ use crate::gfm_html::{HtmlSpanKind, try_lex_html_span};
+ use crate::html_parser::parse_html_inline_fragments;
+
+ let Some((span, rest)) = try_lex_html_span(input) else {
+ return Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)));
+ };
+ // Reject spans where the open tag was never properly terminated by '>',
+ // or where the character immediately following the tag name is not a
+ // valid separator (whitespace, `>`, `/`). Both of these indicate a
+ // malformed tag that should fall through to literal-text parsing.
+ // E.g. `` — `)` is not a valid attribute separator.
+ if !is_well_formed_html_span(span.raw) {
+ return Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)));
+ }
+ match span.kind {
+ HtmlSpanKind::PhrasingSafe => {
+ let fragments = parse_html_inline_fragments(span.raw);
+ Ok((rest, InlineToken::InlineHtml(fragments)))
+ }
+ HtmlSpanKind::Stripped => Ok((rest, InlineToken::StrippedHtml)),
+ HtmlSpanKind::BlockSafe | HtmlSpanKind::Unknown => {
+ Err(nom::Err::Error(E::from_error_kind(input, ErrorKind::Tag)))
+ }
+ }
}
/// Helper to parse a run of delimiters.
@@ -1729,8 +1749,11 @@ enum InlineToken<'a> {
AutoLink(&'a str),
/// A closing `]` bracket, which triggers link parsing.
LinkEnd,
- /// A closing , which triggers underline parsing.
- UnderlineEnd,
+ /// A complete inline HTML span on the phrasing safe-list, already parsed
+ /// into fragments by `html_parser::parse_html_inline_fragments`.
+ InlineHtml(Vec),
+ /// An HTML span on the stripped-list. Emits nothing.
+ StrippedHtml,
}
/// An entry in the [delimiter stack](https://spec.commonmark.org/0.30/#delimiter-stack)
diff --git a/crates/markdown_parser/src/markdown_parser_tests.rs b/crates/markdown_parser/src/markdown_parser_tests.rs
index f4e6f275e..dea7b4fa7 100644
--- a/crates/markdown_parser/src/markdown_parser_tests.rs
+++ b/crates/markdown_parser/src/markdown_parser_tests.rs
@@ -2784,3 +2784,39 @@ fn test_parse_block_html_strips_script() {
assert!(texts.iter().any(|t| t == "after"));
assert!(!texts.iter().any(|t| t.contains("alert")), "script body must be stripped: {texts:?}");
}
+
+#[test]
+fn test_parse_inline_html_kbd() {
+ let source = "Press Cmd+K to clear.";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let cmd = line.iter().find(|f| f.text == "Cmd").expect("Cmd fragment");
+ assert!(cmd.styles.inline_code, "kbd should map to inline_code: {cmd:?}");
+}
+
+#[test]
+fn test_parse_inline_html_sub_sup() {
+ let source = "H2O and E=mc2";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let two = line.iter().find(|f| f.text == "2").expect("subscript 2");
+ assert!(two.styles.italic, "sub should be italic for v1: {two:?}");
+}
+
+#[test]
+fn test_parse_inline_html_unknown_tag_passes_through() {
+ let source = "bar";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let joined: String = line.iter().map(|f| f.text.clone()).collect();
+ assert_eq!(joined, "bar");
+}
From e43d47c6dbaadb6834150a3e16c6ecc553bb5d3b Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:39:25 -0500
Subject: [PATCH 08/18] test(markdown_parser): cover inline html br and
stripped tags
---
.../src/markdown_parser_tests.rs | 32 +++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/crates/markdown_parser/src/markdown_parser_tests.rs b/crates/markdown_parser/src/markdown_parser_tests.rs
index dea7b4fa7..b76c17bf7 100644
--- a/crates/markdown_parser/src/markdown_parser_tests.rs
+++ b/crates/markdown_parser/src/markdown_parser_tests.rs
@@ -2820,3 +2820,35 @@ fn test_parse_inline_html_unknown_tag_passes_through() {
let joined: String = line.iter().map(|f| f.text.clone()).collect();
assert_eq!(joined, "bar");
}
+
+#[test]
+fn test_parse_inline_html_br_emits_linebreak() {
+ let source = "before
after";
+ let parsed = test_parse_markdown(source);
+ // The HTML helper emits the
as a LineBreak line, then "after" as a
+ // continuation. Joined raw text should reflect the break.
+ let joined = parsed
+ .iter()
+ .map(|line| line.raw_text())
+ .collect::();
+ assert!(joined.contains("before"));
+ assert!(joined.contains("after"));
+}
+
+#[test]
+fn test_parse_inline_html_stripped_emits_nothing() {
+ let source = "before after";
+ let parsed = test_parse_markdown(source);
+ let joined: String = parsed
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(fragments) => {
+ Some(fragments.iter().map(|f| f.text.clone()).collect::())
+ }
+ _ => None,
+ })
+ .collect();
+ assert!(joined.contains("before"));
+ assert!(joined.contains("after"));
+ assert!(!joined.contains("alert"), "got {joined:?}");
+}
From 50d34acfd9b19528d512e39ac5e63c7c14a4da76 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:41:03 -0500
Subject: [PATCH 09/18] feat(markdown_parser): extract footnote definitions
from markdown source
---
crates/markdown_parser/src/footnotes.rs | 93 +++++++++++++++++++
crates/markdown_parser/src/footnotes_tests.rs | 35 +++++++
crates/markdown_parser/src/lib.rs | 1 +
3 files changed, 129 insertions(+)
create mode 100644 crates/markdown_parser/src/footnotes.rs
create mode 100644 crates/markdown_parser/src/footnotes_tests.rs
diff --git a/crates/markdown_parser/src/footnotes.rs b/crates/markdown_parser/src/footnotes.rs
new file mode 100644
index 000000000..a4999aec7
--- /dev/null
+++ b/crates/markdown_parser/src/footnotes.rs
@@ -0,0 +1,93 @@
+//! Footnote pre/post-processing for the markdown parser.
+//!
+//! GFM footnotes have two parts:
+//! - Definitions of the form `[^id]: text` (block-level, may continue on
+//! subsequent indented lines).
+//! - References of the form `[^id]` (inline).
+//!
+//! This module pre-extracts definitions from the raw markdown (returning the
+//! source minus the definition lines), and post-rewrites references in the
+//! parsed `FormattedText`, appending a footnotes section if any references
+//! were resolved.
+
+use std::collections::HashMap;
+
+use crate::{
+ FormattedIndentTextInline, FormattedText, FormattedTextFragment, FormattedTextLine,
+ FormattedTextStyles, Hyperlink, OrderedFormattedIndentTextInline,
+};
+
+#[derive(Debug, Clone)]
+pub(crate) struct FootnoteDef {
+ pub(crate) id: String,
+ pub(crate) content: String,
+}
+
+pub(crate) struct FootnoteContext {
+ pub(crate) definitions: HashMap,
+ /// Resolved id → assigned number (1-based, in order of first reference).
+ pub(crate) numbers: HashMap,
+ /// Definitions used at least once, in number order.
+ pub(crate) used: Vec,
+}
+
+impl FootnoteContext {
+ pub(crate) fn empty() -> Self {
+ Self {
+ definitions: HashMap::new(),
+ numbers: HashMap::new(),
+ used: Vec::new(),
+ }
+ }
+}
+
+/// Strip footnote definitions from the source. Returns the source minus the
+/// definition lines and the collected definition map.
+pub(crate) fn extract_definitions(source: &str) -> (String, HashMap) {
+ let mut definitions: HashMap = HashMap::new();
+ let mut output = String::with_capacity(source.len());
+ let mut lines = source.split_inclusive('\n').peekable();
+
+ while let Some(line) = lines.next() {
+ let trimmed = line.trim_end_matches(['\r', '\n']);
+ if let Some((id, first)) = parse_definition_line(trimmed) {
+ let mut content = first.to_string();
+ // Absorb indented continuation lines (4 spaces or a tab).
+ while let Some(peek) = lines.peek() {
+ let peek_no_eol = peek.trim_end_matches(['\r', '\n']);
+ if peek_no_eol.starts_with(" ") || peek_no_eol.starts_with('\t') {
+ let cont = lines.next().unwrap();
+ let cont_trimmed = cont
+ .trim_end_matches(['\r', '\n'])
+ .trim_start_matches(['\t'])
+ .trim_start_matches(" ");
+ content.push(' ');
+ content.push_str(cont_trimmed);
+ } else {
+ break;
+ }
+ }
+ definitions.insert(id.clone(), FootnoteDef { id, content });
+ continue;
+ }
+ output.push_str(line);
+ }
+
+ (output, definitions)
+}
+
+fn parse_definition_line(line: &str) -> Option<(String, &str)> {
+ let rest = line.strip_prefix("[^")?;
+ let close = rest.find(']')?;
+ let id = &rest[..close];
+ if id.is_empty() || id.contains(char::is_whitespace) {
+ return None;
+ }
+ let after = &rest[close + 1..];
+ let body = after.strip_prefix(':')?.trim_start();
+ Some((id.to_string(), body))
+}
+
+#[cfg(test)]
+#[path = "footnotes_tests.rs"]
+mod tests;
diff --git a/crates/markdown_parser/src/footnotes_tests.rs b/crates/markdown_parser/src/footnotes_tests.rs
new file mode 100644
index 000000000..4f34abc8a
--- /dev/null
+++ b/crates/markdown_parser/src/footnotes_tests.rs
@@ -0,0 +1,35 @@
+use super::*;
+
+#[test]
+fn extract_single_definition() {
+ let source = "claim[^x]\n\n[^x]: defn\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "claim[^x]\n\n");
+ assert_eq!(defs.len(), 1);
+ assert_eq!(defs.get("x").unwrap().content, "defn");
+}
+
+#[test]
+fn extract_no_definitions_passthrough() {
+ let source = "plain text\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "plain text\n");
+ assert!(defs.is_empty());
+}
+
+#[test]
+fn extract_continuation_line() {
+ let source = "[^x]: first\n continued\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, "");
+ assert_eq!(defs.get("x").unwrap().content, "first continued");
+}
+
+#[test]
+fn extract_id_with_space_skipped() {
+ // GFM ids don't allow whitespace; the line falls through as a regular paragraph.
+ let source = "[^bad id]: defn\n";
+ let (out, defs) = extract_definitions(source);
+ assert_eq!(out, source);
+ assert!(defs.is_empty());
+}
diff --git a/crates/markdown_parser/src/lib.rs b/crates/markdown_parser/src/lib.rs
index f3e052b11..aa65ff7ed 100644
--- a/crates/markdown_parser/src/lib.rs
+++ b/crates/markdown_parser/src/lib.rs
@@ -6,6 +6,7 @@ use std::{collections::VecDeque, fmt, fmt::Debug};
pub mod html_parser;
pub mod markdown_parser;
pub mod weight;
+mod footnotes;
mod gfm_html;
pub use html_parser::parse_html;
use itertools::Itertools;
From 931359aba4ffc27db6804f2b2f746245730bdb45 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:43:28 -0500
Subject: [PATCH 10/18] feat(markdown_parser): resolve footnote references and
append section
Wire a two-pass pipeline into parse_markdown_impl: extract_definitions
strips [^id]: blocks before parsing, rewrite_references rewrites inline
[^id] tokens to numbered hyperlinks, and append_section appends the
rendered footnote list with back-reference links.
---
crates/markdown_parser/src/footnotes.rs | 126 ++++++++++++++++++
crates/markdown_parser/src/footnotes_tests.rs | 66 +++++++++
crates/markdown_parser/src/markdown_parser.rs | 8 +-
3 files changed, 198 insertions(+), 2 deletions(-)
diff --git a/crates/markdown_parser/src/footnotes.rs b/crates/markdown_parser/src/footnotes.rs
index a4999aec7..aaf41d17a 100644
--- a/crates/markdown_parser/src/footnotes.rs
+++ b/crates/markdown_parser/src/footnotes.rs
@@ -88,6 +88,132 @@ fn parse_definition_line(line: &str) -> Option<(String, &str)> {
Some((id.to_string(), body))
}
+/// Walk every inline fragment in `text` and rewrite occurrences of `[^id]` (for
+/// `id` present in `defs`) into a hyperlink fragment numbered in first-reference order.
+///
+/// Returns the rewritten text and a `FootnoteContext` populated with the
+/// definitions that were actually used (in numbered order).
+pub(crate) fn rewrite_references(
+ mut text: FormattedText,
+ defs: HashMap,
+) -> (FormattedText, FootnoteContext) {
+ let mut ctx = FootnoteContext {
+ definitions: defs,
+ numbers: HashMap::new(),
+ used: Vec::new(),
+ };
+
+ for line in text.lines.iter_mut() {
+ rewrite_line(line, &mut ctx);
+ }
+
+ (text, ctx)
+}
+
+fn rewrite_line(line: &mut FormattedTextLine, ctx: &mut FootnoteContext) {
+ let fragments = match line {
+ FormattedTextLine::Line(frags) => frags,
+ FormattedTextLine::Heading(h) => &mut h.text,
+ FormattedTextLine::OrderedList(list) => &mut list.indented_text.text,
+ FormattedTextLine::UnorderedList(list) => &mut list.text,
+ FormattedTextLine::TaskList(list) => &mut list.text,
+ _ => return,
+ };
+ rewrite_fragments(fragments, ctx);
+}
+
+fn rewrite_fragments(
+ fragments: &mut Vec,
+ ctx: &mut FootnoteContext,
+) {
+ let mut out: Vec = Vec::with_capacity(fragments.len());
+ for fragment in fragments.drain(..) {
+ if !fragment.text.contains("[^") || fragment.styles.inline_code {
+ out.push(fragment);
+ continue;
+ }
+ let original_styles = fragment.styles.clone();
+ let mut remaining = fragment.text.as_str();
+ let mut buf = String::new();
+ while let Some(at) = remaining.find("[^") {
+ buf.push_str(&remaining[..at]);
+ let after = &remaining[at + 2..];
+ if let Some(close) = after.find(']') {
+ let id = &after[..close];
+ if !id.is_empty()
+ && !id.contains(char::is_whitespace)
+ && ctx.definitions.contains_key(id)
+ {
+ if !buf.is_empty() {
+ out.push(FormattedTextFragment {
+ text: std::mem::take(&mut buf),
+ styles: original_styles.clone(),
+ });
+ }
+ let number = match ctx.numbers.get(id) {
+ Some(n) => *n,
+ None => {
+ let n = ctx.used.len() + 1;
+ ctx.numbers.insert(id.to_string(), n);
+ ctx.used.push(ctx.definitions[id].clone());
+ n
+ }
+ };
+ out.push(FormattedTextFragment {
+ text: number.to_string(),
+ styles: FormattedTextStyles {
+ italic: true,
+ hyperlink: Some(Hyperlink::Url(format!("#fn-{id}"))),
+ ..Default::default()
+ },
+ });
+ remaining = &after[close + 1..];
+ continue;
+ }
+ }
+ // Not a defined reference — emit the [^ literally and keep scanning.
+ buf.push_str("[^");
+ remaining = after;
+ }
+ buf.push_str(remaining);
+ if !buf.is_empty() {
+ out.push(FormattedTextFragment {
+ text: buf,
+ styles: original_styles,
+ });
+ }
+ }
+ *fragments = out;
+}
+
+/// Append a footnotes section to `text` based on `ctx.used`.
+pub(crate) fn append_section(text: &mut FormattedText, ctx: &FootnoteContext) {
+ if ctx.used.is_empty() {
+ return;
+ }
+ text.lines.push_back(FormattedTextLine::HorizontalRule);
+ for (index, def) in ctx.used.iter().enumerate() {
+ let number = index + 1;
+ let body_fragments = crate::parse_inline_markdown(&def.content);
+ let mut content_fragments: Vec = body_fragments.into_iter().collect();
+ content_fragments.push(FormattedTextFragment {
+ text: " ↩".to_string(),
+ styles: FormattedTextStyles {
+ hyperlink: Some(Hyperlink::Url(format!("#fnref-{}", def.id))),
+ ..Default::default()
+ },
+ });
+ text.lines
+ .push_back(FormattedTextLine::OrderedList(OrderedFormattedIndentTextInline {
+ number: Some(number),
+ indented_text: FormattedIndentTextInline {
+ indent_level: 0,
+ text: content_fragments,
+ },
+ }));
+ }
+}
+
#[cfg(test)]
#[path = "footnotes_tests.rs"]
mod tests;
diff --git a/crates/markdown_parser/src/footnotes_tests.rs b/crates/markdown_parser/src/footnotes_tests.rs
index 4f34abc8a..590535214 100644
--- a/crates/markdown_parser/src/footnotes_tests.rs
+++ b/crates/markdown_parser/src/footnotes_tests.rs
@@ -1,4 +1,70 @@
use super::*;
+use crate::{parse_markdown};
+
+#[test]
+fn rewrite_single_reference_appends_section() {
+ let parsed = parse_markdown("Some claim[^x].\n\n[^x]: Because reasons.\n")
+ .expect("parse");
+ // Structural assertions:
+ assert!(parsed.lines.iter().any(|l| matches!(l, FormattedTextLine::HorizontalRule)));
+ let has_back_ref = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::OrderedList(list) => Some(&list.indented_text.text),
+ _ => None,
+ })
+ .flatten()
+ .any(|f| f.text == " ↩" && matches!(&f.styles.hyperlink, Some(Hyperlink::Url(u)) if u.contains("fnref")));
+ assert!(has_back_ref, "expected back-reference fragment");
+ // The reference itself is now a hyperlink "1"
+ let reference_hyperlink_present = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .any(|f| f.text == "1" && matches!(&f.styles.hyperlink, Some(Hyperlink::Url(u)) if u == "#fn-x"));
+ assert!(reference_hyperlink_present, "expected #fn-x reference");
+}
+
+#[test]
+fn unused_definition_dropped() {
+ let parsed = parse_markdown("Plain text.\n\n[^never]: Unused.\n").expect("parse");
+ assert!(parsed.lines.iter().all(|l| !matches!(l, FormattedTextLine::HorizontalRule)));
+}
+
+#[test]
+fn undefined_reference_passes_through() {
+ let parsed = parse_markdown("Some claim[^missing] here.\n").expect("parse");
+ let joined: String = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags.iter().map(|f| f.text.clone()).collect::()),
+ _ => None,
+ })
+ .collect();
+ assert!(joined.contains("[^missing]"), "expected literal pass-through: {joined:?}");
+}
+
+#[test]
+fn repeated_references_share_number() {
+ let parsed = parse_markdown("A[^x] B[^x]\n\n[^x]: D\n").expect("parse");
+ let count = parsed
+ .lines
+ .iter()
+ .filter_map(|line| match line {
+ FormattedTextLine::Line(frags) => Some(frags),
+ _ => None,
+ })
+ .flatten()
+ .filter(|f| f.text == "1")
+ .count();
+ assert_eq!(count, 2, "both references should be number 1");
+}
#[test]
fn extract_single_definition() {
diff --git a/crates/markdown_parser/src/markdown_parser.rs b/crates/markdown_parser/src/markdown_parser.rs
index 95d5f391d..d3e446b46 100644
--- a/crates/markdown_parser/src/markdown_parser.rs
+++ b/crates/markdown_parser/src/markdown_parser.rs
@@ -112,7 +112,8 @@ pub fn parse_markdown_with_gfm_tables(markdown: &str) -> Result {
}
fn parse_markdown_impl(markdown: &str, parse_gfm_tables: bool) -> Result {
- parse_markdown_internal::<'_, nom::error::Error<_>>(markdown, parse_gfm_tables)
+ let (stripped, defs) = crate::footnotes::extract_definitions(markdown);
+ let parsed = parse_markdown_internal::<'_, nom::error::Error<_>>(&stripped, parse_gfm_tables)
.map(|(_, mut res)| {
if let Some(FormattedTextLine::LineBreak) = res.last() {
res.pop();
@@ -125,7 +126,10 @@ fn parse_markdown_impl(markdown: &str, parse_gfm_tables: bool) -> Result Result {
From 16ba4ecf135bf0fd5cf43e67c5a5f1cbbb309294 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:44:12 -0500
Subject: [PATCH 11/18] test(markdown_parser): add gfm smoketest fixture for
codeview preview
---
.../test-fixtures/gfm-smoketest.md | 60 +++++++++++++++++++
1 file changed, 60 insertions(+)
create mode 100644 crates/markdown_parser/test-fixtures/gfm-smoketest.md
diff --git a/crates/markdown_parser/test-fixtures/gfm-smoketest.md b/crates/markdown_parser/test-fixtures/gfm-smoketest.md
new file mode 100644
index 000000000..ff1b5471a
--- /dev/null
+++ b/crates/markdown_parser/test-fixtures/gfm-smoketest.md
@@ -0,0 +1,60 @@
+# GFM Smoke Test
+
+This file is opened in CodeView's `.md` Rendered preview to confirm the GFM HTML
++ footnote work renders correctly. It is not a runnable test.
+
+## Inline HTML
+
+Press Cmd+K to clear. H2O reacts with E=mc2.
+This is important, this is old, and this is highlighted.
+
+## Block HTML
+
+
+Click to see commands
+
+Body text inside details.
+
+
+
+
+| Cmd | What |
+
+| build | Compile the project |
+| test | Run the test suite |
+
+
+
+## Mermaid
+
+```mermaid
+graph TD
+ A[Start] --> B{Decision}
+ B -->|Yes| C[Continue]
+ B -->|No| D[Stop]
+```
+
+## Footnotes
+
+A claim with a footnote.[^why] And another reference to the same one.[^why]
+And a separate footnote.[^how]
+
+[^why]: Because the spec says so.
+[^how]: With a hand-rolled two-pass parser.
+
+## Stripped
+
+
+This text appears.
+
+## Existing GFM (regression baseline)
+
+- [ ] Task list works
+- [x] So does this one
+- ~~strikethrough~~
+- A bare URL: https://github.com
+- A pipe table:
+
+| Col A | Col B |
+|-------|-------|
+| 1 | 2 |
From 07d7351c345984fc3f00cdab8ae15cbac38722ab Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:44:39 -0500
Subject: [PATCH 12/18] docs(design-changes): record gfm html and footnote
preview support
---
DESIGN-CHANGES.md | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/DESIGN-CHANGES.md b/DESIGN-CHANGES.md
index 8916e7b5c..620d08dd9 100644
--- a/DESIGN-CHANGES.md
+++ b/DESIGN-CHANGES.md
@@ -46,6 +46,17 @@ crates.
## Applied (this PR)
+### GitHub-Flavored Markdown preview
+
+CodeView's `.md` Rendered preview now renders the GFM HTML safe-list
+(``, ``, ``, ``, ``, ``, raw ``,
+`
`, plus inline phrasing tags) and GFM footnotes (`[^id]` references with
+`[^id]:` definitions). `` blocks render always-expanded with a `▾`
+glyph; interactive collapse is not provided in v1. Mermaid blocks render as
+SVG diagrams in the preview, unchanged from prior behavior. Inline HTML is
+parsed via the existing `html5ever` pipeline in `markdown_parser`; no new
+dependencies were added.
+
### Brand rebrand sweep
- `app/src/drive/index.rs:109` — `WARP_DRIVE_TITLE` literal updated from
From 89019b634e55909adb053873460aba81ac81b75d Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 10:48:21 -0500
Subject: [PATCH 13/18] chore(markdown_parser): drop unused HtmlSpan and
FootnoteContext helpers
---
crates/markdown_parser/src/footnotes.rs | 10 ----------
crates/markdown_parser/src/gfm_html.rs | 12 ------------
crates/markdown_parser/src/gfm_html_tests.rs | 2 +-
3 files changed, 1 insertion(+), 23 deletions(-)
diff --git a/crates/markdown_parser/src/footnotes.rs b/crates/markdown_parser/src/footnotes.rs
index aaf41d17a..d4cba5f0e 100644
--- a/crates/markdown_parser/src/footnotes.rs
+++ b/crates/markdown_parser/src/footnotes.rs
@@ -31,16 +31,6 @@ pub(crate) struct FootnoteContext {
pub(crate) used: Vec,
}
-impl FootnoteContext {
- pub(crate) fn empty() -> Self {
- Self {
- definitions: HashMap::new(),
- numbers: HashMap::new(),
- used: Vec::new(),
- }
- }
-}
-
/// Strip footnote definitions from the source. Returns the source minus the
/// definition lines and the collected definition map.
pub(crate) fn extract_definitions(source: &str) -> (String, HashMap) {
diff --git a/crates/markdown_parser/src/gfm_html.rs b/crates/markdown_parser/src/gfm_html.rs
index 6554fb6e7..399228112 100644
--- a/crates/markdown_parser/src/gfm_html.rs
+++ b/crates/markdown_parser/src/gfm_html.rs
@@ -44,18 +44,6 @@ pub(crate) struct HtmlSpan<'a> {
pub(crate) kind: HtmlSpanKind,
}
-impl HtmlSpan<'_> {
- pub(crate) fn is_block(&self) -> bool {
- matches!(self.kind, HtmlSpanKind::BlockSafe)
- }
- pub(crate) fn is_phrasing(&self) -> bool {
- matches!(self.kind, HtmlSpanKind::PhrasingSafe)
- }
- pub(crate) fn is_stripped(&self) -> bool {
- matches!(self.kind, HtmlSpanKind::Stripped)
- }
-}
-
pub(crate) fn classify(tag: &str) -> HtmlSpanKind {
let lower = tag.to_ascii_lowercase();
if PHRASING_SAFE_TAGS.iter().any(|t| *t == lower) {
diff --git a/crates/markdown_parser/src/gfm_html_tests.rs b/crates/markdown_parser/src/gfm_html_tests.rs
index 23d818449..dd9a9a5e5 100644
--- a/crates/markdown_parser/src/gfm_html_tests.rs
+++ b/crates/markdown_parser/src/gfm_html_tests.rs
@@ -48,7 +48,7 @@ fn lex_missing_close_returns_open_only() {
#[test]
fn lex_comment_stripped() {
let (span, rest) = try_lex_html_span("after").unwrap();
- assert!(span.is_stripped());
+ assert_eq!(span.kind, HtmlSpanKind::Stripped);
assert_eq!(rest, "after");
}
From 2f68db5be75f840fe7d5d8477758847dae778f87 Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 11:01:25 -0500
Subject: [PATCH 14/18] fix(markdown_parser): inline
emits hard break +
review nits
---
crates/markdown_parser/src/gfm_html.rs | 7 +++++--
crates/markdown_parser/src/html_parser.rs | 13 +++++++++++--
crates/markdown_parser/src/markdown_parser.rs | 2 --
.../src/markdown_parser_tests.rs | 17 +++++++++--------
4 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/crates/markdown_parser/src/gfm_html.rs b/crates/markdown_parser/src/gfm_html.rs
index 399228112..a72949b89 100644
--- a/crates/markdown_parser/src/gfm_html.rs
+++ b/crates/markdown_parser/src/gfm_html.rs
@@ -63,8 +63,11 @@ pub(crate) fn try_lex_html_span(input: &str) -> Option<(HtmlSpan<'_>, &str)> {
return None;
}
// HTML comment:
- if input.starts_with("").map(|i| i + 4 + 3).unwrap_or(input.len());
+ if let Some(after_open) = input.strip_prefix("")
+ .map(|i| input.len() - after_open.len() + i + 3)
+ .unwrap_or(input.len());
return Some((
HtmlSpan { raw: &input[..end], tag: "", kind: HtmlSpanKind::Stripped },
&input[end..],
diff --git a/crates/markdown_parser/src/html_parser.rs b/crates/markdown_parser/src/html_parser.rs
index 3396c9511..d56eb2296 100644
--- a/crates/markdown_parser/src/html_parser.rs
+++ b/crates/markdown_parser/src/html_parser.rs
@@ -127,8 +127,9 @@ fn type_matches(attributes: &[Attribute], value: &str) -> bool {
/// Parse an HTML fragment and return its block-level lines.
///
/// Used by the markdown parser when it encounters a block-level HTML span in a `.md` file.
-/// Unlike `parse_html`, this assumes the caller already classified the span as block-level
-/// and does not perform any top-level element skipping.
+/// Delegates to `parse_html`, which top-level-skips wrapper elements like ``, ``,
+/// ``, ``. Those tags are not in the GFM safe-list, so the skipping is benign
+/// for the markdown→HTML dispatch path.
pub(crate) fn parse_html_block_lines(html: &str) -> Vec {
parse_html(html)
.map(|formatted| formatted.lines.into_iter().collect())
@@ -548,6 +549,14 @@ fn parse_phrasing_content(nodes: &[Rc], text_styling: Styling) -> Formatte
}
NodeData::Element { name, attrs, .. } => {
let node_name = name.local.to_string();
+ // `
` in inline context emits a hard line break as a `\n`
+ // text fragment so adjacent text stays visually separated.
+ // Block-context `
` is handled by the per-element dispatch
+ // in `parse_html` and produces `FormattedTextLine::LineBreak`.
+ if node_name.as_str() == "br" {
+ result.push(phrasing_to_formatted_text("\n", &text_styling));
+ continue;
+ }
let mut decorated_styling = text_styling.clone();
decorated_styling.update_with_attributes(&attrs.borrow());
match node_name.as_ref() {
diff --git a/crates/markdown_parser/src/markdown_parser.rs b/crates/markdown_parser/src/markdown_parser.rs
index d3e446b46..c4adf82ce 100644
--- a/crates/markdown_parser/src/markdown_parser.rs
+++ b/crates/markdown_parser/src/markdown_parser.rs
@@ -1320,8 +1320,6 @@ fn parse_link_target<'a, E: ContextError<&'a str> + ParseError<&'a str>>(
Ok((input, target))
}
-/// Parses underlined text using the same logic as parse_link.
-
/// Process emphasis delimiters on the state's delimiter stack, bounded by `stack_bottom`.
///
/// This is approximately equivalent to the CommonMark [process emphasis](https://spec.commonmark.org/0.30/#phase-2-inline-structure)
diff --git a/crates/markdown_parser/src/markdown_parser_tests.rs b/crates/markdown_parser/src/markdown_parser_tests.rs
index b76c17bf7..ec8f8d051 100644
--- a/crates/markdown_parser/src/markdown_parser_tests.rs
+++ b/crates/markdown_parser/src/markdown_parser_tests.rs
@@ -2825,14 +2825,15 @@ fn test_parse_inline_html_unknown_tag_passes_through() {
fn test_parse_inline_html_br_emits_linebreak() {
let source = "before
after";
let parsed = test_parse_markdown(source);
- // The HTML helper emits the
as a LineBreak line, then "after" as a
- // continuation. Joined raw text should reflect the break.
- let joined = parsed
- .iter()
- .map(|line| line.raw_text())
- .collect::();
- assert!(joined.contains("before"));
- assert!(joined.contains("after"));
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(fragments)) => fragments,
+ other => panic!("expected single Line, got {other:?}"),
+ };
+ // Inline
emits a "\n" text fragment between "before" and "after"
+ // so the two halves aren't visually smashed together. (Block-context
+ //
, handled by parse_html, produces FormattedTextLine::LineBreak.)
+ let joined: String = line.iter().map(|f| f.text.clone()).collect();
+ assert_eq!(joined, "before\nafter", "got {joined:?}");
}
#[test]
From 3ae55a99fdc65473ecaa9433fcf46eef5cd8950b Mon Sep 17 00:00:00 2001
From: Val Alexander
Date: Wed, 20 May 2026 13:39:52 -0500
Subject: [PATCH 15/18] Potential fix for pull request finding
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
crates/markdown_parser/src/gfm_html.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crates/markdown_parser/src/gfm_html.rs b/crates/markdown_parser/src/gfm_html.rs
index a72949b89..11dd8e498 100644
--- a/crates/markdown_parser/src/gfm_html.rs
+++ b/crates/markdown_parser/src/gfm_html.rs
@@ -39,7 +39,7 @@ pub(crate) struct HtmlSpan<'a> {
/// The entire matched span, including `` … `` (or just the open tag if
/// self-closing or unmatched).
pub(crate) raw: &'a str,
- /// Lowercased tag name.
+ /// Tag name as matched in the input, preserving the original case.
pub(crate) tag: &'a str,
pub(crate) kind: HtmlSpanKind,
}
From 3b8f896633464b8a9a78d11d14440090f792e79c Mon Sep 17 00:00:00 2001
From: Val Alexander
Date: Wed, 20 May 2026 13:50:36 -0500
Subject: [PATCH 16/18] Potential fix for pull request finding
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
crates/markdown_parser/src/html_parser.rs | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/crates/markdown_parser/src/html_parser.rs b/crates/markdown_parser/src/html_parser.rs
index d56eb2296..46f6cf0e5 100644
--- a/crates/markdown_parser/src/html_parser.rs
+++ b/crates/markdown_parser/src/html_parser.rs
@@ -127,9 +127,9 @@ fn type_matches(attributes: &[Attribute], value: &str) -> bool {
/// Parse an HTML fragment and return its block-level lines.
///
/// Used by the markdown parser when it encounters a block-level HTML span in a `.md` file.
-/// Delegates to `parse_html`, which top-level-skips wrapper elements like ``, ``,
-/// ``, ``. Those tags are not in the GFM safe-list, so the skipping is benign
-/// for the markdown→HTML dispatch path.
+/// Delegates to `parse_html`, inheriting its behavior of skipping certain top-level wrapper
+/// or container elements when extracting block lines. This helper intentionally preserves that
+/// behavior for the markdown→HTML dispatch path.
pub(crate) fn parse_html_block_lines(html: &str) -> Vec {
parse_html(html)
.map(|formatted| formatted.lines.into_iter().collect())
From 080171be36c70b9acb955d54519968180503202f Mon Sep 17 00:00:00 2001
From: Val Alexander
Date: Wed, 20 May 2026 13:51:03 -0500
Subject: [PATCH 17/18] Potential fix for pull request finding
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
crates/markdown_parser/src/footnotes.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crates/markdown_parser/src/footnotes.rs b/crates/markdown_parser/src/footnotes.rs
index d4cba5f0e..69afdf2b9 100644
--- a/crates/markdown_parser/src/footnotes.rs
+++ b/crates/markdown_parser/src/footnotes.rs
@@ -154,7 +154,7 @@ fn rewrite_fragments(
styles: FormattedTextStyles {
italic: true,
hyperlink: Some(Hyperlink::Url(format!("#fn-{id}"))),
- ..Default::default()
+ ..original_styles.clone()
},
});
remaining = &after[close + 1..];
From c39adcd88a6a781226a08abca970fec49fbeb7fc Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Wed, 20 May 2026 13:58:22 -0500
Subject: [PATCH 18/18] fix(markdown_parser): preserve close-only html tags
---
crates/markdown_parser/src/markdown_parser.rs | 16 ++++++++++------
.../markdown_parser/src/markdown_parser_tests.rs | 12 ++++++++++++
2 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/crates/markdown_parser/src/markdown_parser.rs b/crates/markdown_parser/src/markdown_parser.rs
index c4adf82ce..2fd3f9cdd 100644
--- a/crates/markdown_parser/src/markdown_parser.rs
+++ b/crates/markdown_parser/src/markdown_parser.rs
@@ -1638,9 +1638,10 @@ fn parse_inline_token_link_end<'a, E: ContextError<&'a str> + ParseError<&'a str
///
/// Checks (in order):
/// 1. The span must contain `>` (the open tag was properly closed).
-/// 2. The character immediately after the tag name must be a valid separator:
+/// 2. Close-only tags are rejected so they fall through to literal text parsing.
+/// 3. The character immediately after the tag name must be a valid separator:
/// whitespace, `>`, or `/`. This rejects malformed tags like ``.
-/// 3. For non-void, non-self-closing open tags, the span must contain a matching
+/// 4. For non-void, non-self-closing open tags, the span must contain a matching
/// close tag (``). This ensures open-only tags like a bare `` are not
/// consumed as complete spans — they fall through to literal-text parsing.
fn is_well_formed_html_span(raw: &str) -> bool {
@@ -1649,8 +1650,11 @@ fn is_well_formed_html_span(raw: &str) -> bool {
}
let bytes = raw.as_bytes();
let is_close_tag = bytes.get(1) == Some(&b'/');
- // Skip `<` and optional `/` for close tags.
- let name_start = if is_close_tag { 2 } else { 1 };
+ if is_close_tag {
+ return false;
+ }
+ // Skip `<`.
+ let name_start = 1;
// Advance through the tag name.
let mut i = name_start;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
@@ -1661,8 +1665,8 @@ fn is_well_formed_html_span(raw: &str) -> bool {
if !matches!(bytes.get(i), Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None) {
return false;
}
- // Close tags and self-closing tags are already complete.
- if is_close_tag || raw.ends_with("/>") {
+ // Self-closing tags are already complete.
+ if raw.ends_with("/>") {
return true;
}
// Void elements are inherently self-contained.
diff --git a/crates/markdown_parser/src/markdown_parser_tests.rs b/crates/markdown_parser/src/markdown_parser_tests.rs
index ec8f8d051..954b1bafc 100644
--- a/crates/markdown_parser/src/markdown_parser_tests.rs
+++ b/crates/markdown_parser/src/markdown_parser_tests.rs
@@ -2821,6 +2821,18 @@ fn test_parse_inline_html_unknown_tag_passes_through() {
assert_eq!(joined, "bar");
}
+#[test]
+fn test_parse_inline_html_close_only_tag_passes_through() {
+ let source = "before after";
+ let parsed = test_parse_markdown(source);
+ let line = match parsed.first() {
+ Some(FormattedTextLine::Line(line)) => line,
+ other => panic!("expected Line, got {other:?}"),
+ };
+ let joined: String = line.iter().map(|f| f.text.clone()).collect();
+ assert_eq!(joined, "before after");
+}
+
#[test]
fn test_parse_inline_html_br_emits_linebreak() {
let source = "before
after";