mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-16 01:02:13 +08:00
fix(make-pdf): decode HTML entities in titles and TOC to prevent double-escape
A markdown title like "# Herbert & Garry" rendered as "Herbert &amp; Garry" in <title>, cover block, and TOC entries. marked emits "&" (correct HTML), but extractFirstHeading and extractHeadings only stripTags — leaving the entity intact. That string then flows through escapeHtml, producing the double-encode. - render.ts: new decodeTextEntities helper, distinct from decodeTypographicEntities (which runs on in-pipeline HTML and intentionally preserves &). Covers named entities (lt/gt/quot/apos/39/x27/amp) AND numeric (decimal + hex) so inputs like "©" or "—" don't create the same partial-fix bug. Amp-last ordering prevents double-decode on "&lt;" et al. - Apply in both extractFirstHeading and extractHeadings. extractHeadings feeds buildTocBlock → escapeHtml, so the TOC site had the same bug. - render.test.ts: 8 tests covering the contract — parameterized across &, <, >, ©, — chars; single-escape in <title>/cover; TOC double-escape check; numeric entity decode; smartypants-interacts-with-quotes contract (no raw equality). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -288,7 +288,7 @@ function extractHeadings(html: string): Array<{ level: number; text: string }> {
|
||||
let match;
|
||||
while ((match = re.exec(html)) !== null) {
|
||||
const level = parseInt(match[1].slice(1), 10);
|
||||
const text = stripTags(match[2]).trim();
|
||||
const text = decodeTextEntities(stripTags(match[2]).trim());
|
||||
if (text) headings.push({ level, text });
|
||||
}
|
||||
return headings;
|
||||
@@ -324,7 +324,32 @@ function wrapChaptersByH1(html: string): string {
|
||||
|
||||
function extractFirstHeading(html: string): string | null {
|
||||
const m = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
return m ? stripTags(m[1]).trim() : null;
|
||||
return m ? decodeTextEntities(stripTags(m[1]).trim()) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode HTML entities in plain text extracted from rendered HTML. Distinct
|
||||
* from decodeTypographicEntities (which runs on in-pipeline HTML and preserves
|
||||
* & because &amp; can be legitimate there). This runs on text destined
|
||||
* for <title>, cover, and TOC entries where & MUST become & or escapeHtml
|
||||
* produces &amp;.
|
||||
*
|
||||
* Amp-last ordering: input "&#169;" decodes to "©" in the named pass,
|
||||
* then the numeric pass decodes "©" to "©". Decoding & first would
|
||||
* produce "©" and the numeric pass would consume it — different end state
|
||||
* but risks double-decode on inputs like "&lt;".
|
||||
*/
|
||||
function decodeTextEntities(s: string): string {
|
||||
return s
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10)))
|
||||
.replace(/&#x([0-9a-fA-F]+);/g, (_, n) => String.fromCodePoint(parseInt(n, 16)))
|
||||
.replace(/&/g, "&");
|
||||
}
|
||||
|
||||
function stripTags(html: string): string {
|
||||
|
||||
Reference in New Issue
Block a user