mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 18:32:28 +08:00
fix(make-pdf): decode HTML entities in titles and TOC to prevent double-escape
A markdown title like "# Herbert & Garry" rendered as "Herbert &amp; Garry" in <title>, cover block, and TOC entries. marked emits "&" (correct HTML), but extractFirstHeading and extractHeadings only stripTags — leaving the entity intact. That string then flows through escapeHtml, producing the double-encode. - render.ts: new decodeTextEntities helper, distinct from decodeTypographicEntities (which runs on in-pipeline HTML and intentionally preserves &). Covers named entities (lt/gt/quot/apos/39/x27/amp) AND numeric (decimal + hex) so inputs like "©" or "—" don't create the same partial-fix bug. Amp-last ordering prevents double-decode on "&lt;" et al. - Apply in both extractFirstHeading and extractHeadings. extractHeadings feeds buildTocBlock → escapeHtml, so the TOC site had the same bug. - render.test.ts: 8 tests covering the contract — parameterized across &, <, >, ©, — chars; single-escape in <title>/cover; TOC double-escape check; numeric entity decode; smartypants-interacts-with-quotes contract (no raw equality). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -354,3 +354,79 @@ describe("render() — pageNumbers data flow", () => {
|
||||
expect(result.printCss).toMatch(/@bottom-center\s*\{\s*content:\s*counter\(page\)/);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── render() — HTML entity handling in titles, cover, TOC ───────────
|
||||
|
||||
describe("render() — no double HTML entity escaping", () => {
|
||||
type Case = { char: string; inTitle: string; expectedTitleMeta: string };
|
||||
|
||||
// Only characters that should flow through unchanged. `"` and `'` are
|
||||
// omitted from this set because smartypants converts them to curly quotes
|
||||
// before heading extraction — asserted separately below.
|
||||
const cases: Case[] = [
|
||||
{ char: "&", inTitle: "A & B", expectedTitleMeta: "A & B" },
|
||||
{ char: "<", inTitle: "A < B", expectedTitleMeta: "A < B" },
|
||||
{ char: ">", inTitle: "A > B", expectedTitleMeta: "A > B" },
|
||||
{ char: "©", inTitle: "A © B", expectedTitleMeta: "A © B" },
|
||||
{ char: "—", inTitle: "A — B", expectedTitleMeta: "A — B" },
|
||||
];
|
||||
|
||||
for (const { char, inTitle, expectedTitleMeta } of cases) {
|
||||
test(`"${char}" in H1 has no double-escape in <title> or cover`, () => {
|
||||
const result = render({
|
||||
markdown: `# ${inTitle}\n\nBody.`,
|
||||
cover: true,
|
||||
author: "A",
|
||||
});
|
||||
// Meta: decoded plain text.
|
||||
expect(result.meta.title).toBe(expectedTitleMeta);
|
||||
// HTML: <title>...</title> never contains double-escape patterns.
|
||||
expect(result.html).not.toMatch(/<title>[^<]*&amp;/);
|
||||
expect(result.html).not.toMatch(/<title>[^<]*&lt;/);
|
||||
expect(result.html).not.toMatch(/<title>[^<]*&gt;/);
|
||||
expect(result.html).not.toMatch(/<title>[^<]*&#\d+;/);
|
||||
expect(result.html).not.toMatch(/<title>[^<]*&#x[0-9a-fA-F]+;/);
|
||||
// Cover block also single-escape.
|
||||
expect(result.html).not.toMatch(/class="cover-title"[^>]*>[^<]*&amp;/);
|
||||
});
|
||||
}
|
||||
|
||||
test('ampersand in <title> renders as exactly one "&"', () => {
|
||||
const result = render({ markdown: `# Herbert & Garry\n\nBody.` });
|
||||
expect(result.html).toContain("<title>Herbert & Garry</title>");
|
||||
expect(result.html).not.toContain("&amp;");
|
||||
});
|
||||
|
||||
test("TOC entries have no double-escape when a heading contains '&'", () => {
|
||||
const result = render({
|
||||
markdown: `# Doc\n\n## Herbert & Garry\n\nBody.\n\n## Other\n\nMore.`,
|
||||
toc: true,
|
||||
});
|
||||
// TOC renders the heading text through escapeHtml; must be single-escaped.
|
||||
expect(result.html).toContain("Herbert & Garry");
|
||||
expect(result.html).not.toContain("&amp;");
|
||||
});
|
||||
|
||||
test('numeric entity in H1 (e.g. "©") decodes cleanly to <title>', () => {
|
||||
// Marked passes through numeric entities verbatim in the HTML output,
|
||||
// so the decoder must handle them.
|
||||
const result = render({ markdown: `# A © B\n\nBody.` });
|
||||
expect(result.meta.title).toBe("A © B");
|
||||
expect(result.html).toContain("<title>A © B</title>");
|
||||
});
|
||||
|
||||
test("smartypants converts raw quotes in title BEFORE extraction (contract)", () => {
|
||||
// We do NOT assert raw `"` survives — smartypants is expected to convert it.
|
||||
// The contract is: no double-escape of the encoded form.
|
||||
const result = render({ markdown: `# Say "hi"\n\nBody.` });
|
||||
expect(result.html).not.toContain("&quot;");
|
||||
expect(result.html).not.toContain("&#39;");
|
||||
// And <title> contains exactly one level of escaping.
|
||||
const titleMatch = result.html.match(/<title>([^<]*)<\/title>/);
|
||||
expect(titleMatch).toBeTruthy();
|
||||
if (titleMatch) {
|
||||
// Never contains a double-encoded entity.
|
||||
expect(titleMatch[1]).not.toMatch(/&(amp|lt|gt|quot|#\d+);/);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user