From 9cb98a710333acd2fb226dde398e7eea086d2480 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 7 May 2026 13:37:14 -0700 Subject: [PATCH] feat(browse): --navigate flag on download for browser-triggered files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the --navigate strategy from community PR #1355 (originally from @garrytan-agents). When set, download navigates to the URL with waitUntil:'commit' and captures the resulting browser download via page.waitForEvent('download'), then saves via download.saveAs(). Handles URLs that trigger files via Content-Disposition headers, multi-hop CDN redirects requiring browser cookies, or anti-bot CDN chains where page.request.fetch() can't follow the auth/redirect chain. Defaults still use the existing direct-fetch strategy. --navigate is opt-in. Goes through the same validateNavigationUrl SSRF gate as goto, so download --navigate cannot reach IPv4 metadata endpoints (AWS IMDSv1, GCP/Azure equivalents) or arbitrary internal hosts. Inferred content type from suggested filename for common extensions (epub, pdf, zip, gz, mp3/mp4, jpg/jpeg/png, txt, html, json) — falls back to application/octet-stream. Same 200MB cap as Strategy 1. Frames the use case generically (anti-bot CDN, Content-Disposition, redirect chains) rather than naming any specific site, per project voice rules. Co-Authored-By: @garrytan-agents Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/commands.ts | 2 +- browse/src/write-commands.ts | 59 ++++++++++++++++++++++++++++++++++-- gstack/llms.txt | 2 +- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/browse/src/commands.ts b/browse/src/commands.ts index 493c19ea..1af127d5 100644 --- a/browse/src/commands.ts +++ b/browse/src/commands.ts @@ -134,7 +134,7 @@ export const COMMAND_DESCRIPTIONS: Record [path] [--base64]' }, + 'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies. Use --navigate for URLs that trigger browser downloads (CDN redirects, Content-Disposition, anti-bot protected sites)', usage: 'download [path] [--base64] [--navigate]' }, 'scrape': { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape [--selector sel] [--dir path] [--limit N]' }, 'archive': { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' }, // Visual diff --git a/browse/src/write-commands.ts b/browse/src/write-commands.ts index 73896ba3..61c84d83 100644 --- a/browse/src/write-commands.ts +++ b/browse/src/write-commands.ts @@ -1137,9 +1137,10 @@ export async function handleWriteCommand( } case 'download': { - if (args.length === 0) throw new Error('Usage: download [path] [--base64]'); + if (args.length === 0) throw new Error('Usage: download [path] [--base64] [--navigate]'); const isBase64 = args.includes('--base64'); - const filteredArgs = args.filter(a => a !== '--base64'); + const useNavigate = args.includes('--navigate'); + const filteredArgs = args.filter(a => a !== '--base64' && a !== '--navigate'); let url = filteredArgs[0]; const outputPath = filteredArgs[1]; @@ -1200,6 +1201,60 @@ export async function handleWriteCommand( if (!match) throw new Error('Failed to decode blob data'); contentType = match[1]; buffer = Buffer.from(match[2], 'base64'); + } else if (useNavigate) { + // Strategy 2: Navigate to URL and capture browser-triggered download. + // Handles URLs that trigger file downloads via redirects, + // Content-Disposition headers, or anti-bot CDN chains where + // page.request.fetch() can't follow the auth/redirect chain. + await validateNavigationUrl(url); + const downloadPromise = page.waitForEvent('download', { timeout: 60000 }); + // Use goto with 'commit' wait — the page may redirect to trigger + // the download, so 'domcontentloaded' may never fire. + page.goto(url, { waitUntil: 'commit', timeout: 30000 }).catch(() => { + // Navigation may "fail" because the response is a download, + // not a page. The download event handles it. + }); + const download = await downloadPromise; + const failure = await download.failure(); + if (failure) { + throw new Error(`Download failed: ${failure}`); + } + // Save to temp location first, then read into buffer + const tempPath = path.join(TEMP_DIR, `browse-nav-download-${Date.now()}`); + await download.saveAs(tempPath); + buffer = fs.readFileSync(tempPath); + // Try to infer content type from suggested filename + const suggested = download.suggestedFilename(); + if (suggested) { + const extMatch = suggested.match(/\.([a-z0-9]+)$/i); + if (extMatch) { + const extLower = extMatch[1].toLowerCase(); + const mimeMap: Record = { + epub: 'application/epub+zip', pdf: 'application/pdf', + zip: 'application/zip', gz: 'application/gzip', + mp3: 'audio/mpeg', mp4: 'video/mp4', + jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png', + txt: 'text/plain', html: 'text/html', json: 'application/json', + }; + contentType = mimeMap[extLower] || 'application/octet-stream'; + } + } + // Clean up temp file if we're going to write elsewhere + if (outputPath || isBase64) { + try { fs.unlinkSync(tempPath); } catch { /* ignore */ } + } else { + // No explicit output path — rename temp file with inferred extension. + const ext = contentType.split(';')[0].includes('/') + ? mimeToExt(contentType.split(';')[0].trim()) + : '.bin'; + const finalPath = path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`); + fs.renameSync(tempPath, finalPath); + const sizeKB = Math.round(buffer.length / 1024); + return `Downloaded: ${finalPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})${suggested ? ` [${suggested}]` : ''}`; + } + if (buffer.length > 200 * 1024 * 1024) { + throw new Error('File too large (>200MB).'); + } } else { // Strategy 1: Direct URL via page.request.fetch(). // Gate the URL through the same validator `goto` uses. Without diff --git a/gstack/llms.txt b/gstack/llms.txt index 7fb00400..8c5d4a39 100644 --- a/gstack/llms.txt +++ b/gstack/llms.txt @@ -64,7 +64,7 @@ Run with `browse [args]`. Full reference: `browse/SKILL.md`. ### Extraction - `archive [path]`: Save complete page as MHTML via CDP -- `download [path] [--base64]`: Download URL or media element to disk using browser cookies +- `download [path] [--base64] [--navigate]`: Download URL or media element to disk using browser cookies. - `scrape [--selector sel] [--dir path] [--limit N]`: Bulk download all media from page. ### Inspection