feat(browse): --navigate flag on download for browser-triggered files

Adds the --navigate strategy from community PR #1355 (originally from
@garrytan-agents). When set, download navigates to the URL with
waitUntil:'commit' and captures the resulting browser download via
page.waitForEvent('download'), then saves via download.saveAs().
Handles URLs that trigger files via Content-Disposition headers,
multi-hop CDN redirects requiring browser cookies, or anti-bot CDN
chains where page.request.fetch() can't follow the auth/redirect
chain.

Defaults still use the existing direct-fetch strategy. --navigate is
opt-in.

Goes through the same validateNavigationUrl SSRF gate as goto, so
download --navigate cannot reach IPv4 metadata endpoints (AWS IMDSv1,
GCP/Azure equivalents) or arbitrary internal hosts.

Inferred content type from suggested filename for common extensions
(epub, pdf, zip, gz, mp3/mp4, jpg/jpeg/png, txt, html, json) — falls
back to application/octet-stream. Same 200MB cap as Strategy 1.

Frames the use case generically (anti-bot CDN, Content-Disposition,
redirect chains) rather than naming any specific site, per project
voice rules.

Co-Authored-By: @garrytan-agents
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-07 13:37:14 -07:00
parent 04a813e21f
commit 9cb98a7103
3 changed files with 59 additions and 4 deletions

View File

@@ -134,7 +134,7 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
// Data extraction
'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download <url|@ref> [path] [--base64]' },
'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies. Use --navigate for URLs that trigger browser downloads (CDN redirects, Content-Disposition, anti-bot protected sites)', usage: 'download <url|@ref> [path] [--base64] [--navigate]' },
'scrape': { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]' },
'archive': { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' },
// Visual

View File

@@ -1137,9 +1137,10 @@ export async function handleWriteCommand(
}
case 'download': {
if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64]');
if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64] [--navigate]');
const isBase64 = args.includes('--base64');
const filteredArgs = args.filter(a => a !== '--base64');
const useNavigate = args.includes('--navigate');
const filteredArgs = args.filter(a => a !== '--base64' && a !== '--navigate');
let url = filteredArgs[0];
const outputPath = filteredArgs[1];
@@ -1200,6 +1201,60 @@ export async function handleWriteCommand(
if (!match) throw new Error('Failed to decode blob data');
contentType = match[1];
buffer = Buffer.from(match[2], 'base64');
} else if (useNavigate) {
// Strategy 2: Navigate to URL and capture browser-triggered download.
// Handles URLs that trigger file downloads via redirects,
// Content-Disposition headers, or anti-bot CDN chains where
// page.request.fetch() can't follow the auth/redirect chain.
await validateNavigationUrl(url);
const downloadPromise = page.waitForEvent('download', { timeout: 60000 });
// Use goto with 'commit' wait — the page may redirect to trigger
// the download, so 'domcontentloaded' may never fire.
page.goto(url, { waitUntil: 'commit', timeout: 30000 }).catch(() => {
// Navigation may "fail" because the response is a download,
// not a page. The download event handles it.
});
const download = await downloadPromise;
const failure = await download.failure();
if (failure) {
throw new Error(`Download failed: ${failure}`);
}
// Save to temp location first, then read into buffer
const tempPath = path.join(TEMP_DIR, `browse-nav-download-${Date.now()}`);
await download.saveAs(tempPath);
buffer = fs.readFileSync(tempPath);
// Try to infer content type from suggested filename
const suggested = download.suggestedFilename();
if (suggested) {
const extMatch = suggested.match(/\.([a-z0-9]+)$/i);
if (extMatch) {
const extLower = extMatch[1].toLowerCase();
const mimeMap: Record<string, string> = {
epub: 'application/epub+zip', pdf: 'application/pdf',
zip: 'application/zip', gz: 'application/gzip',
mp3: 'audio/mpeg', mp4: 'video/mp4',
jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
txt: 'text/plain', html: 'text/html', json: 'application/json',
};
contentType = mimeMap[extLower] || 'application/octet-stream';
}
}
// Clean up temp file if we're going to write elsewhere
if (outputPath || isBase64) {
try { fs.unlinkSync(tempPath); } catch { /* ignore */ }
} else {
// No explicit output path — rename temp file with inferred extension.
const ext = contentType.split(';')[0].includes('/')
? mimeToExt(contentType.split(';')[0].trim())
: '.bin';
const finalPath = path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`);
fs.renameSync(tempPath, finalPath);
const sizeKB = Math.round(buffer.length / 1024);
return `Downloaded: ${finalPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})${suggested ? ` [${suggested}]` : ''}`;
}
if (buffer.length > 200 * 1024 * 1024) {
throw new Error('File too large (>200MB).');
}
} else {
// Strategy 1: Direct URL via page.request.fetch().
// Gate the URL through the same validator `goto` uses. Without

View File

@@ -64,7 +64,7 @@ Run with `browse <command> [args]`. Full reference: `browse/SKILL.md`.
### Extraction
- `archive [path]`: Save complete page as MHTML via CDP
- `download <url|@ref> [path] [--base64]`: Download URL or media element to disk using browser cookies
- `download <url|@ref> [path] [--base64] [--navigate]`: Download URL or media element to disk using browser cookies.
- `scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]`: Bulk download all media from page.
### Inspection