feat(browse): --navigate flag on download for browser-triggered files

Adds the --navigate strategy from community PR #1355 (originally from @garrytan-agents). When set, download navigates to the URL with waitUntil:'commit' and captures the resulting browser download via page.waitForEvent('download'), then saves via download.saveAs(). Handles URLs that trigger files via Content-Disposition headers, multi-hop CDN redirects requiring browser cookies, or anti-bot CDN chains where page.request.fetch() can't follow the auth/redirect chain. Defaults still use the existing direct-fetch strategy. --navigate is opt-in. Goes through the same validateNavigationUrl SSRF gate as goto, so download --navigate cannot reach IPv4 metadata endpoints (AWS IMDSv1, GCP/Azure equivalents) or arbitrary internal hosts. Inferred content type from suggested filename for common extensions (epub, pdf, zip, gz, mp3/mp4, jpg/jpeg/png, txt, html, json) — falls back to application/octet-stream. Same 200MB cap as Strategy 1. Frames the use case generically (anti-bot CDN, Content-Disposition, redirect chains) rather than naming any specific site, per project voice rules. Co-Authored-By: @garrytan-agents Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 13:39:45 +08:00 · 2026-05-07 13:37:14 -07:00
parent 04a813e21f
commit 9cb98a7103
3 changed files with 59 additions and 4 deletions
--- a/browse/src/commands.ts
+++ b/browse/src/commands.ts
@@ -134,7 +134,7 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
  'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
  'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
  // Data extraction
-  'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download <url|@ref> [path] [--base64]' },
+  'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies. Use --navigate for URLs that trigger browser downloads (CDN redirects, Content-Disposition, anti-bot protected sites)', usage: 'download <url|@ref> [path] [--base64] [--navigate]' },
  'scrape':   { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]' },
  'archive':  { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' },
  // Visual
--- a/browse/src/write-commands.ts
+++ b/browse/src/write-commands.ts
@@ -1137,9 +1137,10 @@ export async function handleWriteCommand(
    }

    case 'download': {
-      if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64]');
+      if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64] [--navigate]');
      const isBase64 = args.includes('--base64');
-      const filteredArgs = args.filter(a => a !== '--base64');
+      const useNavigate = args.includes('--navigate');
+      const filteredArgs = args.filter(a => a !== '--base64' && a !== '--navigate');
      let url = filteredArgs[0];
      const outputPath = filteredArgs[1];

@@ -1200,6 +1201,60 @@ export async function handleWriteCommand(
        if (!match) throw new Error('Failed to decode blob data');
        contentType = match[1];
        buffer = Buffer.from(match[2], 'base64');
+      } else if (useNavigate) {
+        // Strategy 2: Navigate to URL and capture browser-triggered download.
+        // Handles URLs that trigger file downloads via redirects,
+        // Content-Disposition headers, or anti-bot CDN chains where
+        // page.request.fetch() can't follow the auth/redirect chain.
+        await validateNavigationUrl(url);
+        const downloadPromise = page.waitForEvent('download', { timeout: 60000 });
+        // Use goto with 'commit' wait — the page may redirect to trigger
+        // the download, so 'domcontentloaded' may never fire.
+        page.goto(url, { waitUntil: 'commit', timeout: 30000 }).catch(() => {
+          // Navigation may "fail" because the response is a download,
+          // not a page. The download event handles it.
+        });
+        const download = await downloadPromise;
+        const failure = await download.failure();
+        if (failure) {
+          throw new Error(`Download failed: ${failure}`);
+        }
+        // Save to temp location first, then read into buffer
+        const tempPath = path.join(TEMP_DIR, `browse-nav-download-${Date.now()}`);
+        await download.saveAs(tempPath);
+        buffer = fs.readFileSync(tempPath);
+        // Try to infer content type from suggested filename
+        const suggested = download.suggestedFilename();
+        if (suggested) {
+          const extMatch = suggested.match(/\.([a-z0-9]+)$/i);
+          if (extMatch) {
+            const extLower = extMatch[1].toLowerCase();
+            const mimeMap: Record<string, string> = {
+              epub: 'application/epub+zip', pdf: 'application/pdf',
+              zip: 'application/zip', gz: 'application/gzip',
+              mp3: 'audio/mpeg', mp4: 'video/mp4',
+              jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
+              txt: 'text/plain', html: 'text/html', json: 'application/json',
+            };
+            contentType = mimeMap[extLower] || 'application/octet-stream';
+          }
+        }
+        // Clean up temp file if we're going to write elsewhere
+        if (outputPath || isBase64) {
+          try { fs.unlinkSync(tempPath); } catch { /* ignore */ }
+        } else {
+          // No explicit output path — rename temp file with inferred extension.
+          const ext = contentType.split(';')[0].includes('/')
+            ? mimeToExt(contentType.split(';')[0].trim())
+            : '.bin';
+          const finalPath = path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`);
+          fs.renameSync(tempPath, finalPath);
+          const sizeKB = Math.round(buffer.length / 1024);
+          return `Downloaded: ${finalPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})${suggested ? ` [${suggested}]` : ''}`;
+        }
+        if (buffer.length > 200 * 1024 * 1024) {
+          throw new Error('File too large (>200MB).');
+        }
      } else {
        // Strategy 1: Direct URL via page.request.fetch().
        // Gate the URL through the same validator `goto` uses. Without
--- a/gstack/llms.txt
+++ b/gstack/llms.txt
@@ -64,7 +64,7 @@ Run with `browse <command> [args]`. Full reference: `browse/SKILL.md`.

 ### Extraction
 - `archive [path]`: Save complete page as MHTML via CDP
- `download <url|@ref> [path] [--base64]`: Download URL or media element to disk using browser cookies
+- `download <url|@ref> [path] [--base64] [--navigate]`: Download URL or media element to disk using browser cookies.
 - `scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]`: Bulk download all media from page.

 ### Inspection