diff --git a/bin/gstack-gbrain-sync.ts b/bin/gstack-gbrain-sync.ts index 6aea0f93..48563aa3 100644 --- a/bin/gstack-gbrain-sync.ts +++ b/bin/gstack-gbrain-sync.ts @@ -33,6 +33,7 @@ import { existsSync, statSync, mkdirSync, writeFileSync, readFileSync, unlinkSyn import { join, dirname } from "path"; import { execSync, execFileSync, spawnSync } from "child_process"; import { homedir } from "os"; +import { createHash } from "crypto"; import { detectEngineTier, withErrorContext, canonicalizeRemote } from "../lib/gstack-memory-helpers"; import { sourcePageCount } from "../lib/gbrain-sources"; @@ -158,20 +159,43 @@ function originUrl(): string | null { } /** - * Derive a stable source id for the cwd code corpus. Pattern: `gstack-code-`, - * where comes from canonicalizeRemote() then `/` → `-` (e.g., - * `github.com/garrytan/gstack` → `gstack-code-github-com-garrytan-gstack`). + * Derive a stable source id for the cwd code corpus. Pattern: `gstack-code-`. * - * Falls back to `gstack-code-` when there is no origin (local repo). + * gbrain enforces source ids to be 1-32 lowercase alnum chars with optional interior + * hyphens. We use the last two segments of the canonical remote (org/repo) and skip + * the host — `github.com` etc. is the same for nearly every user and just eats budget. + * If the resulting id still exceeds 32 chars, we keep the tail (most distinctive end) + * and append a 6-char hash of the full slug for collision resistance. + * + * Falls back to the repo basename when there is no origin (local repo). */ function deriveCodeSourceId(repoPath: string): string { const remote = canonicalizeRemote(originUrl()); if (remote) { - return `gstack-code-${remote.replace(/[\/\s]+/g, "-").replace(/-+/g, "-")}`; + const segs = remote.split("/").filter(Boolean); + const slugSource = segs.slice(-2).join("-"); + return constrainSourceId("gstack-code", slugSource); } - // Fallback for repos without a remote. const base = repoPath.split("/").pop() || "repo"; - return `gstack-code-${base.toLowerCase().replace(/[^a-z0-9-]+/g, "-").replace(/-+/g, "-")}`; + return constrainSourceId("gstack-code", base); +} + +/** + * Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes + * `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length + * would exceed 32 chars. + */ +function constrainSourceId(prefix: string, raw: string): string { + const MAX = 32; + const slug = raw.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); + const full = `${prefix}-${slug}`; + if (full.length <= MAX) return full; + const hash = createHash("sha1").update(slug).digest("hex").slice(0, 6); + // Total budget: prefix + "-" + tail + "-" + hash + const tailBudget = MAX - prefix.length - 2 - hash.length; + if (tailBudget < 1) return `${prefix}-${hash}`; + const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, ""); + return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`; } function gbrainAvailable(): boolean { diff --git a/test/gstack-gbrain-sync.test.ts b/test/gstack-gbrain-sync.test.ts index 5401fc48..6693615d 100644 --- a/test/gstack-gbrain-sync.test.ts +++ b/test/gstack-gbrain-sync.test.ts @@ -108,6 +108,47 @@ describe("gstack-gbrain-sync CLI", () => { rmSync(home, { recursive: true, force: true }); }); + it("derived source ids are gbrain-valid (≤32 chars, alnum + interior hyphens, no dots) for any remote", () => { + // gbrain enforces source ids to be 1-32 lowercase alnum chars with optional interior + // hyphens. Pre-fix, the slug came from canonicalizeRemote() with only `/` and + // whitespace stripped — leaving dots from hostnames (`github.com`) and no length cap. + // For `github.com//`, the id was `gstack-code-github.com--`, + // which fails validation on both counts. This test exercises the derivation against + // controlled remotes by spawning the CLI in a temp git repo. + const cases = [ + "https://github.com/radubach/platform.git", // dot in hostname, total > 32 with old slug + "git@github.com:garrytan/gstack.git", // SCP-style remote + "https://gitlab.example.com/team/proj.git", // multi-dot host, non-github + "https://github.com/some-very-long-org-name/some-very-long-repo-name.git", // forces hash-truncate + ]; + const VALID_ID = /^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/; + for (const remote of cases) { + const home = makeTestHome(); + const gstackHome = join(home, ".gstack"); + mkdirSync(gstackHome, { recursive: true }); + const repo = mkdtempSync(join(tmpdir(), "gstack-source-id-repo-")); + spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo }); + spawnSync("git", ["remote", "add", "origin", remote], { cwd: repo }); + + const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], { + encoding: "utf-8", + timeout: 60000, + cwd: repo, + env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome }, + }); + expect(r.status).toBe(0); + const m = (r.stdout || "").match(/gbrain sources add (\S+)/); + expect(m).not.toBeNull(); + const id = m![1]; + expect(id.length).toBeLessThanOrEqual(32); + expect(id).toMatch(VALID_ID); + expect(id.startsWith("gstack-code-")).toBe(true); + + rmSync(repo, { recursive: true, force: true }); + rmSync(home, { recursive: true, force: true }); + } + }); + it("dry-run does NOT acquire the lock file (lock is for write paths only)", () => { const home = makeTestHome(); const gstackHome = join(home, ".gstack");