fix(sync-gbrain): generate gbrain-valid source ids for repos with dots or long names

`deriveCodeSourceId` previously concatenated the canonicalized remote with only `/`
and whitespace stripped, leaving dots from hostnames (`github.com`) and no length
cap. gbrain rejects any source id containing characters outside [a-z0-9-] or longer
than 32 chars, so `github.com/<org>/<repo>` produced `gstack-code-github.com-<org>-<repo>`
(40 chars, plus dots) and registration failed:

    code  source registration failed: Invalid source id
          "gstack-code-github.com-radubach-platform". Must be 1-32 lowercase alnum
          chars with optional interior hyphens.

Fix:
- Drop the host segment (`github.com` is the same for nearly every user and just
  consumes the 32-char budget). Use only the last two path segments (org-repo).
- Sanitize any remaining non-alnum to hyphens, then collapse and trim.
- For genuinely long org/repo names that still exceed the budget, keep the tail
  (most distinctive end of the slug) and append a 6-char sha1 hash for collision
  resistance.

Adds a regression test that spawns the CLI in temp git repos with controlled
remotes (dot in hostname, SCP-style, multi-dot host, long names forcing
hash-truncation) and asserts every derived id is ≤32 chars and matches the
gbrain validator regex.
This commit is contained in:
Richard Dubach
2026-05-05 11:20:11 -07:00
parent db9447c333
commit 59d3075708
2 changed files with 72 additions and 7 deletions

View File

@@ -33,6 +33,7 @@ import { existsSync, statSync, mkdirSync, writeFileSync, readFileSync, unlinkSyn
import { join, dirname } from "path";
import { execSync, execFileSync, spawnSync } from "child_process";
import { homedir } from "os";
import { createHash } from "crypto";
import { detectEngineTier, withErrorContext, canonicalizeRemote } from "../lib/gstack-memory-helpers";
import { sourcePageCount } from "../lib/gbrain-sources";
@@ -158,20 +159,43 @@ function originUrl(): string | null {
}
/**
* Derive a stable source id for the cwd code corpus. Pattern: `gstack-code-<slug>`,
* where <slug> comes from canonicalizeRemote() then `/` → `-` (e.g.,
* `github.com/garrytan/gstack` → `gstack-code-github-com-garrytan-gstack`).
* Derive a stable source id for the cwd code corpus. Pattern: `gstack-code-<slug>`.
*
* Falls back to `gstack-code-<basename(repo)>` when there is no origin (local repo).
* gbrain enforces source ids to be 1-32 lowercase alnum chars with optional interior
* hyphens. We use the last two segments of the canonical remote (org/repo) and skip
* the host — `github.com` etc. is the same for nearly every user and just eats budget.
* If the resulting id still exceeds 32 chars, we keep the tail (most distinctive end)
* and append a 6-char hash of the full slug for collision resistance.
*
* Falls back to the repo basename when there is no origin (local repo).
*/
function deriveCodeSourceId(repoPath: string): string {
const remote = canonicalizeRemote(originUrl());
if (remote) {
return `gstack-code-${remote.replace(/[\/\s]+/g, "-").replace(/-+/g, "-")}`;
const segs = remote.split("/").filter(Boolean);
const slugSource = segs.slice(-2).join("-");
return constrainSourceId("gstack-code", slugSource);
}
// Fallback for repos without a remote.
const base = repoPath.split("/").pop() || "repo";
return `gstack-code-${base.toLowerCase().replace(/[^a-z0-9-]+/g, "-").replace(/-+/g, "-")}`;
return constrainSourceId("gstack-code", base);
}
/**
* Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes
* `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length
* would exceed 32 chars.
*/
function constrainSourceId(prefix: string, raw: string): string {
const MAX = 32;
const slug = raw.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
const full = `${prefix}-${slug}`;
if (full.length <= MAX) return full;
const hash = createHash("sha1").update(slug).digest("hex").slice(0, 6);
// Total budget: prefix + "-" + tail + "-" + hash
const tailBudget = MAX - prefix.length - 2 - hash.length;
if (tailBudget < 1) return `${prefix}-${hash}`;
const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, "");
return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`;
}
function gbrainAvailable(): boolean {

View File

@@ -108,6 +108,47 @@ describe("gstack-gbrain-sync CLI", () => {
rmSync(home, { recursive: true, force: true });
});
it("derived source ids are gbrain-valid (≤32 chars, alnum + interior hyphens, no dots) for any remote", () => {
// gbrain enforces source ids to be 1-32 lowercase alnum chars with optional interior
// hyphens. Pre-fix, the slug came from canonicalizeRemote() with only `/` and
// whitespace stripped — leaving dots from hostnames (`github.com`) and no length cap.
// For `github.com/<org>/<repo>`, the id was `gstack-code-github.com-<org>-<repo>`,
// which fails validation on both counts. This test exercises the derivation against
// controlled remotes by spawning the CLI in a temp git repo.
const cases = [
"https://github.com/radubach/platform.git", // dot in hostname, total > 32 with old slug
"git@github.com:garrytan/gstack.git", // SCP-style remote
"https://gitlab.example.com/team/proj.git", // multi-dot host, non-github
"https://github.com/some-very-long-org-name/some-very-long-repo-name.git", // forces hash-truncate
];
const VALID_ID = /^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/;
for (const remote of cases) {
const home = makeTestHome();
const gstackHome = join(home, ".gstack");
mkdirSync(gstackHome, { recursive: true });
const repo = mkdtempSync(join(tmpdir(), "gstack-source-id-repo-"));
spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo });
spawnSync("git", ["remote", "add", "origin", remote], { cwd: repo });
const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], {
encoding: "utf-8",
timeout: 60000,
cwd: repo,
env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome },
});
expect(r.status).toBe(0);
const m = (r.stdout || "").match(/gbrain sources add (\S+)/);
expect(m).not.toBeNull();
const id = m![1];
expect(id.length).toBeLessThanOrEqual(32);
expect(id).toMatch(VALID_ID);
expect(id.startsWith("gstack-code-")).toBe(true);
rmSync(repo, { recursive: true, force: true });
rmSync(home, { recursive: true, force: true });
}
});
it("dry-run does NOT acquire the lock file (lock is for write paths only)", () => {
const home = makeTestHome();
const gstackHome = join(home, ".gstack");