Files
gstack/bin/gstack-gbrain-sync.ts
Richard Dubach 59d3075708 fix(sync-gbrain): generate gbrain-valid source ids for repos with dots or long names
`deriveCodeSourceId` previously concatenated the canonicalized remote with only `/`
and whitespace stripped, leaving dots from hostnames (`github.com`) and no length
cap. gbrain rejects any source id containing characters outside [a-z0-9-] or longer
than 32 chars, so `github.com/<org>/<repo>` produced `gstack-code-github.com-<org>-<repo>`
(40 chars, plus dots) and registration failed:

    code  source registration failed: Invalid source id
          "gstack-code-github.com-radubach-platform". Must be 1-32 lowercase alnum
          chars with optional interior hyphens.

Fix:
- Drop the host segment (`github.com` is the same for nearly every user and just
  consumes the 32-char budget). Use only the last two path segments (org-repo).
- Sanitize any remaining non-alnum to hyphens, then collapse and trim.
- For genuinely long org/repo names that still exceed the budget, keep the tail
  (most distinctive end of the slug) and append a 6-char sha1 hash for collision
  resistance.

Adds a regression test that spawns the CLI in temp git repos with controlled
remotes (dot in hostname, SCP-style, multi-dot host, long names forcing
hash-truncation) and asserts every derived id is ≤32 chars and matches the
gbrain validator regex.
2026-05-05 11:20:11 -07:00

578 lines
20 KiB
TypeScript

#!/usr/bin/env bun
/**
* gstack-gbrain-sync — V1 unified sync verb.
*
* Orchestrates three storage tiers per plan §"Storage tiering":
*
* 1. Code (current repo) → `gbrain sources add` (idempotent via
* lib/gbrain-sources.ts) + `gbrain sync
* --strategy code` (incremental) or
* `gbrain reindex-code --yes` (--full).
* NEVER `gbrain import` (markdown only).
* 2. Transcripts + curated memory → gstack-memory-ingest (typed put_page)
* 3. Curated artifacts to git → gstack-brain-sync (existing pipeline)
*
* Modes:
* --incremental (default) — mtime fast-path; runs all 3 stages with cache hits
* --full — first-run; full walk + reindex; honest budget per ED2
* --dry-run — preview what would sync; no writes anywhere (incl. state file)
*
* Concurrency safety per /plan-eng-review D1:
* - Lock file at ~/.gstack/.sync-gbrain.lock (PID + start ts).
* - Stale-lock takeover after 5 min (process death).
* - State file written via tmp+rename for atomicity.
* - Lock released in finally; SIGINT/SIGTERM trapped for cleanup.
*
* --watch (V1.5 P0 TODO): file-watcher daemon. NOTE: gbrain v0.25.1 already
* ships `gbrain sync --watch [--interval N]` and `gbrain sync --install-cron`;
* when revisited, /sync-gbrain --watch wires through to the gbrain CLI rather
* than building a gstack-side daemon.
*/
import { existsSync, statSync, mkdirSync, writeFileSync, readFileSync, unlinkSync, renameSync } from "fs";
import { join, dirname } from "path";
import { execSync, execFileSync, spawnSync } from "child_process";
import { homedir } from "os";
import { createHash } from "crypto";
import { detectEngineTier, withErrorContext, canonicalizeRemote } from "../lib/gstack-memory-helpers";
import { sourcePageCount } from "../lib/gbrain-sources";
// ── Types ──────────────────────────────────────────────────────────────────
type Mode = "incremental" | "full" | "dry-run";
interface CliArgs {
mode: Mode;
quiet: boolean;
noCode: boolean;
noMemory: boolean;
noBrainSync: boolean;
codeOnly: boolean;
}
interface CodeStageDetail {
source_id?: string;
source_path?: string;
page_count?: number | null;
last_imported?: string;
status?: "ok" | "skipped" | "failed";
}
interface StageResult {
name: string;
ran: boolean;
ok: boolean;
duration_ms: number;
summary: string;
/** Stage-specific structured detail. Code stage carries source_id + page_count. */
detail?: CodeStageDetail;
}
// ── Constants ──────────────────────────────────────────────────────────────
const HOME = homedir();
const GSTACK_HOME = process.env.GSTACK_HOME || join(HOME, ".gstack");
const STATE_PATH = join(GSTACK_HOME, ".gbrain-sync-state.json");
const LOCK_PATH = join(GSTACK_HOME, ".sync-gbrain.lock");
const STALE_LOCK_MS = 5 * 60 * 1000;
// ── CLI ────────────────────────────────────────────────────────────────────
function printUsage(): void {
console.error(`Usage: gstack-gbrain-sync [--incremental|--full|--dry-run] [options]
Modes:
--incremental Default. mtime fast-path; ~50ms steady-state.
--full First-run; full walk + reindex. Honest ~25-35 min for big Macs (ED2).
--dry-run Preview what would sync; no writes anywhere.
Options:
--quiet Suppress per-stage output.
--no-code Skip the cwd code-import stage.
--no-memory Skip the gstack-memory-ingest stage (transcripts + artifacts).
--no-brain-sync Skip the gstack-brain-sync git pipeline stage.
--code-only Only run the code-import stage (alias for --no-memory --no-brain-sync).
--help This text.
Stages run in order: code → memory ingest → curated git push.
Each stage failure is non-fatal; subsequent stages still run.
`);
}
function parseArgs(): CliArgs {
const args = process.argv.slice(2);
let mode: Mode = "incremental";
let quiet = false;
let noCode = false;
let noMemory = false;
let noBrainSync = false;
let codeOnly = false;
for (let i = 0; i < args.length; i++) {
const a = args[i];
switch (a) {
case "--incremental": mode = "incremental"; break;
case "--full": mode = "full"; break;
case "--dry-run": mode = "dry-run"; break;
case "--quiet": quiet = true; break;
case "--no-code": noCode = true; break;
case "--no-memory": noMemory = true; break;
case "--no-brain-sync": noBrainSync = true; break;
case "--code-only":
codeOnly = true;
noMemory = true;
noBrainSync = true;
break;
case "--help":
case "-h":
printUsage();
process.exit(0);
default:
console.error(`Unknown argument: ${a}`);
printUsage();
process.exit(1);
}
}
return { mode, quiet, noCode, noMemory, noBrainSync, codeOnly };
}
// ── Helpers ────────────────────────────────────────────────────────────────
function repoRoot(): string | null {
try {
const out = execSync("git rev-parse --show-toplevel", { encoding: "utf-8", timeout: 2000 });
return out.trim();
} catch {
return null;
}
}
function originUrl(): string | null {
try {
const out = execSync("git remote get-url origin", { encoding: "utf-8", timeout: 2000 });
return out.trim();
} catch {
return null;
}
}
/**
* Derive a stable source id for the cwd code corpus. Pattern: `gstack-code-<slug>`.
*
* gbrain enforces source ids to be 1-32 lowercase alnum chars with optional interior
* hyphens. We use the last two segments of the canonical remote (org/repo) and skip
* the host — `github.com` etc. is the same for nearly every user and just eats budget.
* If the resulting id still exceeds 32 chars, we keep the tail (most distinctive end)
* and append a 6-char hash of the full slug for collision resistance.
*
* Falls back to the repo basename when there is no origin (local repo).
*/
function deriveCodeSourceId(repoPath: string): string {
const remote = canonicalizeRemote(originUrl());
if (remote) {
const segs = remote.split("/").filter(Boolean);
const slugSource = segs.slice(-2).join("-");
return constrainSourceId("gstack-code", slugSource);
}
const base = repoPath.split("/").pop() || "repo";
return constrainSourceId("gstack-code", base);
}
/**
* Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes
* `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length
* would exceed 32 chars.
*/
function constrainSourceId(prefix: string, raw: string): string {
const MAX = 32;
const slug = raw.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
const full = `${prefix}-${slug}`;
if (full.length <= MAX) return full;
const hash = createHash("sha1").update(slug).digest("hex").slice(0, 6);
// Total budget: prefix + "-" + tail + "-" + hash
const tailBudget = MAX - prefix.length - 2 - hash.length;
if (tailBudget < 1) return `${prefix}-${hash}`;
const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, "");
return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`;
}
function gbrainAvailable(): boolean {
try {
execSync("command -v gbrain", { stdio: "ignore" });
return true;
} catch {
return false;
}
}
// ── Lock file (D1) ─────────────────────────────────────────────────────────
interface LockInfo {
pid: number;
started_at: string;
}
function acquireLock(): boolean {
mkdirSync(GSTACK_HOME, { recursive: true });
if (existsSync(LOCK_PATH)) {
// Check if stale.
try {
const stat = statSync(LOCK_PATH);
const ageMs = Date.now() - stat.mtimeMs;
if (ageMs > STALE_LOCK_MS) {
// Stale; take over.
unlinkSync(LOCK_PATH);
} else {
return false;
}
} catch {
// Cannot stat; bail conservatively.
return false;
}
}
const info: LockInfo = { pid: process.pid, started_at: new Date().toISOString() };
try {
writeFileSync(LOCK_PATH, JSON.stringify(info), { encoding: "utf-8", flag: "wx" });
return true;
} catch {
return false;
}
}
function releaseLock(): void {
try {
if (!existsSync(LOCK_PATH)) return;
const raw = readFileSync(LOCK_PATH, "utf-8");
const info = JSON.parse(raw) as LockInfo;
if (info.pid === process.pid) {
unlinkSync(LOCK_PATH);
}
} catch {
// Best-effort cleanup.
}
}
// ── Stage runners ──────────────────────────────────────────────────────────
function runCodeImport(args: CliArgs): StageResult {
const t0 = Date.now();
const root = repoRoot();
if (!root) {
return { name: "code", ran: false, ok: true, duration_ms: 0, summary: "skipped (not in git repo)" };
}
if (!gbrainAvailable()) {
return { name: "code", ran: false, ok: false, duration_ms: 0, summary: "skipped (gbrain CLI not in PATH)" };
}
const sourceId = deriveCodeSourceId(root);
if (args.mode === "dry-run") {
return {
name: "code",
ran: false,
ok: true,
duration_ms: 0,
summary: `would: gbrain sources add ${sourceId} --path ${root} --federated; gbrain sync --strategy code --source ${sourceId}`,
detail: { source_id: sourceId, source_path: root, status: "skipped" },
};
}
// Step 1: Ensure source registered (idempotent).
let registered = false;
try {
// ensureSourceRegistered is async — but we're in a sync stage runner. Use a deasync pattern.
// Bun supports top-level await in main(), but stage runners are sync per orchestrator contract.
// Workaround: run as a child Bun script for the registration probe.
// Simpler: call gbrain CLI directly via the sync helpers in lib/gbrain-sources.ts probeSource.
// For symmetry, we duplicate the small ensureSourceRegistered logic synchronously here using
// execFileSync. (The lib helper is preferred for async callers; sync helpers below.)
registered = ensureSourceRegisteredSync(sourceId, root);
} catch (err) {
return {
name: "code",
ran: true,
ok: false,
duration_ms: Date.now() - t0,
summary: `source registration failed: ${(err as Error).message}`,
detail: { source_id: sourceId, source_path: root, status: "failed" },
};
}
// Step 2: Run sync or reindex.
const syncArgs = args.mode === "full"
? ["reindex-code", "--source", sourceId, "--yes"]
: ["sync", "--strategy", "code", "--source", sourceId];
const syncResult = spawnSync("gbrain", syncArgs, {
stdio: args.quiet ? ["ignore", "ignore", "ignore"] : ["ignore", "inherit", "inherit"],
timeout: 35 * 60 * 1000,
});
if (syncResult.status !== 0) {
return {
name: "code",
ran: true,
ok: false,
duration_ms: Date.now() - t0,
summary: `gbrain ${syncArgs.join(" ")} exited ${syncResult.status}`,
detail: { source_id: sourceId, source_path: root, status: "failed" },
};
}
// Step 3: Read page_count from gbrain sources list.
const pageCount = sourcePageCount(sourceId);
return {
name: "code",
ran: true,
ok: true,
duration_ms: Date.now() - t0,
summary: `${registered ? "registered + " : ""}synced ${sourceId} (page_count=${pageCount ?? "unknown"})`,
detail: {
source_id: sourceId,
source_path: root,
page_count: pageCount,
last_imported: new Date().toISOString(),
status: "ok",
},
};
}
/**
* Synchronous mirror of ensureSourceRegistered for use inside the synchronous
* stage runner. Returns true if registration changed (added or re-added).
*/
function ensureSourceRegisteredSync(id: string, path: string): boolean {
// Probe.
let probeOut: string;
try {
probeOut = execFileSync("gbrain", ["sources", "list", "--json"], {
encoding: "utf-8",
timeout: 10_000,
stdio: ["ignore", "pipe", "pipe"],
});
} catch (err) {
const e = err as NodeJS.ErrnoException & { stderr?: Buffer };
const stderr = e.stderr?.toString() || "";
if (e.code === "ENOENT") throw new Error("gbrain CLI not on PATH");
if (stderr.includes("Cannot connect to database") || stderr.includes("config.json")) {
throw new Error("gbrain not configured (run /setup-gbrain)");
}
throw err;
}
let parsed: { sources?: Array<{ id?: string; local_path?: string }> };
try {
parsed = JSON.parse(probeOut);
} catch (err) {
throw new Error(`gbrain sources list returned non-JSON: ${(err as Error).message}`);
}
const sources = parsed.sources || [];
const match = sources.find((s) => s.id === id);
if (match && match.local_path === path) {
return false; // no-op
}
if (match && match.local_path !== path) {
const rm = spawnSync("gbrain", ["sources", "remove", id, "--yes"], {
encoding: "utf-8",
timeout: 30_000,
});
if (rm.status !== 0) {
throw new Error(`gbrain sources remove ${id} failed: ${rm.stderr || rm.stdout || `exit ${rm.status}`}`);
}
}
const add = spawnSync("gbrain", ["sources", "add", id, "--path", path, "--federated"], {
encoding: "utf-8",
timeout: 30_000,
});
if (add.status !== 0) {
throw new Error(`gbrain sources add ${id} failed: ${add.stderr || add.stdout || `exit ${add.status}`}`);
}
return true;
}
function runMemoryIngest(args: CliArgs): StageResult {
const t0 = Date.now();
if (args.mode === "dry-run") {
return { name: "memory", ran: false, ok: true, duration_ms: 0, summary: "would: gstack-memory-ingest --probe" };
}
const ingestPath = join(import.meta.dir, "gstack-memory-ingest.ts");
const ingestArgs = ["run", ingestPath];
if (args.mode === "full") ingestArgs.push("--bulk");
else ingestArgs.push("--incremental");
if (args.quiet) ingestArgs.push("--quiet");
const result = spawnSync("bun", ingestArgs, {
encoding: "utf-8",
timeout: 35 * 60 * 1000,
});
const summary = (result.stderr || "").split("\n").filter((l) => l.includes("[memory-ingest]")).slice(-1)[0] || "ingest pass complete";
return {
name: "memory",
ran: true,
ok: result.status === 0,
duration_ms: Date.now() - t0,
summary: result.status === 0 ? summary : `memory ingest exited ${result.status}`,
};
}
function runBrainSyncPush(args: CliArgs): StageResult {
const t0 = Date.now();
if (args.mode === "dry-run") {
return { name: "brain-sync", ran: false, ok: true, duration_ms: 0, summary: "would: gstack-brain-sync --discover-new --once" };
}
const brainSyncPath = join(import.meta.dir, "gstack-brain-sync");
if (!existsSync(brainSyncPath)) {
return { name: "brain-sync", ran: false, ok: true, duration_ms: 0, summary: "skipped (gstack-brain-sync not installed)" };
}
spawnSync(brainSyncPath, ["--discover-new"], {
stdio: args.quiet ? ["ignore", "ignore", "ignore"] : ["ignore", "inherit", "inherit"],
timeout: 60 * 1000,
});
const result = spawnSync(brainSyncPath, ["--once"], {
stdio: args.quiet ? ["ignore", "ignore", "ignore"] : ["ignore", "inherit", "inherit"],
timeout: 60 * 1000,
});
return {
name: "brain-sync",
ran: true,
ok: result.status === 0,
duration_ms: Date.now() - t0,
summary: result.status === 0 ? "curated artifacts pushed" : `gstack-brain-sync exited ${result.status}`,
};
}
// ── State file ─────────────────────────────────────────────────────────────
interface SyncState {
schema_version: 1;
last_writer: string;
last_sync?: string;
last_full_sync?: string;
last_stages?: StageResult[];
}
function loadSyncState(): SyncState {
if (!existsSync(STATE_PATH)) {
return { schema_version: 1, last_writer: "gstack-gbrain-sync" };
}
try {
const raw = JSON.parse(readFileSync(STATE_PATH, "utf-8")) as SyncState;
if (raw.schema_version === 1) return raw;
} catch {
// fall through
}
return { schema_version: 1, last_writer: "gstack-gbrain-sync" };
}
/**
* Atomic state file write per /plan-eng-review D1: write tmp file then rename.
* rename(2) is atomic on POSIX filesystems.
*/
function saveSyncState(state: SyncState): void {
try {
mkdirSync(dirname(STATE_PATH), { recursive: true });
const tmp = `${STATE_PATH}.tmp.${process.pid}`;
writeFileSync(tmp, JSON.stringify(state, null, 2), "utf-8");
renameSync(tmp, STATE_PATH);
} catch {
// non-fatal
}
}
// ── Output ─────────────────────────────────────────────────────────────────
function formatStage(s: StageResult): string {
const status = !s.ran ? "SKIP" : s.ok ? "OK" : "ERR";
const dur = s.duration_ms > 0 ? ` (${(s.duration_ms / 1000).toFixed(1)}s)` : "";
return ` ${status.padEnd(5)} ${s.name.padEnd(12)} ${s.summary}${dur}`;
}
// ── Main ───────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
const args = parseArgs();
if (!args.quiet) {
const engine = detectEngineTier();
console.error(`[gbrain-sync] mode=${args.mode} engine=${engine.engine}`);
}
// Acquire lock (skip on dry-run since dry-run never writes).
const needsLock = args.mode !== "dry-run";
let haveLock = false;
if (needsLock) {
haveLock = acquireLock();
if (!haveLock) {
console.error(
`[gbrain-sync] another /sync-gbrain is running (lock at ${LOCK_PATH}). ` +
`If that process died, the lock auto-clears after 5 min, or remove it manually.`
);
process.exit(2);
}
}
const cleanup = () => {
if (haveLock) releaseLock();
};
process.on("SIGINT", () => { cleanup(); process.exit(130); });
process.on("SIGTERM", () => { cleanup(); process.exit(143); });
let exitCode = 0;
try {
const state = loadSyncState();
const stages: StageResult[] = [];
if (!args.noCode) {
stages.push(await withErrorContext("sync:code", () => runCodeImport(args), "gstack-gbrain-sync"));
}
if (!args.noMemory) {
stages.push(await withErrorContext("sync:memory", () => runMemoryIngest(args), "gstack-gbrain-sync"));
}
if (!args.noBrainSync) {
stages.push(await withErrorContext("sync:brain-sync", () => runBrainSyncPush(args), "gstack-gbrain-sync"));
}
if (args.mode !== "dry-run") {
state.last_sync = new Date().toISOString();
if (args.mode === "full") state.last_full_sync = state.last_sync;
state.last_stages = stages;
saveSyncState(state);
}
if (!args.quiet || args.mode === "dry-run") {
console.log(`\ngstack-gbrain-sync (${args.mode}):`);
for (const s of stages) console.log(formatStage(s));
const okCount = stages.filter((s) => s.ok).length;
const errCount = stages.filter((s) => !s.ok && s.ran).length;
console.log(`\n ${okCount} ok, ${errCount} error, ${stages.length - okCount - errCount} skipped`);
}
const anyError = stages.some((s) => s.ran && !s.ok);
exitCode = anyError ? 1 : 0;
} finally {
cleanup();
}
process.exit(exitCode);
}
main().catch((err) => {
console.error(`gstack-gbrain-sync fatal: ${err instanceof Error ? err.message : String(err)}`);
releaseLock();
process.exit(1);
});