directory.ts 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813
  1. /**
  2. * Directory Management
  3. *
  4. * Manages the .codegraph/ directory structure for CodeGraph data.
  5. */
  6. import * as fs from 'fs';
  7. import * as os from 'os';
  8. import * as path from 'path';
  9. /** The default per-project data directory name. */
  10. const DEFAULT_CODEGRAPH_DIR = '.codegraph';
  11. let warnedBadDirName = false;
  12. /**
  13. * Resolve the per-project data directory name, honoring the `CODEGRAPH_DIR`
  14. * environment override (default `.codegraph`). The override is a single path
  15. * segment that lives in the project root.
  16. *
  17. * Why this exists: two environments that share one working tree must NOT share
  18. * one `.codegraph/` — most concretely Windows-native and WSL (issue #636). The
  19. * daemon lockfile (`.codegraph/daemon.pid`) records a platform-specific pid and
  20. * socket path (a Windows named pipe vs a WSL Unix socket), and SQLite file
  21. * locking across the WSL2 ↔ Windows filesystem boundary is unreliable, so two
  22. * daemons sharing one index risks corruption. Setting `CODEGRAPH_DIR=.codegraph-win`
  23. * on one side gives each environment its own index in the same tree.
  24. *
  25. * Read live (not captured at load) so it is both process-accurate and testable.
  26. * An override that isn't a plain directory name — empty, containing a path
  27. * separator, `.`, `..`/traversal, or absolute — is ignored (we keep the
  28. * default) rather than risk writing the index outside the project or into the
  29. * project root itself; we warn once to stderr so the misconfiguration is seen.
  30. */
  31. export function codeGraphDirName(): string {
  32. const raw = process.env.CODEGRAPH_DIR?.trim();
  33. if (!raw) return DEFAULT_CODEGRAPH_DIR;
  34. const invalid =
  35. raw === '.' ||
  36. raw.includes('..') ||
  37. raw.includes('/') ||
  38. raw.includes('\\') ||
  39. path.isAbsolute(raw);
  40. if (invalid) {
  41. if (!warnedBadDirName) {
  42. warnedBadDirName = true;
  43. // stderr only — stdout is the MCP protocol channel.
  44. console.warn(
  45. `[codegraph] Ignoring invalid CODEGRAPH_DIR="${raw}" — it must be a plain ` +
  46. `directory name (no path separators, no "..", not absolute). Using "${DEFAULT_CODEGRAPH_DIR}".`
  47. );
  48. }
  49. return DEFAULT_CODEGRAPH_DIR;
  50. }
  51. return raw;
  52. }
  53. /**
  54. * CodeGraph directory name — a load-time snapshot of {@link codeGraphDirName}.
  55. * A running process's environment is fixed, so this equals the live value;
  56. * it's kept as a stable string export for backward compatibility. Internal code
  57. * resolves the name through {@link codeGraphDirName} / {@link getCodeGraphDir}
  58. * so the `CODEGRAPH_DIR` override always applies.
  59. */
  60. export const CODEGRAPH_DIR = codeGraphDirName();
  61. /**
  62. * Is `name` (a single path segment) a CodeGraph data directory? Matches the
  63. * default `.codegraph`, the active `CODEGRAPH_DIR` override, and any
  64. * `.codegraph-*` sibling. File-watching and the indexer skip ALL of these, so
  65. * when two environments share one working tree (Windows + WSL, issue #636)
  66. * neither indexes or watches the other's index directory.
  67. */
  68. export function isCodeGraphDataDir(name: string): boolean {
  69. return (
  70. name === DEFAULT_CODEGRAPH_DIR ||
  71. name === codeGraphDirName() ||
  72. name.startsWith(DEFAULT_CODEGRAPH_DIR + '-')
  73. );
  74. }
  75. /**
  76. * Get the .codegraph directory path for a project
  77. */
  78. export function getCodeGraphDir(projectRoot: string): string {
  79. return path.join(projectRoot, codeGraphDirName());
  80. }
  81. /**
  82. * Check if a project has been initialized with CodeGraph
  83. * Requires both .codegraph/ directory AND codegraph.db to exist
  84. */
  85. export function isInitialized(projectRoot: string): boolean {
  86. const codegraphDir = getCodeGraphDir(projectRoot);
  87. if (!fs.existsSync(codegraphDir) || !fs.statSync(codegraphDir).isDirectory()) {
  88. return false;
  89. }
  90. // Must have codegraph.db, not just .codegraph folder
  91. const dbPath = path.join(codegraphDir, 'codegraph.db');
  92. return fs.existsSync(dbPath);
  93. }
  94. /**
  95. * Find the nearest parent directory containing .codegraph/
  96. *
  97. * Walks up from the given path to find a CodeGraph-initialized project,
  98. * similar to how git finds .git/ directories.
  99. *
  100. * @param startPath - Directory to start searching from
  101. * @returns The project root containing .codegraph/, or null if not found
  102. */
  103. /**
  104. * Reason a directory is unsafe to use as an index ROOT, or null when it's fine.
  105. *
  106. * Indexing your home directory or a filesystem root drags in caches, `Library`,
  107. * every other project, etc. — a multi-GB index, constant file-watcher churn, and
  108. * (pre-1.0 on macOS) a file-descriptor blowup that exhausted `kern.maxfiles` and
  109. * took unrelated apps / the whole machine down (#845). The classic trigger:
  110. * running the installer or `codegraph init` from `$HOME`, which auto-indexes the
  111. * current directory. These are never intended project roots, so the installer
  112. * and `init`/`index` refuse them (overridable with `--force`).
  113. *
  114. * Pure-ish (reads only `os.homedir()` + realpath) so it's easy to unit-test.
  115. * The returned string is a human phrase that slots into "… looks like {reason}".
  116. */
  117. export function unsafeIndexRootReason(projectRoot: string): string | null {
  118. const resolve = (p: string): string => {
  119. try {
  120. return fs.realpathSync(path.resolve(p));
  121. } catch {
  122. return path.resolve(p);
  123. }
  124. };
  125. const resolved = resolve(projectRoot);
  126. // Filesystem root: `/` on POSIX, a drive root like `C:\` on Windows.
  127. if (path.parse(resolved).root === resolved) {
  128. return 'the filesystem root';
  129. }
  130. const home = resolve(os.homedir());
  131. // Case-insensitive on macOS/Windows (case-preserving but case-insensitive FS).
  132. const norm = (p: string): string =>
  133. process.platform === 'darwin' || process.platform === 'win32' ? p.toLowerCase() : p;
  134. const r = norm(resolved);
  135. const h = norm(home);
  136. if (r === h) {
  137. return 'your home directory';
  138. }
  139. // An ancestor of home (e.g. `/Users`, `/home`) — even broader than home.
  140. if (h.startsWith(r + path.sep)) {
  141. return 'a parent of your home directory';
  142. }
  143. return null;
  144. }
  145. export function findNearestCodeGraphRoot(startPath: string): string | null {
  146. let current = path.resolve(startPath);
  147. const root = path.parse(current).root;
  148. while (current !== root) {
  149. if (isInitialized(current)) {
  150. return current;
  151. }
  152. const parent = path.dirname(current);
  153. if (parent === current) break; // Reached filesystem root
  154. current = parent;
  155. }
  156. // Check root as well
  157. if (isInitialized(current)) {
  158. return current;
  159. }
  160. return null;
  161. }
  162. /** Heavy/irrelevant directory names the sub-project scan never descends into. */
  163. const SUBPROJECT_SCAN_SKIP = new Set([
  164. 'node_modules', '.git', '.svn', '.hg', 'dist', 'build', 'out', 'target',
  165. 'vendor', 'bin', 'obj', '.next', '.nuxt', '.svelte-kit', '.cache', 'coverage',
  166. '.venv', 'venv', '__pycache__', '.turbo', '.idea', '.vscode', 'tmp', 'temp',
  167. ]);
  168. /** Manifests that mark a directory as a project/workspace root. The down-scan
  169. * is gated on one of these so a non-project cwd (e.g. `$HOME`) is a cheap
  170. * no-op instead of a deep filesystem crawl. */
  171. const WORKSPACE_ROOT_MANIFESTS = [
  172. 'package.json', 'pnpm-workspace.yaml', 'lerna.json', 'nx.json', 'turbo.json',
  173. 'go.work', 'go.mod', 'Cargo.toml', 'pom.xml', 'build.gradle', 'build.gradle.kts',
  174. 'settings.gradle', 'pyproject.toml', 'composer.json', 'Gemfile', 'rush.json',
  175. 'WORKSPACE', 'WORKSPACE.bazel',
  176. ];
  177. function looksLikeProjectRoot(dir: string): boolean {
  178. return WORKSPACE_ROOT_MANIFESTS.some((m) => fs.existsSync(path.join(dir, m)));
  179. }
  180. function escapeRegExp(s: string): string {
  181. return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
  182. }
  183. /**
  184. * Indexed sub-project roots beneath `root` (bounded breadth-first scan). For
  185. * the monorepo case behind #964: the index lives in a CHILD
  186. * (`packages/x/.codegraph/`), not at the workspace root the agent's cwd points
  187. * at. Descent stops at the first indexed directory on a branch (a project's
  188. * own sub-dirs aren't separate projects) and is bounded by depth + count so it
  189. * never turns into a full-tree crawl on a large repo.
  190. */
  191. export function findIndexedSubprojectRoots(
  192. root: string,
  193. opts: { maxDepth?: number; max?: number } = {},
  194. ): string[] {
  195. const maxDepth = opts.maxDepth ?? 4;
  196. const max = opts.max ?? 64;
  197. const out: string[] = [];
  198. const walk = (dir: string, depth: number): void => {
  199. if (out.length >= max || depth > maxDepth) return;
  200. let entries: fs.Dirent[];
  201. try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
  202. for (const e of entries) {
  203. if (out.length >= max) return;
  204. if (!e.isDirectory()) continue;
  205. if (e.name.startsWith('.') || SUBPROJECT_SCAN_SKIP.has(e.name)) continue;
  206. const child = path.join(dir, e.name);
  207. if (isInitialized(child)) { out.push(child); continue; } // don't descend into an indexed project
  208. walk(child, depth + 1);
  209. }
  210. };
  211. walk(root, 1);
  212. return out;
  213. }
  214. /**
  215. * Unicode-aware word-boundary emulation for the keyword lists below. JS's `\b`
  216. * is ASCII-only — it fires only at `[A-Za-z0-9_]` edges — so it can never bound
  217. * a keyword whose first or last character is accented or non-Latin: `/\boù\b/`
  218. * NEVER matches "où est …" (ù isn't an ASCII word char, so no boundary exists
  219. * next to it). That is the #994 CJK mechanism resurfaced for Latin scripts and
  220. * Cyrillic (#1126). A lookaround — "not flanked by a letter, digit, or
  221. * underscore" — is the script-independent equivalent.
  222. */
  223. const NOT_WORD_BEFORE = /(?<![\p{L}\p{N}_])/u.source;
  224. const NOT_WORD_AFTER = /(?![\p{L}\p{N}_])/u.source;
  225. /**
  226. * Structural keywords matched as EXACT words (boundary on both sides): short
  227. * or ambiguous tokens where prefix matching would false-positive ("flow" in
  228. * "flower", "path" in "pathological"). Grouped by language; a term appears once
  229. * even when several languages share it ("como" is Portuguese for how AND
  230. * unaccented-typed Spanish "cómo").
  231. */
  232. const STRUCTURAL_WORDS = [
  233. // English — the pre-#1126 list minus what moved to STRUCTURAL_STEMS: the
  234. // bare-stem entries never matched their own derived forms (`\barchitect\b`
  235. // can't match "architecture"), and "what calls" is subsumed by the "call" stem.
  236. 'how', 'where', 'tracing', 'flows?', 'paths?', 'reach(?:es|ed)?', 'wired?', 'breaks?', 'why does',
  237. // French (où=where, flux=flow, chemin=path, casse=breaks)
  238. 'comment', 'où', 'flux', 'chemins?', 'casse',
  239. // Spanish (cómo/como=how, dónde/donde=where, flujo=flow, ruta/camino=path,
  240. // rompe=breaks, llaman / quién llama = call(s) — bare "llama" is excluded:
  241. // it's also the animal/model name in English prompts)
  242. 'cómo', 'dónde', 'donde', 'flujos?', 'rutas?', 'caminos?', 'rompe', 'llaman', 'quién llama', 'quien llama',
  243. // Portuguese (como=how — also covers unaccented Spanish; onde=where,
  244. // fluxo=flow, caminho=path)
  245. 'como', 'onde', 'fluxos?', 'caminhos?',
  246. // German (wie=how, wo/woher/wohin=where, Pfad=path, Fluss/Ablauf=flow,
  247. // bricht/kaputt=breaks, ruft=calls, hängt=depends — "hängt … von X ab"
  248. // splits the separable verb "abhängen", so the "abhäng" stem can't catch it)
  249. 'wie', 'wo', 'woher', 'wohin', 'pfade?', 'fluss', 'ablauf', 'bricht', 'kaputt', 'ruft', 'hängt',
  250. // Italian (dove=where, flusso=flow, percorso/i=path)
  251. 'dove', 'flusso', 'percors[oi]',
  252. // Russian (как=how, где=where, путь/пути=path, работает=works)
  253. 'как', 'где', 'путь', 'пути', 'работает',
  254. // Ukrainian (як=how, де=where, потік=flow — обліque cases reuse the RU
  255. // "поток" stem; працює=works)
  256. 'як', 'де', 'потік', 'працює',
  257. // Dutch (hoe=how, waar=where, roept=calls, werkt=works, aangeroepen=called —
  258. // the ge- participle escapes the "aanroep" stem)
  259. 'hoe', 'waar', 'roept', 'werkt', 'aangeroepen',
  260. // Polish + Czech (jak=how — shared; gdzie/kde=where, cesta=path)
  261. 'jak', 'gdzie', 'kde', 'cesta',
  262. // Romanian (cum=how, unde=where; flux is shared with French)
  263. 'cum', 'unde',
  264. // Hungarian (hogyan=how, hol=where)
  265. 'hogyan', 'hol',
  266. // Turkish (nasıl=how, mimari=architecture, takip=trace/follow)
  267. 'nasıl', 'mimari', 'takip',
  268. // Indonesian/Malay (bagaimana=how, di mana/dimana=where, alur=flow, jalur=path)
  269. 'bagaimana', 'di mana', 'dimana', 'alur', 'jalur',
  270. // Vietnamese — spaced Latin with heavy diacritics, the exact class ASCII `\b`
  271. // breaks (làm sao/thế nào=how, ở đâu=where, gọi=call, phụ thuộc=depend,
  272. // ảnh hưởng=affect, kiến trúc=architecture, cấu trúc=structure, luồng=flow,
  273. // đường dẫn=path, hoạt động=works, giải thích=explain, theo dõi=trace)
  274. 'làm sao', 'thế nào', 'ở đâu', 'gọi', 'phụ thuộc', 'ảnh hưởng', 'kiến trúc',
  275. 'cấu trúc', 'luồng', 'đường dẫn', 'hoạt động', 'giải thích', 'theo dõi',
  276. // Swedish / Danish / Norwegian (hur/hvordan=how, hvor=where, beror=depends,
  277. // flöde=flow)
  278. 'hur', 'hvordan', 'hvor', 'beror', 'flöde',
  279. // Finnish (miten=how, missä=where, toimii=works)
  280. 'miten', 'missä', 'toimii',
  281. // Greek (πώς=how, πού=where — accented forms only: unaccented πως/που are
  282. // ubiquitous conjunctions; καλεί=calls, δομή=structure, ροή=flow)
  283. 'πώς', 'πού', 'καλεί', 'δομή', 'ροή',
  284. // Hindi (कैसे=how, कहाँ/कहां=where, कॉल=call, निर्भर=depends,
  285. // संरचना=structure, प्रवाह=flow)
  286. 'कैसे', 'कहाँ', 'कहां', 'कॉल', 'निर्भर', 'संरचना', 'प्रवाह',
  287. ];
  288. /**
  289. * Structural keyword STEMS matched as word PREFIXES (boundary on the left
  290. * only), so derived forms match without enumerating each: "architect" fires on
  291. * architecture/architectural, "depend" on depends/dependency/dependencies,
  292. * "вызыва" on вызывает/вызывается. Mid-word occurrences stay excluded —
  293. * "restructure"/"independent" don't fire — so precision stays close to the
  294. * exact-word class. Add a stem only when every plausible completion is still a
  295. * structural word; a stem with ordinary-English completions must instead
  296. * enumerate its structural suffixes and re-assert the right boundary (see the
  297. * four bounded English entries below, #1138).
  298. */
  299. const STRUCTURAL_STEMS = [
  300. // English + the Latin-script languages that share the spelling (French
  301. // architecture/structure/trace/impact, Spanish depende/implementa/impacto, …).
  302. // call/trace/affect/connect are NOT safe as open prefixes — callus,
  303. // calligraphy, Connecticut, connective, affectionate, Tracey are ordinary
  304. // words that would false-fire the full-explore tier (#1138) — so they carry
  305. // an enumerated suffix set + right boundary. "tracing" lives in
  306. // STRUCTURAL_WORDS (the e is dropped, so no trace-prefix form matches it).
  307. 'architect', 'structur', 'depend', 'implement', 'impact', 'explain',
  308. `call(?:s|ing|ed|ers?|backs?|able|sites?)?${NOT_WORD_AFTER}`,
  309. `trace(?:s|d|rs?)?${NOT_WORD_AFTER}`,
  310. `affect(?:s|ed|ing)?${NOT_WORD_AFTER}`,
  311. `connect(?:s|ed|ing|ions?|ors?|ivity)?${NOT_WORD_AFTER}`,
  312. // French (appel(le)=call, dépend=depends, implément(e)=implement,
  313. // connex(ion)=connection, expliqu(e)=explain, fonctionn(e/ement)=works)
  314. 'appel', 'dépend', 'implément', 'connex', 'expliqu', 'fonctionn',
  315. // Spanish (llamad(a)=call, afect(a)=affect, conect(a)/conexi(ón)=connect,
  316. // arquitec(tura)=architecture, estructur(a)=structure, funcion(a)=works,
  317. // traza(r)=trace, explica=explain)
  318. 'llamad', 'afect', 'conect', 'conexi', 'arquitec', 'estructur', 'funcion', 'traza', 'explica',
  319. // Portuguese (chama(da)=call, afeta=affect, arquitet(ura)=architecture,
  320. // estrutur(a)=structure, quebra(do)=breaks)
  321. 'chama', 'afeta', 'arquitet', 'estrutur', 'quebra',
  322. // German (abhäng(t)=depend, Auswirkung=impact, beeinfluss(t)=affect,
  323. // verbind(et)=connect, Architektur, Struktur, funktionier(t)=works,
  324. // Aufruf/aufgerufen=call, erklär(t)=explain, verfolg(en)=trace)
  325. 'abhäng', 'auswirkung', 'beeinfluss', 'verbind', 'architekt', 'struktur', 'funktionier', 'aufruf', 'aufgerufen', 'erklär', 'verfolg',
  326. // Italian (chiam(a/ata)=call, dipend(e/enza)=depend, impatt(o)=impact,
  327. // connett(e)/conness(ione)=connect, architett(ura), struttur(a),
  328. // funzion(a/amento)=works, tracci(a)=trace, spiega(mi)=explain)
  329. 'chiam', 'dipend', 'impatt', 'connett', 'conness', 'architett', 'struttur', 'funzion', 'tracci', 'spiega',
  330. // Russian (вызыва(ет)=calls, завис(ит)=depends, влия(ет)=affects,
  331. // реализ(ация)=implementation, структур(а), архитектур(а),
  332. // трассир(овка)=trace, лома(ет)=breaks, объясн(и)=explain, поток=flow)
  333. 'вызыва', 'завис', 'влия', 'реализ', 'структур', 'архитектур', 'трассир', 'лома', 'объясн', 'поток',
  334. // Ukrainian — і/и spellings diverge from Russian (виклика(є)=calls,
  335. // залеж(ить)=depends, вплива(є)=affects, архітектур(а), реаліз(ація),
  336. // поясн(и)=explain, шлях(у)=path; структур(а) is shared with Russian)
  337. 'виклика', 'залеж', 'вплива', 'архітектур', 'реаліз', 'поясн', 'шлях',
  338. // Dutch (aanroep(en)=call, afhankelijk(heid)=depends, beïnvloed(t)=affects,
  339. // structuur — "structur" can't reach the uu; uitleg(gen)=explain)
  340. 'aanroep', 'afhankelijk', 'beïnvloed', 'structuur', 'uitleg',
  341. // Polish (wywoł(uje)=calls, zależ(y)=depends, wpływ(a)=affects/impact,
  342. // przepływ=flow, ścieżk(a)=path, działa(nie)=works, wyjaśni(j)=explain,
  343. // śledz(enie)=trace; architektura/struktura fire via the German stems)
  344. 'wywoł', 'zależ', 'wpływ', 'przepływ', 'ścieżk', 'działa', 'wyjaśni', 'śledz',
  345. // Czech (volá(ní)=calls, závis(í)=depends, ovlivň(uje)=affects,
  346. // funguj(e)=works, vysvětl(i)=explain)
  347. 'volá', 'závis', 'ovlivň', 'funguj', 'vysvětl',
  348. // Romanian (apel(ează)=calls, depind(e)=depends — i not e, so "depend" misses
  349. // it; arhitectur(a) — no c; funcțion(ează)=works, explică=explain)
  350. 'apel', 'depind', 'arhitectur', 'funcțion', 'explică',
  351. // Hungarian (hív(ja)=calls, függ(őség)=depends, működ(ik)=works,
  352. // struktúr(a) — ú escapes "struktur"; magyaráz(d)=explain;
  353. // architektúra fires via the German stem)
  354. 'hív', 'függ', 'működ', 'struktúr', 'magyaráz',
  355. // Turkish — agglutinative, so stems beat exact words (nere(de/ye/den)=where,
  356. // çağır/çağrı=call, bağıml(ı)=depends, bağlant(ı)=connection, akış(ı)=flow,
  357. // etkile(r)/etkisi=affects/impact)
  358. 'nere', 'çağır', 'çağrı', 'bağıml', 'bağlant', 'akış', 'etkile', 'etkisi',
  359. // Indonesian/Malay — me-/di-/ber- prefixes block a bare stem, so affixed
  360. // forms are listed too (panggil(an)/memanggil/dipanggil=call,
  361. // bergantung/tergantung=depends, pengaruh/mempengaruhi/memengaruhi=affect,
  362. // arsitektur=architecture, fungsi/berfungsi=works,
  363. // jelaskan/menjelaskan=explain)
  364. 'panggil', 'memanggil', 'dipanggil', 'bergantung', 'tergantung', 'pengaruh',
  365. 'mempengaruhi', 'memengaruhi', 'arsitektur', 'fungsi', 'berfungsi', 'jelaskan', 'menjelaskan',
  366. // Swedish / Danish / Norwegian (anrop(ar)=calls, påverk(ar)/påvirk(er)=affects,
  367. // afhæng(er)/avheng(er)=depends, förklar(a)/forklar=explain,
  368. // arkitektur — k not ch; funger(ar/er)=works)
  369. 'anrop', 'påverk', 'påvirk', 'afhæng', 'avheng', 'förklar', 'forklar', 'arkitektur', 'funger',
  370. // Finnish (kutsu(u)=calls, riippu(u)=depends, arkkitehtuur(i),
  371. // rakente(en)=structure, selit(ä)=explain)
  372. 'kutsu', 'riippu', 'arkkitehtuur', 'rakente', 'selit',
  373. // Greek — accented and unaccented stem spellings both occur
  374. // (εξαρτ(άται)=depends, επηρε(άζει)=affects, αρχιτεκτονικ(ή),
  375. // διαδρομ(ή)=path, εξηγ/εξήγ(ησε)=explain)
  376. 'εξαρτ', 'επηρε', 'αρχιτεκτονικ', 'διαδρομ', 'εξηγ', 'εξήγ',
  377. // Hindi (समझा(ओ/इए)=explain, आर्किटेक्चर=architecture)
  378. 'समझा', 'आर्किटेक्चर',
  379. ];
  380. const STRUCTURAL_WORDS_RE = new RegExp(`${NOT_WORD_BEFORE}(?:${STRUCTURAL_WORDS.join('|')})${NOT_WORD_AFTER}`, 'iu');
  381. const STRUCTURAL_STEMS_RE = new RegExp(`${NOT_WORD_BEFORE}(?:${STRUCTURAL_STEMS.join('|')})`, 'iu');
  382. /**
  383. * Structural keywords matched as bare SUBSTRINGS, for languages where a
  384. * boundary can't be relied on: scripts with no word separators (Chinese —
  385. * simplified AND traditional; the original #994 set was simplified-only —
  386. * Japanese, Thai), Korean (spaced, but particles attach directly to the noun:
  387. * 구조가/구조를), and Arabic / Farsi / Hebrew (spaced, but proclitics attach to
  388. * the word: وكيف "and-how", והמבנה "and-the-structure"). JS's `\b` can never
  389. * fire between Han characters, which was issue #994: the English-only gate
  390. * silently no-op'd every Chinese prompt, so non-English users got no front-load
  391. * nudge and no error to explain why. The sets mirror the English intent
  392. * (如何/怎么/怎麼/どうやって/どのように/어떻게/كيف/چگونه/چطور/איך/อย่างไร/ยังไง=how,
  393. * 在哪/哪里/哪裡/어디/أين/كجا/איפה/ที่ไหน=where, 流程/流向/流れ/흐름/تدفق/זרימה=flow,
  394. * 路径/路徑/経路/경로/مسار/مسیر/נתיב/เส้นทาง=path,
  395. * 调用/調用/呼び出/호출/يستدعي/استدعاء/فراخوان/קורא/เรียกใช้=call,
  396. * 依赖/依賴/依存/의존/يعتمد/تعتمد/وابسته/תלוי/ขึ้นอยู่กับ=depend,
  397. * 影响/影響/영향/يؤثر/تأثير/تأثیر/משפיע/ผลกระทบ=impact/affect,
  398. * 实现/實現/実装/구현=implement,
  399. * 架构/架構/アーキテクチャ/아키텍처/معماري/معماری/ארכיטקטור/สถาปัตยกรรม=architecture,
  400. * 结构/結構/構造/구조/بنية/هيكل/ساختار/מבנה/โครงสร้าง=structure,
  401. * 追踪/跟踪/追蹤/追跡/トレース/추적/تتبع/ติดตาม=trace,
  402. * يعمل/تعمل/ทำงาน=works) plus structural-overview words with no single clean
  403. * English equivalent (介绍/介紹/解析/分析/原理/机制/機制/仕組み/説明/설명/動作/동작/작동/
  404. * اشرح/شرح/توضیح/הסבר/อธิบาย=explain).
  405. *
  406. * KNOWN, ACCEPTED false-positive class (#1140): substring matching cannot see
  407. * homograph compounds — Korean 구조 (structure) also fires inside 구조대
  408. * (rescue squad). Verified unfixable at this layer: ICU word segmentation
  409. * (Intl.Segmenter) returns 구조대 and the particle form 구조가 (which the gate
  410. * MUST keep matching) as equally opaque single segments, and a 구조대 denylist
  411. * would break 구조대로 ("according to the structure" — 구조 + the 대로
  412. * particle), a legitimate structural prompt. The miss rate this design avoids
  413. * (silently no-op'ing every prompt in these languages, #994) outweighs the
  414. * occasional off-domain fire.
  415. */
  416. const STRUCTURAL_UNSEGMENTED = /如何|怎么|怎麼|在哪|哪里|哪裡|追踪|跟踪|追蹤|追跡|トレース|流程|流向|流れ|路径|路徑|経路|调用|調用|呼び出|依赖|依賴|依存|影响|影響|实现|實現|実装|架构|架構|アーキテクチャ|结构|結構|構造|介绍|介紹|解析|分析|原理|机制|機制|仕組み|説明|動作|どうやって|どのように|어떻게|어디|호출|흐름|경로|의존|영향|구현|구조|아키텍처|추적|동작|작동|설명|كيف|أين|اين|يستدعي|استدعاء|يعتمد|تعتمد|يؤثر|تأثير|معماري|بنية|هيكل|تدفق|مسار|تتبع|يعمل|تعمل|اشرح|شرح|چگونه|چطور|کجا|فراخوان|وابسته|تأثیر|معماری|ساختار|مسیر|توضیح|איך|איפה|קורא|תלוי|משפיע|ארכיטקטור|מבנה|זרימה|נתיב|הסבר|อย่างไร|ยังไง|ที่ไหน|เรียกใช้|ขึ้นอยู่กับ|ผลกระทบ|สถาปัตยกรรม|โครงสร้าง|เส้นทาง|ติดตาม|ทำงาน|อธิบาย/;
  417. /** Doc/data/asset file extensions — a `name.ext` of this kind is a file
  418. * reference, not a code symbol, so it must not trip the member-access signal. */
  419. const DOC_DATA_EXT = /\.(md|markdown|txt|rst|json|ya?ml|toml|lock|csv|tsv|log|ini|cfg|conf|env|xml|html?|png|jpe?g|gif|svg|pdf)$/i;
  420. /**
  421. * Does `prompt` contain an explicit structural keyword? A keyword is a strong,
  422. * self-contained signal, so the front-load hook fires on it directly — no graph
  423. * check needed. (A *code-token* match, by contrast, is only a candidate the
  424. * hook verifies against the graph first; see {@link extractCodeTokens}.)
  425. * Coverage is multilingual (#994, #1126): the ~29 languages with the largest
  426. * developer populations, across Latin, Cyrillic, Greek, CJK, Hangul, Arabic,
  427. * Hebrew, Thai, and Devanagari scripts. Languages beyond the keyword table
  428. * still fire through the language-agnostic code-token path.
  429. */
  430. export function hasStructuralKeyword(prompt: string): boolean {
  431. return (
  432. !!prompt &&
  433. (STRUCTURAL_WORDS_RE.test(prompt) || STRUCTURAL_STEMS_RE.test(prompt) || STRUCTURAL_UNSEGMENTED.test(prompt))
  434. );
  435. }
  436. /**
  437. * Identifier-shaped tokens in `prompt` — camelCase / PascalCase-with-inner-cap,
  438. * snake_case, a `name(` call, or the two sides of an `a.b` member access. Naming
  439. * a symbol is a code question whatever the surrounding human language, and these
  440. * shapes almost never occur in ordinary prose, so they catch the common
  441. * "<symbol> 的调用链?" / "where is <symbol> 定義" prompts no keyword list would.
  442. *
  443. * These are *candidates*, not a verdict: a tech brand like `JavaScript` or
  444. * `GitHub` is identifier-shaped too, so the front-load hook checks each token
  445. * against the actual index ({@link getNodesByName}) and only fires when one is a
  446. * real symbol here — otherwise a brand-name prompt would inject ~16KB of
  447. * low-relevance context (issue #994 follow-up). A doc/data filename ("README.md")
  448. * is excluded from the member-access form since it's a file reference, not a symbol.
  449. */
  450. export function extractCodeTokens(prompt: string): string[] {
  451. if (!prompt) return [];
  452. const out = new Set<string>();
  453. // camelCase / PascalCase-with-inner-cap (getUserId, parseToken, UserService) or
  454. // snake_case (article_publish, get_user) — a whole identifier run that has an
  455. // inner lower→upper transition or an underscore flanked by alphanumerics.
  456. for (const m of prompt.matchAll(/[A-Za-z_$][\w$]*/g)) {
  457. const w = m[0];
  458. if (/[a-z][A-Z]/.test(w) || /[A-Za-z0-9]_[A-Za-z0-9]/.test(w)) out.add(w);
  459. }
  460. // call form: an identifier directly before '(' — parseToken(, render(). No
  461. // whitespace before '(' so prose like "the function (entry point)" doesn't trip it.
  462. for (const m of prompt.matchAll(/([A-Za-z_$][\w$]*)\(/g)) out.add(m[1]!);
  463. // member access on identifiers (user.login) — but not a doc/data filename.
  464. for (const m of prompt.matchAll(/([A-Za-z_$][\w$]*)\.([A-Za-z_$][\w$]*)/g)) {
  465. if (!DOC_DATA_EXT.test(m[0])) { out.add(m[1]!); out.add(m[2]!); }
  466. }
  467. return [...out];
  468. }
  469. /**
  470. * Cheap, graph-free candidate gate for the front-load hook: could `prompt` be a
  471. * structural / flow / impact / "where-how" question worth front-loading context
  472. * for? True on an explicit keyword in any covered language (#994, #1126) OR an
  473. * identifier-shaped token. A keyword is sufficient to fire on its own; a
  474. * token-only match is only a candidate the hook then verifies against the graph
  475. * (a brand name like `JavaScript` is token-shaped but isn't a symbol). Every
  476. * non-candidate prompt ("fix this typo", in any language) stays a zero-cost no-op.
  477. */
  478. export function isStructuralPrompt(prompt: string): boolean {
  479. return hasStructuralKeyword(prompt) || extractCodeTokens(prompt).length > 0;
  480. }
  481. /**
  482. * What the front-load hook should do for a prompt issued from a directory.
  483. */
  484. export interface FrontloadPlan {
  485. /** Open + explore this project and inject its source as context. `null` when
  486. * there's no single project to front-load (none indexed, or several indexed
  487. * sub-projects with no clear match — see {@link nudgeProjects}). */
  488. exploreRoot: string | null;
  489. /** Indexed sub-projects to surface in a "pass `projectPath`" nudge: the rest
  490. * of a monorepo's indexed projects alongside `exploreRoot`, or — when no one
  491. * project clearly matches — the full list (with `exploreRoot` null). */
  492. nudgeProjects: string[];
  493. /** True when the plan came from scanning DOWN into sub-projects (cwd itself
  494. * is not under any index) — the monorepo case, where a follow-up
  495. * `codegraph_explore` needs an explicit `projectPath`. */
  496. viaSubScan: boolean;
  497. }
  498. /**
  499. * Decide what the front-load hook injects for a `prompt` issued from `cwd`,
  500. * shaped by where the `.codegraph/` index(es) actually are:
  501. * 1. **cwd (or an ancestor) is indexed** → front-load that project. The
  502. * normal single-project / nested-file case.
  503. * 2. **cwd isn't indexed but looks like a workspace root** → the indexes live
  504. * in sub-projects (the monorepo case behind #964). One indexed
  505. * sub-project → front-load it; several → front-load the one the prompt
  506. * names (by relative path like `packages/api`, or package directory name)
  507. * and nudge about the rest; several with no match → nudge the full list so
  508. * the agent passes `projectPath`, rather than guessing wrong.
  509. * 3. **nothing indexed reachable** → do nothing (the agent's own tools apply).
  510. */
  511. export function planFrontload(cwd: string, prompt: string): FrontloadPlan {
  512. const none: FrontloadPlan = { exploreRoot: null, nudgeProjects: [], viaSubScan: false };
  513. // 1. up-walk — nearest indexed ancestor (incl. cwd). Cheap; covers the common
  514. // single-project case without a down-scan.
  515. let dir = path.resolve(cwd);
  516. for (let i = 0; i < 6; i++) {
  517. if (isInitialized(dir)) return { exploreRoot: dir, nudgeProjects: [], viaSubScan: false };
  518. const parent = path.dirname(dir);
  519. if (parent === dir) break;
  520. dir = parent;
  521. }
  522. // 2. down-scan — only from something that looks like a workspace root, so a
  523. // non-project cwd (e.g. $HOME) is a cheap no-op, not a deep crawl.
  524. const base = path.resolve(cwd);
  525. if (!looksLikeProjectRoot(base)) return none;
  526. const subs = findIndexedSubprojectRoots(base);
  527. if (subs.length === 0) return none;
  528. if (subs.length === 1) return { exploreRoot: subs[0]!, nudgeProjects: [], viaSubScan: true };
  529. // Several indexed sub-projects — pick the one the prompt points at, if any.
  530. const p = prompt.toLowerCase();
  531. let best: { root: string; score: number; relLen: number } | null = null;
  532. for (const s of subs) {
  533. const rel = path.relative(base, s);
  534. const relLc = rel.split(path.sep).join('/').toLowerCase();
  535. const name = path.basename(s).toLowerCase();
  536. let score = 0;
  537. if (relLc && p.includes(relLc)) score = 10; // "packages/api"
  538. else if (name.length >= 3 && new RegExp(`\\b${escapeRegExp(name)}\\b`).test(p)) score = 5; // "api"
  539. if (score > 0 && (!best || score > best.score || (score === best.score && rel.length < best.relLen))) {
  540. best = { root: s, score, relLen: rel.length };
  541. }
  542. }
  543. if (best) {
  544. return { exploreRoot: best.root, nudgeProjects: subs.filter((s) => s !== best!.root), viaSubScan: true };
  545. }
  546. // No clear match — nudge the full list rather than front-load a guess.
  547. return { exploreRoot: null, nudgeProjects: subs, viaSubScan: true };
  548. }
  549. /**
  550. * Contents of `.codegraph/.gitignore`. A single wildcard ignore keeps every
  551. * transient file in the index dir — the database, `daemon.pid`, the socket,
  552. * logs, cache, and anything future versions add — out of git, without having
  553. * to enumerate each name (issues #788, #492, #484). Older versions wrote an
  554. * explicit allowlist that never listed `daemon.pid` or the socket, so those
  555. * runtime files were silently committed.
  556. */
  557. const GITIGNORE_CONTENT = `# CodeGraph data files — local to each machine, not for committing.
  558. # Ignore everything in .codegraph/ except this file itself, so transient
  559. # files (the database, daemon.pid, sockets, logs) never show up in git.
  560. *
  561. !.gitignore
  562. `;
  563. /** Header line that prefixes every .gitignore CodeGraph has auto-generated. */
  564. const GITIGNORE_MARKER = '# CodeGraph data files';
  565. /**
  566. * Is `content` a stale CodeGraph-generated `.gitignore` that should be
  567. * regenerated in place? True when it carries our header but predates the
  568. * wildcard ignore (it has no bare `*` line) — i.e. one of the old explicit
  569. * allowlists (`*.db`, `cache/`, `.dirty`, …) that never ignored `daemon.pid`
  570. * or the socket (issue #788). A file WITHOUT our header is user-authored and
  571. * is left untouched; one that already has the wildcard is current. Matching
  572. * on the header (not a byte-exact list of past defaults) heals every old
  573. * variant — v0.7.x through 0.9.9 — and is idempotent once upgraded.
  574. */
  575. function isStaleDefaultGitignore(content: string): boolean {
  576. if (!content.trimStart().startsWith(GITIGNORE_MARKER)) return false;
  577. return !content.split('\n').some((line) => line.trim() === '*');
  578. }
  579. /**
  580. * Write `.codegraph/.gitignore` if it's absent, or upgrade a stale
  581. * CodeGraph-generated default in place; a user-customized file is left alone.
  582. * Best-effort — returns `false` only if a needed write failed.
  583. */
  584. function ensureGitignore(gitignorePath: string): boolean {
  585. let existing: string | null;
  586. try {
  587. existing = fs.readFileSync(gitignorePath, 'utf-8');
  588. } catch {
  589. existing = null; // absent (ENOENT) or unreadable — (re)create below
  590. }
  591. // Current default or a user-authored file: nothing to do.
  592. if (existing !== null && !isStaleDefaultGitignore(existing)) return true;
  593. try {
  594. fs.writeFileSync(gitignorePath, GITIGNORE_CONTENT, 'utf-8');
  595. return true;
  596. } catch {
  597. return false;
  598. }
  599. }
  600. /**
  601. * Create the .codegraph directory structure
  602. * Note: Only throws if codegraph.db already exists, not just if .codegraph/ exists.
  603. */
  604. export function createDirectory(projectRoot: string): void {
  605. const codegraphDir = getCodeGraphDir(projectRoot);
  606. const dbPath = path.join(codegraphDir, 'codegraph.db');
  607. // Only throw if CodeGraph is actually initialized (db exists)
  608. // .codegraph/ folder alone is fine
  609. if (fs.existsSync(dbPath)) {
  610. throw new Error(`CodeGraph already initialized in ${projectRoot}`);
  611. }
  612. // Create main directory (if it doesn't exist)
  613. fs.mkdirSync(codegraphDir, { recursive: true });
  614. // Write .gitignore inside .codegraph (create if absent, upgrade a stale
  615. // pre-wildcard default left by an older version — issue #788).
  616. ensureGitignore(path.join(codegraphDir, '.gitignore'));
  617. }
  618. /**
  619. * Remove the .codegraph directory
  620. */
  621. export function removeDirectory(projectRoot: string): void {
  622. const codegraphDir = getCodeGraphDir(projectRoot);
  623. if (!fs.existsSync(codegraphDir)) {
  624. return;
  625. }
  626. // Verify .codegraph is a real directory, not a symlink pointing elsewhere
  627. const lstat = fs.lstatSync(codegraphDir);
  628. if (lstat.isSymbolicLink()) {
  629. // Only remove the symlink itself, never follow it for recursive delete
  630. fs.unlinkSync(codegraphDir);
  631. return;
  632. }
  633. if (!lstat.isDirectory()) {
  634. // Not a directory - remove the single file
  635. fs.unlinkSync(codegraphDir);
  636. return;
  637. }
  638. // Recursively remove directory
  639. fs.rmSync(codegraphDir, { recursive: true, force: true });
  640. }
  641. /**
  642. * Get all files in the .codegraph directory
  643. */
  644. export function listDirectoryContents(projectRoot: string): string[] {
  645. const codegraphDir = getCodeGraphDir(projectRoot);
  646. if (!fs.existsSync(codegraphDir)) {
  647. return [];
  648. }
  649. const files: string[] = [];
  650. function walkDir(dir: string, prefix: string = ''): void {
  651. const entries = fs.readdirSync(dir, { withFileTypes: true });
  652. for (const entry of entries) {
  653. const relativePath = prefix ? `${prefix}/${entry.name}` : entry.name;
  654. // Skip symlinks to prevent following links outside .codegraph
  655. if (entry.isSymbolicLink()) {
  656. continue;
  657. }
  658. if (entry.isDirectory()) {
  659. walkDir(path.join(dir, entry.name), relativePath);
  660. } else {
  661. files.push(relativePath);
  662. }
  663. }
  664. }
  665. walkDir(codegraphDir);
  666. return files;
  667. }
  668. /**
  669. * Get the total size of the .codegraph directory in bytes
  670. */
  671. export function getDirectorySize(projectRoot: string): number {
  672. const codegraphDir = getCodeGraphDir(projectRoot);
  673. if (!fs.existsSync(codegraphDir)) {
  674. return 0;
  675. }
  676. let totalSize = 0;
  677. function walkDir(dir: string): void {
  678. const entries = fs.readdirSync(dir, { withFileTypes: true });
  679. for (const entry of entries) {
  680. // Skip symlinks to prevent following links outside .codegraph
  681. if (entry.isSymbolicLink()) {
  682. continue;
  683. }
  684. const fullPath = path.join(dir, entry.name);
  685. if (entry.isDirectory()) {
  686. walkDir(fullPath);
  687. } else {
  688. const stats = fs.statSync(fullPath);
  689. totalSize += stats.size;
  690. }
  691. }
  692. }
  693. walkDir(codegraphDir);
  694. return totalSize;
  695. }
  696. /**
  697. * Ensure a subdirectory exists within .codegraph
  698. */
  699. export function ensureSubdirectory(projectRoot: string, subdirName: string): string {
  700. if (subdirName.includes('..') || subdirName.includes(path.sep) || subdirName.includes('/')) {
  701. throw new Error(`Invalid subdirectory name: ${subdirName}`);
  702. }
  703. const subdirPath = path.join(getCodeGraphDir(projectRoot), subdirName);
  704. if (!fs.existsSync(subdirPath)) {
  705. fs.mkdirSync(subdirPath, { recursive: true });
  706. }
  707. return subdirPath;
  708. }
  709. /**
  710. * Check if the .codegraph directory has valid structure
  711. */
  712. export function validateDirectory(projectRoot: string): {
  713. valid: boolean;
  714. errors: string[];
  715. } {
  716. const errors: string[] = [];
  717. const codegraphDir = getCodeGraphDir(projectRoot);
  718. if (!fs.existsSync(codegraphDir)) {
  719. errors.push('CodeGraph directory does not exist');
  720. return { valid: false, errors };
  721. }
  722. if (!fs.statSync(codegraphDir).isDirectory()) {
  723. errors.push('.codegraph exists but is not a directory');
  724. return { valid: false, errors };
  725. }
  726. // Auto-repair / upgrade .gitignore (non-critical file). A missing one is
  727. // recreated; a stale pre-wildcard default that never ignored daemon.pid is
  728. // regenerated in place (issue #788); a user-authored file is left alone.
  729. const gitignorePath = path.join(codegraphDir, '.gitignore');
  730. const existedBefore = fs.existsSync(gitignorePath);
  731. if (!ensureGitignore(gitignorePath) && !existedBefore) {
  732. // Only a missing-and-uncreatable file is surfaced; a failed in-place
  733. // upgrade of an existing file is non-fatal — the index still works.
  734. errors.push('.gitignore missing in .codegraph directory and could not be created');
  735. }
  736. return {
  737. valid: errors.length === 0,
  738. errors,
  739. };
  740. }