check-grammar.mjs 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/env node
  2. // Verify a tree-sitter grammar wasm is HEALTHY under the project's web-tree-sitter
  3. // runtime BEFORE writing an extractor. Prints the ABI version and parses a valid
  4. // sample many times in a multi-grammar context, to catch heap-corruption bugs
  5. // that silently drop nodes on every parse after the first.
  6. //
  7. // Why this exists: the tree-sitter-wasms Lua grammar is ABI 13 and corrupts the
  8. // shared WASM heap under web-tree-sitter 0.25 — Lua extraction degraded on every
  9. // file after the first (nested calls/imports vanished). The fix was to vendor the
  10. // upstream ABI-15 wasm. Run this on any new grammar first; if it FAILs, vendor a
  11. // newer build instead of using the tree-sitter-wasms one.
  12. //
  13. // Usage: node scripts/add-lang/check-grammar.mjs <lang|wasm-path> <valid-sample> [iterations]
  14. // Exit: 0 healthy, 1 corruption / parse errors, 2 could not run.
  15. // NOTE: the sample must be SYNTACTICALLY VALID — a broken sample fails for the
  16. // wrong reason.
  17. import { readFileSync, existsSync } from 'node:fs';
  18. import { createRequire } from 'node:module';
  19. import { Parser, Language } from 'web-tree-sitter';
  20. const require = createRequire(import.meta.url);
  21. const fail = (code, msg) => { console.error(`[check-grammar] ${msg}`); process.exit(code); };
  22. const [token, sample, iterArg] = process.argv.slice(2);
  23. if (!token || !sample) fail(2, 'usage: check-grammar.mjs <lang|wasm-path> <valid-sample> [iterations]');
  24. if (!existsSync(sample)) fail(2, `sample not found: ${sample}`);
  25. const iters = iterArg ? parseInt(iterArg, 10) : 20;
  26. const SPECIAL = { csharp: 'c_sharp', 'c#': 'c_sharp' };
  27. function resolveWasm(t) {
  28. if (t.endsWith('.wasm')) return existsSync(t) ? t : fail(2, `wasm not found: ${t}`);
  29. const base = SPECIAL[t.toLowerCase()] ?? t.toLowerCase();
  30. try { return require.resolve(`tree-sitter-wasms/out/tree-sitter-${base}.wasm`); } catch { /* try vendored */ }
  31. const vendored = `src/extraction/wasm/tree-sitter-${base}.wasm`;
  32. if (existsSync(vendored)) return vendored;
  33. return fail(2, `no grammar for "${t}" — not in tree-sitter-wasms and not vendored`);
  34. }
  35. const wasmPath = resolveWasm(token);
  36. const source = readFileSync(sample, 'utf8');
  37. try { await Parser.init(); }
  38. catch { await Parser.init({ locateFile: () => require.resolve('web-tree-sitter/tree-sitter.wasm') }); }
  39. // Load a second, known-good grammar — the corruption surfaces under the
  40. // multi-grammar runtime that real indexing uses, not a single grammar in isolation.
  41. try { await Language.load(require.resolve('tree-sitter-wasms/out/tree-sitter-python.wasm')); } catch { /* ok */ }
  42. let language;
  43. try { language = await Language.load(wasmPath); }
  44. catch (e) { fail(2, `failed to load ${wasmPath}: ${e.message}`); }
  45. const parser = new Parser();
  46. parser.setLanguage(language);
  47. let ok = 0, err = 0;
  48. for (let i = 0; i < iters; i++) {
  49. const tree = parser.parse(source);
  50. if (tree.rootNode.hasError) err++; else ok++;
  51. }
  52. console.log(`grammar: ${wasmPath.split('/').pop()}`);
  53. console.log(` ABI version: ${language.abiVersion}`);
  54. console.log(` parses: ${ok} clean / ${err} with errors (of ${iters})`);
  55. if (err > 0) {
  56. console.log(
  57. `RESULT: FAIL — ${err}/${iters} parses produced ERROR trees on a valid sample. ` +
  58. `This grammar corrupts under web-tree-sitter; vendor a newer (ABI 14/15) wasm ` +
  59. `(see SKILL.md "Find a grammar"). Confirm your sample is syntactically valid first.`
  60. );
  61. process.exit(1);
  62. }
  63. console.log('RESULT: PASS — grammar parses cleanly and reuses safely.');
  64. process.exit(0);