1
0

dump-ast.mjs 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env node
  2. // Dump the tree-sitter AST for a sample file so you can write a LanguageExtractor
  3. // mapping. Loads a grammar .wasm directly via web-tree-sitter (the same runtime
  4. // codegraph uses) — you do NOT need to register the language first.
  5. //
  6. // Usage:
  7. // node scripts/add-lang/dump-ast.mjs <lang|wasm-path> <sample-file> [--depth=N] [--full]
  8. // Examples:
  9. // node scripts/add-lang/dump-ast.mjs lua sample.lua
  10. // node scripts/add-lang/dump-ast.mjs src/extraction/wasm/tree-sitter-zig.wasm a.zig --depth=4
  11. //
  12. // Output: an indented AST (named nodes, with field names) followed by a
  13. // node-type FREQUENCY table. The frequency table is the payoff — it tells you
  14. // which node types to map to functionTypes / classTypes / importTypes / etc.
  15. import { readFileSync, existsSync } from 'node:fs';
  16. import { createRequire } from 'node:module';
  17. import { Parser, Language } from 'web-tree-sitter';
  18. const require = createRequire(import.meta.url);
  19. const fail = (msg) => { console.error(`[dump-ast] ${msg}`); process.exit(1); };
  20. const argv = process.argv.slice(2);
  21. const positional = argv.filter((a) => !a.startsWith('--'));
  22. const [langOrWasm, sampleFile] = positional;
  23. const depthFlag = argv.find((a) => a.startsWith('--depth='));
  24. const showAll = argv.includes('--full'); // also print anonymous (token) nodes
  25. const maxDepth = depthFlag ? parseInt(depthFlag.split('=')[1], 10) : (showAll ? Infinity : 8);
  26. if (!langOrWasm || !sampleFile) {
  27. fail('usage: dump-ast.mjs <lang|wasm-path> <sample-file> [--depth=N] [--full]');
  28. }
  29. if (!existsSync(sampleFile)) fail(`sample file not found: ${sampleFile}`);
  30. // Language tokens whose tree-sitter-wasms filename differs from the token.
  31. const WASM_SPECIAL = { csharp: 'c_sharp', 'c#': 'c_sharp' };
  32. function resolveWasm(token) {
  33. if (token.endsWith('.wasm')) {
  34. if (!existsSync(token)) fail(`wasm not found: ${token}`);
  35. return token;
  36. }
  37. const base = WASM_SPECIAL[token.toLowerCase()] ?? token.toLowerCase();
  38. try {
  39. return require.resolve(`tree-sitter-wasms/out/tree-sitter-${base}.wasm`);
  40. } catch {
  41. /* not in tree-sitter-wasms — try a vendored copy */
  42. }
  43. const vendored = `src/extraction/wasm/tree-sitter-${base}.wasm`;
  44. if (existsSync(vendored)) return vendored;
  45. fail(
  46. `no grammar for "${token}" — not in tree-sitter-wasms and not vendored at ` +
  47. `${vendored}. Pass an explicit .wasm path, or vendor one (see SKILL.md "Find a grammar").`
  48. );
  49. }
  50. const wasmPath = resolveWasm(langOrWasm);
  51. const source = readFileSync(sampleFile, 'utf8');
  52. try {
  53. await Parser.init();
  54. } catch {
  55. await Parser.init({ locateFile: () => require.resolve('web-tree-sitter/tree-sitter.wasm') });
  56. }
  57. let language;
  58. try {
  59. language = await Language.load(wasmPath);
  60. } catch (e) {
  61. fail(`failed to load grammar ${wasmPath}: ${e.message}`);
  62. }
  63. const parser = new Parser();
  64. parser.setLanguage(language);
  65. const tree = parser.parse(source);
  66. const freq = new Map();
  67. const snippet = (node) => {
  68. const t = node.text.replace(/\s+/g, ' ').trim();
  69. return t.length > 48 ? `${t.slice(0, 48)}…` : t;
  70. };
  71. function walk(node, depth, fieldName) {
  72. if (node.isNamed) freq.set(node.type, (freq.get(node.type) || 0) + 1);
  73. if ((node.isNamed || showAll) && depth <= maxDepth) {
  74. const field = fieldName ? `${fieldName}: ` : '';
  75. const leaf = node.childCount === 0 ? ` "${snippet(node)}"` : '';
  76. console.log(`${' '.repeat(depth)}${field}${node.type} @${node.startPosition.row + 1}:${node.startPosition.column}${leaf}`);
  77. }
  78. for (let i = 0; i < node.childCount; i++) {
  79. const child = node.child(i);
  80. if (child) walk(child, depth + 1, node.fieldNameForChild(i));
  81. }
  82. }
  83. console.log(`\n# AST for ${sampleFile} (grammar: ${wasmPath.split('/').pop()})\n`);
  84. walk(tree.rootNode, 0, null);
  85. console.log('\n# Node-type frequency (named nodes) — map the relevant ones in your extractor:\n');
  86. [...freq.entries()]
  87. .sort((a, b) => b[1] - a[1])
  88. .forEach(([type, n]) => console.log(` ${String(n).padStart(5)} ${type}`));
  89. console.log();