tree-sitter-types.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. /**
  2. * Tree-sitter Extraction Types
  3. *
  4. * Defines the LanguageExtractor interface and related types used by
  5. * the core TreeSitterExtractor and per-language extraction configs.
  6. * Extracted to a leaf module to avoid circular imports.
  7. */
  8. import { Node as SyntaxNode } from 'web-tree-sitter';
  9. import {
  10. Node,
  11. NodeKind,
  12. UnresolvedReference,
  13. } from '../types';
  14. /**
  15. * Information returned by a language's extractImport hook.
  16. */
  17. export interface ImportInfo {
  18. /** The module/package name being imported */
  19. moduleName: string;
  20. /** Full import statement text for display */
  21. signature: string;
  22. /** If true, the hook already created unresolved references itself */
  23. handledRefs?: boolean;
  24. }
  25. /**
  26. * Information about a single variable within a declaration.
  27. * Returned by a language's extractVariables hook.
  28. */
  29. export interface VariableInfo {
  30. /** Variable name */
  31. name: string;
  32. /** Node kind: 'variable' or 'constant' */
  33. kind: NodeKind;
  34. /** Optional signature string */
  35. signature?: string;
  36. /** If set, this declarator is actually a function and should be extracted as such */
  37. delegateToFunction?: SyntaxNode;
  38. /** The AST node to use for positioning (may differ from the declaration node) */
  39. positionNode?: SyntaxNode;
  40. }
  41. /**
  42. * Context object passed to language hooks that need to call back into the core extractor.
  43. * Provides a controlled API surface — hooks can create nodes, visit children, and add
  44. * references without accessing the full TreeSitterExtractor internals.
  45. */
  46. export interface ExtractorContext {
  47. /** Create a node and add it to the extraction result */
  48. createNode(kind: NodeKind, name: string, node: SyntaxNode, extra?: Partial<Node>): Node | null;
  49. /** Visit a child node (dispatches through the standard visitNode logic) */
  50. visitNode(node: SyntaxNode): void;
  51. /** Visit a function body to extract calls */
  52. visitFunctionBody(body: SyntaxNode, functionId: string): void;
  53. /** Add an unresolved reference */
  54. addUnresolvedReference(ref: UnresolvedReference): void;
  55. /** Push a node ID onto the scope stack (for containment/qualified name building) */
  56. pushScope(nodeId: string): void;
  57. /** Pop the last node ID from the scope stack */
  58. popScope(): void;
  59. /** Current file path */
  60. readonly filePath: string;
  61. /** Current source text */
  62. readonly source: string;
  63. /** Stack of parent node IDs (current scope) */
  64. readonly nodeStack: readonly string[];
  65. /** All nodes extracted so far */
  66. readonly nodes: readonly Node[];
  67. }
  68. /**
  69. * Language-specific extraction configuration.
  70. *
  71. * Each supported language provides an implementation of this interface
  72. * that configures which AST node types to look for and how to extract
  73. * language-specific details like signatures, visibility, and imports.
  74. */
  75. export interface LanguageExtractor {
  76. /**
  77. * Optional source transform applied immediately before the grammar parses the
  78. * file. Used to work around grammar gaps that would otherwise corrupt the
  79. * parse tree (e.g. C# blanks conditional-compilation directive lines the
  80. * grammar mis-parses inside enum bodies). MUST preserve byte offsets (replace
  81. * removed text with spaces, keep newlines) so node positions and getNodeText
  82. * stay correct; the returned string is used for both parsing and extraction.
  83. * `filePath` lets a transform key off the concrete file extension when one
  84. * language id serves several dialects (C++ also parses `.metal` shaders).
  85. */
  86. preParse?: (source: string, filePath?: string) => string;
  87. // --- Node type mappings ---
  88. /** Node types that represent functions */
  89. functionTypes: string[];
  90. /** Node types that represent classes */
  91. classTypes: string[];
  92. /** Node types that represent methods */
  93. methodTypes: string[];
  94. /** Node types that represent interfaces/protocols/traits */
  95. interfaceTypes: string[];
  96. /** Node types that represent structs */
  97. structTypes: string[];
  98. /** Node types that represent enums */
  99. enumTypes: string[];
  100. /** Node types that represent enum members/cases (e.g. Swift: 'enum_entry', Rust: 'enum_variant') */
  101. enumMemberTypes?: string[];
  102. /** Node types that represent type aliases (e.g. `type X = ...`) */
  103. typeAliasTypes: string[];
  104. /** Node types that represent imports */
  105. importTypes: string[];
  106. /** Node types that represent function calls */
  107. callTypes: string[];
  108. /** Node types that represent variable declarations (const, let, var, etc.) */
  109. variableTypes: string[];
  110. /** Node types that represent class fields (extracted as 'field' kind inside class bodies) */
  111. fieldTypes?: string[];
  112. /** Node types that represent class properties (extracted as 'property' kind inside class bodies) */
  113. propertyTypes?: string[];
  114. // --- Field name mappings ---
  115. /** Field name for identifier/name */
  116. nameField: string;
  117. /** Field name for body */
  118. bodyField: string;
  119. /** Field name for parameters */
  120. paramsField: string;
  121. /** Field name for return type */
  122. returnField?: string;
  123. // --- Existing hooks ---
  124. /** Override symbol name extraction (e.g. ObjC multi-part selectors). */
  125. resolveName?: (node: SyntaxNode, source: string) => string | undefined;
  126. /**
  127. * Post-process an already-extracted name to recover a real identifier from a
  128. * name still mangled by a macro the pre-parse didn't blank (C/C++:
  129. * `MACRO Ret name(` misparses to the name "Ret name"). Applied to every name
  130. * this extractor produces, so it MUST be a no-op on a well-formed name — only
  131. * C/C++ set it, because a mangled name there is unambiguous (an internal space),
  132. * whereas e.g. Kotlin/Scala backtick identifiers legitimately contain spaces.
  133. */
  134. recoverMangledName?: (name: string) => string;
  135. /** Extract property name when the generic name walk fails (e.g. ObjC @property). */
  136. extractPropertyName?: (node: SyntaxNode, source: string) => string | null;
  137. /** Extract signature from node */
  138. getSignature?: (node: SyntaxNode, source: string) => string | undefined;
  139. /** Extract visibility from node */
  140. getVisibility?: (node: SyntaxNode) => 'public' | 'private' | 'protected' | 'internal' | undefined;
  141. /** Check if node is exported */
  142. isExported?: (node: SyntaxNode, source: string) => boolean;
  143. /** Check if node is async */
  144. isAsync?: (node: SyntaxNode) => boolean;
  145. /** Check if node is static */
  146. isStatic?: (node: SyntaxNode) => boolean;
  147. /** Check if variable declaration is a constant (const vs let/var) */
  148. isConst?: (node: SyntaxNode) => boolean;
  149. /**
  150. * Extract extra symbol-level modifier keywords to persist on the node's
  151. * `decorators` list (e.g. Kotlin `expect`/`actual` multiplatform markers).
  152. * Called generically for every created node; return undefined/[] when none.
  153. * Used by the resolver to link `expect` declarations to their `actual`
  154. * implementations across source sets.
  155. */
  156. extractModifiers?: (node: SyntaxNode) => string[] | undefined;
  157. // --- New config properties ---
  158. /** Additional node types to treat as class declarations (e.g. Dart: 'mixin_declaration') */
  159. extraClassNodeTypes?: string[];
  160. /** Whether methods can be top-level without enclosing class (Go: true) */
  161. methodsAreTopLevel?: boolean;
  162. /**
  163. * Skip a bodiless class node as a forward declaration / elaborated type,
  164. * mirroring the bodiless-struct/enum skip. Set only for languages where a
  165. * bodiless `class` specifier is NOT a complete definition — C/C++
  166. * (`class Foo;` is a forward decl). Leave unset for languages where a
  167. * bodiless class IS complete (Kotlin `class Empty`, Scala `case object`). (#1093)
  168. */
  169. skipBodilessClass?: boolean;
  170. /** NodeKind to use for interface-like declarations (Rust: 'trait'). Default: 'interface' */
  171. interfaceKind?: NodeKind;
  172. // --- New hooks ---
  173. /**
  174. * Custom node visitor. Return true if the node was fully handled (skip default dispatch).
  175. * Used by languages with fundamentally different AST structures (e.g. Pascal).
  176. */
  177. visitNode?: (node: SyntaxNode, ctx: ExtractorContext) => boolean;
  178. /**
  179. * Synthesize members that exist at compile time but not in the source AST,
  180. * called at the end of class extraction with the class still on the scope
  181. * stack (so `ctx.createNode` attaches containment + qualified names) and the
  182. * class's real members already extracted (so the hook can skip a member the
  183. * source explicitly declares). Used by Java for Lombok-generated accessors
  184. * (`@Getter`/`@Setter`/`@Data`/`@Value`/`@Builder` → `getX`/`setX`/`builder`/
  185. * `equals`/`hashCode`/`toString` + the `log` field), which are otherwise
  186. * invisible and break call-chain analysis (#912). The created nodes carry a
  187. * `lombok` decorator + a docstring naming the generating annotation, so an
  188. * agent can tell them apart from hand-written code.
  189. */
  190. synthesizeMembers?: (classNode: SyntaxNode, ctx: ExtractorContext) => void;
  191. /**
  192. * Classify a class_declaration node when the grammar reuses one node type
  193. * for multiple concepts (e.g. Swift uses class_declaration for classes, structs, and enums).
  194. */
  195. classifyClassNode?: (node: SyntaxNode) => 'class' | 'struct' | 'enum' | 'interface' | 'trait';
  196. /**
  197. * Classify a methodTypes node when the grammar reuses one node type for
  198. * both callable and data members (#808): TS/JS class FIELDS
  199. * (`public_field_definition` / `field_definition`) are methods only when
  200. * their value is callable (`onClick = () => {}`); a plain field
  201. * (`public fonts: Fonts;`, `count = 0`) is a property. Default: 'method'.
  202. */
  203. classifyMethodNode?: (node: SyntaxNode) => 'method' | 'property';
  204. /**
  205. * Resolve the body node for a function/method/class when it's not a child field.
  206. * (e.g. Dart puts function_body as a sibling, not a child.)
  207. */
  208. resolveBody?: (node: SyntaxNode, bodyField: string) => SyntaxNode | null;
  209. /**
  210. * Extract import information from an import node.
  211. * Return null if the node isn't a recognized import form.
  212. */
  213. extractImport?: (node: SyntaxNode, source: string) => ImportInfo | null;
  214. /**
  215. * Extract variable declarations from a variable declaration node.
  216. * Returns info about each declared variable, allowing the core to create nodes.
  217. */
  218. extractVariables?: (node: SyntaxNode, source: string) => VariableInfo[];
  219. /**
  220. * Extract receiver/owner type name from a method declaration.
  221. * Used by Go to get the struct receiver (e.g., "scrapeLoop" from "func (sl *scrapeLoop) run()").
  222. * When present, the receiver type is included in the qualified name for better searchability.
  223. */
  224. getReceiverType?: (node: SyntaxNode, source: string) => string | undefined;
  225. /**
  226. * Extract a function/method's normalized return type name (bare class name,
  227. * smart-pointer pointee unwrapped), stored on the node as `returnType`. Used
  228. * by C/C++ so resolution can infer a chained receiver's type from what the
  229. * inner call returns (`Foo::instance().bar()` → resolve `bar` on `Foo`,
  230. * issue #645). Return undefined for primitives / void / constructors.
  231. */
  232. getReturnType?: (node: SyntaxNode, source: string) => string | undefined;
  233. /**
  234. * Resolve the actual node kind for a type alias declaration.
  235. * Used by Go where `type_spec` is the named declaration wrapper for structs/interfaces:
  236. * `type Foo struct { ... }` → type_spec (name: "Foo") → struct_type
  237. * Returns 'struct', 'interface', etc. to override the default 'type_alias' kind,
  238. * or undefined to keep it as a type alias.
  239. */
  240. resolveTypeAliasKind?: (node: SyntaxNode, source: string) => NodeKind | undefined;
  241. /**
  242. * Check if a function/method name is a misparse artifact that should be skipped.
  243. * Used by C/C++ where macros (e.g. NLOHMANN_JSON_NAMESPACE_BEGIN) cause tree-sitter
  244. * to misparse namespace blocks as function_definitions. When this returns true,
  245. * the function node is NOT created, but the body is still visited for calls and
  246. * structural nodes (classes, structs, enums).
  247. */
  248. isMisparsedFunction?: (name: string, node: SyntaxNode) => boolean;
  249. /**
  250. * Detect bare method calls that don't use call expression syntax.
  251. * Used by Ruby where `reset` (no parens, no receiver) is a method call but
  252. * tree-sitter parses it as a plain `identifier` node instead of `call`/`method_call`.
  253. * Returns the callee name if this node is a bare call, or undefined if not.
  254. */
  255. extractBareCall?: (node: SyntaxNode, source: string) => string | undefined;
  256. /**
  257. * Node types representing a file-level package/namespace declaration
  258. * (e.g. Kotlin `package_header`, Java `package_declaration`). When set,
  259. * the core wraps every top-level declaration in an implicit `namespace`
  260. * node carrying the FQN, so cross-file import resolution can match by
  261. * qualifiedName instead of filename (Kotlin filename ≠ class name).
  262. */
  263. packageTypes?: string[];
  264. /** Extract the dotted package name from a package declaration node. */
  265. extractPackage?: (node: SyntaxNode, source: string) => string | null;
  266. }