c-cpp.ts 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. import type { Node as SyntaxNode } from 'web-tree-sitter';
  2. import { getChildByField, getNodeText } from '../tree-sitter-helpers';
  3. import type { LanguageExtractor } from '../tree-sitter-types';
  4. /**
  5. * Find the function NAME's `qualified_identifier` (`Foo::bar`) inside a
  6. * declarator, skipping the `parameter_list` — a parameter with a qualified type
  7. * (`const std::string& x`) must NOT be mistaken for the method name. Without the
  8. * skip, a plain free function `std::string TableFileName(const std::string&...)`
  9. * was named `string` (from the parameter type), so calls to it never resolved
  10. * and its file looked like nothing depended on it.
  11. */
  12. function findDeclaratorQualifiedId(declarator: SyntaxNode): SyntaxNode | undefined {
  13. const queue: SyntaxNode[] = [declarator];
  14. while (queue.length > 0) {
  15. const current = queue.shift()!;
  16. if (current.type === 'qualified_identifier') return current;
  17. for (let i = 0; i < current.namedChildCount; i++) {
  18. const child = current.namedChild(i);
  19. // Don't descend into parameters or the trailing return type — their types
  20. // (`const std::string&`, `-> std::string`) aren't the function name.
  21. if (child && child.type !== 'parameter_list' && child.type !== 'trailing_return_type') {
  22. queue.push(child);
  23. }
  24. }
  25. }
  26. return undefined;
  27. }
  28. function extractCppQualifiedMethodName(node: SyntaxNode, source: string): string | undefined {
  29. const declarator = getChildByField(node, 'declarator');
  30. if (!declarator) return undefined;
  31. const qid = findDeclaratorQualifiedId(declarator);
  32. if (!qid) return undefined;
  33. const parts = getNodeText(qid, source).trim().split('::').filter(Boolean);
  34. return parts[parts.length - 1];
  35. }
  36. function extractCppReceiverType(node: SyntaxNode, source: string): string | undefined {
  37. const declarator = getChildByField(node, 'declarator');
  38. if (!declarator) return undefined;
  39. const qid = findDeclaratorQualifiedId(declarator);
  40. if (!qid) return undefined;
  41. const parts = getNodeText(qid, source).trim().split('::').filter(Boolean);
  42. return parts.length > 1 ? parts.slice(0, -1).join('::') : undefined;
  43. }
  44. /**
  45. * Built-in / non-class return types that can never be a method receiver. We
  46. * store no `returnType` for these so resolution never tries to resolve a method
  47. * on `void` / `int` / etc.
  48. */
  49. const CPP_NON_CLASS_RETURN = new Set([
  50. 'void', 'bool', 'char', 'short', 'int', 'long', 'float', 'double', 'unsigned',
  51. 'signed', 'size_t', 'ssize_t', 'auto', 'wchar_t', 'char8_t', 'char16_t',
  52. 'char32_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t', 'uint8_t', 'uint16_t',
  53. 'uint32_t', 'uint64_t', 'intptr_t', 'uintptr_t', 'nullptr_t',
  54. ]);
  55. /**
  56. * Normalize a C++ return type to the bare class name a method could be called
  57. * on. Unwraps smart-pointer / optional wrappers to their element type
  58. * (`std::unique_ptr<Widget>` → `Widget`) so a factory's `->method()` resolves on
  59. * the pointee. Strips cv-qualifiers, `&`/`*`, namespace qualifiers, and other
  60. * template args. Returns undefined for primitives / void / `auto` / empty.
  61. */
  62. export function normalizeCppReturnType(raw: string): string | undefined {
  63. let t = raw.trim();
  64. if (!t) return undefined;
  65. // Unwrap smart pointers / optional to their pointee (the thing you call `->` on).
  66. const wrapper = t.match(/\b(?:std\s*::\s*)?(?:unique_ptr|shared_ptr|weak_ptr|optional)\s*<\s*([^,>]+?)\s*>/);
  67. if (wrapper && wrapper[1]) t = wrapper[1];
  68. t = t
  69. .replace(/\b(?:const|volatile|typename|struct|class|enum)\b/g, ' ')
  70. .replace(/<[^>]*>/g, ' ')
  71. .replace(/[*&]+/g, ' ')
  72. .replace(/\s+/g, ' ')
  73. .trim();
  74. if (!t) return undefined;
  75. const last = t.split('::').filter(Boolean).pop();
  76. if (!last) return undefined;
  77. if (CPP_NON_CLASS_RETURN.has(last)) return undefined;
  78. if (!/^[A-Za-z_]\w*$/.test(last)) return undefined;
  79. return last;
  80. }
  81. /**
  82. * Strip C++ template arguments from a base-type reference name so it matches the
  83. * bare class/struct the template was DEFINED as. `template<typename T> class
  84. * Base { … }` is indexed as a node named `Base`, but a derived class
  85. * `class D : public Base<int>` records its base as the full `Base<int>` (and
  86. * `class Q : public ns::Tpl<int>` as `ns::Tpl<int>`) — neither name-matches
  87. * `Base` / `ns::Tpl`, so the `extends` edge never resolves and the derived class
  88. * looks like it inherits from nothing (#1043).
  89. *
  90. * Removes every balanced `<…>` group regardless of nesting or position, so
  91. * `Base<int>` → `Base`, `ns::Tpl<Foo<int>>` → `ns::Tpl`, and the rare
  92. * `Outer<int>::Inner` → `Outer::Inner`. The remaining qualified head is exactly
  93. * what the non-templated base case already produces, so resolution treats them
  94. * identically. A name with no template args passes through unchanged.
  95. */
  96. export function stripCppTemplateArgs(name: string): string {
  97. if (!name.includes('<')) return name;
  98. let out = '';
  99. let depth = 0;
  100. for (const ch of name) {
  101. if (ch === '<') depth++;
  102. else if (ch === '>') { if (depth > 0) depth--; }
  103. else if (depth === 0) out += ch;
  104. }
  105. return out.trim();
  106. }
  107. /**
  108. * A function/method's return type lives in the `function_definition`'s `type`
  109. * field (`Metrics& Metrics::instance()` → `Metrics`). Constructors, destructors,
  110. * and conversion operators have no `type` field → undefined.
  111. */
  112. function extractCppReturnType(node: SyntaxNode, source: string): string | undefined {
  113. const typeNode = getChildByField(node, 'type');
  114. if (!typeNode) return undefined;
  115. return normalizeCppReturnType(getNodeText(typeNode, source));
  116. }
  117. export const cExtractor: LanguageExtractor = {
  118. functionTypes: ['function_definition'],
  119. classTypes: [],
  120. methodTypes: [],
  121. interfaceTypes: [],
  122. structTypes: ['struct_specifier'],
  123. enumTypes: ['enum_specifier'],
  124. enumMemberTypes: ['enumerator'],
  125. typeAliasTypes: ['type_definition'], // typedef
  126. importTypes: ['preproc_include'],
  127. callTypes: ['call_expression'],
  128. variableTypes: ['declaration'],
  129. nameField: 'declarator',
  130. bodyField: 'body',
  131. paramsField: 'parameters',
  132. // A `const`/`static const` file-scope declaration carries a `type_qualifier`
  133. // child reading "const" — extract those as `constant`, plain globals as
  134. // `variable`.
  135. isConst: (node) =>
  136. node.namedChildren.some(
  137. (c: SyntaxNode) => c.type === 'type_qualifier' && c.text === 'const'
  138. ),
  139. getReturnType: extractCppReturnType,
  140. resolveTypeAliasKind: (node, _source) => {
  141. // C typedef: `typedef enum { ... } name;` or `typedef struct { ... } name;`
  142. // The inner enum_specifier/struct_specifier is anonymous, but we want the typedef name
  143. // to become the enum/struct node name.
  144. for (let i = 0; i < node.namedChildCount; i++) {
  145. const child = node.namedChild(i);
  146. if (!child) continue;
  147. if (child.type === 'enum_specifier' && getChildByField(child, 'body')) return 'enum';
  148. if (child.type === 'struct_specifier' && getChildByField(child, 'body')) return 'struct';
  149. }
  150. return undefined;
  151. },
  152. extractImport: (node, source) => {
  153. const importText = source.substring(node.startIndex, node.endIndex).trim();
  154. // C includes: #include <stdio.h>, #include "myheader.h"
  155. const systemLib = node.namedChildren.find((c: SyntaxNode) => c.type === 'system_lib_string');
  156. if (systemLib) {
  157. return { moduleName: getNodeText(systemLib, source).replace(/^<|>$/g, ''), signature: importText };
  158. }
  159. const stringLiteral = node.namedChildren.find((c: SyntaxNode) => c.type === 'string_literal');
  160. if (stringLiteral) {
  161. const stringContent = stringLiteral.namedChildren.find((c: SyntaxNode) => c.type === 'string_content');
  162. if (stringContent) {
  163. return { moduleName: getNodeText(stringContent, source), signature: importText };
  164. }
  165. }
  166. return null;
  167. },
  168. };
  169. /**
  170. * Detect tree-sitter's misparse of a macro-annotated class/struct, e.g.
  171. * `class MACRO Name { … }` or `class MACRO Name : public Base { … }` (#946).
  172. * Not knowing `MACRO` is a macro, tree-sitter reads `class MACRO` as an
  173. * *elaborated type specifier* (a bodyless `class_specifier`/`struct_specifier`
  174. * whose "type name" is the macro) and the rest as a function: `Name` becomes the
  175. * declarator and the `{ … }` a function body — so the whole declaration surfaces
  176. * as a `function_definition` named after the class, with a line range spanning
  177. * the entire class body. (A base clause, when present, additionally lands in an
  178. * `ERROR` node, but it isn't required — the leading macro alone triggers this.)
  179. *
  180. * Two structural signals pin it down with no risk to genuine code:
  181. * - the `type` field is a *bodyless* class/struct specifier — an elaborated
  182. * type, not a real inline-defined return type like
  183. * `struct P { int x; } makeP() { … }` (which carries a field list); and
  184. * - the declarator is not a `function_declarator` — a real function definition
  185. * always has one, which also leaves the legal-but-rare `class Foo f() { … }`
  186. * (an elaborated return type on a genuine function) alone.
  187. *
  188. * The class body is mangled by the same misparse and is unrecoverable, so —
  189. * matching how macro-prefixed C prototypes are handled — we drop the spurious
  190. * node rather than mint a misleading whole-body `function` that pollutes
  191. * callers/impact and skews kind statistics.
  192. */
  193. function isMacroMisparsedTypeDecl(node: SyntaxNode): boolean {
  194. const typeNode = getChildByField(node, 'type');
  195. if (!typeNode) return false;
  196. if (typeNode.type !== 'class_specifier' && typeNode.type !== 'struct_specifier') return false;
  197. if (typeNode.namedChildren.some((c: SyntaxNode) => c.type === 'field_declaration_list')) return false;
  198. const declarator = getChildByField(node, 'declarator');
  199. if (declarator && declarator.type === 'function_declarator') return false;
  200. return true;
  201. }
  202. /**
  203. * Blank an export/visibility macro in a `class/struct EXPORT_MACRO Name …`
  204. * *definition* header before parsing. Not knowing the macro, tree-sitter reads
  205. * `class EXPORT_MACRO` as an elaborated type specifier and the rest as a
  206. * function, so the whole class — its name, base clause, and members — drops out
  207. * of the index (#946 catches the resulting phantom function but can't recover
  208. * the class), which silently breaks type-hierarchy / inheritance-impact queries
  209. * for effectively every Unreal-Engine (`*_API`), Qt/Boost (`*_EXPORT`), LLVM
  210. * (`*_ABI`), … class. Replacing the macro with equal-length spaces preserves
  211. * every byte offset (and thus line/column), so the declaration then parses as a
  212. * normal class_specifier and the existing extraction emits the node, members,
  213. * and `extends` edge. (#1061, follow-up to #946.)
  214. *
  215. * Matched tightly so it can't touch the same macro used as an ordinary value
  216. * elsewhere (`int x = SOME_API;`): the macro is the ALL-CAPS token sitting
  217. * *between* `class`/`struct` and the type name, and the trailing `[:{]`
  218. * definition-guard fires only when a base clause or body follows — the only
  219. * shape that misparses. That guard also leaves elaborated-type variable
  220. * declarations (`struct FOO var;`, `class FOO obj = …`) untouched, since those
  221. * end in `;` / `=` / `[`, never `:` / `{`. C++-only (wired into cppExtractor),
  222. * so C's heavier use of `struct TAG var;` never reaches it.
  223. */
  224. export function blankCppExportMacros(source: string): string {
  225. if (source.indexOf('class') === -1 && source.indexOf('struct') === -1) return source;
  226. return source.replace(
  227. /\b(class|struct)(\s+)([A-Z][A-Z0-9_]+)(?=\s+[A-Za-z_]\w*(?:\s+final)?\s*[:{])/g,
  228. (_m, kw, ws, macro) => kw + ws + ' '.repeat(macro.length)
  229. );
  230. }
  231. /**
  232. * Blank a known inline-specifier macro sitting in front of a function's return
  233. * type (`FORCEINLINE FString GetName(…)`), before parsing. Not knowing the
  234. * macro, tree-sitter can't reconcile `MACRO <return-type> <name>(` — an extra
  235. * type-like token before the name — and drops into error recovery: the macro
  236. * becomes the return type and, for a non-primitive return, the return type gets
  237. * glued onto the name (`GetName` → `"FString GetName"`), so the function can't
  238. * be found by name and its callers don't link. This is pervasive in Unreal
  239. * Engine, where inline helpers are written `FORCEINLINE <ret> <name>(…)`.
  240. * Replacing the macro with equal-length spaces preserves every byte offset (so
  241. * line/column stay exact) and the declaration then parses as an ordinary
  242. * function — recovering the real name AND the return type — mirroring how
  243. * `blankCppExportMacros` recovers macro-annotated classes (#946/#1061).
  244. *
  245. * Matched tightly so it can't touch an ordinary identifier: only the exact,
  246. * well-known UE inline specifiers, and only in specifier position — immediately
  247. * followed by whitespace and the identifier that starts the return type or name.
  248. * That lookahead leaves value/expression uses (`x = FORCEINLINE ? …`), string
  249. * literals, and `FORCEINLINE_SOMETHINGELSE` (word-boundary) alone. To cover a
  250. * new codebase's inline macro, add its exact token here.
  251. */
  252. const CPP_INLINE_MACROS = ['FORCEINLINE_DEBUGGABLE', 'FORCENOINLINE', 'FORCEINLINE'] as const;
  253. export function blankCppInlineMacros(source: string): string {
  254. if (!CPP_INLINE_MACROS.some((m) => source.indexOf(m) !== -1)) return source;
  255. return source.replace(
  256. // `FORCEINLINE_DEBUGGABLE` before `FORCEINLINE` so the longer token wins.
  257. /\b(FORCEINLINE_DEBUGGABLE|FORCENOINLINE|FORCEINLINE)\b(?=\s+[A-Za-z_])/g,
  258. (_m, macro) => ' '.repeat(macro.length)
  259. );
  260. }
  261. /** C/C++ source pre-processing before tree-sitter: recover both macro-annotated
  262. * class definitions and macro-prefixed function definitions. Offset-preserving. */
  263. function preParseCppSource(source: string): string {
  264. return blankCppInlineMacros(blankCppExportMacros(source));
  265. }
  266. export const cppExtractor: LanguageExtractor = {
  267. // Recover macro-annotated class/struct definitions (`class MYMODULE_API Foo : Base`,
  268. // #1061/#946) and macro-prefixed functions (`FORCEINLINE FString Foo()`, #1093
  269. // follow-up) that tree-sitter otherwise misparses.
  270. preParse: preParseCppSource,
  271. functionTypes: ['function_definition'],
  272. classTypes: ['class_specifier'],
  273. // A bodiless `class_specifier` is a forward declaration (`class Foo;`) or an
  274. // elaborated type reference, not a definition. Skip it so dozens of forward
  275. // decls across headers don't mint phantom `class` nodes that crowd out — and
  276. // get picked as the blast-radius representative over — the single real
  277. // definition, exactly as bodiless struct/enum specifiers are already skipped. (#1093)
  278. skipBodilessClass: true,
  279. methodTypes: ['function_definition'],
  280. interfaceTypes: [],
  281. structTypes: ['struct_specifier'],
  282. enumTypes: ['enum_specifier'],
  283. enumMemberTypes: ['enumerator'],
  284. typeAliasTypes: ['type_definition', 'alias_declaration'], // typedef and using
  285. importTypes: ['preproc_include'],
  286. callTypes: ['call_expression'],
  287. variableTypes: ['declaration'],
  288. nameField: 'declarator',
  289. bodyField: 'body',
  290. paramsField: 'parameters',
  291. resolveName: extractCppQualifiedMethodName,
  292. getReceiverType: extractCppReceiverType,
  293. getReturnType: extractCppReturnType,
  294. getVisibility: (node) => {
  295. // Check for access specifier in parent
  296. const parent = node.parent;
  297. if (parent) {
  298. for (let i = 0; i < parent.childCount; i++) {
  299. const child = parent.child(i);
  300. if (child?.type === 'access_specifier') {
  301. const text = child.text;
  302. if (text.includes('public')) return 'public';
  303. if (text.includes('private')) return 'private';
  304. if (text.includes('protected')) return 'protected';
  305. }
  306. }
  307. }
  308. return undefined;
  309. },
  310. resolveTypeAliasKind: (node, _source) => {
  311. // C++ typedef: `typedef enum { ... } name;` or `typedef struct { ... } name;`
  312. for (let i = 0; i < node.namedChildCount; i++) {
  313. const child = node.namedChild(i);
  314. if (!child) continue;
  315. if (child.type === 'enum_specifier' && getChildByField(child, 'body')) return 'enum';
  316. if (child.type === 'struct_specifier' && getChildByField(child, 'body')) return 'struct';
  317. }
  318. return undefined;
  319. },
  320. isMisparsedFunction: (name, node) => {
  321. // C++ macros like NLOHMANN_JSON_NAMESPACE_BEGIN cause tree-sitter to misparse
  322. // namespace blocks as function_definitions (e.g. name = "namespace detail").
  323. // Also filter C++ keywords that tree-sitter occasionally misinterprets as
  324. // function/method names (e.g. switch statements inside macro-confused scopes).
  325. if (name.startsWith('namespace')) return true;
  326. const cppKeywords = ['switch', 'if', 'for', 'while', 'do', 'case', 'return'];
  327. if (cppKeywords.includes(name)) return true;
  328. // `class MACRO Name : public Base { … }` misparses to a function_definition
  329. // named after the class. `blankCppExportMacros` (preParse) recovers the
  330. // common ALL-CAPS export-macro shape; this drop is the fallback for any
  331. // residual misparse it doesn't blank — still no phantom function (#1061/#946).
  332. return isMacroMisparsedTypeDecl(node);
  333. },
  334. extractImport: (node, source) => {
  335. const importText = source.substring(node.startIndex, node.endIndex).trim();
  336. // C++ includes: #include <iostream>, #include "myheader.h"
  337. const systemLib = node.namedChildren.find((c: SyntaxNode) => c.type === 'system_lib_string');
  338. if (systemLib) {
  339. return { moduleName: getNodeText(systemLib, source).replace(/^<|>$/g, ''), signature: importText };
  340. }
  341. const stringLiteral = node.namedChildren.find((c: SyntaxNode) => c.type === 'string_literal');
  342. if (stringLiteral) {
  343. const stringContent = stringLiteral.namedChildren.find((c: SyntaxNode) => c.type === 'string_content');
  344. if (stringContent) {
  345. return { moduleName: getNodeText(stringContent, source), signature: importText };
  346. }
  347. }
  348. return null;
  349. },
  350. };