| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- import type { Node as SyntaxNode } from 'web-tree-sitter';
- import { getChildByField, getNodeText } from '../tree-sitter-helpers';
- import type { LanguageExtractor } from '../tree-sitter-types';
- /**
- * Find the function NAME's `qualified_identifier` (`Foo::bar`) inside a
- * declarator, skipping the `parameter_list` — a parameter with a qualified type
- * (`const std::string& x`) must NOT be mistaken for the method name. Without the
- * skip, a plain free function `std::string TableFileName(const std::string&...)`
- * was named `string` (from the parameter type), so calls to it never resolved
- * and its file looked like nothing depended on it.
- */
- function findDeclaratorQualifiedId(declarator: SyntaxNode): SyntaxNode | undefined {
- const queue: SyntaxNode[] = [declarator];
- while (queue.length > 0) {
- const current = queue.shift()!;
- if (current.type === 'qualified_identifier') return current;
- for (let i = 0; i < current.namedChildCount; i++) {
- const child = current.namedChild(i);
- // Don't descend into parameters or the trailing return type — their types
- // (`const std::string&`, `-> std::string`) aren't the function name.
- if (child && child.type !== 'parameter_list' && child.type !== 'trailing_return_type') {
- queue.push(child);
- }
- }
- }
- return undefined;
- }
- function extractCppQualifiedMethodName(node: SyntaxNode, source: string): string | undefined {
- const declarator = getChildByField(node, 'declarator');
- if (!declarator) return undefined;
- const qid = findDeclaratorQualifiedId(declarator);
- if (!qid) return undefined;
- const parts = getNodeText(qid, source).trim().split('::').filter(Boolean);
- return parts[parts.length - 1];
- }
- function extractCppReceiverType(node: SyntaxNode, source: string): string | undefined {
- const declarator = getChildByField(node, 'declarator');
- if (!declarator) return undefined;
- const qid = findDeclaratorQualifiedId(declarator);
- if (!qid) return undefined;
- const parts = getNodeText(qid, source).trim().split('::').filter(Boolean);
- return parts.length > 1 ? parts.slice(0, -1).join('::') : undefined;
- }
- /**
- * Built-in / non-class return types that can never be a method receiver. We
- * store no `returnType` for these so resolution never tries to resolve a method
- * on `void` / `int` / etc.
- */
- const CPP_NON_CLASS_RETURN = new Set([
- 'void', 'bool', 'char', 'short', 'int', 'long', 'float', 'double', 'unsigned',
- 'signed', 'size_t', 'ssize_t', 'auto', 'wchar_t', 'char8_t', 'char16_t',
- 'char32_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t', 'uint8_t', 'uint16_t',
- 'uint32_t', 'uint64_t', 'intptr_t', 'uintptr_t', 'nullptr_t',
- ]);
- /**
- * Normalize a C++ return type to the bare class name a method could be called
- * on. Unwraps smart-pointer / optional wrappers to their element type
- * (`std::unique_ptr<Widget>` → `Widget`) so a factory's `->method()` resolves on
- * the pointee. Strips cv-qualifiers, `&`/`*`, namespace qualifiers, and other
- * template args. Returns undefined for primitives / void / `auto` / empty.
- */
- export function normalizeCppReturnType(raw: string): string | undefined {
- let t = raw.trim();
- if (!t) return undefined;
- // Unwrap smart pointers / optional to their pointee (the thing you call `->` on).
- const wrapper = t.match(/\b(?:std\s*::\s*)?(?:unique_ptr|shared_ptr|weak_ptr|optional)\s*<\s*([^,>]+?)\s*>/);
- if (wrapper && wrapper[1]) t = wrapper[1];
- t = t
- .replace(/\b(?:const|volatile|typename|struct|class|enum)\b/g, ' ')
- .replace(/<[^>]*>/g, ' ')
- .replace(/[*&]+/g, ' ')
- .replace(/\s+/g, ' ')
- .trim();
- if (!t) return undefined;
- const last = t.split('::').filter(Boolean).pop();
- if (!last) return undefined;
- if (CPP_NON_CLASS_RETURN.has(last)) return undefined;
- if (!/^[A-Za-z_]\w*$/.test(last)) return undefined;
- return last;
- }
- /**
- * Strip C++ template arguments from a base-type reference name so it matches the
- * bare class/struct the template was DEFINED as. `template<typename T> class
- * Base { … }` is indexed as a node named `Base`, but a derived class
- * `class D : public Base<int>` records its base as the full `Base<int>` (and
- * `class Q : public ns::Tpl<int>` as `ns::Tpl<int>`) — neither name-matches
- * `Base` / `ns::Tpl`, so the `extends` edge never resolves and the derived class
- * looks like it inherits from nothing (#1043).
- *
- * Removes every balanced `<…>` group regardless of nesting or position, so
- * `Base<int>` → `Base`, `ns::Tpl<Foo<int>>` → `ns::Tpl`, and the rare
- * `Outer<int>::Inner` → `Outer::Inner`. The remaining qualified head is exactly
- * what the non-templated base case already produces, so resolution treats them
- * identically. A name with no template args passes through unchanged.
- */
- export function stripCppTemplateArgs(name: string): string {
- if (!name.includes('<')) return name;
- let out = '';
- let depth = 0;
- for (const ch of name) {
- if (ch === '<') depth++;
- else if (ch === '>') { if (depth > 0) depth--; }
- else if (depth === 0) out += ch;
- }
- return out.trim();
- }
- /**
- * A function/method's return type lives in the `function_definition`'s `type`
- * field (`Metrics& Metrics::instance()` → `Metrics`). Constructors, destructors,
- * and conversion operators have no `type` field → undefined.
- */
- function extractCppReturnType(node: SyntaxNode, source: string): string | undefined {
- const typeNode = getChildByField(node, 'type');
- if (!typeNode) return undefined;
- return normalizeCppReturnType(getNodeText(typeNode, source));
- }
- export const cExtractor: LanguageExtractor = {
- // Universal net: recover a real name from any macro-mangled function name.
- recoverMangledName: recoverMangledCppName,
- functionTypes: ['function_definition'],
- classTypes: [],
- methodTypes: [],
- interfaceTypes: [],
- structTypes: ['struct_specifier'],
- enumTypes: ['enum_specifier'],
- enumMemberTypes: ['enumerator'],
- typeAliasTypes: ['type_definition'], // typedef
- importTypes: ['preproc_include'],
- callTypes: ['call_expression'],
- variableTypes: ['declaration'],
- nameField: 'declarator',
- bodyField: 'body',
- paramsField: 'parameters',
- // A `const`/`static const` file-scope declaration carries a `type_qualifier`
- // child reading "const" — extract those as `constant`, plain globals as
- // `variable`.
- isConst: (node) =>
- node.namedChildren.some(
- (c: SyntaxNode) => c.type === 'type_qualifier' && c.text === 'const'
- ),
- getReturnType: extractCppReturnType,
- resolveTypeAliasKind: (node, _source) => {
- // C typedef: `typedef enum { ... } name;` or `typedef struct { ... } name;`
- // The inner enum_specifier/struct_specifier is anonymous, but we want the typedef name
- // to become the enum/struct node name.
- for (let i = 0; i < node.namedChildCount; i++) {
- const child = node.namedChild(i);
- if (!child) continue;
- if (child.type === 'enum_specifier' && getChildByField(child, 'body')) return 'enum';
- if (child.type === 'struct_specifier' && getChildByField(child, 'body')) return 'struct';
- }
- return undefined;
- },
- extractImport: (node, source) => {
- const importText = source.substring(node.startIndex, node.endIndex).trim();
- // C includes: #include <stdio.h>, #include "myheader.h"
- const systemLib = node.namedChildren.find((c: SyntaxNode) => c.type === 'system_lib_string');
- if (systemLib) {
- return { moduleName: getNodeText(systemLib, source).replace(/^<|>$/g, ''), signature: importText };
- }
- const stringLiteral = node.namedChildren.find((c: SyntaxNode) => c.type === 'string_literal');
- if (stringLiteral) {
- const stringContent = stringLiteral.namedChildren.find((c: SyntaxNode) => c.type === 'string_content');
- if (stringContent) {
- return { moduleName: getNodeText(stringContent, source), signature: importText };
- }
- }
- return null;
- },
- };
- /**
- * Detect tree-sitter's misparse of a macro-annotated class/struct, e.g.
- * `class MACRO Name { … }` or `class MACRO Name : public Base { … }` (#946).
- * Not knowing `MACRO` is a macro, tree-sitter reads `class MACRO` as an
- * *elaborated type specifier* (a bodyless `class_specifier`/`struct_specifier`
- * whose "type name" is the macro) and the rest as a function: `Name` becomes the
- * declarator and the `{ … }` a function body — so the whole declaration surfaces
- * as a `function_definition` named after the class, with a line range spanning
- * the entire class body. (A base clause, when present, additionally lands in an
- * `ERROR` node, but it isn't required — the leading macro alone triggers this.)
- *
- * Two structural signals pin it down with no risk to genuine code:
- * - the `type` field is a *bodyless* class/struct specifier — an elaborated
- * type, not a real inline-defined return type like
- * `struct P { int x; } makeP() { … }` (which carries a field list); and
- * - the declarator is not a `function_declarator` — a real function definition
- * always has one, which also leaves the legal-but-rare `class Foo f() { … }`
- * (an elaborated return type on a genuine function) alone.
- *
- * The class body is mangled by the same misparse and is unrecoverable, so —
- * matching how macro-prefixed C prototypes are handled — we drop the spurious
- * node rather than mint a misleading whole-body `function` that pollutes
- * callers/impact and skews kind statistics.
- */
- function isMacroMisparsedTypeDecl(node: SyntaxNode): boolean {
- const typeNode = getChildByField(node, 'type');
- if (!typeNode) return false;
- if (typeNode.type !== 'class_specifier' && typeNode.type !== 'struct_specifier') return false;
- if (typeNode.namedChildren.some((c: SyntaxNode) => c.type === 'field_declaration_list')) return false;
- const declarator = getChildByField(node, 'declarator');
- if (declarator && declarator.type === 'function_declarator') return false;
- return true;
- }
- /**
- * Blank an export/visibility macro in a `class/struct EXPORT_MACRO Name …`
- * *definition* header before parsing. Not knowing the macro, tree-sitter reads
- * `class EXPORT_MACRO` as an elaborated type specifier and the rest as a
- * function, so the whole class — its name, base clause, and members — drops out
- * of the index (#946 catches the resulting phantom function but can't recover
- * the class), which silently breaks type-hierarchy / inheritance-impact queries
- * for effectively every Unreal-Engine (`*_API`), Qt/Boost (`*_EXPORT`), LLVM
- * (`*_ABI`), … class. Replacing the macro with equal-length spaces preserves
- * every byte offset (and thus line/column), so the declaration then parses as a
- * normal class_specifier and the existing extraction emits the node, members,
- * and `extends` edge. (#1061, follow-up to #946.)
- *
- * Matched tightly so it can't touch the same macro used as an ordinary value
- * elsewhere (`int x = SOME_API;`): the macro is the ALL-CAPS token sitting
- * *between* `class`/`struct` and the type name, and the trailing `[:{]`
- * definition-guard fires only when a base clause or body follows — the only
- * shape that misparses. That guard also leaves elaborated-type variable
- * declarations (`struct FOO var;`, `class FOO obj = …`) untouched, since those
- * end in `;` / `=` / `[`, never `:` / `{`. C++-only (wired into cppExtractor),
- * so C's heavier use of `struct TAG var;` never reaches it.
- */
- export function blankCppExportMacros(source: string): string {
- if (source.indexOf('class') === -1 && source.indexOf('struct') === -1) return source;
- return source.replace(
- /\b(class|struct)(\s+)([A-Z][A-Z0-9_]+)(?=\s+[A-Za-z_]\w*(?:\s+final)?\s*[:{])/g,
- (_m, kw, ws, macro) => kw + ws + ' '.repeat(macro.length)
- );
- }
- /**
- * Blank a known inline-specifier macro sitting in front of a function's return
- * type (`FORCEINLINE FString GetName(…)`), before parsing. Not knowing the
- * macro, tree-sitter can't reconcile `MACRO <return-type> <name>(` — an extra
- * type-like token before the name — and drops into error recovery: the macro
- * becomes the return type and, for a non-primitive return, the return type gets
- * glued onto the name (`GetName` → `"FString GetName"`), so the function can't
- * be found by name and its callers don't link. This is pervasive in Unreal
- * Engine (`FORCEINLINE <ret> <name>(…)`) and in vendored third-party libraries
- * that define their own inline macro (pugixml's `PUGI__FN`, Godot's
- * `_FORCE_INLINE_`, Boost's `BOOST_FORCEINLINE`, …). Replacing the macro with
- * equal-length spaces preserves every byte offset (so line/column stay exact)
- * and the declaration then parses as an ordinary function — recovering the real
- * name AND the return type — mirroring how `blankCppExportMacros` recovers
- * macro-annotated classes (#946/#1061).
- *
- * Matched tightly so it can't touch an ordinary identifier: only the exact,
- * curated inline-specifier tokens below (never an arbitrary all-caps token, so a
- * real return type like `HRESULT DoIt()` is untouched), and only in specifier
- * position — immediately followed by whitespace and the identifier that starts
- * the return type or name. That lookahead leaves value/expression uses
- * (`x = FORCEINLINE ? …`), string literals, and longer words
- * (`FORCEINLINE_SOMETHINGELSE`, word-boundary) alone. To cover a new codebase's
- * inline macro, add its exact token to the list.
- */
- const CPP_INLINE_MACROS = [
- // Unreal Engine
- 'FORCEINLINE_DEBUGGABLE', 'FORCENOINLINE', 'FORCEINLINE',
- // pugixml (ubiquitous vendored XML parser): `#define PUGI__FN inline` before
- // the return type, plus `PUGIXML_FUNCTION` (linkage macro) between the return
- // type and the name — the blank mechanism handles both positions.
- 'PUGI__FN_NO_INLINE', 'PUGI__FN', 'PUGIXML_FUNCTION',
- // Godot
- '_ALWAYS_INLINE_', '_FORCE_INLINE_',
- // Boost
- 'BOOST_FORCEINLINE', 'BOOST_NOINLINE',
- // Qt (per-method markers + inline)
- 'Q_INVOKABLE', 'Q_SCRIPTABLE', 'Q_ALWAYS_INLINE', 'Q_SLOT', 'Q_SIGNAL',
- // Folly / Abseil / LLVM / V8 / Eigen / rapidjson
- 'FOLLY_ALWAYS_INLINE', 'FOLLY_NOINLINE',
- 'ABSL_ATTRIBUTE_ALWAYS_INLINE', 'ABSL_ATTRIBUTE_NOINLINE',
- 'LLVM_ATTRIBUTE_ALWAYS_INLINE', 'LLVM_ATTRIBUTE_NOINLINE',
- 'V8_INLINE', 'V8_NOINLINE',
- 'EIGEN_STRONG_INLINE', 'EIGEN_ALWAYS_INLINE', 'EIGEN_DEVICE_FUNC',
- 'RAPIDJSON_FORCEINLINE',
- // Common cross-ecosystem inline/attribute hints
- 'ALWAYS_INLINE', 'FORCE_INLINE', 'NOINLINE',
- ] as const;
- // One alternation, longest token first so a longer macro wins over a prefix.
- const CPP_INLINE_MACRO_RE = new RegExp(
- `\\b(${[...CPP_INLINE_MACROS].sort((a, b) => b.length - a.length).join('|')})\\b(?=\\s+[A-Za-z_])`,
- 'g'
- );
- export function blankCppInlineMacros(source: string): string {
- if (!CPP_INLINE_MACROS.some((m) => source.indexOf(m) !== -1)) return source;
- return source.replace(CPP_INLINE_MACRO_RE, (m) => ' '.repeat(m.length));
- }
- // Bare C/C++ type/qualifier tokens that must never be taken as a recovered
- // function name (guards `recoverMangledCppName` against the `Ret (name)` idiom,
- // where the token before the params is the return type, not the name).
- const CPP_PRIMITIVE_NAMES = new Set([
- 'bool', 'void', 'int', 'char', 'short', 'long', 'float', 'double', 'unsigned',
- 'signed', 'wchar_t', 'char8_t', 'char16_t', 'char32_t', 'char_t', 'size_t',
- 'auto', 'const', 'struct', 'class', 'enum', 'union', 'typename',
- ]);
- /**
- * Universal fallback (any macro, no list) for a C/C++ function name still mangled
- * because a macro we don't blank sat in front of the return type: `MACRO Ret
- * name(…)` / `Ret MACRO name(…)` misparse so the return type is glued onto the
- * name ("Ret name", "char_t* to_str(double v)"). Recover the real identifier —
- * the token immediately before the parameter list (or the last token). This runs
- * AFTER the curated pre-parse blank, so it only ever sees the residual tail that
- * blanking didn't already fix cleanly (which also recovers the return type).
- *
- * Safe by construction: only touches an ALREADY-mangled name — one with an
- * internal space that isn't a legit `operator …`/destructor — so a well-formed
- * name is returned unchanged. Guarded against the two ways it could mis-pick:
- * the `Ret (name)` parenthesized-name idiom (left as-is, ambiguous), and a token
- * that is a bare primitive/keyword rather than a real identifier.
- */
- export function recoverMangledCppName(name: string): string {
- if (!/\s/.test(name) || name.startsWith('operator') || name.startsWith('~')) return name;
- if (/^\S+\s+\([A-Za-z_]\w*\)/.test(name)) return name; // `Ret (name)` idiom — leave alone
- const beforeParams = name.includes('(') ? name.slice(0, name.indexOf('(')) : name;
- const tokens = beforeParams.trim().split(/\s+/);
- const candidate = tokens[tokens.length - 1];
- if (!candidate || !/^[A-Za-z_]\w*$/.test(candidate) || CPP_PRIMITIVE_NAMES.has(candidate)) return name;
- return candidate;
- }
- /** C/C++ source pre-processing before tree-sitter: recover both macro-annotated
- * class definitions and macro-prefixed function definitions. Offset-preserving. */
- function preParseCppSource(source: string): string {
- return blankCppInlineMacros(blankCppExportMacros(source));
- }
- export const cppExtractor: LanguageExtractor = {
- // Recover macro-annotated class/struct definitions (`class MYMODULE_API Foo : Base`,
- // #1061/#946) and macro-prefixed functions (`FORCEINLINE FString Foo()`, #1093
- // follow-up) that tree-sitter otherwise misparses.
- preParse: preParseCppSource,
- // Universal net for any macro the curated blank list misses.
- recoverMangledName: recoverMangledCppName,
- functionTypes: ['function_definition'],
- classTypes: ['class_specifier'],
- // A bodiless `class_specifier` is a forward declaration (`class Foo;`) or an
- // elaborated type reference, not a definition. Skip it so dozens of forward
- // decls across headers don't mint phantom `class` nodes that crowd out — and
- // get picked as the blast-radius representative over — the single real
- // definition, exactly as bodiless struct/enum specifiers are already skipped. (#1093)
- skipBodilessClass: true,
- methodTypes: ['function_definition'],
- interfaceTypes: [],
- structTypes: ['struct_specifier'],
- enumTypes: ['enum_specifier'],
- enumMemberTypes: ['enumerator'],
- typeAliasTypes: ['type_definition', 'alias_declaration'], // typedef and using
- importTypes: ['preproc_include'],
- callTypes: ['call_expression'],
- variableTypes: ['declaration'],
- nameField: 'declarator',
- bodyField: 'body',
- paramsField: 'parameters',
- resolveName: extractCppQualifiedMethodName,
- getReceiverType: extractCppReceiverType,
- getReturnType: extractCppReturnType,
- getVisibility: (node) => {
- // Check for access specifier in parent
- const parent = node.parent;
- if (parent) {
- for (let i = 0; i < parent.childCount; i++) {
- const child = parent.child(i);
- if (child?.type === 'access_specifier') {
- const text = child.text;
- if (text.includes('public')) return 'public';
- if (text.includes('private')) return 'private';
- if (text.includes('protected')) return 'protected';
- }
- }
- }
- return undefined;
- },
- resolveTypeAliasKind: (node, _source) => {
- // C++ typedef: `typedef enum { ... } name;` or `typedef struct { ... } name;`
- for (let i = 0; i < node.namedChildCount; i++) {
- const child = node.namedChild(i);
- if (!child) continue;
- if (child.type === 'enum_specifier' && getChildByField(child, 'body')) return 'enum';
- if (child.type === 'struct_specifier' && getChildByField(child, 'body')) return 'struct';
- }
- return undefined;
- },
- isMisparsedFunction: (name, node) => {
- // C++ macros like NLOHMANN_JSON_NAMESPACE_BEGIN cause tree-sitter to misparse
- // namespace blocks as function_definitions (e.g. name = "namespace detail").
- // Also filter C++ keywords that tree-sitter occasionally misinterprets as
- // function/method names (e.g. switch statements inside macro-confused scopes).
- if (name.startsWith('namespace')) return true;
- const cppKeywords = ['switch', 'if', 'for', 'while', 'do', 'case', 'return'];
- if (cppKeywords.includes(name)) return true;
- // `class MACRO Name : public Base { … }` misparses to a function_definition
- // named after the class. `blankCppExportMacros` (preParse) recovers the
- // common ALL-CAPS export-macro shape; this drop is the fallback for any
- // residual misparse it doesn't blank — still no phantom function (#1061/#946).
- return isMacroMisparsedTypeDecl(node);
- },
- extractImport: (node, source) => {
- const importText = source.substring(node.startIndex, node.endIndex).trim();
- // C++ includes: #include <iostream>, #include "myheader.h"
- const systemLib = node.namedChildren.find((c: SyntaxNode) => c.type === 'system_lib_string');
- if (systemLib) {
- return { moduleName: getNodeText(systemLib, source).replace(/^<|>$/g, ''), signature: importText };
- }
- const stringLiteral = node.namedChildren.find((c: SyntaxNode) => c.type === 'string_literal');
- if (stringLiteral) {
- const stringContent = stringLiteral.namedChildren.find((c: SyntaxNode) => c.type === 'string_content');
- if (stringContent) {
- return { moduleName: getNodeText(stringContent, source), signature: importText };
- }
- }
- return null;
- },
- };
|