1
0
Эх сурвалжийг харах

fix(extraction): skip bodiless C++ forward declarations (#1093) (#1095)

A `class Foo;` forward declaration parses as a bodiless class_specifier.
extractStruct (#831) and extractEnum already skip their bodiless forms,
but extractClass did not — so every forward decl across dozens of headers
minted a phantom bodiless `class` node that competed with, and could be
picked as the blast-radius representative over, the single real definition.

Add an opt-in `skipBodilessClass` extractor flag (set only on cppExtractor)
and skip a bodiless class node when it's set, mirroring the struct/enum
skip. The flag keeps this C/C++-scoped: languages where a bodiless class is
a complete definition (Kotlin `class Empty`, Scala `case object`/`trait`)
leave it unset and are unaffected. The body is now resolved once at the top
of extractClass and reused for the member walk.

Regression tests cover the collapse to a single definition, elaborated-type
references creating no phantom, and Kotlin/Scala staying indexed.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry 18 цаг өмнө
parent
commit
f856f7ae49

+ 4 - 0
CHANGELOG.md

@@ -9,6 +9,10 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Fixes
+
+- C++ forward declarations no longer crowd out the real class definition. A `class Foo;` forward declaration — common in large C++ and Unreal Engine codebases, where a heavily used class is forward-declared across dozens of headers — was indexed as its own class node every time it appeared. So exploring that class returned mostly forward-declaration sites, and could even pick one of them as the representative for blast-radius, burying the actual definition and its members and callers. Bodiless forward declarations are now skipped for C and C++, exactly as forward-declared structs and enums already were, so only the real definition is indexed. Languages where a class with no body is a complete definition — such as Kotlin's `class Empty` and Scala — are unaffected. Thanks @luoyxy for the report and root-cause analysis. (#1093)
+
 
 ## [1.1.6] - 2026-06-30
 

+ 53 - 0
__tests__/extraction.test.ts

@@ -2811,6 +2811,59 @@ class MYGAME_API UMyComponent : public UActorComponent { };
     });
   });
 
+  describe('C++ forward declarations do not mint phantom class nodes (#1093)', () => {
+    // `class Foo;` parses as a bodiless class_specifier. Repeated across headers,
+    // each forward decl minted a phantom bodiless `class` node that crowded out —
+    // and could be picked as the blast-radius representative over — the single
+    // real definition. Bodiless struct/enum specifiers were already skipped;
+    // classes now are too, but ONLY for C/C++ (opt-in flag), never for languages
+    // where a bodiless class is a complete definition.
+    it('keeps only the real definition, dropping repeated forward decls', () => {
+      const code = `
+class APXCharacter;   // forward decl (header 1)
+class APXCharacter;   // forward decl (header 2)
+friend class APXCharacter;   // elaborated / friend forward reference
+
+class APXCharacter {  // the one real definition
+  int hp;
+  void takeDamage(int amount) { hp -= amount; }
+};
+`;
+      const result = extractFromSource('character.cpp', code);
+      const classes = result.nodes.filter(
+        (n) => n.kind === 'class' && n.name === 'APXCharacter'
+      );
+      // Exactly one class node, and it's the definition (carries the member).
+      expect(classes).toHaveLength(1);
+      expect(classes[0].startLine).toBe(6);
+      expect(
+        result.nodes.some((n) => n.kind === 'method' && n.name === 'takeDamage')
+      ).toBe(true);
+    });
+
+    it('elaborated type references in declarations create no phantom class', () => {
+      // `class Foo obj;` is a variable declaration using an elaborated type, not
+      // a class definition — it must not mint a `Foo` class node.
+      const result = extractFromSource('use.cpp', 'class Foo;\nvoid f() { class Foo *p = nullptr; (void)p; }\n');
+      expect(result.nodes.filter((n) => n.kind === 'class' && n.name === 'Foo')).toHaveLength(0);
+    });
+
+    it('does NOT affect languages where a bodiless class is complete', () => {
+      // Kotlin `class Empty` and Scala `trait`/`case object`/`class` with no body
+      // are complete definitions — the C/C++-only skip must leave them indexed.
+      const kt = extractFromSource('Empty.kt', 'class Empty\nclass Full { val x = 1 }\n');
+      const ktClasses = kt.nodes.filter((n) => n.kind === 'class').map((n) => n.name);
+      expect(ktClasses).toContain('Empty');
+      expect(ktClasses).toContain('Full');
+
+      const scala = extractFromSource('M.scala', 'trait Marker\ncase object Red\nclass Foo\n');
+      const scalaNames = scala.nodes
+        .filter((n) => ['class', 'trait', 'interface'].includes(n.kind))
+        .map((n) => n.name);
+      expect(scalaNames).toEqual(expect.arrayContaining(['Marker', 'Red', 'Foo']));
+    });
+  });
+
   describe('C++ templated base-class inheritance (#1043)', () => {
     // Inheriting from a template (`class D : public Base<int>`) recorded the base
     // ref as the full instantiation `Base<int>`, which never name-matched the

+ 6 - 0
src/extraction/languages/c-cpp.ts

@@ -245,6 +245,12 @@ export const cppExtractor: LanguageExtractor = {
   preParse: blankCppExportMacros,
   functionTypes: ['function_definition'],
   classTypes: ['class_specifier'],
+  // A bodiless `class_specifier` is a forward declaration (`class Foo;`) or an
+  // elaborated type reference, not a definition. Skip it so dozens of forward
+  // decls across headers don't mint phantom `class` nodes that crowd out — and
+  // get picked as the blast-radius representative over — the single real
+  // definition, exactly as bodiless struct/enum specifiers are already skipped. (#1093)
+  skipBodilessClass: true,
   methodTypes: ['function_definition'],
   interfaceTypes: [],
   structTypes: ['struct_specifier'],

+ 8 - 0
src/extraction/tree-sitter-types.ts

@@ -163,6 +163,14 @@ export interface LanguageExtractor {
   extraClassNodeTypes?: string[];
   /** Whether methods can be top-level without enclosing class (Go: true) */
   methodsAreTopLevel?: boolean;
+  /**
+   * Skip a bodiless class node as a forward declaration / elaborated type,
+   * mirroring the bodiless-struct/enum skip. Set only for languages where a
+   * bodiless `class` specifier is NOT a complete definition — C/C++
+   * (`class Foo;` is a forward decl). Leave unset for languages where a
+   * bodiless class IS complete (Kotlin `class Empty`, Scala `case object`). (#1093)
+   */
+  skipBodilessClass?: boolean;
   /** NodeKind to use for interface-like declarations (Rust: 'trait'). Default: 'interface' */
   interfaceKind?: NodeKind;
 

+ 10 - 3
src/extraction/tree-sitter.ts

@@ -1528,6 +1528,15 @@ export class TreeSitterExtractor {
   private extractClass(node: SyntaxNode, kind: NodeKind = 'class'): void {
     if (!this.extractor) return;
 
+    // Skip forward declarations / elaborated type references (`class Foo;`) in
+    // languages that opt in — bodiless there means "not a definition", so it
+    // would otherwise mint a phantom node competing with the real definition
+    // (#1093). Languages where a bodiless class is complete (Kotlin, Scala)
+    // leave the flag unset. Resolved once here and reused for the body walk.
+    const resolvedBody = this.extractor.resolveBody?.(node, this.extractor.bodyField)
+      ?? getChildByField(node, this.extractor.bodyField);
+    if (this.extractor.skipBodilessClass && !resolvedBody) return;
+
     const name = extractName(node, this.source, this.extractor);
     const docstring = getPrecedingDocstring(node, this.source);
     const visibility = this.extractor.getVisibility?.(node);
@@ -1551,9 +1560,7 @@ export class TreeSitterExtractor {
 
     // Push to stack and visit body
     this.nodeStack.push(classNode.id);
-    let body = this.extractor.resolveBody?.(node, this.extractor.bodyField)
-      ?? getChildByField(node, this.extractor.bodyField);
-    if (!body) body = node;
+    const body = resolvedBody ?? node;
 
     // Visit all children for methods and properties
     for (let i = 0; i < body.namedChildCount; i++) {