浏览代码

feat: Add C++ macro misparse handling and structural node extraction in function bodies

Addresses C++ macros like NLOHMANN_JSON_NAMESPACE_BEGIN that cause tree-sitter to misparse namespace blocks as function_definitions. Adds isMisparsedFunction hook to filter macro artifacts while still visiting their bodies to extract legitimate class/struct/enum definitions hidden inside the misparsed "function" scope.
Colby McHenry 2 月之前
父节点
当前提交
237fb3b206

+ 3 - 1
docs/SEARCH_QUALITY_LOOP.md

@@ -445,6 +445,7 @@ test().catch(console.error);
 | Search term dropped from query | Term is in the stop words list | `src/search/query-utils.ts: STOP_WORDS` |
 | `qualified_name` missing class for nested methods | Extraction not walking parent stack correctly | `src/extraction/tree-sitter.ts: visitNode()` |
 | Import edges missing | `extractImport` returns null for this syntax | `src/extraction/languages/<lang>.ts: extractImport` |
+| C++ classes/structs/enums missing from macro namespaces | Macros like `NLOHMANN_JSON_NAMESPACE_BEGIN` cause tree-sitter to misparse namespace blocks as `function_definition` | `src/extraction/languages/c-cpp.ts: isMisparsedFunction` filters bad names; `src/extraction/tree-sitter.ts: visitFunctionBody` extracts structural nodes |
 
 ## After Fixing Issues
 
@@ -524,12 +525,13 @@ if (receiverType) {
 - [x] **Java** — NOT needed. Methods nested in class body. Verified against Guava
 - [x] **Python** — NOT needed. Methods nested in class body. Verified against Flask
 - [x] **Rust** — `getReceiverType` walks up to parent `impl_item` to extract type name. Also adds `contains` edges from struct to impl methods. Verified against Deno
+- [x] **C** — NOT needed. No methods in C. Strong function/struct/enum extraction with excellent call edge density. Verified against Redis
+- [x] **C++** — NOT needed for header-only libs. `isMisparsedFunction` hook filters macro-caused misparse artifacts (e.g. `NLOHMANN_JSON_NAMESPACE_BEGIN`). `visitFunctionBody` now extracts structural nodes (classes/structs/enums) inside macro-confused "function" bodies. Verified against nlohmann/json. Note: out-of-class `Type::method()` definitions would need `getReceiverType` but are uncommon in header-only codebases.
 
 ### Needs Verification
 
 Check these — may need `getReceiverType` if methods are top-level in the AST:
 
-- [ ] C++ — out-of-class method definitions `Type::method()`
 - [ ] Kotlin — extension functions `fun Type.method()`
 
 Verify these DON'T need `getReceiverType` (methods nested in class body):

+ 9 - 0
src/extraction/languages/c-cpp.ts

@@ -88,6 +88,15 @@ export const cppExtractor: LanguageExtractor = {
     }
     return undefined;
   },
+  isMisparsedFunction: (name) => {
+    // C++ macros like NLOHMANN_JSON_NAMESPACE_BEGIN cause tree-sitter to misparse
+    // namespace blocks as function_definitions (e.g. name = "namespace detail").
+    // Also filter C++ keywords that tree-sitter occasionally misinterprets as
+    // function/method names (e.g. switch statements inside macro-confused scopes).
+    if (name.startsWith('namespace')) return true;
+    const cppKeywords = ['switch', 'if', 'for', 'while', 'do', 'case', 'return'];
+    return cppKeywords.includes(name);
+  },
   extractImport: (node, source) => {
     const importText = source.substring(node.startIndex, node.endIndex).trim();
     // C++ includes: #include <iostream>, #include "myheader.h"

+ 9 - 0
src/extraction/tree-sitter-types.ts

@@ -183,4 +183,13 @@ export interface LanguageExtractor {
    * or undefined to keep it as a type alias.
    */
   resolveTypeAliasKind?: (node: SyntaxNode, source: string) => NodeKind | undefined;
+
+  /**
+   * Check if a function/method name is a misparse artifact that should be skipped.
+   * Used by C/C++ where macros (e.g. NLOHMANN_JSON_NAMESPACE_BEGIN) cause tree-sitter
+   * to misparse namespace blocks as function_definitions. When this returns true,
+   * the function node is NOT created, but the body is still visited for calls and
+   * structural nodes (classes, structs, enums).
+   */
+  isMisparsedFunction?: (name: string, node: SyntaxNode) => boolean;
 }

+ 58 - 6
src/extraction/tree-sitter.ts

@@ -455,6 +455,17 @@ export class TreeSitterExtractor {
     }
     if (name === '<anonymous>') return; // Skip anonymous functions
 
+    // Check for misparse artifacts (e.g. C++ macros causing "namespace detail" functions)
+    // Skip the node but still visit the body for calls and structural nodes
+    if (this.extractor.isMisparsedFunction?.(name, node)) {
+      const body = this.extractor.resolveBody?.(node, this.extractor.bodyField)
+        ?? getChildByField(node, this.extractor.bodyField);
+      if (body) {
+        this.visitFunctionBody(body, '');
+      }
+      return;
+    }
+
     const docstring = getPrecedingDocstring(node, this.source);
     const signature = this.extractor.getSignature?.(node, this.source);
     const visibility = this.extractor.getVisibility?.(node);
@@ -542,6 +553,17 @@ export class TreeSitterExtractor {
     }
 
     const name = extractName(node, this.source, this.extractor);
+
+    // Check for misparse artifacts (e.g. C++ "switch" inside macro-confused class body)
+    if (this.extractor.isMisparsedFunction?.(name, node)) {
+      const body = this.extractor.resolveBody?.(node, this.extractor.bodyField)
+        ?? getChildByField(node, this.extractor.bodyField);
+      if (body) {
+        this.visitFunctionBody(body, '');
+      }
+      return;
+    }
+
     const docstring = getPrecedingDocstring(node, this.source);
     const signature = this.extractor.getSignature?.(node, this.source);
     const visibility = this.extractor.getVisibility?.(node);
@@ -1272,26 +1294,56 @@ export class TreeSitterExtractor {
   }
 
   /**
-   * Visit function body and extract calls
+   * Visit function body and extract calls (and structural nodes).
+   *
+   * In addition to call expressions, this also detects class/struct/enum
+   * definitions inside function bodies. This handles two cases:
+   *   1. Local class/struct/enum definitions (valid in C++, Java, etc.)
+   *   2. C++ macro misparsing — macros like NLOHMANN_JSON_NAMESPACE_BEGIN cause
+   *      tree-sitter to interpret the namespace block as a function_definition,
+   *      hiding real class/struct/enum nodes inside the "function body".
    */
   private visitFunctionBody(body: SyntaxNode, _functionId: string): void {
     if (!this.extractor) return;
 
-    // Recursively find all call expressions
-    const visitForCalls = (node: SyntaxNode): void => {
-      if (this.extractor!.callTypes.includes(node.type)) {
+    const visitForCallsAndStructure = (node: SyntaxNode): void => {
+      const nodeType = node.type;
+
+      if (this.extractor!.callTypes.includes(nodeType)) {
         this.extractCall(node);
       }
 
+      // Extract structural nodes found inside function bodies.
+      // Each extract method visits its own children, so we return after extracting.
+      if (this.extractor!.classTypes.includes(nodeType)) {
+        const classification = this.extractor!.classifyClassNode?.(node) ?? 'class';
+        if (classification === 'struct') this.extractStruct(node);
+        else if (classification === 'enum') this.extractEnum(node);
+        else this.extractClass(node);
+        return;
+      }
+      if (this.extractor!.structTypes.includes(nodeType)) {
+        this.extractStruct(node);
+        return;
+      }
+      if (this.extractor!.enumTypes.includes(nodeType)) {
+        this.extractEnum(node);
+        return;
+      }
+      if (this.extractor!.interfaceTypes.includes(nodeType)) {
+        this.extractInterface(node);
+        return;
+      }
+
       for (let i = 0; i < node.namedChildCount; i++) {
         const child = node.namedChild(i);
         if (child) {
-          visitForCalls(child);
+          visitForCallsAndStructure(child);
         }
       }
     };
 
-    visitForCalls(body);
+    visitForCallsAndStructure(body);
   }
 
   /**