Ver Fonte

feat: Add content-based C++ detection for .h headers

Addresses C++ classes missing from .h files where extension-based detection defaults to 'c' language which has no class extraction support. Adds looksLikeCpp() heuristic that scans first 8KB for C++-specific patterns (namespace, class, template, access specifiers) to promote .h files to 'cpp' language when C++ constructs are detected. Ensures cpp grammar is loaded alongside c to handle potential .h promotion during parsing.
Colby McHenry há 2 meses atrás
pai
commit
4a8d2f0396

+ 2 - 1
docs/SEARCH_QUALITY_LOOP.md

@@ -446,6 +446,7 @@ test().catch(console.error);
 | `qualified_name` missing class for nested methods | Extraction not walking parent stack correctly | `src/extraction/tree-sitter.ts: visitNode()` |
 | Import edges missing | `extractImport` returns null for this syntax | `src/extraction/languages/<lang>.ts: extractImport` |
 | C++ classes/structs/enums missing from macro namespaces | Macros like `NLOHMANN_JSON_NAMESPACE_BEGIN` cause tree-sitter to misparse namespace blocks as `function_definition` | `src/extraction/languages/c-cpp.ts: isMisparsedFunction` filters bad names; `src/extraction/tree-sitter.ts: visitFunctionBody` extracts structural nodes |
+| C++ classes missing from `.h` headers | `.h` files default to `c` language which has `classTypes: []` | `src/extraction/grammars.ts: looksLikeCpp()` — content-based heuristic promotes `.h` files to `cpp` when C++ patterns detected |
 
 ## After Fixing Issues
 
@@ -526,7 +527,7 @@ if (receiverType) {
 - [x] **Python** — NOT needed. Methods nested in class body. Verified against Flask
 - [x] **Rust** — `getReceiverType` walks up to parent `impl_item` to extract type name. Also adds `contains` edges from struct to impl methods. Verified against Deno
 - [x] **C** — NOT needed. No methods in C. Strong function/struct/enum extraction with excellent call edge density. Verified against Redis
-- [x] **C++** — NOT needed for header-only libs. `isMisparsedFunction` hook filters macro-caused misparse artifacts (e.g. `NLOHMANN_JSON_NAMESPACE_BEGIN`). `visitFunctionBody` now extracts structural nodes (classes/structs/enums) inside macro-confused "function" bodies. Verified against nlohmann/json. Note: out-of-class `Type::method()` definitions would need `getReceiverType` but are uncommon in header-only codebases.
+- [x] **C++** — NOT needed for header-only libs. `isMisparsedFunction` hook filters macro-caused misparse artifacts (e.g. `NLOHMANN_JSON_NAMESPACE_BEGIN`). `visitFunctionBody` now extracts structural nodes (classes/structs/enums) inside macro-confused "function" bodies. Content-based `.h` detection (`looksLikeCpp` in `grammars.ts`) promotes C++ headers to `cpp` language so classes in `.h` files are extracted. Verified against nlohmann/json and gRPC. Note: out-of-class `Type::method()` definitions would need `getReceiverType` but are uncommon in header-only codebases.
 
 ### Needs Verification
 

+ 18 - 2
src/extraction/grammars.ts

@@ -174,9 +174,25 @@ export function getParser(language: Language): Parser | null {
 /**
  * Detect language from file extension
  */
-export function detectLanguage(filePath: string): Language {
+export function detectLanguage(filePath: string, source?: string): Language {
   const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
-  return EXTENSION_MAP[ext] || 'unknown';
+  const lang = EXTENSION_MAP[ext] || 'unknown';
+
+  // .h files could be C or C++ — check source content for C++ features
+  if (lang === 'c' && ext === '.h' && source) {
+    if (looksLikeCpp(source)) return 'cpp';
+  }
+
+  return lang;
+}
+
+/**
+ * Heuristic: does a .h file contain C++ constructs?
+ * Checks the first ~8KB for patterns that are unique to C++ and never valid C.
+ */
+function looksLikeCpp(source: string): boolean {
+  const sample = source.substring(0, 8192);
+  return /\bnamespace\b|\bclass\s+\w+\s*[:{]|\btemplate\s*<|\b(?:public|private|protected)\s*:|\bvirtual\b|\busing\s+(?:namespace\b|\w+\s*=)/.test(sample);
 }
 
 /**

+ 13 - 5
src/extraction/index.ts

@@ -472,6 +472,10 @@ export class ExtractionOrchestrator {
 
     // Detect needed languages and load grammars in the parse worker
     const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
+    // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
+    if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
+      neededLanguages.push('cpp');
+    }
 
     // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
     // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
@@ -580,7 +584,7 @@ export class ExtractionOrchestrator {
     async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
       if (!WorkerClass) {
         // In-process fallback
-        return extractFromSource(filePath, content, detectLanguage(filePath));
+        return extractFromSource(filePath, content, detectLanguage(filePath, content));
       }
 
       // Recycle the worker before the next parse if we've hit the threshold.
@@ -706,7 +710,7 @@ export class ExtractionOrchestrator {
 
         // Store in database on main thread (SQLite is not thread-safe)
         if (result.nodes.length > 0 || result.errors.length === 0) {
-          const language = detectLanguage(filePath);
+          const language = detectLanguage(filePath, content);
           this.storeExtractionResult(filePath, content, language, stats, result);
         }
 
@@ -779,7 +783,7 @@ export class ExtractionOrchestrator {
         }
 
         if (result.nodes.length > 0 || result.errors.length === 0) {
-          const language = detectLanguage(filePath);
+          const language = detectLanguage(filePath, content);
           const stats = await fsp.stat(path.join(this.rootDir, filePath));
           this.storeExtractionResult(filePath, content, language, stats, result);
 
@@ -830,7 +834,7 @@ export class ExtractionOrchestrator {
           }
 
           if (result.nodes.length > 0 || result.errors.length === 0) {
-            const language = detectLanguage(filePath);
+            const language = detectLanguage(filePath, fullContent);
             const stats = await fsp.stat(path.join(this.rootDir, filePath));
             this.storeExtractionResult(filePath, fullContent, language, stats, result);
 
@@ -989,7 +993,7 @@ export class ExtractionOrchestrator {
     }
 
     // Detect language
-    const language = detectLanguage(relativePath);
+    const language = detectLanguage(relativePath, content);
     if (!isLanguageSupported(language)) {
       return {
         nodes: [],
@@ -1201,6 +1205,10 @@ export class ExtractionOrchestrator {
     // Load only grammars needed for changed files
     if (filesToIndex.length > 0) {
       const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
+      // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
+      if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
+        neededLanguages.push('cpp');
+      }
       await loadGrammarsForLanguages(neededLanguages);
     }
 

+ 1 - 1
src/extraction/parse-worker.ts

@@ -20,7 +20,7 @@ parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: st
   } else if (msg.type === 'parse') {
     const { id, filePath, content } = msg;
     try {
-      const language = detectLanguage(filePath!);
+      const language = detectLanguage(filePath!, content);
       const result: ExtractionResult = extractFromSource(filePath!, content!, language);
 
       // Periodic parser reset to reclaim WASM heap memory

+ 2 - 2
src/extraction/tree-sitter.ts

@@ -106,7 +106,7 @@ export class TreeSitterExtractor {
   constructor(filePath: string, source: string, language?: Language) {
     this.filePath = filePath;
     this.source = source;
-    this.language = language || detectLanguage(filePath);
+    this.language = language || detectLanguage(filePath, source);
     this.extractor = EXTRACTORS[this.language] || null;
   }
 
@@ -2087,7 +2087,7 @@ export function extractFromSource(
   source: string,
   language?: Language
 ): ExtractionResult {
-  const detectedLanguage = language || detectLanguage(filePath);
+  const detectedLanguage = language || detectLanguage(filePath, source);
   const fileExtension = path.extname(filePath).toLowerCase();
 
   // Use custom extractor for Svelte