Просмотр исходного кода

feat(extraction): add Objective-C language support (#165)

Adds tree-sitter-objc extractor for `.m`/`.mm` files and `.h` files
that content-sniff as Objective-C (`@interface`/`@implementation`/`@protocol`/`@synthesize`).

Extraction covers:
- `@interface` / `@implementation` (deduplicated into a single class node)
- `@protocol` (as `protocol` nodes via new `interfaceKind` config)
- Methods with full multi-part selectors (`doThing:with:`, not just `doThing`),
  including `+`/`-` static distinction
- `@property` declarations
- Inheritance (`extends`) and protocol conformance (`implements`)
- C-style `function_definition` and `#import` (both `<system>` and `"local"` forms)
- Call edges from both `call_expression` and `message_expression`,
  with `self`/`super` skipped on qualified callee names

Two new generic hooks on `LanguageExtractor` (`resolveName`,
`extractPropertyName`) handle the cases where the default name walk
doesn't fit; usable by future languages with similar shape.

Import resolver tries `.h`, `.m`, `.mm` for `objc` imports.

Validated on AFNetworking (84 files, 100% file coverage), RestKit
(282 files, 99.6%), and Texture (926 files, 100%, heavy `.mm`
content) — multi-keyword selectors preserved up to 7 parts, no parse
failures on ObjC++.

Known limitations (disclosed in README):
- Categories produce duplicate class nodes (one per category file)
- Chained/nested message sends record only the innermost method
- `[Class alloc]` patterns don't emit `instantiates` edges
- `@protocol Foo <Bar>` refinement lists not yet wired to `implements`
- Heavy C++ in `.mm` files may parse incompletely under the ObjC grammar
0x1306a94 4 недель назад
Родитель
Сommit
61153f96ee

+ 5 - 0
.claude/skills/agent-eval/corpus.json

@@ -69,5 +69,10 @@
     { "name": "Knit", "repo": "https://github.com/Sleitnick/Knit", "size": "Small", "files": "~10", "question": "How does Knit register services and expose them to clients?" },
     { "name": "vide", "repo": "https://github.com/centau/vide", "size": "Small", "files": "~40", "question": "How does vide track reactive sources and re-run effects when state changes?" },
     { "name": "Fusion", "repo": "https://github.com/dphfox/Fusion", "size": "Medium", "files": "~115", "question": "How does Fusion build and update its reactive UI graph from state objects?" }
+  ],
+  "Objective-C": [
+    { "name": "Masonry", "repo": "https://github.com/SnapKit/Masonry", "size": "Small", "files": "~50", "question": "How does Masonry build and activate Auto Layout constraints from its block DSL?" },
+    { "name": "FMDB", "repo": "https://github.com/ccgus/fmdb", "size": "Medium", "files": "~80", "question": "How does FMDB execute a prepared SQL statement and bind parameters?" },
+    { "name": "SDWebImage", "repo": "https://github.com/SDWebImage/SDWebImage", "size": "Large", "files": "~400", "question": "How does SDWebImage download, cache, and decode an image for a UIImageView?" }
   ]
 }

+ 2 - 1
README.md

@@ -135,7 +135,7 @@ The gains scale with codebase size: on large repos the agent answers from the in
 | **Full-Text Search** | Find code by name instantly across your entire codebase, powered by FTS5 |
 | **Impact Analysis** | Trace callers, callees, and the full impact radius of any symbol before making changes |
 | **Always Fresh** | File watcher uses native OS events (FSEvents/inotify/ReadDirectoryChangesW) with debounced auto-sync — the graph stays current as you code, zero config |
-| **19+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Swift, Kotlin, Dart, Lua, Luau, Svelte, Liquid, Pascal/Delphi |
+| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Dart, Lua, Luau, Svelte, Liquid, Pascal/Delphi |
 | **Framework-aware Routes** | Recognizes web-framework routing files and links URL patterns to their handlers across 14 frameworks |
 | **100% Local** | No data leaves your machine. No API keys. No external services. SQLite database only |
 
@@ -480,6 +480,7 @@ the MCP server and writing its instructions file:
 | Ruby | `.rb` | Full support |
 | C | `.c`, `.h` | Full support |
 | C++ | `.cpp`, `.hpp`, `.cc` | Full support |
+| Objective-C | `.m`, `.mm`, `.h` | Partial support (classes, protocols, methods, `@property`, `#import`, message sends; `.mm` ObjC++ may parse incompletely) |
 | Swift | `.swift` | Full support |
 | Kotlin | `.kt`, `.kts` | Full support |
 | Scala | `.scala`, `.sc` | Full support (classes, traits, methods, type aliases, Scala 3 enums) |

+ 108 - 0
__tests__/extraction.test.ts

@@ -93,6 +93,14 @@ describe('Language Detection', () => {
     expect(detectLanguage('main.dart')).toBe('dart');
   });
 
+  it('should detect Objective-C files', () => {
+    expect(detectLanguage('AppDelegate.m')).toBe('objc');
+    expect(detectLanguage('ViewController.mm')).toBe('objc');
+    const objcHeader = '@interface Foo : NSObject\n@end\n';
+    expect(detectLanguage('Foo.h', objcHeader)).toBe('objc');
+    expect(detectLanguage('stdio.h', '#ifndef STDIO_H\nvoid printf();\n#endif\n')).toBe('c');
+  });
+
   it('should return unknown for unsupported extensions', () => {
     expect(detectLanguage('styles.css')).toBe('unknown');
     expect(detectLanguage('data.json')).toBe('unknown');
@@ -3900,3 +3908,103 @@ local count = 0
     });
   });
 });
+
+// =============================================================================
+// Objective-C
+// =============================================================================
+
+describe('Objective-C Extraction', () => {
+  const sample = `
+#import <Foundation/Foundation.h>
+#import "MyClass.h"
+
+@interface MyClass : NSObject <NSCopying>
+@property (nonatomic, copy) NSString *name;
+- (void)greet;
+- (void)doThing:(id)x with:(id)y;
++ (instancetype)shared;
+@end
+
+@implementation MyClass
+
+- (void)greet {
+    NSLog(@"Hello");
+    [self doWork];
+}
+
+- (void)doThing:(id)x with:(id)y {
+    [self notify:x];
+}
+
++ (instancetype)shared {
+    return [[MyClass alloc] init];
+}
+
+@end
+
+void helperFunction(int count) {
+    MyClass *obj = [MyClass shared];
+    [obj greet];
+}
+`;
+
+  it('should extract classes, methods, functions, and imports', () => {
+    const result = extractFromSource('App.m', sample);
+
+    const classes = result.nodes.filter((n) => n.kind === 'class');
+    expect(classes.filter((c) => c.name === 'MyClass')).toHaveLength(1);
+
+    const methods = result.nodes.filter((n) => n.kind === 'method');
+    expect(methods.map((m) => m.name).sort()).toEqual(['doThing:with:', 'greet', 'shared']);
+
+    const shared = methods.find((m) => m.name === 'shared');
+    expect(shared?.isStatic).toBe(true);
+
+    const properties = result.nodes.filter((n) => n.kind === 'property');
+    expect(properties.some((p) => p.name === 'name')).toBe(true);
+
+    const functions = result.nodes.filter((n) => n.kind === 'function');
+    expect(functions.some((f) => f.name === 'helperFunction')).toBe(true);
+
+    const imports = result.nodes.filter((n) => n.kind === 'import').map((n) => n.name);
+    expect(imports).toContain('Foundation/Foundation.h');
+    expect(imports).toContain('MyClass.h');
+  });
+
+  it('should record inheritance and protocol conformance', () => {
+    const result = extractFromSource('App.m', sample);
+    const extendsRefs = result.unresolvedReferences.filter((r) => r.referenceKind === 'extends');
+    const implementsRefs = result.unresolvedReferences.filter((r) => r.referenceKind === 'implements');
+    expect(extendsRefs.map((r) => r.referenceName)).toContain('NSObject');
+    expect(implementsRefs.map((r) => r.referenceName)).toContain('NSCopying');
+  });
+
+  it('should record message sends and C calls', () => {
+    const result = extractFromSource('App.m', sample);
+    const calls = result.unresolvedReferences
+      .filter((r) => r.referenceKind === 'calls')
+      .map((r) => r.referenceName);
+    expect(calls).toEqual(expect.arrayContaining(['NSLog', 'doWork', 'MyClass.shared', 'obj.greet']));
+  });
+
+  it('should not classify pure C headers with @end in comments as objc', () => {
+    const cHeader = '/* @end of file */\n#ifndef STDIO_H\nvoid printf(const char *);\n#endif\n';
+    expect(detectLanguage('stdio.h', cHeader)).toBe('c');
+  });
+
+  it('should extract protocol declarations', () => {
+    const code = `
+@protocol DataSource <NSObject>
+- (NSInteger)numberOfItems;
+@end
+`;
+    const result = extractFromSource('DataSource.h', code);
+    const protocol = result.nodes.find((n) => n.kind === 'protocol' && n.name === 'DataSource');
+    expect(protocol).toBeDefined();
+  });
+
+  it('should report Objective-C as supported', () => {
+    expect(isLanguageSupported('objc')).toBe(true);
+    expect(getSupportedLanguages()).toContain('objc');
+  });
+});

+ 14 - 1
src/extraction/grammars.ts

@@ -37,6 +37,7 @@ const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
   scala: 'tree-sitter-scala.wasm',
   lua: 'tree-sitter-lua.wasm',
   luau: 'tree-sitter-luau.wasm',
+  objc: 'tree-sitter-objc.wasm',
 };
 
 /**
@@ -92,6 +93,8 @@ export const EXTENSION_MAP: Record<string, Language> = {
   '.sc': 'scala',
   '.lua': 'lua',
   '.luau': 'luau',
+  '.m': 'objc',
+  '.mm': 'objc',
 };
 
 /**
@@ -228,9 +231,10 @@ export function detectLanguage(filePath: string, source?: string): Language {
   const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
   const lang = EXTENSION_MAP[ext] || 'unknown';
 
-  // .h files could be C or C++ — check source content for C++ features
+  // .h files could be C, C++, or Objective-C — check source content
   if (lang === 'c' && ext === '.h' && source) {
     if (looksLikeCpp(source)) return 'cpp';
+    if (looksLikeObjc(source)) return 'objc';
   }
 
   return lang;
@@ -245,6 +249,14 @@ function looksLikeCpp(source: string): boolean {
   return /\bnamespace\b|\bclass\s+\w+\s*[:{]|\btemplate\s*<|\b(?:public|private|protected)\s*:|\bvirtual\b|\busing\s+(?:namespace\b|\w+\s*=)/.test(sample);
 }
 
+/**
+ * Heuristic: does a .h file contain Objective-C constructs?
+ */
+function looksLikeObjc(source: string): boolean {
+  const sample = source.substring(0, 8192);
+  return /@(?:interface|implementation|protocol|synthesize)\b/.test(sample);
+}
+
 /**
  * Check if a language is supported (has a grammar defined).
  * Returns true if the grammar exists, even if not yet loaded.
@@ -342,6 +354,7 @@ export function getLanguageDisplayName(language: Language): string {
     scala: 'Scala',
     lua: 'Lua',
     luau: 'Luau',
+    objc: 'Objective-C',
     yaml: 'YAML',
     twig: 'Twig',
     unknown: 'Unknown',

+ 2 - 0
src/extraction/languages/index.ts

@@ -25,6 +25,7 @@ import { pascalExtractor } from './pascal';
 import { scalaExtractor } from './scala';
 import { luaExtractor } from './lua';
 import { luauExtractor } from './luau';
+import { objcExtractor } from './objc';
 
 export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
   typescript: typescriptExtractor,
@@ -47,4 +48,5 @@ export const EXTRACTORS: Partial<Record<Language, LanguageExtractor>> = {
   scala: scalaExtractor,
   lua: luaExtractor,
   luau: luauExtractor,
+  objc: objcExtractor,
 };

+ 136 - 0
src/extraction/languages/objc.ts

@@ -0,0 +1,136 @@
+import type { Node as SyntaxNode } from 'web-tree-sitter';
+import { getChildByField, getNodeText } from '../tree-sitter-helpers';
+import type { ExtractorContext, LanguageExtractor } from '../tree-sitter-types';
+
+function findCompoundStatement(node: SyntaxNode): SyntaxNode | null {
+  for (let i = 0; i < node.namedChildCount; i++) {
+    const child = node.namedChild(i);
+    if (child?.type === 'compound_statement') {
+      return child;
+    }
+  }
+  return null;
+}
+
+/** Build ObjC selector: `greet`, `doThing:`, or `doThing:with:`. */
+function extractObjcMethodName(node: SyntaxNode, source: string): string | undefined {
+  if (node.type !== 'method_definition' && node.type !== 'method_declaration') {
+    return undefined;
+  }
+
+  const identifiers = node.namedChildren.filter((c) => c.type === 'identifier');
+  if (identifiers.length === 0) return undefined;
+
+  const hasParameters = node.namedChildren.some((c) => c.type === 'method_parameter');
+  const firstIdentifier = identifiers[0];
+  if (!firstIdentifier) return undefined;
+  if (!hasParameters) {
+    return getNodeText(firstIdentifier, source);
+  }
+
+  return identifiers.map((id) => `${getNodeText(id, source)}:`).join('');
+}
+
+function extractObjcPropertyName(node: SyntaxNode, source: string): string | null {
+  if (node.type !== 'property_declaration') return null;
+
+  const structDecl = node.namedChildren.find((c) => c.type === 'struct_declaration');
+  if (!structDecl) return null;
+
+  const structDeclarator = structDecl.namedChildren.find((c) => c.type === 'struct_declarator');
+  if (!structDeclarator) return null;
+
+  let current: SyntaxNode | null = structDeclarator;
+  while (current) {
+    const inner: SyntaxNode | undefined =
+      getChildByField(current, 'declarator') ||
+      current.namedChildren.find((c) => c.type === 'identifier' || c.type === 'pointer_declarator');
+    if (!inner) break;
+    if (inner.type === 'identifier') {
+      return getNodeText(inner, source);
+    }
+    current = inner;
+  }
+
+  return null;
+}
+
+export const objcExtractor: LanguageExtractor = {
+  functionTypes: ['function_definition'],
+  // Only @interface emits a class node; @implementation reuses it via visitNode.
+  classTypes: ['class_interface'],
+  methodTypes: ['method_definition'],
+  interfaceTypes: ['protocol_declaration'],
+  interfaceKind: 'protocol',
+  structTypes: ['struct_specifier'],
+  enumTypes: ['enum_specifier'],
+  enumMemberTypes: ['enumerator'],
+  typeAliasTypes: ['type_definition'],
+  importTypes: ['preproc_include'],
+  callTypes: ['call_expression', 'message_expression'],
+  variableTypes: ['declaration'],
+  propertyTypes: ['property_declaration'],
+  nameField: 'declarator',
+  bodyField: 'body',
+  paramsField: 'parameters',
+  resolveName: extractObjcMethodName,
+  extractPropertyName: extractObjcPropertyName,
+  resolveBody: (node, bodyField) => {
+    const fromField = getChildByField(node, bodyField);
+    if (fromField) {
+      return fromField;
+    }
+    return findCompoundStatement(node);
+  },
+  resolveTypeAliasKind: (node, _source) => {
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (!child) continue;
+      if (child.type === 'enum_specifier' && getChildByField(child, 'body')) return 'enum';
+      if (child.type === 'struct_specifier' && getChildByField(child, 'body')) return 'struct';
+    }
+    return undefined;
+  },
+  isStatic: (node) => /^\s*\+/.test(node.text),
+  visitNode: (node, ctx: ExtractorContext) => {
+    if (node.type !== 'class_implementation') return false;
+
+    const classNameNode = node.namedChildren.find((c) => c.type === 'identifier');
+    if (!classNameNode) return true;
+
+    const className = getNodeText(classNameNode, ctx.source);
+    const classNode =
+      ctx.nodes.find(
+        (n) => n.name === className && n.filePath === ctx.filePath && n.kind === 'class'
+      ) ?? ctx.createNode('class', className, node, {});
+    if (!classNode) return true;
+
+    ctx.pushScope(classNode.id);
+    for (let i = 0; i < node.namedChildCount; i++) {
+      const child = node.namedChild(i);
+      if (child?.type === 'implementation_definition') {
+        for (let j = 0; j < child.namedChildCount; j++) {
+          const implChild = child.namedChild(j);
+          if (implChild) ctx.visitNode(implChild);
+        }
+      }
+    }
+    ctx.popScope();
+    return true;
+  },
+  extractImport: (node, source) => {
+    const importText = source.substring(node.startIndex, node.endIndex).trim();
+    const systemLib = node.namedChildren.find((c: SyntaxNode) => c.type === 'system_lib_string');
+    if (systemLib) {
+      return { moduleName: getNodeText(systemLib, source).replace(/^<|>$/g, ''), signature: importText };
+    }
+    const stringLiteral = node.namedChildren.find((c: SyntaxNode) => c.type === 'string_literal');
+    if (stringLiteral) {
+      const stringContent = stringLiteral.namedChildren.find((c: SyntaxNode) => c.type === 'string_content');
+      if (stringContent) {
+        return { moduleName: getNodeText(stringContent, source), signature: importText };
+      }
+    }
+    return null;
+  },
+};

+ 6 - 0
src/extraction/tree-sitter-types.ts

@@ -120,6 +120,12 @@ export interface LanguageExtractor {
 
   // --- Existing hooks ---
 
+  /** Override symbol name extraction (e.g. ObjC multi-part selectors). */
+  resolveName?: (node: SyntaxNode, source: string) => string | undefined;
+
+  /** Extract property name when the generic name walk fails (e.g. ObjC @property). */
+  extractPropertyName?: (node: SyntaxNode, source: string) => string | null;
+
   /** Extract signature from node */
   getSignature?: (node: SyntaxNode, source: string) => string | undefined;
   /** Extract visibility from node */

+ 62 - 6
src/extraction/tree-sitter.ts

@@ -35,6 +35,9 @@ export { generateNodeId } from './tree-sitter-helpers';
  * Extract the name from a node based on language
  */
 function extractName(node: SyntaxNode, source: string, extractor: LanguageExtractor): string {
+  const hookName = extractor.resolveName?.(node, source);
+  if (hookName) return hookName;
+
   // Try field name first
   const nameNode = getChildByField(node, extractor.nameField);
   if (nameNode) {
@@ -893,12 +896,12 @@ export class TreeSitterExtractor {
     const visibility = this.extractor.getVisibility?.(node);
     const isStatic = this.extractor.isStatic?.(node) ?? false;
 
-    // Property name is a direct identifier child
-    const nameNode = getChildByField(node, 'name')
-      || node.namedChildren.find(c => c.type === 'identifier');
-    if (!nameNode) return;
-
-    const name = getNodeText(nameNode, this.source);
+    const hookName = this.extractor.extractPropertyName?.(node, this.source);
+    const nameNode = hookName
+      ? null
+      : getChildByField(node, 'name') || node.namedChildren.find(c => c.type === 'identifier');
+    const name = hookName ?? (nameNode ? getNodeText(nameNode, this.source) : null);
+    if (!name) return;
 
     // Get property type from the type child (first named child that isn't modifier or identifier)
     const typeNode = node.namedChildren.find(
@@ -1463,6 +1466,23 @@ export class TreeSitterExtractor {
           calleeName = `${receiverName}.${methodName}`;
         }
       }
+    } else if (node.type === 'message_expression') {
+      const methodField = getChildByField(node, 'method');
+      if (methodField) {
+        const methodName = getNodeText(methodField, this.source);
+        const receiverField = getChildByField(node, 'receiver');
+        const SKIP_RECEIVERS = new Set(['self', 'super']);
+        if (receiverField && receiverField.type !== 'message_expression') {
+          const receiverName = getNodeText(receiverField, this.source);
+          if (receiverName && !SKIP_RECEIVERS.has(receiverName)) {
+            calleeName = `${receiverName}.${methodName}`;
+          } else {
+            calleeName = methodName;
+          }
+        } else {
+          calleeName = methodName;
+        }
+      }
     } else {
       const func = getChildByField(node, 'function') || node.namedChild(0);
 
@@ -1770,6 +1790,42 @@ export class TreeSitterExtractor {
    * Extract inheritance relationships
    */
   private extractInheritance(node: SyntaxNode, classId: string): void {
+    // Objective-C @interface MyClass : NSObject <ProtoA, ProtoB>
+    if (node.type === 'class_interface') {
+      const superclass = getChildByField(node, 'superclass');
+      if (superclass) {
+        const name = getNodeText(superclass, this.source);
+        this.unresolvedReferences.push({
+          fromNodeId: classId,
+          referenceName: name,
+          referenceKind: 'extends',
+          line: superclass.startPosition.row + 1,
+          column: superclass.startPosition.column,
+        });
+      }
+      for (let j = 0; j < node.namedChildCount; j++) {
+        const argList = node.namedChild(j);
+        if (argList?.type !== 'parameterized_arguments') continue;
+        for (let k = 0; k < argList.namedChildCount; k++) {
+          const typeName = argList.namedChild(k);
+          if (!typeName) continue;
+          const typeId = typeName.namedChildren.find(
+            (c: SyntaxNode) => c.type === 'type_identifier' || c.type === 'identifier'
+          );
+          if (!typeId) continue;
+          const protocolName = getNodeText(typeId, this.source);
+          this.unresolvedReferences.push({
+            fromNodeId: classId,
+            referenceName: protocolName,
+            referenceKind: 'implements',
+            line: typeId.startPosition.row + 1,
+            column: typeId.startPosition.column,
+          });
+        }
+      }
+      return;
+    }
+
     // Look for extends/implements clauses
     for (let i = 0; i < node.namedChildCount; i++) {
       const child = node.namedChild(i);

+ 1 - 0
src/resolution/import-resolver.ts

@@ -24,6 +24,7 @@ const EXTENSION_RESOLUTION: Record<string, string[]> = {
   csharp: ['.cs'],
   php: ['.php'],
   ruby: ['.rb'],
+  objc: ['.h', '.m', '.mm'],
 };
 
 /**

+ 1 - 0
src/types.ts

@@ -87,6 +87,7 @@ export const LANGUAGES = [
   'scala',
   'lua',
   'luau',
+  'objc',
   'yaml',
   'twig',
   'unknown',