generated-detection.ts 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. /**
  2. * Generated-file detection for symbol-disambiguation down-ranking.
  3. *
  4. * When a query like "Send" matches 17 symbols across protobuf scaffolding,
  5. * test mocks, and the hand-written implementation, the FTS ranker often
  6. * surfaces the generated stubs first because their names are identical
  7. * to the implementation's name (validated empirically on cosmos-sdk —
  8. * see project_go_multi_module_audit memory). Generated stubs frequently
  9. * have no body to trace from, so the agent ends up reading source anyway.
  10. *
  11. * This helper is a pure path-based classifier consulted at disambiguation
  12. * time (findSymbol / findAllSymbols / codegraph_search formatting), NOT
  13. * a hard filter — generated nodes are still in the graph and remain
  14. * reachable; they just rank LAST when there's a real implementation
  15. * with the same name.
  16. *
  17. * Scope: suffix patterns only. Most generated files follow the
  18. * `<basename>.<tool>.<ext>` convention (`.pb.go`, `_grpc.pb.go`,
  19. * `.g.dart`, `_pb2.py`), and that covers ~all of what we saw in the
  20. * Go audit. A future addition would be scanning for the canonical
  21. * `// Code generated by` header during extraction, for the rare files
  22. * that defy the suffix convention.
  23. */
  24. const GENERATED_PATTERNS: ReadonlyArray<RegExp> = [
  25. // Go — protobuf / gRPC / pulsar
  26. /\.pb\.go$/,
  27. /\.pulsar\.go$/,
  28. /_grpc\.pb\.go$/,
  29. // Go — mockgen output. Default emits `mock_<src>.go`; many projects
  30. // (cosmos-sdk uses `expected_*_mocks.go`) rename to `*_mock.go` /
  31. // `*_mocks.go`. Matching either suffix catches both conventions
  32. // without false-positive risk on hand-written sources.
  33. /_mock\.go$/,
  34. /_mocks\.go$/,
  35. /^mock_[^/]+\.go$/,
  36. // TypeScript / JavaScript — common codegen suffixes (Apollo / GraphQL
  37. // codegen, Prisma, Hasura, ts-proto, gRPC-web, swagger-codegen).
  38. /\.generated\.[jt]sx?$/,
  39. /\.gen\.[jt]sx?$/,
  40. /\.pb\.[jt]s$/,
  41. /_pb\.[jt]s$/,
  42. /_grpc_pb\.[jt]s$/,
  43. // Python — protobuf / gRPC / openapi-codegen
  44. /_pb2(_grpc)?\.py$/,
  45. /_pb2\.pyi$/,
  46. // C++ — protobuf
  47. /\.pb\.(cc|h)$/,
  48. // C# — protobuf / gRPC (protoc-gen-csharp puts output under obj/ but
  49. // many projects also commit *.g.cs and *Grpc.cs siblings)
  50. /\.g\.cs$/,
  51. /Grpc\.cs$/,
  52. // Java — protobuf / gRPC: protoc-gen-java emits `*OuterClass.java`,
  53. // protoc-gen-grpc-java emits `*Grpc.java`. The XxxImplBase abstract
  54. // class lives inside Xxx*Grpc.java.
  55. /OuterClass\.java$/,
  56. /Grpc\.java$/,
  57. // Swift — protobuf
  58. /\.pb\.swift$/,
  59. // Dart — build_runner / freezed / json_serializable / chopper
  60. /\.g\.dart$/,
  61. /\.freezed\.dart$/,
  62. /\.pb\.dart$/,
  63. /\.pbgrpc\.dart$/,
  64. /\.chopper\.dart$/,
  65. // Rust — common build.rs OUT_DIR outputs are usually outside the source
  66. // tree, but in-tree generated files often use `*.generated.rs`.
  67. /\.generated\.rs$/,
  68. ];
  69. /**
  70. * Whether `filePath` looks like a tool-generated source file based on
  71. * its filename. Path-only — does not read content. The result is a
  72. * relevance hint for disambiguation, not a hard claim.
  73. */
  74. export function isGeneratedFile(filePath: string): boolean {
  75. return GENERATED_PATTERNS.some((p) => p.test(filePath));
  76. }