explore-output-budget.test.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /**
  2. * Adaptive output budget for codegraph_explore (#185).
  3. *
  4. * The explore tool used to apply a fixed 35KB output cap regardless of
  5. * project size, which on small codebases was a net loss vs. native
  6. * grep+Read. These tests pin the per-tier budget shape so future tuning
  7. * doesn't silently drift the small-project case back into bloat.
  8. */
  9. import { describe, it, expect, beforeAll, afterAll } from 'vitest';
  10. import * as fs from 'fs';
  11. import * as path from 'path';
  12. import * as os from 'os';
  13. import { getExploreOutputBudget, getExploreBudget, ToolHandler } from '../src/mcp/tools';
  14. import CodeGraph from '../src/index';
  15. describe('getExploreOutputBudget', () => {
  16. it('returns a strictly smaller total cap for small projects than for huge ones', () => {
  17. const small = getExploreOutputBudget(100);
  18. const huge = getExploreOutputBudget(30000);
  19. expect(small.maxOutputChars).toBeLessThan(huge.maxOutputChars);
  20. expect(small.defaultMaxFiles).toBeLessThan(huge.defaultMaxFiles);
  21. expect(small.maxCharsPerFile).toBeLessThan(huge.maxCharsPerFile);
  22. });
  23. it('caps total output well under 8000 tokens (~32k chars) on small projects', () => {
  24. const small = getExploreOutputBudget(100);
  25. expect(small.maxOutputChars).toBeLessThanOrEqual(20000);
  26. });
  27. it('keeps the historical 35k+ ceiling for medium-large projects so existing benchmarks do not regress', () => {
  28. const large = getExploreOutputBudget(10000);
  29. expect(large.maxOutputChars).toBeGreaterThanOrEqual(35000);
  30. });
  31. it('uses tier breakpoints matching getExploreBudget so call-count and output-budget agree on a project', () => {
  32. // Very-tiny tier (<150 files) gets a tighter cap than small (150-499) —
  33. // paired with tool gating to handle the MCP-overhead-dominates regime.
  34. const tier0a = getExploreOutputBudget(50);
  35. const tier0b = getExploreOutputBudget(149);
  36. expect(tier0a.maxOutputChars).toBe(tier0b.maxOutputChars);
  37. const tier1a = getExploreOutputBudget(150);
  38. const tier1b = getExploreOutputBudget(499);
  39. expect(tier1a.maxOutputChars).toBe(tier1b.maxOutputChars);
  40. // The <500 explore-call budget covers both very-tiny and small.
  41. expect(getExploreBudget(50)).toBe(getExploreBudget(499));
  42. const tier2a = getExploreOutputBudget(500);
  43. const tier2b = getExploreOutputBudget(4999);
  44. expect(tier2a.maxOutputChars).toBe(tier2b.maxOutputChars);
  45. expect(getExploreBudget(500)).toBe(getExploreBudget(4999));
  46. const tier3a = getExploreOutputBudget(5000);
  47. const tier3b = getExploreOutputBudget(14999);
  48. expect(tier3a.maxOutputChars).toBe(tier3b.maxOutputChars);
  49. // And crossing a breakpoint changes the cap.
  50. expect(tier0a.maxOutputChars).not.toBe(tier1a.maxOutputChars);
  51. expect(tier1a.maxOutputChars).not.toBe(tier2a.maxOutputChars);
  52. expect(tier2a.maxOutputChars).not.toBe(tier3a.maxOutputChars);
  53. });
  54. it('gates off "Additional relevant files", completeness signal, and budget note on small projects', () => {
  55. const small = getExploreOutputBudget(100);
  56. expect(small.includeAdditionalFiles).toBe(false);
  57. expect(small.includeCompletenessSignal).toBe(false);
  58. expect(small.includeBudgetNote).toBe(false);
  59. });
  60. it('keeps all meta-text on for projects that earn the breadth signal (>=500 files)', () => {
  61. const medium = getExploreOutputBudget(1000);
  62. expect(medium.includeAdditionalFiles).toBe(true);
  63. expect(medium.includeCompletenessSignal).toBe(true);
  64. expect(medium.includeBudgetNote).toBe(true);
  65. });
  66. it('keeps the Relationships section on for medium+ tiers — small tiers drop it to maximize body density', () => {
  67. // ITER2: relationships dropped on <500 tiers; on tiny repos the
  68. // per-call payload is the cost driver, so even "cheap" structural
  69. // signal adds up across follow-up turns. Re-enabled at ≥500 where
  70. // body budgets are roomy enough to absorb the 1-2KB overhead.
  71. expect(getExploreOutputBudget(50).includeRelationships).toBe(false);
  72. expect(getExploreOutputBudget(1000).includeRelationships).toBe(true);
  73. expect(getExploreOutputBudget(10000).includeRelationships).toBe(true);
  74. expect(getExploreOutputBudget(30000).includeRelationships).toBe(true);
  75. });
  76. it('caps the per-file header symbol list more tightly on small projects', () => {
  77. // Without this cap, a file like Alamofire's Session.swift produced
  78. // a 3.4KB symbol list in the `#### path — sym, sym, ...` header,
  79. // dwarfing the per-file body cap.
  80. const small = getExploreOutputBudget(100);
  81. const huge = getExploreOutputBudget(30000);
  82. expect(small.maxSymbolsInFileHeader).toBeLessThan(huge.maxSymbolsInFileHeader);
  83. expect(small.maxSymbolsInFileHeader).toBeGreaterThan(0);
  84. });
  85. it('uses a tighter clustering gap threshold on small projects to break runaway single clusters', () => {
  86. const small = getExploreOutputBudget(100);
  87. const huge = getExploreOutputBudget(30000);
  88. expect(small.gapThreshold).toBeLessThanOrEqual(huge.gapThreshold);
  89. });
  90. it('handles the boundary file counts exactly (off-by-one regression guard)', () => {
  91. // 149 -> very-tiny, 150 -> small
  92. expect(getExploreOutputBudget(149).maxOutputChars).toBe(getExploreOutputBudget(50).maxOutputChars);
  93. expect(getExploreOutputBudget(150).maxOutputChars).toBe(getExploreOutputBudget(200).maxOutputChars);
  94. // 499 -> small, 500 -> medium
  95. expect(getExploreOutputBudget(499).maxOutputChars).toBe(getExploreOutputBudget(200).maxOutputChars);
  96. expect(getExploreOutputBudget(500).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars);
  97. // 4999 -> medium, 5000 -> large
  98. expect(getExploreOutputBudget(4999).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars);
  99. expect(getExploreOutputBudget(5000).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars);
  100. // 14999 -> large, 15000 -> xlarge
  101. expect(getExploreOutputBudget(14999).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars);
  102. expect(getExploreOutputBudget(15000).maxOutputChars).toBe(getExploreOutputBudget(30000).maxOutputChars);
  103. });
  104. });
  105. /**
  106. * End-to-end check that the budget is actually applied by handleExplore.
  107. *
  108. * Builds a tiny synthetic project (<500 files, so the small tier), indexes
  109. * it, and confirms the output:
  110. * - stays under the small-tier maxOutputChars cap
  111. * - omits the meta-text the small tier gates off (completeness signal,
  112. * budget note, "Additional relevant files")
  113. *
  114. * Regression guard for #185 — protects against future edits to handleExplore
  115. * silently re-introducing the fixed 35KB cap on small projects.
  116. */
  117. describe('codegraph_explore output respects the adaptive budget', () => {
  118. let testDir: string;
  119. let cg: CodeGraph;
  120. let handler: ToolHandler;
  121. beforeAll(async () => {
  122. testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-explore-budget-'));
  123. const srcDir = path.join(testDir, 'src');
  124. fs.mkdirSync(srcDir);
  125. // A handful of files with one fat target file. The fat file mimics the
  126. // Alamofire Session.swift case: many methods stacked on top of each other,
  127. // which collapsed into one giant cluster pre-#185.
  128. const fatLines: string[] = ['export class Session {'];
  129. for (let i = 0; i < 30; i++) {
  130. fatLines.push(` method${i}(arg: string): string {`);
  131. fatLines.push(` return this.helper${i}(arg) + "${i}";`);
  132. fatLines.push(` }`);
  133. fatLines.push(` private helper${i}(arg: string): string {`);
  134. fatLines.push(` return arg.repeat(${i + 1});`);
  135. fatLines.push(` }`);
  136. }
  137. fatLines.push('}');
  138. fs.writeFileSync(path.join(srcDir, 'session.ts'), fatLines.join('\n'));
  139. // A few small supporting files so the project has >1 indexed file.
  140. for (let i = 0; i < 5; i++) {
  141. fs.writeFileSync(
  142. path.join(srcDir, `support${i}.ts`),
  143. `import { Session } from './session';\nexport function callSession${i}(s: Session) { return s.method${i}('hi'); }\n`
  144. );
  145. }
  146. cg = CodeGraph.initSync(testDir, {
  147. config: { include: ['**/*.ts'], exclude: [] },
  148. });
  149. await cg.indexAll();
  150. handler = new ToolHandler(cg);
  151. });
  152. afterAll(() => {
  153. if (cg) cg.destroy();
  154. if (testDir && fs.existsSync(testDir)) {
  155. fs.rmSync(testDir, { recursive: true, force: true });
  156. }
  157. });
  158. it('keeps total output under the small-project cap', async () => {
  159. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  160. const text = result.content?.[0]?.text ?? '';
  161. const smallBudget = getExploreOutputBudget(100);
  162. // Allow a small overshoot for the trailing markers — the cap is enforced
  163. // per-file rather than as an absolute output ceiling.
  164. expect(text.length).toBeLessThan(smallBudget.maxOutputChars + 500);
  165. });
  166. it('omits the meta-text gated off for small projects', async () => {
  167. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  168. const text = result.content?.[0]?.text ?? '';
  169. expect(text).not.toContain('### Additional relevant files');
  170. expect(text).not.toContain('Complete source code is included above');
  171. expect(text).not.toContain('Explore budget:');
  172. });
  173. it('still includes the Relationships section — it is the cheapest structural signal', async () => {
  174. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  175. const text = result.content?.[0]?.text ?? '';
  176. // Either there are relationships, or no edges were significant — both are fine.
  177. // We just want to confirm we did not accidentally gate it off.
  178. const hasRelationships = text.includes('### Relationships');
  179. const sourceFollowsHeader = text.indexOf('### Source Code') > 0;
  180. expect(hasRelationships || sourceFollowsHeader).toBe(true);
  181. });
  182. it('prefixes source lines with line numbers by default (cat -n style)', async () => {
  183. delete process.env.CODEGRAPH_EXPLORE_LINENUMS;
  184. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  185. const text = result.content?.[0]?.text ?? '';
  186. // At least one fenced source line should look like `<digits>\t<code>`.
  187. expect(/\n\d+\t/.test(text)).toBe(true);
  188. });
  189. it('omits line numbers when CODEGRAPH_EXPLORE_LINENUMS=0', async () => {
  190. process.env.CODEGRAPH_EXPLORE_LINENUMS = '0';
  191. try {
  192. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  193. const text = result.content?.[0]?.text ?? '';
  194. // The synthetic source has no tab-prefixed numeric lines of its own,
  195. // so none should appear when the toggle is off.
  196. expect(/\n\d+\t(?:export| )/.test(text)).toBe(false);
  197. } finally {
  198. delete process.env.CODEGRAPH_EXPLORE_LINENUMS;
  199. }
  200. });
  201. it('uses language-neutral omission markers (no C-style // in the output)', async () => {
  202. // The gap/trimmed separators must not assume `//` is a comment — that's
  203. // wrong in Python, Ruby, etc. They render inside fenced source blocks.
  204. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  205. const text = result.content?.[0]?.text ?? '';
  206. expect(text).not.toContain('// ... (gap)');
  207. expect(text).not.toContain('// ... trimmed');
  208. });
  209. it('does not collapse a whole-file class into just its header (envelope filter)', async () => {
  210. // The synthetic `Session` class spans the entire file. Without the
  211. // envelope filter it would form one giant cluster that tail-trims to
  212. // the class declaration, hiding the methods. Confirm real method bodies
  213. // make it into the output. Regression guard for the #185 follow-up.
  214. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  215. const text = result.content?.[0]?.text ?? '';
  216. // A method body line (`methodN(arg: string)`) should appear, not just
  217. // the `export class Session {` opener.
  218. const hasMethodBody = /method\d+\(arg: string\)/.test(text);
  219. expect(hasMethodBody).toBe(true);
  220. });
  221. });