explore-output-budget.test.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /**
  2. * Adaptive output budget for codegraph_explore (#185).
  3. *
  4. * The explore tool used to apply a fixed 35KB output cap regardless of
  5. * project size, which on small codebases was a net loss vs. native
  6. * grep+Read. These tests pin the per-tier budget shape so future tuning
  7. * doesn't silently drift the small-project case back into bloat.
  8. */
  9. import { describe, it, expect, beforeAll, afterAll } from 'vitest';
  10. import * as fs from 'fs';
  11. import * as path from 'path';
  12. import * as os from 'os';
  13. import { getExploreOutputBudget, getExploreBudget, ToolHandler } from '../src/mcp/tools';
  14. import CodeGraph from '../src/index';
  15. describe('getExploreOutputBudget', () => {
  16. it('returns a strictly smaller total cap for small projects than for huge ones', () => {
  17. const small = getExploreOutputBudget(100);
  18. const huge = getExploreOutputBudget(30000);
  19. expect(small.maxOutputChars).toBeLessThan(huge.maxOutputChars);
  20. expect(small.defaultMaxFiles).toBeLessThan(huge.defaultMaxFiles);
  21. expect(small.maxCharsPerFile).toBeLessThan(huge.maxCharsPerFile);
  22. });
  23. it('caps total output well under 8000 tokens (~32k chars) on small projects', () => {
  24. const small = getExploreOutputBudget(100);
  25. expect(small.maxOutputChars).toBeLessThanOrEqual(20000);
  26. });
  27. it('caps medium-large projects at the inline tool-result ceiling (~24k) so the result is never externalized', () => {
  28. // A bigger single response gets externalized by the host to a file the agent
  29. // Reads back (a 35k vscode explore did exactly that in the n=4 A/B) — adding a
  30. // read AND cache-write cost. So large repos get MORE CALLS (getExploreBudget),
  31. // not a fatter single response; the output cap stays under the inline limit.
  32. const large = getExploreOutputBudget(10000);
  33. expect(large.maxOutputChars).toBeLessThanOrEqual(25000);
  34. expect(large.maxOutputChars).toBeGreaterThanOrEqual(20000);
  35. });
  36. it('uses tier breakpoints matching getExploreBudget so call-count and output-budget agree on a project', () => {
  37. // Very-tiny tier (<150 files) gets a tighter cap than small (150-499) —
  38. // paired with tool gating to handle the MCP-overhead-dominates regime.
  39. const tier0a = getExploreOutputBudget(50);
  40. const tier0b = getExploreOutputBudget(149);
  41. expect(tier0a.maxOutputChars).toBe(tier0b.maxOutputChars);
  42. const tier1a = getExploreOutputBudget(150);
  43. const tier1b = getExploreOutputBudget(499);
  44. expect(tier1a.maxOutputChars).toBe(tier1b.maxOutputChars);
  45. // The <500 explore-call budget covers both very-tiny and small.
  46. expect(getExploreBudget(50)).toBe(getExploreBudget(499));
  47. const tier2a = getExploreOutputBudget(500);
  48. const tier2b = getExploreOutputBudget(4999);
  49. expect(tier2a.maxOutputChars).toBe(tier2b.maxOutputChars);
  50. expect(getExploreBudget(500)).toBe(getExploreBudget(4999));
  51. const tier3a = getExploreOutputBudget(5000);
  52. const tier3b = getExploreOutputBudget(14999);
  53. expect(tier3a.maxOutputChars).toBe(tier3b.maxOutputChars);
  54. // Small tiers step up (13k → 18k → 24k); medium and large SHARE the ~24k
  55. // inline ceiling — scaling with repo size now lives in the CALL budget
  56. // (getExploreBudget), not in a fatter single response.
  57. expect(tier0a.maxOutputChars).not.toBe(tier1a.maxOutputChars); // <150 vs <500
  58. expect(tier1a.maxOutputChars).not.toBe(tier2a.maxOutputChars); // <500 vs <5000
  59. expect(tier2a.maxOutputChars).toBe(tier3a.maxOutputChars); // <5000 == <15000 (inline cap)
  60. expect(getExploreBudget(5000)).toBeGreaterThan(getExploreBudget(4999)); // calls scale instead
  61. });
  62. it('gates off "Additional relevant files", completeness signal, and budget note on small projects', () => {
  63. const small = getExploreOutputBudget(100);
  64. expect(small.includeAdditionalFiles).toBe(false);
  65. expect(small.includeCompletenessSignal).toBe(false);
  66. expect(small.includeBudgetNote).toBe(false);
  67. });
  68. it('keeps all meta-text on for projects that earn the breadth signal (>=500 files)', () => {
  69. const medium = getExploreOutputBudget(1000);
  70. expect(medium.includeAdditionalFiles).toBe(true);
  71. expect(medium.includeCompletenessSignal).toBe(true);
  72. expect(medium.includeBudgetNote).toBe(true);
  73. });
  74. it('keeps the Relationships section on for medium+ tiers — small tiers drop it to maximize body density', () => {
  75. // ITER2: relationships dropped on <500 tiers; on tiny repos the
  76. // per-call payload is the cost driver, so even "cheap" structural
  77. // signal adds up across follow-up turns. Re-enabled at ≥500 where
  78. // body budgets are roomy enough to absorb the 1-2KB overhead.
  79. expect(getExploreOutputBudget(50).includeRelationships).toBe(false);
  80. expect(getExploreOutputBudget(1000).includeRelationships).toBe(true);
  81. expect(getExploreOutputBudget(10000).includeRelationships).toBe(true);
  82. expect(getExploreOutputBudget(30000).includeRelationships).toBe(true);
  83. });
  84. it('caps the per-file header symbol list more tightly on small projects', () => {
  85. // Without this cap, a file like Alamofire's Session.swift produced
  86. // a 3.4KB symbol list in the `#### path — sym, sym, ...` header,
  87. // dwarfing the per-file body cap.
  88. const small = getExploreOutputBudget(100);
  89. const huge = getExploreOutputBudget(30000);
  90. expect(small.maxSymbolsInFileHeader).toBeLessThan(huge.maxSymbolsInFileHeader);
  91. expect(small.maxSymbolsInFileHeader).toBeGreaterThan(0);
  92. });
  93. it('uses a tighter clustering gap threshold on small projects to break runaway single clusters', () => {
  94. const small = getExploreOutputBudget(100);
  95. const huge = getExploreOutputBudget(30000);
  96. expect(small.gapThreshold).toBeLessThanOrEqual(huge.gapThreshold);
  97. });
  98. it('handles the boundary file counts exactly (off-by-one regression guard)', () => {
  99. // 149 -> very-tiny, 150 -> small
  100. expect(getExploreOutputBudget(149).maxOutputChars).toBe(getExploreOutputBudget(50).maxOutputChars);
  101. expect(getExploreOutputBudget(150).maxOutputChars).toBe(getExploreOutputBudget(200).maxOutputChars);
  102. // 499 -> small, 500 -> medium
  103. expect(getExploreOutputBudget(499).maxOutputChars).toBe(getExploreOutputBudget(200).maxOutputChars);
  104. expect(getExploreOutputBudget(500).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars);
  105. // 4999 -> medium, 5000 -> large
  106. expect(getExploreOutputBudget(4999).maxOutputChars).toBe(getExploreOutputBudget(1000).maxOutputChars);
  107. expect(getExploreOutputBudget(5000).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars);
  108. // 14999 -> large, 15000 -> xlarge
  109. expect(getExploreOutputBudget(14999).maxOutputChars).toBe(getExploreOutputBudget(10000).maxOutputChars);
  110. expect(getExploreOutputBudget(15000).maxOutputChars).toBe(getExploreOutputBudget(30000).maxOutputChars);
  111. });
  112. });
  113. /**
  114. * End-to-end check that the budget is actually applied by handleExplore.
  115. *
  116. * Builds a tiny synthetic project (<500 files, so the small tier), indexes
  117. * it, and confirms the output:
  118. * - stays under the small-tier maxOutputChars cap
  119. * - omits the meta-text the small tier gates off (completeness signal,
  120. * budget note, "Additional relevant files")
  121. *
  122. * Regression guard for #185 — protects against future edits to handleExplore
  123. * silently re-introducing the fixed 35KB cap on small projects.
  124. */
  125. describe('codegraph_explore output respects the adaptive budget', () => {
  126. let testDir: string;
  127. let cg: CodeGraph;
  128. let handler: ToolHandler;
  129. beforeAll(async () => {
  130. testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-explore-budget-'));
  131. const srcDir = path.join(testDir, 'src');
  132. fs.mkdirSync(srcDir);
  133. // A handful of files with one fat target file. The fat file mimics the
  134. // Alamofire Session.swift case: many methods stacked on top of each other,
  135. // which collapsed into one giant cluster pre-#185.
  136. const fatLines: string[] = ['export class Session {'];
  137. for (let i = 0; i < 30; i++) {
  138. fatLines.push(` method${i}(arg: string): string {`);
  139. fatLines.push(` return this.helper${i}(arg) + "${i}";`);
  140. fatLines.push(` }`);
  141. fatLines.push(` private helper${i}(arg: string): string {`);
  142. fatLines.push(` return arg.repeat(${i + 1});`);
  143. fatLines.push(` }`);
  144. }
  145. fatLines.push('}');
  146. fs.writeFileSync(path.join(srcDir, 'session.ts'), fatLines.join('\n'));
  147. // A few small supporting files so the project has >1 indexed file.
  148. for (let i = 0; i < 5; i++) {
  149. fs.writeFileSync(
  150. path.join(srcDir, `support${i}.ts`),
  151. `import { Session } from './session';\nexport function callSession${i}(s: Session) { return s.method${i}('hi'); }\n`
  152. );
  153. }
  154. cg = CodeGraph.initSync(testDir, {
  155. config: { include: ['**/*.ts'], exclude: [] },
  156. });
  157. await cg.indexAll();
  158. handler = new ToolHandler(cg);
  159. });
  160. afterAll(() => {
  161. if (cg) cg.destroy();
  162. if (testDir && fs.existsSync(testDir)) {
  163. fs.rmSync(testDir, { recursive: true, force: true });
  164. }
  165. });
  166. it('keeps total output under the small-project cap', async () => {
  167. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  168. const text = result.content?.[0]?.text ?? '';
  169. const smallBudget = getExploreOutputBudget(100);
  170. // Allow a small overshoot for the trailing markers — the cap is enforced
  171. // per-file rather than as an absolute output ceiling.
  172. expect(text.length).toBeLessThan(smallBudget.maxOutputChars + 500);
  173. });
  174. it('omits the meta-text gated off for small projects', async () => {
  175. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  176. const text = result.content?.[0]?.text ?? '';
  177. expect(text).not.toContain('### Additional relevant files');
  178. expect(text).not.toContain('Complete source code is included above');
  179. expect(text).not.toContain('Explore budget:');
  180. });
  181. it('still includes the Relationships section — it is the cheapest structural signal', async () => {
  182. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  183. const text = result.content?.[0]?.text ?? '';
  184. // Either there are relationships, or no edges were significant — both are fine.
  185. // We just want to confirm we did not accidentally gate it off.
  186. const hasRelationships = text.includes('### Relationships');
  187. const sourceFollowsHeader = text.indexOf('### Source Code') > 0;
  188. expect(hasRelationships || sourceFollowsHeader).toBe(true);
  189. });
  190. it('prefixes source lines with line numbers by default (cat -n style)', async () => {
  191. delete process.env.CODEGRAPH_EXPLORE_LINENUMS;
  192. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  193. const text = result.content?.[0]?.text ?? '';
  194. // At least one fenced source line should look like `<digits>\t<code>`.
  195. expect(/\n\d+\t/.test(text)).toBe(true);
  196. });
  197. it('omits line numbers when CODEGRAPH_EXPLORE_LINENUMS=0', async () => {
  198. process.env.CODEGRAPH_EXPLORE_LINENUMS = '0';
  199. try {
  200. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  201. const text = result.content?.[0]?.text ?? '';
  202. // The synthetic source has no tab-prefixed numeric lines of its own,
  203. // so none should appear when the toggle is off.
  204. expect(/\n\d+\t(?:export| )/.test(text)).toBe(false);
  205. } finally {
  206. delete process.env.CODEGRAPH_EXPLORE_LINENUMS;
  207. }
  208. });
  209. it('uses language-neutral omission markers (no C-style // in the output)', async () => {
  210. // The gap/trimmed separators must not assume `//` is a comment — that's
  211. // wrong in Python, Ruby, etc. They render inside fenced source blocks.
  212. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  213. const text = result.content?.[0]?.text ?? '';
  214. expect(text).not.toContain('// ... (gap)');
  215. expect(text).not.toContain('// ... trimmed');
  216. });
  217. it('does not collapse a whole-file class into just its header (envelope filter)', async () => {
  218. // The synthetic `Session` class spans the entire file. Without the
  219. // envelope filter it would form one giant cluster that tail-trims to
  220. // the class declaration, hiding the methods. Confirm real method bodies
  221. // make it into the output. Regression guard for the #185 follow-up.
  222. const result = await handler.execute('codegraph_explore', { query: 'Session method helper' });
  223. const text = result.content?.[0]?.text ?? '';
  224. // A method body line (`methodN(arg: string)`) should appear, not just
  225. // the `export class Session {` opener.
  226. const hasMethodBody = /method\d+\(arg: string\)/.test(text);
  227. expect(hasMethodBody).toBe(true);
  228. });
  229. });