Quellcode durchsuchen

fix(security): index config files by key only, never surface values (#383) (#722)

Spring `application.{properties,yml}` keys (and Shopify Liquid `{% schema %}`
blocks) were storing the config VALUE in the node docstring, and
`codegraph_explore`'s source section re-read the raw `key = value` line off
disk — so a secret committed to a config file (DB password, API key, JDBC URL
with embedded credentials) could be pushed into an agent's context via
explore/node output without the agent ever opening the file.

Config-leaf nodes (`kind: 'constant'` in a config language) now surface the KEY
only, via a shared `isConfigLeafNode` predicate applied at both surfacing
paths: the value is dropped from extraction, `getCode`/`includeCode` returns
the key instead of the file line, and explore excludes config leaves from
source rendering. The predicate can't match real code (real constants are
ts/java/go/…), so `@Value`/`@ConfigurationProperties` resolution and impact are
unaffected. Adds a regression test asserting a planted secret never appears in
`codegraph_explore` / `codegraph_node` output while the keys still resolve.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry vor 2 Wochen
Ursprung
Commit
112e278b5c

+ 4 - 0
CHANGELOG.md

@@ -9,6 +9,10 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Security
+
+- CodeGraph now indexes Spring configuration files (`application.properties` / `application.yml`) by key only, and never includes their values in `codegraph_explore` or `codegraph_node` output. Previously a secret committed to one of these files — a database password, API key, or connection string with embedded credentials — could be surfaced to an AI agent that asked about nearby code, even though the agent never opened the file. The configuration keys are still indexed, so reference and impact analysis are unaffected; an agent that genuinely needs a value reads the file itself. Shopify Liquid `{% schema %}` blocks are likewise indexed by name only. (#383)
+
 ### New Features
 
 - New `codegraph upgrade` command updates CodeGraph to the latest release in place — it detects how you installed (the standalone `install.sh` / `install.ps1` bundle, npm, or npx) and does the right thing for each, on macOS, Linux, and Windows. Use `codegraph upgrade --check` to see whether an update is available without installing, or `codegraph upgrade <version>` to move to a specific version. After upgrading it reminds you to re-index your projects so they pick up the newer engine's improvements. (#679)

+ 102 - 0
__tests__/config-secret-redaction.test.ts

@@ -0,0 +1,102 @@
+/**
+ * #383 — CodeGraph indexes config KEYS but must never surface config VALUES.
+ *
+ * Spring `application.{yml,properties}` keys are indexed as `constant` nodes so
+ * `@Value` resolution works, but their values are routinely secrets (DB
+ * passwords, API keys, JDBC URLs with embedded creds). CodeGraph must surface
+ * the KEY and never the value — not in node metadata (docstring/signature),
+ * not via `codegraph_explore`'s verbatim source dump, and not via
+ * `codegraph_node` `includeCode`. An agent that genuinely needs a value can
+ * read the file itself (a deliberate pull); CodeGraph must never volunteer it.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import CodeGraph from '../src/index';
+import { ToolHandler } from '../src/mcp/tools';
+
+const SECRET = 'sk-live-DO-NOT-LEAK-2f9a4c7e1b';
+
+describe('config secret redaction (#383)', () => {
+  let tmpDir: string;
+  let cg: CodeGraph;
+  let handler: ToolHandler;
+
+  beforeEach(async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-config-secret-'));
+    const javaDir = path.join(tmpDir, 'src/main/java/com/example');
+    const resDir = path.join(tmpDir, 'src/main/resources');
+    fs.mkdirSync(javaDir, { recursive: true });
+    fs.mkdirSync(resDir, { recursive: true });
+    // pom.xml triggers Spring detection so the resolver parses the config files.
+    fs.writeFileSync(
+      path.join(tmpDir, 'pom.xml'),
+      '<project><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter</artifactId></dependency></dependencies></project>\n',
+    );
+    fs.writeFileSync(
+      path.join(resDir, 'application.properties'),
+      `server.port=8080\nspring.datasource.password=${SECRET}\n`,
+    );
+    fs.writeFileSync(
+      path.join(resDir, 'application.yml'),
+      `app:\n  api:\n    key: "${SECRET}"\n`,
+    );
+    fs.writeFileSync(
+      path.join(javaDir, 'DataConfig.java'),
+      'package com.example;\n' +
+        'import org.springframework.beans.factory.annotation.Value;\n' +
+        'public class DataConfig {\n' +
+        '  @Value("${spring.datasource.password}") private String dbPass;\n' +
+        '  @Value("${app.api.key}") private String apiKey;\n' +
+        '}\n',
+    );
+
+    cg = CodeGraph.initSync(tmpDir);
+    await cg.indexAll();
+    handler = new ToolHandler(cg);
+  });
+
+  afterEach(() => {
+    if (cg) cg.destroy();
+    if (fs.existsSync(tmpDir)) fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  const configKeys = () =>
+    cg.getNodesByKind('constant').filter((n) => n.language === 'yaml' || n.language === 'properties');
+
+  it('still indexes config KEYS as nodes (resolution must not regress)', () => {
+    const byQn = (qn: string) => configKeys().find((n) => n.qualifiedName === qn);
+    expect(byQn('spring.datasource.password'), '.properties key indexed').toBeDefined();
+    expect(byQn('app.api.key'), 'yaml key indexed').toBeDefined();
+  });
+
+  it('never stores the secret VALUE in node metadata (docstring/signature/name)', () => {
+    const keys = configKeys();
+    expect(keys.length).toBeGreaterThan(0);
+    for (const n of keys) {
+      expect(n.docstring ?? '', `docstring of ${n.qualifiedName}`).not.toContain(SECRET);
+      expect(n.signature ?? '', `signature of ${n.qualifiedName}`).not.toContain(SECRET);
+      expect(n.name, `name of ${n.qualifiedName}`).not.toContain(SECRET);
+    }
+  });
+
+  it('codegraph_explore surfaces the config key but NEVER the secret value', async () => {
+    const res = await handler.execute('codegraph_explore', {
+      query: 'DataConfig dbPass apiKey spring.datasource.password app.api.key',
+    });
+    const text = res.content.map((c) => c.text).join('\n');
+    expect(text).toContain('password'); // the key is in scope (non-vacuous)
+    expect(text).not.toContain(SECRET); // ...but the value is never dumped
+  });
+
+  it('codegraph_node includeCode returns the key, not the secret value', async () => {
+    const res = await handler.execute('codegraph_node', {
+      symbol: 'spring.datasource.password',
+      includeCode: true,
+    });
+    const text = res.content.map((c) => c.text).join('\n');
+    expect(text).toContain('password'); // found the node
+    expect(text).not.toContain(SECRET); // value redacted from the code path
+  });
+});

+ 9 - 1
src/context/index.ts

@@ -24,7 +24,7 @@ import { QueryBuilder } from '../db/queries';
 import { GraphTraverser } from '../graph';
 import { formatContextAsMarkdown, formatContextAsJson } from './formatter';
 import { logDebug } from '../errors';
-import { validatePathWithinRoot } from '../utils';
+import { validatePathWithinRoot, isConfigLeafNode } from '../utils';
 import { isTestFile, extractSearchTerms, scorePathRelevance, getStemVariants, isDistinctiveIdentifier } from '../search/query-utils';
 import { LOW_CONFIDENCE_MARKER } from './markers';
 
@@ -1161,6 +1161,14 @@ export class ContextBuilder {
    * Extract code from a node's source file
    */
   private async extractNodeCode(node: Node): Promise<string | null> {
+    // SECURITY (#383): a config-leaf node's on-disk line is `key = <secret>`.
+    // Return the KEY only — never read the value off disk. This closes the
+    // includeCode / buildContext code-block path, mirroring the explore source
+    // renderer; an agent that genuinely needs a value can read the file itself.
+    if (isConfigLeafNode(node)) {
+      return node.signature || node.qualifiedName || node.name;
+    }
+
     const filePath = validatePathWithinRoot(this.projectRoot, node.filePath);
 
     if (!filePath || !fs.existsSync(filePath)) {

+ 4 - 1
src/extraction/liquid-extractor.ts

@@ -313,7 +313,10 @@ export class LiquidExtractor {
         endLine,
         startColumn: match.index - this.getLineStart(startLine),
         endColumn: 0,
-        docstring: schemaContent?.trim().substring(0, 200), // Store first 200 chars as docstring
+        // SECURITY (#383): don't dump the raw {% schema %} JSON (section settings
+        // + default values) into the docstring — the schema name is already in
+        // `name`, so the data block adds nothing but a potential leak of any
+        // IDs/endpoints/keys a developer placed in setting defaults.
         updatedAt: Date.now(),
       };
 

+ 6 - 1
src/mcp/tools.ts

@@ -26,7 +26,7 @@ import {
   existsSync,
   readFileSync,
 } from 'fs';
-import { clamp, validatePathWithinRoot, validateProjectPath } from '../utils';
+import { clamp, validatePathWithinRoot, validateProjectPath, isConfigLeafNode } from '../utils';
 import { isGeneratedFile } from '../extraction/generated-detection';
 import { resolve as resolvePath } from 'path';
 
@@ -1705,6 +1705,11 @@ export class ToolHandler {
     for (const node of subgraph.nodes.values()) {
       // Skip import/export nodes — they add noise without information
       if (node.kind === 'import' || node.kind === 'export') continue;
+      // SECURITY (#383): never render the on-disk source of a config-leaf
+      // (Spring application.{yml,properties} key) — its line is `key = <secret>`,
+      // so whole-file/cluster rendering here would push secrets into context
+      // unbidden. The key still appears in the flow/symbol listing above.
+      if (isConfigLeafNode(node)) continue;
 
       const group = fileGroups.get(node.filePath) || { nodes: [], score: 0 };
       group.nodes.push(node);

+ 6 - 1
src/resolution/frameworks/java.ts

@@ -335,7 +335,12 @@ function extractSpringConfig(
       endColumn: valueText.length,
       language: lang,
       signature: dottedKey,
-      docstring: valueText.slice(0, 200),
+      // SECURITY (#383): store the KEY only, never the value. Config files
+      // routinely hold secrets (DB passwords, API keys, JDBC URLs with embedded
+      // credentials), and surfacing the value here pushes it into agent context
+      // unbidden (it lands in codegraph_node/explore output via the docstring).
+      // The key is all `@Value`/`@ConfigurationProperties` resolution needs; an
+      // agent that genuinely needs a value can read the file directly.
       updatedAt: now,
     });
   };

+ 19 - 0
src/utils.ts

@@ -46,6 +46,25 @@ const SENSITIVE_PATHS = new Set([
   'c:\\', 'c:\\windows', 'c:\\windows\\system32',
 ]);
 
+/**
+ * Config "languages" whose nodes are pure key/value DATA lifted from a config
+ * file (e.g. Spring `application.{yml,properties}`), not source code.
+ */
+export const CONFIG_LEAF_LANGUAGES: ReadonlySet<string> = new Set(['yaml', 'properties']);
+
+/**
+ * A config-leaf node is a single key lifted out of a pure config/data file —
+ * `kind: 'constant'` in a {@link CONFIG_LEAF_LANGUAGES} language. Its on-disk
+ * line is `key = <value>`, and that value is routinely a secret (DB password,
+ * API key, JDBC URL with embedded creds). CodeGraph must surface the KEY only
+ * and never read/return the value, or it pushes secrets into agent context
+ * unbidden — the value isn't needed for resolution, and an agent that genuinely
+ * needs it can read the file directly. (#383)
+ */
+export function isConfigLeafNode(node: { kind: string; language?: string }): boolean {
+  return node.kind === 'constant' && !!node.language && CONFIG_LEAF_LANGUAGES.has(node.language);
+}
+
 /**
  * Validate that a resolved file path stays within the project root.
  * Prevents path traversal attacks (e.g. node.filePath = "../../etc/passwd").