Kaynağa Gözat

feat(v7): M5.5 P3——机检候选、备料反复读清单、drift 方差 delta

lingfengQAQ 16 saat önce
ebeveyn
işleme
e7028e338a

+ 1 - 1
v7/src/cache/rebuilder.js

@@ -46,7 +46,7 @@ export async function rebuildCache(repoPath, db) {
     // 6. 扫描角色卡 → 填充 entities 表
     await scanCharacters(repoPath, db)
 
-    // 7. fingerprints 表留空(特征提取随 M3+ 体检补)
+    // 7. fingerprints 由体检按需重算(M5.5),重建不填
 
     db.exec('COMMIT')
     return { ok: true, warnings, errors }

+ 4 - 2
v7/src/commands/report-style-drift.js

@@ -1,6 +1,6 @@
 /**
  * report-style-drift → 当前指纹 vs 基线的差异
- * M1 边界:能读 fingerprints 表并对比基线,但不实现特征提取(表 M3+ 体检填充)。
+ * fingerprints 表由体检填充(M5.5:基线段 + 最近周期章段各一行
  * 表为空 → 返回友好中文错误。契约:纯返回 {ok, output?, error?}(见 design §6.2)。
  */
 export async function run(args, options, ctx) {
@@ -14,7 +14,7 @@ export async function run(args, options, ctx) {
   if (baseline.length === 0 || recent.length === 0) {
     return {
       ok: false,
-      error: '缺少指纹数据:fingerprints 表为空或不全,请先运行体检以提取文体特征(M1 不实现特征提取)。',
+      error: '缺少指纹数据:fingerprints 表为空或不全,请先运行体检以提取文体特征。',
     }
   }
 
@@ -24,6 +24,8 @@ export async function run(args, options, ctx) {
     基线章段: [b.chapter_range_start, b.chapter_range_end],
     最近章段: [r.chapter_range_start, r.chapter_range_end],
     avg_sentence_length_delta: (r.avg_sentence_length ?? 0) - (b.avg_sentence_length ?? 0),
+    sentence_length_variance_delta:
+      (r.sentence_length_variance ?? 0) - (b.sentence_length_variance ?? 0),
     avg_paragraph_length_delta: (r.avg_paragraph_length ?? 0) - (b.avg_paragraph_length ?? 0),
     vocabulary_richness_delta: (r.vocabulary_richness ?? 0) - (b.vocabulary_richness ?? 0),
   }

+ 67 - 1
v7/src/mechanical-check/index.js

@@ -3,13 +3,20 @@ import path from 'node:path'
 import { parseFrontMatter } from '../storage/parsers/front-matter.js'
 import { BookConfigReader } from '../storage/adapters/BookConfigReader.js'
 import { parseThreadDeclarations, VERBS, OPENING_VERBS } from '../util/thread-declarations.js'
+import { styleMetrics } from '../style-stats/index.js'
 
 // front matter 章档案必填字段(§4.1 机器消费部分)
 const REQUIRED_FM = ['章号', '标题', '卷', '字数', '章定位', '钩子', '情绪定位']
 
+// 句式偏离容差(vs 基线指纹;硬编码合理默认,候选只提醒不拦截)
+const AVG_LEN_TOLERANCE = 0.3
+const VARIANCE_TOLERANCE = 0.5
+
 /**
  * 机检:零 token 可计数项(D2 七项 + 条目变动形式检查,spec 0.9 §8 第 5 步)。
- * 不过关(pass=false)= 存在阻断 issue。新专名/信息差关键词只出候选(candidates),不拦截。
+ * 不过关(pass=false)= 存在阻断 issue。新专名/信息差关键词/高频意象/句式偏离只出候选
+ * (candidates),不拦截。高频意象与句式偏离消费体检缓存(meta 清单/基线指纹),
+ * 体检产出、机检消费——机检不做全书扫描。
  * @param {{repoPath: string, cache: object}} ctx
  * @param {{chapterNum: number, draftPath: string}} args
  * @returns {Promise<{ok: boolean, pass: boolean, issues: object[], candidates: object[], error: string}>}
@@ -37,6 +44,8 @@ export async function mechanicalCheck(ctx, { chapterNum, draftPath }) {
     checkFrontMatter(parsed, fm, issues) // 6
     await checkSecretKeywords(body, cache, candidates) // 7(候选)
     await checkThreadDeclarations(fm, cache, issues) // 8(条目变动,只查形式)
+    await checkImageryHits(body, cache, candidates) // 9(候选,消费体检的高频意象清单)
+    await checkStyleDeviation(body, cache, candidates) // 10(候选,vs 基线指纹)
 
     return { ok: true, pass: issues.length === 0, issues, candidates, error: '' }
   } catch (err) {
@@ -209,3 +218,60 @@ async function checkThreadDeclarations(fm, cache, issues) {
     }
   }
 }
+
+// 体检产出的跨章高频意象清单(meta imagery_top):本章草稿命中 → 非阻断提醒;未体检过 → 静默跳过
+async function checkImageryHits(body, cache, candidates) {
+  let top = []
+  try {
+    const rows = await cache.query("SELECT value FROM meta WHERE key = 'imagery_top'")
+    top = JSON.parse(rows[0]?.value || '[]')
+  } catch {
+    return
+  }
+  for (const t of top) {
+    if (!t?.phrase) continue
+    const hits = body.split(t.phrase).length - 1
+    if (hits > 0) {
+      candidates.push({
+        type: '高频意象',
+        value: t.phrase,
+        description: `「${t.phrase}」全书已用 ${t.count} 次,本章又用 ${hits} 次,建议换个写法`,
+      })
+    }
+  }
+}
+
+// 本章句式 vs 基线指纹(体检 upsert 的基线行):平均句长偏 ≥30% 或句长方差偏 ≥50% → 非阻断提醒;无基线 → 静默跳过
+async function checkStyleDeviation(body, cache, candidates) {
+  let base = null
+  try {
+    const rows = await cache.query(
+      'SELECT avg_sentence_length, sentence_length_variance FROM fingerprints WHERE is_baseline = 1 ORDER BY chapter_range_end DESC LIMIT 1'
+    )
+    base = rows[0] || null
+  } catch {
+    return
+  }
+  if (!base) return
+  const m = styleMetrics(body)
+  if (base.avg_sentence_length > 0) {
+    const dev = (m.平均句长 - base.avg_sentence_length) / base.avg_sentence_length
+    if (Math.abs(dev) >= AVG_LEN_TOLERANCE) {
+      candidates.push({
+        type: '句式偏离',
+        value: '平均句长',
+        description: `本章平均句长 ${m.平均句长.toFixed(1)} 字,基线 ${base.avg_sentence_length.toFixed(1)} 字,偏了 ${Math.round(Math.abs(dev) * 100)}%,句子比基线明显${dev > 0 ? '变长' : '变短'}`,
+      })
+    }
+  }
+  if (base.sentence_length_variance > 0) {
+    const dev = (m.句长方差 - base.sentence_length_variance) / base.sentence_length_variance
+    if (Math.abs(dev) >= VARIANCE_TOLERANCE) {
+      candidates.push({
+        type: '句式偏离',
+        value: '句长方差',
+        description: `本章句长方差 ${m.句长方差.toFixed(1)},基线 ${base.sentence_length_variance.toFixed(1)},偏了 ${Math.round(Math.abs(dev) * 100)}%,句子长短比基线${dev > 0 ? '更参差' : '更齐整'}`,
+      })
+    }
+  }
+}

+ 20 - 1
v7/src/prep/index.js

@@ -77,6 +77,25 @@ export async function prepareChapterMaterials(ctx, { chapterNum }) {
       // 无文风铁律
     }
 
+    // 反复读清单:体检产出的跨章高频意象(meta imagery_top),提醒本章避免再用
+    let 反复读 = '(尚未体检,暂无数据——首次体检后自动填充)'
+    try {
+      const metaRows = await cache.query("SELECT value FROM meta WHERE key = 'imagery_top'")
+      if (metaRows.length) {
+        const top = JSON.parse(metaRows[0].value || '[]')
+        反复读 = top.length
+          ? top
+              .slice(0, 10)
+              .map(
+                (t) => `- 「${t.phrase}」全书已用 ${t.count} 次(${t.chapterCount} 章出现),本章避免再用`
+              )
+              .join('\n')
+          : '(最近一次体检没有查出跨章高频复用短语)'
+      }
+    } catch {
+      // meta 读不到按未体检处理
+    }
+
     const parts = [
       `# 第 ${chapterNum} 章写作材料`,
       '',
@@ -94,7 +113,7 @@ export async function prepareChapterMaterials(ctx, { chapterNum }) {
       '',
       反和解 ? `## 反和解规则\n${反和解}` : '## 反和解规则\n(无)',
       '',
-      '## 反复读清单\n(暂空,跨章高频意象统计随 M5.5 体检补)',
+      `## 反复读清单\n${反复读}`,
       '',
     ]
     const content = parts.join('\n')

+ 5 - 4
v7/test/commands/report-style-drift.test.js

@@ -14,20 +14,21 @@ test('report-style-drift 无指纹数据 → 友好错误(M1 边界,不做
   }
 })
 
-test('report-style-drift 有基线+最近指纹 → 返回差异(测对比逻辑)', async () => {
+test('report-style-drift 有基线+最近指纹 → 返回含句长方差在内的差异', async () => {
   const { ctx, cleanup } = await fixtureCtx()
   try {
-    // M1 不做特征提取,手工插入基线 + 最近指纹,验证对比逻辑
+    // 手工插入基线 + 最近指纹,验证对比逻辑(特征提取由体检测试覆盖)
     await ctx.cache.query(
-      "INSERT INTO fingerprints (chapter_range_start, chapter_range_end, is_baseline, avg_sentence_length, vocabulary_richness, fingerprint_data) VALUES (1, 30, 1, 20.0, 0.5, '{}')"
+      "INSERT INTO fingerprints (chapter_range_start, chapter_range_end, is_baseline, avg_sentence_length, sentence_length_variance, vocabulary_richness, fingerprint_data) VALUES (1, 30, 1, 20.0, 30.0, 0.5, '{}')"
     )
     await ctx.cache.query(
-      "INSERT INTO fingerprints (chapter_range_start, chapter_range_end, is_baseline, avg_sentence_length, vocabulary_richness, fingerprint_data) VALUES (31, 40, 0, 25.0, 0.6, '{}')"
+      "INSERT INTO fingerprints (chapter_range_start, chapter_range_end, is_baseline, avg_sentence_length, sentence_length_variance, vocabulary_richness, fingerprint_data) VALUES (31, 40, 0, 25.0, 42.5, 0.6, '{}')"
     )
     const r = await run([], {}, ctx)
     assert.equal(r.ok, true)
     const drift = JSON.parse(r.output)
     assert.ok(Math.abs(drift.avg_sentence_length_delta - 5.0) < 1e-9)
+    assert.ok(Math.abs(drift.sentence_length_variance_delta - 12.5) < 1e-9)
   } finally {
     await cleanup()
   }

+ 103 - 0
v7/test/mechanical-check/check.test.js

@@ -218,3 +218,106 @@ test('机检 声明行不合「动词 编号」格式 → 阻断', async () => {
     await cleanup()
   }
 })
+
+// —— 体检消费两候选(M5.5:高频意象命中 + 句式偏离 vs 基线指纹,均非阻断)——
+
+// 千字文选段:字字不重,按长度切句不会误触「复读」检查
+const 字池 =
+  '天地玄黄宇宙洪荒日月盈昃辰宿列张寒来暑往秋收冬藏闰余成岁律吕调阳云腾致雨露结为霜金生丽水玉出昆冈剑号巨阙珠称夜光果珍李柰菜重芥姜海咸河淡鳞潜羽翔龙师火帝鸟官人皇始制文字乃服衣裳推位让国有虞陶唐吊民伐罪周发殷汤坐朝问道垂拱平章爱育黎首臣伏戎羌遐迩一体率宾归王鸣凤在竹白驹食场'
+
+function sentencesOfLengths(lengths) {
+  let pos = 0
+  const parts = []
+  for (const n of lengths) {
+    parts.push(字池.slice(pos, pos + n))
+    pos += n
+  }
+  return parts.join('。') + '。'
+}
+
+async function runWithCache(draftBody, { extra, seed } = {}) {
+  const { ctx, cleanup } = await repoCtx(null, files(draftBody, { extra }))
+  try {
+    if (seed) await seed(ctx)
+    const draftPath = path.join(ctx.repoPath, '工作区', '草稿-A.md')
+    const r = await mechanicalCheck(ctx, { chapterNum: 3, draftPath })
+    return { r, cleanup }
+  } catch (e) {
+    await cleanup()
+    throw e
+  }
+}
+
+const 基线指纹 = (avg, variance) => (ctx) =>
+  ctx.cache.run(
+    "INSERT INTO fingerprints (chapter_range_start, chapter_range_end, is_baseline, avg_sentence_length, sentence_length_variance, avg_paragraph_length, common_phrase_frequency, vocabulary_richness, fingerprint_data) VALUES (1, 2, 1, ?, ?, 20, '{}', 0.5, '{}')",
+    [avg, variance]
+  )
+
+const 目标字数 = (n) => ({ 'book.yaml': `spec_version: "7.0"\n书名: 测\n每章目标字数: ${n}\n` })
+
+test('机检 高频意象命中(体检缓存)→ 候选非阻断,pass 不受影响', async () => {
+  const seed = (ctx) =>
+    ctx.cache.run("INSERT OR REPLACE INTO meta (key, value) VALUES ('imagery_top', ?)", [
+      JSON.stringify([
+        { phrase: '空气仿佛凝固', count: 47, chapterCount: 12, firstChapter: 3, lastChapter: 40 },
+      ]),
+    ])
+  const { r, cleanup } = await runWithCache(
+    '林晚推门而入,空气仿佛凝固。她环视四周缓缓落座,空气仿佛凝固,无人开口说话,落针可闻此时无声。',
+    { seed }
+  )
+  try {
+    const c = r.candidates.find((x) => x.type === '高频意象')
+    assert.ok(c, JSON.stringify(r.candidates))
+    assert.equal(c.value, '空气仿佛凝固')
+    assert.match(c.description, /全书已用 47 次,本章又用 2 次/)
+    assert.equal(r.pass, true, JSON.stringify(r.issues))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 无体检数据 → 高频意象/句式偏离静默跳过', async () => {
+  const { r, cleanup } = await run(正常正文)
+  try {
+    assert.ok(!r.candidates.some((x) => x.type === '高频意象' || x.type === '句式偏离'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 句式偏离边界:平均句长偏 29% 不报', async () => {
+  const body = sentencesOfLengths([12, 13, 13, 13, 13, 13, 13, 13, 13, 13]) // 均 12.9,基线 10
+  const { r, cleanup } = await runWithCache(body, { extra: 目标字数(130), seed: 基线指纹(10, 0) })
+  try {
+    assert.ok(!r.candidates.some((x) => x.type === '句式偏离'), JSON.stringify(r.candidates))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 句式偏离边界:平均句长偏 31% 报(非阻断)', async () => {
+  const body = sentencesOfLengths([13, 13, 13, 13, 13, 13, 13, 13, 13, 14]) // 均 13.1,基线 10
+  const { r, cleanup } = await runWithCache(body, { extra: 目标字数(130), seed: 基线指纹(10, 0) })
+  try {
+    const c = r.candidates.find((x) => x.type === '句式偏离' && x.value === '平均句长')
+    assert.ok(c, JSON.stringify(r.candidates))
+    assert.match(c.description, /偏了 31%/)
+    assert.ok(!r.issues.some((i) => i.check === '句式偏离'), '句式偏离只进候选不进 issues')
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 句长方差偏离 ≥50% 报,平均句长未偏不误报', async () => {
+  const body = sentencesOfLengths([10, 14]) // 均 12 与基线持平;方差 4 vs 基线 1
+  const { r, cleanup } = await runWithCache(body, { extra: 目标字数(25), seed: 基线指纹(12, 1) })
+  try {
+    const c = r.candidates.find((x) => x.type === '句式偏离' && x.value === '句长方差')
+    assert.ok(c, JSON.stringify(r.candidates))
+    assert.ok(!r.candidates.some((x) => x.type === '句式偏离' && x.value === '平均句长'))
+  } finally {
+    await cleanup()
+  }
+})

+ 29 - 0
v7/test/prep/prepare.test.js

@@ -31,3 +31,32 @@ test('prepareChapterMaterials 组装本章写作材料(八组件锚点)并
     await cleanup()
   }
 })
+
+test('prepareChapterMaterials 反复读清单:未体检 → 人话占位', async () => {
+  const { ctx, cleanup } = await tempBookCtx()
+  try {
+    const r = await prepareChapterMaterials(ctx, { chapterNum: 3 })
+    assert.equal(r.ok, true)
+    assert.match(r.content, /## 反复读清单\n(尚未体检,暂无数据——首次体检后自动填充)/)
+  } finally {
+    await cleanup()
+  }
+})
+
+test('prepareChapterMaterials 反复读清单:体检后 → top 高频意象带全书次数(AC4)', async () => {
+  const { ctx, cleanup } = await tempBookCtx()
+  try {
+    await ctx.cache.run("INSERT OR REPLACE INTO meta (key, value) VALUES ('imagery_top', ?)", [
+      JSON.stringify([
+        { phrase: '空气仿佛凝固', count: 47, chapterCount: 12, firstChapter: 3, lastChapter: 40 },
+        { phrase: '眼底闪过一丝', count: 21, chapterCount: 9, firstChapter: 5, lastChapter: 38 },
+      ]),
+    ])
+    const r = await prepareChapterMaterials(ctx, { chapterNum: 3 })
+    assert.equal(r.ok, true)
+    assert.match(r.content, /- 「空气仿佛凝固」全书已用 47 次(12 章出现),本章避免再用/)
+    assert.match(r.content, /- 「眼底闪过一丝」全书已用 21 次(9 章出现),本章避免再用/)
+  } finally {
+    await cleanup()
+  }
+})