Просмотр исходного кода

feat(v7): M2 P2——机检 7 项可计数(统计项留 M3+ 体检)

mechanicalCheck → {pass, issues, candidates}:
- 阻断项:字数区间、禁词(文风铁律)、禁句式(正则)、本章内复读(6-gram≥3)、
  front matter 完整性(章号/标题/卷/字数/章定位/钩子/情绪定位)
- 非阻断候选:新专名比名册(对话提示词前 2-3 字 Han 启发式)、信息差关键词命中
- 统计项(句式体检/跨章高频意象)留 M3+ 体检,checks 结构预留扩展
8 项测试逐项正例(正常 + 7 检查),受控临时仓库。
lingfengQAQ 1 день назад
Родитель
Сommit
28e768050e
2 измененных файлов с 293 добавлено и 3 удалено
  1. 171 3
      v7/src/mechanical-check/index.js
  2. 122 0
      v7/test/mechanical-check/check.test.js

+ 171 - 3
v7/src/mechanical-check/index.js

@@ -1,3 +1,171 @@
-// 机检:零 token 的可计数项(字数、禁词、复读、句式体检、新专名比对等)。
-// 占位——真实实现见 M2。
-export {}
+import { promises as fs } from 'node:fs'
+import path from 'node:path'
+import { parseFrontMatter } from '../storage/parsers/front-matter.js'
+import { BookConfigReader } from '../storage/adapters/BookConfigReader.js'
+
+// front matter 章档案必填字段(§4.1 机器消费部分)
+const REQUIRED_FM = ['章号', '标题', '卷', '字数', '章定位', '钩子', '情绪定位']
+
+/**
+ * 机检:零 token 可计数项(D2 七项)。不过关(pass=false)= 存在阻断 issue。
+ * 新专名/信息差关键词只出候选(candidates),不拦截。
+ * @param {{repoPath: string, cache: object}} ctx
+ * @param {{chapterNum: number, draftPath: string}} args
+ * @returns {Promise<{ok: boolean, pass: boolean, issues: object[], candidates: object[], error: string}>}
+ */
+export async function mechanicalCheck(ctx, { chapterNum, draftPath }) {
+  try {
+    const { repoPath, cache } = ctx
+    const raw = await fs.readFile(draftPath, 'utf8')
+    const parsed = parseFrontMatter(raw)
+    const body = parsed.ok ? parsed.body : raw
+    const fm = parsed.ok ? parsed.data : {}
+
+    const config = await new BookConfigReader(repoPath).read()
+    const bookConfig = config.ok ? config.data : {}
+    const style = await readStyleRules(repoPath)
+
+    const issues = []
+    const candidates = []
+
+    checkWordCount(body, bookConfig, issues) // 1
+    checkBannedWords(body, style.禁词, issues) // 2
+    checkBannedPatterns(body, style.禁句式, issues) // 3
+    checkRepetition(body, issues) // 4
+    await checkNewProperNouns(body, cache, candidates) // 5(候选)
+    checkFrontMatter(parsed, fm, issues) // 6
+    await checkSecretKeywords(body, cache, candidates) // 7(候选)
+
+    return { ok: true, pass: issues.length === 0, issues, candidates, error: '' }
+  } catch (err) {
+    return { ok: false, pass: false, issues: [], candidates: [], error: `机检失败:${err.message}` }
+  }
+}
+
+async function readStyleRules(repoPath) {
+  try {
+    const content = await fs.readFile(path.join(repoPath, '文风', '文风铁律.md'), 'utf8')
+    const parsed = parseFrontMatter(content)
+    if (parsed.ok) {
+      return { 禁词: parsed.data.禁词 || [], 禁句式: parsed.data.禁句式 || [] }
+    }
+  } catch {
+    // 无文风铁律
+  }
+  return { 禁词: [], 禁句式: [] }
+}
+
+function checkWordCount(body, config, issues) {
+  const target = config.每章目标字数 || 3000
+  const tol = 0.3
+  const count = [...body.replace(/\s+/g, '')].length
+  if (count < target * (1 - tol)) {
+    issues.push({ check: '字数', severity: 'medium', blocking: true, description: `字数 ${count} 低于目标 ${target} 下限` })
+  } else if (count > target * (1 + tol)) {
+    issues.push({ check: '字数', severity: 'medium', blocking: true, description: `字数 ${count} 高于目标 ${target} 上限` })
+  }
+}
+
+function checkBannedWords(body, banned, issues) {
+  for (const w of banned) {
+    if (w && body.includes(w)) {
+      issues.push({ check: '禁词', severity: 'high', blocking: true, description: `命中禁词「${w}」` })
+    }
+  }
+}
+
+function checkBannedPatterns(body, patterns, issues) {
+  for (const p of patterns) {
+    if (!p) continue
+    try {
+      if (new RegExp(p).test(body)) {
+        issues.push({ check: '禁句式', severity: 'high', blocking: true, description: `命中禁句式 /${p}/` })
+      }
+    } catch {
+      // 非法正则跳过(文风铁律里写错不该崩机检)
+    }
+  }
+}
+
+function checkRepetition(body, issues) {
+  const text = body.replace(/\s+/g, '')
+  const L = 6
+  const threshold = 3
+  if (text.length < L) return
+  const counts = new Map()
+  for (let i = 0; i + L <= text.length; i++) {
+    const g = text.slice(i, i + L)
+    counts.set(g, (counts.get(g) || 0) + 1)
+  }
+  for (const [g, c] of counts) {
+    if (c >= threshold) {
+      issues.push({ check: '复读', severity: 'medium', blocking: true, description: `短语「${g}」重复 ${c} 次` })
+      break
+    }
+  }
+}
+
+// 保守启发式:对话提示词(道/说/问…)前的 2-3 字 Han 视作疑似人名,比对名册(非阻断候选)
+async function checkNewProperNouns(body, cache, candidates) {
+  const known = new Set()
+  try {
+    for (const e of await cache.query('SELECT id FROM entities')) known.add(e.id)
+    for (const a of await cache.query('SELECT alias FROM entity_aliases')) known.add(a.alias)
+  } catch {
+    // 无缓存,跳过
+  }
+  const seen = new Set()
+  const re = /([一-龥]{2,3})(冷笑道|笑道|喝道|说道|问道|答道|道|说|喊|问)/g
+  let m
+  while ((m = re.exec(body))) {
+    const name = m[1]
+    if (!known.has(name) && !seen.has(name)) {
+      seen.add(name)
+      candidates.push({
+        type: '新专名',
+        value: name,
+        description: `正文出现疑似新专名「${name}」,名册未登记,请确认(新实体 or 笔误)`,
+      })
+    }
+  }
+}
+
+async function checkSecretKeywords(body, cache, candidates) {
+  let secrets = []
+  try {
+    secrets = await cache.query('SELECT id, keywords FROM secrets WHERE reader_knows = 0')
+  } catch {
+    return
+  }
+  for (const s of secrets) {
+    let kws = []
+    try {
+      kws = JSON.parse(s.keywords || '[]')
+    } catch {
+      kws = []
+    }
+    for (const kw of kws) {
+      if (kw && body.includes(kw)) {
+        candidates.push({
+          type: '信息差候选',
+          value: s.id,
+          description: `正文出现信息差「${s.id}」关键词「${kw}」,疑似泄密候选(不拦截,请人工确认)`,
+        })
+        break
+      }
+    }
+  }
+}
+
+function checkFrontMatter(parsed, fm, issues) {
+  if (!parsed.ok) {
+    issues.push({ check: 'front matter', severity: 'high', blocking: true, description: `front matter 解析失败:${parsed.error}` })
+    return
+  }
+  const missing = REQUIRED_FM.filter(
+    (k) => !(k in fm) || fm[k] === '' || fm[k] === null || fm[k] === undefined
+  )
+  if (missing.length) {
+    issues.push({ check: 'front matter', severity: 'high', blocking: true, description: `front matter 缺字段:${missing.join('、')}` })
+  }
+}

+ 122 - 0
v7/test/mechanical-check/check.test.js

@@ -0,0 +1,122 @@
+import { test } from 'node:test'
+import assert from 'node:assert/strict'
+import path from 'node:path'
+import { mechanicalCheck } from '../../src/mechanical-check/index.js'
+import { repoCtx } from '../commands/_helper.js'
+
+const 文风铁律 = `---
+禁词:
+  - 眸子一缩
+禁句式:
+  - '不是.*而是'
+---
+## 铁律
+节奏优先。
+`
+const 名册 = '| 正名 | 别名 | 类型 | 首现章 |\n|--|--|--|--|\n| 林晚 | 晚晚 | character | 1 |\n'
+const 信息差 = '---\n读者知道: false\n登记章: 1\n关键词:\n  - 玉佩\n---\n## 内容\n秘密。\n'
+
+// 组装一个含 front matter + 正文的草稿,并放进受控临时仓库
+function files(draftBody, { fm } = {}) {
+  const front =
+    fm ??
+    `章号: 3\n标题: 测试章\n卷: 1\n字数: ${[...draftBody.replace(/\s+/g, '')].length}\n章定位: 推进\n钩子: 危机钩-强\n情绪定位: 铺垫`
+  return {
+    'book.yaml': 'spec_version: "7.0"\n书名: 测\n每章目标字数: 50\n',
+    '文风/文风铁律.md': 文风铁律,
+    '定稿/设定/名册.md': 名册,
+    '定稿/设定/信息差/信息差-001-x.md': 信息差,
+    '工作区/草稿-A.md': `---\n${front}\n---\n${draftBody}`,
+  }
+}
+
+async function run(draftBody, opts) {
+  const { ctx, cleanup } = await repoCtx(null, files(draftBody, opts))
+  try {
+    const draftPath = path.join(ctx.repoPath, '工作区', '草稿-A.md')
+    const r = await mechanicalCheck(ctx, { chapterNum: 3, draftPath })
+    return { r, cleanup }
+  } catch (e) {
+    await cleanup()
+    throw e
+  }
+}
+
+const 正常正文 = '林晚立于大殿之前,握紧手中令牌,暗自下定决心,此番定要查明当年旧案,还师门公道。'
+
+test('机检 正常草稿 → pass=true,无阻断 issue', async () => {
+  const { r, cleanup } = await run(正常正文)
+  try {
+    assert.equal(r.ok, true)
+    assert.equal(r.pass, true, `不应有阻断 issue:${JSON.stringify(r.issues)}`)
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 字数太短 → 阻断 issue(字数)', async () => {
+  const { r, cleanup } = await run('林晚。', { fm: '章号: 3\n标题: 测\n卷: 1\n字数: 2\n章定位: 推进\n钩子: 危机钩-强\n情绪定位: 铺垫' })
+  try {
+    assert.equal(r.pass, false)
+    assert.ok(r.issues.some((i) => i.check === '字数'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 命中禁词 → 阻断 issue(禁词)', async () => {
+  const { r, cleanup } = await run(正常正文 + '他眸子一缩,盯着令牌看了又看,心头警兆大作久久难平。')
+  try {
+    assert.ok(r.issues.some((i) => i.check === '禁词'))
+    assert.equal(r.pass, false)
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 命中禁句式正则 → 阻断 issue(禁句式)', async () => {
+  const { r, cleanup } = await run('这把剑不是凡铁而是上古神兵,林晚握着它,只觉一股暖流缓缓涌入四肢百骸之间。')
+  try {
+    assert.ok(r.issues.some((i) => i.check === '禁句式'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 本章内复读 → 阻断 issue(复读)', async () => {
+  const { r, cleanup } = await run('空气仿佛凝固空气仿佛凝固空气仿佛凝固空气仿佛凝固,林晚站在原地一动不动。')
+  try {
+    assert.ok(r.issues.some((i) => i.check === '复读'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 缺 front matter 字段 → 阻断 issue(front matter)', async () => {
+  const { r, cleanup } = await run(正常正文, { fm: '章号: 3\n标题: 测\n卷: 1\n字数: 40\n章定位: 推进' }) // 缺钩子/情绪定位
+  try {
+    assert.ok(r.issues.some((i) => i.check === 'front matter'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 新专名比名册 → 候选(非阻断)', async () => {
+  const { r, cleanup } = await run('赵铁山道:“何人擅闯?”林晚抬眼望去,只见来人一身玄衣气度不凡令人不敢直视。')
+  try {
+    assert.ok(r.candidates.some((c) => c.type === '新专名' && c.value === '赵铁山'))
+    // 新专名非阻断
+    assert.ok(!r.issues.some((i) => i.check === '新专名'))
+  } finally {
+    await cleanup()
+  }
+})
+
+test('机检 信息差关键词命中 → 候选(非阻断)', async () => {
+  const { r, cleanup } = await run('林晚摩挲着那枚玉佩,心中疑云密布,却始终参不透其中藏着的惊天秘密究竟为何。')
+  try {
+    assert.ok(r.candidates.some((c) => c.type === '信息差候选'))
+  } finally {
+    await cleanup()
+  }
+})