index.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. import { promises as fs } from 'node:fs'
  2. import path from 'node:path'
  3. import { parseFrontMatter } from '../storage/parsers/front-matter.js'
  4. import { BookConfigReader } from '../storage/adapters/BookConfigReader.js'
  5. import { parseThreadDeclarations, VERBS, OPENING_VERBS } from '../util/thread-declarations.js'
  6. import { styleMetrics } from '../style-stats/index.js'
  7. // front matter 章档案必填字段(§4.1 机器消费部分)
  8. const REQUIRED_FM = ['章号', '标题', '卷', '字数', '章定位', '钩子', '情绪定位']
  9. // 句式偏离容差(vs 基线指纹;硬编码合理默认,候选只提醒不拦截)
  10. const AVG_LEN_TOLERANCE = 0.3
  11. const VARIANCE_TOLERANCE = 0.5
  12. /**
  13. * 机检:零 token 可计数项(D2 七项 + 条目变动形式检查,spec 0.9 §8 第 5 步)。
  14. * 不过关(pass=false)= 存在阻断 issue。新专名/信息差关键词/高频意象/句式偏离只出候选
  15. * (candidates),不拦截。高频意象与句式偏离消费体检缓存(meta 清单/基线指纹),
  16. * 体检产出、机检消费——机检不做全书扫描。
  17. * @param {{repoPath: string, cache: object}} ctx
  18. * @param {{chapterNum: number, draftPath: string}} args
  19. * @returns {Promise<{ok: boolean, pass: boolean, issues: object[], candidates: object[], error: string}>}
  20. */
  21. export async function mechanicalCheck(ctx, { chapterNum, draftPath }) {
  22. try {
  23. const { repoPath, cache } = ctx
  24. const raw = await fs.readFile(draftPath, 'utf8')
  25. const parsed = parseFrontMatter(raw)
  26. const body = parsed.ok ? parsed.body : raw
  27. const fm = parsed.ok ? parsed.data : {}
  28. const config = await new BookConfigReader(repoPath).read()
  29. const bookConfig = config.ok ? config.data : {}
  30. const style = await readStyleRules(repoPath)
  31. const issues = []
  32. const candidates = []
  33. checkWordCount(body, bookConfig, issues) // 1
  34. checkBannedWords(body, style.禁词, issues) // 2
  35. checkBannedPatterns(body, style.禁句式, issues) // 3
  36. checkRepetition(body, issues) // 4
  37. await checkNewProperNouns(body, cache, candidates) // 5(候选)
  38. checkFrontMatter(parsed, fm, issues) // 6
  39. await checkSecretKeywords(body, cache, candidates) // 7(候选)
  40. await checkThreadDeclarations(fm, cache, issues) // 8(条目变动,只查形式)
  41. await checkImageryHits(body, cache, candidates) // 9(候选,消费体检的高频意象清单)
  42. await checkStyleDeviation(body, cache, candidates) // 10(候选,vs 基线指纹)
  43. return { ok: true, pass: issues.length === 0, issues, candidates, error: '' }
  44. } catch (err) {
  45. return { ok: false, pass: false, issues: [], candidates: [], error: `机检失败:${err.message}` }
  46. }
  47. }
  48. async function readStyleRules(repoPath) {
  49. try {
  50. const content = await fs.readFile(path.join(repoPath, '文风', '文风铁律.md'), 'utf8')
  51. const parsed = parseFrontMatter(content)
  52. if (parsed.ok) {
  53. return { 禁词: parsed.data.禁词 || [], 禁句式: parsed.data.禁句式 || [] }
  54. }
  55. } catch {
  56. // 无文风铁律
  57. }
  58. return { 禁词: [], 禁句式: [] }
  59. }
  60. function checkWordCount(body, config, issues) {
  61. const target = config.每章目标字数 || 3000
  62. const tol = 0.3
  63. const count = [...body.replace(/\s+/g, '')].length
  64. if (count < target * (1 - tol)) {
  65. issues.push({ check: '字数', severity: 'medium', blocking: true, description: `字数 ${count} 低于目标 ${target} 下限` })
  66. } else if (count > target * (1 + tol)) {
  67. issues.push({ check: '字数', severity: 'medium', blocking: true, description: `字数 ${count} 高于目标 ${target} 上限` })
  68. }
  69. }
  70. function checkBannedWords(body, banned, issues) {
  71. for (const w of banned) {
  72. if (w && body.includes(w)) {
  73. issues.push({ check: '禁词', severity: 'high', blocking: true, description: `命中禁词「${w}」` })
  74. }
  75. }
  76. }
  77. function checkBannedPatterns(body, patterns, issues) {
  78. for (const p of patterns) {
  79. if (!p) continue
  80. try {
  81. if (new RegExp(p).test(body)) {
  82. issues.push({ check: '禁句式', severity: 'high', blocking: true, description: `命中禁句式 /${p}/` })
  83. }
  84. } catch {
  85. // 非法正则跳过(文风铁律里写错不该崩机检)
  86. }
  87. }
  88. }
  89. function checkRepetition(body, issues) {
  90. const text = body.replace(/\s+/g, '')
  91. const L = 6
  92. const threshold = 3
  93. if (text.length < L) return
  94. const counts = new Map()
  95. for (let i = 0; i + L <= text.length; i++) {
  96. const g = text.slice(i, i + L)
  97. counts.set(g, (counts.get(g) || 0) + 1)
  98. }
  99. for (const [g, c] of counts) {
  100. if (c >= threshold) {
  101. issues.push({ check: '复读', severity: 'medium', blocking: true, description: `短语「${g}」重复 ${c} 次` })
  102. break
  103. }
  104. }
  105. }
  106. // 保守启发式:对话提示词(道/说/问…)前的 2-3 字 Han 视作疑似人名,比对名册(非阻断候选)
  107. async function checkNewProperNouns(body, cache, candidates) {
  108. const known = new Set()
  109. try {
  110. for (const e of await cache.query('SELECT id FROM entities')) known.add(e.id)
  111. for (const a of await cache.query('SELECT alias FROM entity_aliases')) known.add(a.alias)
  112. } catch {
  113. // 无缓存,跳过
  114. }
  115. const seen = new Set()
  116. const re = /([一-龥]{2,3})(冷笑道|笑道|喝道|说道|问道|答道|道|说|喊|问)/g
  117. let m
  118. while ((m = re.exec(body))) {
  119. const name = m[1]
  120. if (!known.has(name) && !seen.has(name)) {
  121. seen.add(name)
  122. candidates.push({
  123. type: '新专名',
  124. value: name,
  125. description: `正文出现疑似新专名「${name}」,名册未登记,请确认(新实体 or 笔误)`,
  126. })
  127. }
  128. }
  129. }
  130. async function checkSecretKeywords(body, cache, candidates) {
  131. let secrets = []
  132. try {
  133. secrets = await cache.query('SELECT id, keywords FROM secrets WHERE reader_knows = 0')
  134. } catch {
  135. return
  136. }
  137. for (const s of secrets) {
  138. let kws = []
  139. try {
  140. kws = JSON.parse(s.keywords || '[]')
  141. } catch {
  142. kws = []
  143. }
  144. for (const kw of kws) {
  145. if (kw && body.includes(kw)) {
  146. candidates.push({
  147. type: '信息差候选',
  148. value: s.id,
  149. description: `正文出现信息差「${s.id}」关键词「${kw}」,疑似泄密候选(不拦截,请人工确认)`,
  150. })
  151. break
  152. }
  153. }
  154. }
  155. }
  156. function checkFrontMatter(parsed, fm, issues) {
  157. if (!parsed.ok) {
  158. issues.push({ check: 'front matter', severity: 'high', blocking: true, description: `front matter 解析失败:${parsed.error}` })
  159. return
  160. }
  161. const missing = REQUIRED_FM.filter(
  162. (k) => !(k in fm) || fm[k] === '' || fm[k] === null || fm[k] === undefined
  163. )
  164. if (missing.length) {
  165. issues.push({ check: 'front matter', severity: 'high', blocking: true, description: `front matter 缺字段:${missing.join('、')}` })
  166. }
  167. }
  168. // 条目变动形式检查(spec 0.9 §8 第 5 步;查 threads 表,零语义):
  169. // ①类型一致 ②开启类动词不得撞已有编号 ③非开启动词要求条目存在且状态=进行
  170. async function checkThreadDeclarations(fm, cache, issues) {
  171. const { declarations, malformed } = parseThreadDeclarations(fm)
  172. for (const bad of malformed) {
  173. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `条目声明格式应为「动词 编号」:${bad}` })
  174. }
  175. if (!declarations.length) return
  176. const known = new Map()
  177. try {
  178. for (const t of await cache.query('SELECT id, status FROM threads')) known.set(t.id, t.status)
  179. } catch {
  180. return // 无缓存,跳过(形式检查依赖条目表)
  181. }
  182. for (const d of declarations) {
  183. if (!d.id.startsWith(`${d.type}-`)) {
  184. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `「${d.type}」清单里出现异类编号「${d.id}」` })
  185. continue
  186. }
  187. if (!VERBS[d.type].includes(d.verb)) {
  188. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `「${d.type}」没有动词「${d.verb}」(${d.raw}),合法动词:${VERBS[d.type].join('/')}` })
  189. continue
  190. }
  191. const status = known.get(d.id)
  192. if (OPENING_VERBS.has(d.verb)) {
  193. if (status !== undefined) {
  194. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `「${d.raw}」:${d.id} 已存在(状态:${status}),开新条目须用新编号` })
  195. }
  196. } else if (status === undefined) {
  197. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `「${d.raw}」:${d.id} 不存在,疑似编号笔误` })
  198. } else if (status !== '进行') {
  199. issues.push({ check: '条目变动', severity: 'high', blocking: true, description: `「${d.raw}」:${d.id} 状态是「${status}」,不能再「${d.verb}」` })
  200. }
  201. }
  202. }
  203. // 体检产出的跨章高频意象清单(meta imagery_top):本章草稿命中 → 非阻断提醒;未体检过 → 静默跳过
  204. async function checkImageryHits(body, cache, candidates) {
  205. let top = []
  206. try {
  207. const rows = await cache.query("SELECT value FROM meta WHERE key = 'imagery_top'")
  208. top = JSON.parse(rows[0]?.value || '[]')
  209. } catch {
  210. return
  211. }
  212. for (const t of top) {
  213. if (!t?.phrase) continue
  214. const hits = body.split(t.phrase).length - 1
  215. if (hits > 0) {
  216. candidates.push({
  217. type: '高频意象',
  218. value: t.phrase,
  219. description: `「${t.phrase}」全书已用 ${t.count} 次,本章又用 ${hits} 次,建议换个写法`,
  220. })
  221. }
  222. }
  223. }
  224. // 本章句式 vs 基线指纹(体检 upsert 的基线行):平均句长偏 ≥30% 或句长方差偏 ≥50% → 非阻断提醒;无基线 → 静默跳过
  225. async function checkStyleDeviation(body, cache, candidates) {
  226. let base = null
  227. try {
  228. const rows = await cache.query(
  229. 'SELECT avg_sentence_length, sentence_length_variance FROM fingerprints WHERE is_baseline = 1 ORDER BY chapter_range_end DESC LIMIT 1'
  230. )
  231. base = rows[0] || null
  232. } catch {
  233. return
  234. }
  235. if (!base) return
  236. const m = styleMetrics(body)
  237. if (base.avg_sentence_length > 0) {
  238. const dev = (m.平均句长 - base.avg_sentence_length) / base.avg_sentence_length
  239. if (Math.abs(dev) >= AVG_LEN_TOLERANCE) {
  240. candidates.push({
  241. type: '句式偏离',
  242. value: '平均句长',
  243. description: `本章平均句长 ${m.平均句长.toFixed(1)} 字,基线 ${base.avg_sentence_length.toFixed(1)} 字,偏了 ${Math.round(Math.abs(dev) * 100)}%,句子比基线明显${dev > 0 ? '变长' : '变短'}`,
  244. })
  245. }
  246. }
  247. if (base.sentence_length_variance > 0) {
  248. const dev = (m.句长方差 - base.sentence_length_variance) / base.sentence_length_variance
  249. if (Math.abs(dev) >= VARIANCE_TOLERANCE) {
  250. candidates.push({
  251. type: '句式偏离',
  252. value: '句长方差',
  253. description: `本章句长方差 ${m.句长方差.toFixed(1)},基线 ${base.sentence_length_variance.toFixed(1)},偏了 ${Math.round(Math.abs(dev) * 100)}%,句子长短比基线${dev > 0 ? '更参差' : '更齐整'}`,
  254. })
  255. }
  256. }
  257. }