18 tuntia sitten · 0a3009010e
--- a/v7/src/style-stats/index.js
+++ b/v7/src/style-stats/index.js
@@ -0,0 +1,279 @@
 
				+/**
			
 
				+ * style-stats：体检统计算法（spec §9，M5.5）。纯函数、零依赖、无 IO；
			
 
				+ * 全部固定排序纯计数、无时间戳无随机——同输入任何时候重算结果逐字段一致
			
 
				+ * （「删缓存全量重建后指纹不变」的根基）。
			
 
				+ */
			
 
				+
			
 
				+// 跨章高频意象口径：全书 ≥10 次且 ≥3 章出现才算高频（报告是提醒不拦截，硬编码合理默认）
			
 
				+export const IMAGERY_MIN_COUNT = 10
			
 
				+export const IMAGERY_MIN_CHAPTERS = 3
			
 
				+
			
 
				+const MIN_GRAM = 4
			
 
				+const MAX_GRAM = 8
			
 
				+const TTR_WINDOW = 1000
			
 
				+
			
 
				+/**
			
 
				+ * 分句：按中文句终标点切分，句尾收编随后的闭引号（「站住！」→ 一句）。
			
 
				+ * @param {string} text
			
 
				+ * @returns {string[]} 修剪后的非空句
			
 
				+ */
			
 
				+export function splitSentences(text) {
			
 
				+  if (!text) return []
			
 
				+  return text
			
 
				+    .split(/[。！？；…]+[”』」]?/)
			
 
				+    .map((s) => s.trim())
			
 
				+    .filter(Boolean)
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 分段：一段一换行的网文约定，空行同样断段。
			
 
				+ * @param {string} text
			
 
				+ * @returns {string[]} 修剪后的非空段
			
 
				+ */
			
 
				+export function splitParagraphs(text) {
			
 
				+  if (!text) return []
			
 
				+  return text
			
 
				+    .split(/\n+/)
			
 
				+    .map((p) => p.trim())
			
 
				+    .filter(Boolean)
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 句式指标（单章机检与章段体检复用，口径一致）。长度一律取去空白字符数。
			
 
				+ * @param {string} text 正文
			
 
				+ * @param {Set<string>} exclude 名册专名/别名——句首开头统计排除人名前缀（「林晚…」是人名不是句式）
			
 
				+ * @returns {{句数, 平均句长, 句长方差, 段落数, 平均段长, 段落分布, 高频开头}}
			
 
				+ */
			
 
				+export function styleMetrics(text, exclude = new Set()) {
			
 
				+  const sentences = splitSentences(text)
			
 
				+  const lengths = sentences.map(countChars)
			
 
				+  const paragraphs = splitParagraphs(text)
			
 
				+  const pLengths = paragraphs.map(countChars)
			
 
				+  return {
			
 
				+    句数: sentences.length,
			
 
				+    平均句长: mean(lengths),
			
 
				+    句长方差: variance(lengths),
			
 
				+    段落数: paragraphs.length,
			
 
				+    平均段长: mean(pLengths),
			
 
				+    段落分布: paragraphDistribution(pLengths),
			
 
				+    高频开头: topOpeners(sentences, exclude),
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 跨章高频意象：连续中文字符段内的 n-gram（n=4..8），Apriori 分层——
			
 
				+ * 第 n 层只数「前 n-1 字在上层已频繁」的候选，内存有界（全长度一把梭在百万字级别会爆）。
			
 
				+ * 专名/别名（长度 ≥2）出现处先把字符段切断：含名短语与跨名碎片（「晚冷笑一声」）都出不来，
			
 
				+ * 人名动作短语的重复是叙事常态不是意象复读，宁缺勿滥。
			
 
				+ * @param {{num: number, text: string}[]} chapters 定稿章（章号 + 正文）
			
 
				+ * @param {Set<string>} exclude 名册专名/别名
			
 
				+ * @param {{minCount?: number, minChapters?: number}} [opts] 默认全书 ≥10 次、≥3 章
			
 
				+ * @returns {{phrase: string, count: number, chapterCount: number, firstChapter: number, lastChapter: number}[]}
			
 
				+ *   按（次数 desc，字典序 asc）稳定排序的全量清单，调用方取 top-N
			
 
				+ */
			
 
				+export function extractImagery(chapters, exclude = new Set(), opts = {}) {
			
 
				+  const minCount = opts.minCount ?? IMAGERY_MIN_COUNT
			
 
				+  const minChapters = opts.minChapters ?? IMAGERY_MIN_CHAPTERS
			
 
				+  const names = [...exclude]
			
 
				+    .filter((n) => typeof n === 'string' && n.length >= 2)
			
 
				+    .sort((a, b) => b.length - a.length || cmp(a, b))
			
 
				+
			
 
				+  const runsByChapter = chapters.map((c) => ({
			
 
				+    num: c.num,
			
 
				+    runs: cjkRuns(c.text || '', names),
			
 
				+  }))
			
 
				+
			
 
				+  // 分层计数：phrase → 全书次数
			
 
				+  const frequent = new Map()
			
 
				+  let prevLevel = null
			
 
				+  for (let n = MIN_GRAM; n <= MAX_GRAM; n++) {
			
 
				+    const counts = new Map()
			
 
				+    for (const { runs } of runsByChapter) {
			
 
				+      for (const run of runs) {
			
 
				+        for (let i = 0; i + n <= run.length; i++) {
			
 
				+          if (prevLevel && !prevLevel.has(run.slice(i, i + n - 1))) continue
			
 
				+          const g = run.slice(i, i + n)
			
 
				+          counts.set(g, (counts.get(g) || 0) + 1)
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    const level = new Set()
			
 
				+    for (const [g, c] of counts) {
			
 
				+      if (c >= minCount) {
			
 
				+        level.add(g)
			
 
				+        frequent.set(g, c)
			
 
				+      }
			
 
				+    }
			
 
				+    if (!level.size) break
			
 
				+    prevLevel = level
			
 
				+  }
			
 
				+
			
 
				+  // 第二遍只为频繁短语数章分布（第一遍不建 per-gram 章集合，控内存）
			
 
				+  const byLen = new Map()
			
 
				+  for (const g of frequent.keys()) {
			
 
				+    if (!byLen.has(g.length)) byLen.set(g.length, new Set())
			
 
				+    byLen.get(g.length).add(g)
			
 
				+  }
			
 
				+  const chapterSets = new Map()
			
 
				+  for (const { num, runs } of runsByChapter) {
			
 
				+    for (const run of runs) {
			
 
				+      for (const [len, set] of byLen) {
			
 
				+        for (let i = 0; i + len <= run.length; i++) {
			
 
				+          const g = run.slice(i, i + len)
			
 
				+          if (!set.has(g)) continue
			
 
				+          if (!chapterSets.has(g)) chapterSets.set(g, new Set())
			
 
				+          chapterSets.get(g).add(num)
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  const candidates = []
			
 
				+  for (const [phrase, count] of frequent) {
			
 
				+    const nums = [...(chapterSets.get(phrase) || [])].sort((a, b) => a - b)
			
 
				+    if (nums.length < minChapters) continue
			
 
				+    candidates.push({
			
 
				+      phrase,
			
 
				+      count,
			
 
				+      chapterCount: nums.length,
			
 
				+      firstChapter: nums[0],
			
 
				+      lastChapter: nums[nums.length - 1],
			
 
				+    })
			
 
				+  }
			
 
				+
			
 
				+  // 最长优先去重：子串且次数 ≤ 父串 ×1.25 视为同一意象的碎片（「气仿佛凝固」被「空气仿佛凝固」覆盖）
			
 
				+  candidates.sort(
			
 
				+    (a, b) => b.phrase.length - a.phrase.length || b.count - a.count || cmp(a.phrase, b.phrase)
			
 
				+  )
			
 
				+  const kept = []
			
 
				+  for (const c of candidates) {
			
 
				+    const covered = kept.some((k) => k.phrase.includes(c.phrase) && c.count <= k.count * 1.25)
			
 
				+    if (!covered) kept.push(c)
			
 
				+  }
			
 
				+  kept.sort((a, b) => b.count - a.count || cmp(a.phrase, b.phrase))
			
 
				+  return kept
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 文体指纹：章段 → fingerprints 表五常用列 + fingerprint_data 完整对象（含段落分布/高频开头/总字数/章数）。
			
 
				+ * 返回对象不含 JSON 字符串，序列化在落库处做。
			
 
				+ * @param {{num: number, text: string}[]} chapters
			
 
				+ * @param {Set<string>} exclude
			
 
				+ */
			
 
				+export function extractFingerprint(chapters, exclude = new Set()) {
			
 
				+  const combined = chapters.map((c) => c.text || '').join('\n\n')
			
 
				+  const metrics = styleMetrics(combined, exclude)
			
 
				+  // 章段内常用短语：跨章条件放宽为 ≥1（章段内统计），取 top10
			
 
				+  const imagery = extractImagery(chapters, exclude, { minChapters: 1 })
			
 
				+  const common_phrase_frequency = {}
			
 
				+  for (const { phrase, count } of imagery.slice(0, 10)) common_phrase_frequency[phrase] = count
			
 
				+  const vocabulary_richness = windowTTR(combined)
			
 
				+  const fingerprint_data = {
			
 
				+    avg_sentence_length: metrics.平均句长,
			
 
				+    sentence_length_variance: metrics.句长方差,
			
 
				+    avg_paragraph_length: metrics.平均段长,
			
 
				+    common_phrase_frequency,
			
 
				+    vocabulary_richness,
			
 
				+    段落分布: metrics.段落分布,
			
 
				+    高频开头: metrics.高频开头,
			
 
				+    总字数: countChars(combined),
			
 
				+    章数: chapters.length,
			
 
				+  }
			
 
				+  return {
			
 
				+    avg_sentence_length: metrics.平均句长,
			
 
				+    sentence_length_variance: metrics.句长方差,
			
 
				+    avg_paragraph_length: metrics.平均段长,
			
 
				+    common_phrase_frequency,
			
 
				+    vocabulary_richness,
			
 
				+    fingerprint_data,
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 词汇丰富度：滑动窗口 TTR——剥标点空白后按 1000 字窗口求 unique/窗长 的平均（末窗用实际长度）。
			
 
				+ * 朴素 unique/total 对文本长度敏感，跨章段不可比。
			
 
				+ * @param {string} text
			
 
				+ * @returns {number}
			
 
				+ */
			
 
				+export function windowTTR(text) {
			
 
				+  const chars = [...(text || '').replace(/[^一-龥A-Za-z0-9]/g, '')]
			
 
				+  if (!chars.length) return 0
			
 
				+  const ttrs = []
			
 
				+  for (let i = 0; i < chars.length; i += TTR_WINDOW) {
			
 
				+    const win = chars.slice(i, i + TTR_WINDOW)
			
 
				+    ttrs.push(new Set(win).size / win.length)
			
 
				+  }
			
 
				+  return mean(ttrs)
			
 
				+}
			
 
				+
			
 
				+// —— 内部工具（全部确定性：固定遍历顺序、码元序比较，不用 locale）——
			
 
				+
			
 
				+function countChars(s) {
			
 
				+  return [...s.replace(/\s+/g, '')].length
			
 
				+}
			
 
				+
			
 
				+function mean(nums) {
			
 
				+  if (!nums.length) return 0
			
 
				+  return nums.reduce((a, b) => a + b, 0) / nums.length
			
 
				+}
			
 
				+
			
 
				+function variance(nums) {
			
 
				+  if (!nums.length) return 0
			
 
				+  const m = mean(nums)
			
 
				+  return nums.reduce((acc, n) => acc + (n - m) * (n - m), 0) / nums.length
			
 
				+}
			
 
				+
			
 
				+function cmp(a, b) {
			
 
				+  return a < b ? -1 : a > b ? 1 : 0
			
 
				+}
			
 
				+
			
 
				+// 连续中文字符段（标点断开），段内再按专名出现处切断
			
 
				+function cjkRuns(text, names) {
			
 
				+  const raw = text.match(/[一-龥]+/g) || []
			
 
				+  if (!names.length) return raw
			
 
				+  let runs = raw
			
 
				+  for (const name of names) {
			
 
				+    const next = []
			
 
				+    for (const run of runs) {
			
 
				+      if (run.includes(name)) next.push(...run.split(name).filter(Boolean))
			
 
				+      else next.push(run)
			
 
				+    }
			
 
				+    runs = next
			
 
				+  }
			
 
				+  return runs
			
 
				+}
			
 
				+
			
 
				+function paragraphDistribution(lengths) {
			
 
				+  const total = lengths.length
			
 
				+  const buckets = { 短: 0, 中: 0, 长: 0, 超长: 0 }
			
 
				+  for (const n of lengths) {
			
 
				+    if (n <= 50) buckets.短++
			
 
				+    else if (n <= 150) buckets.中++
			
 
				+    else if (n <= 300) buckets.长++
			
 
				+    else buckets.超长++
			
 
				+  }
			
 
				+  const dist = {}
			
 
				+  for (const [k, v] of Object.entries(buckets)) {
			
 
				+    dist[k] = { 段数: v, 占比: total ? v / total : 0 }
			
 
				+  }
			
 
				+  return dist
			
 
				+}
			
 
				+
			
 
				+// 句首 2 字聚合 top5；跳过句首非中文字符（引号等），人名前缀不算句式开头
			
 
				+function topOpeners(sentences, exclude) {
			
 
				+  const names = [...exclude].filter((n) => typeof n === 'string' && n.length >= 2)
			
 
				+  const counts = new Map()
			
 
				+  let total = 0
			
 
				+  for (const s of sentences) {
			
 
				+    const m = s.match(/^[^一-龥]*([一-龥]{2})/)
			
 
				+    if (!m) continue
			
 
				+    const opener = m[1]
			
 
				+    if (names.some((name) => name.startsWith(opener))) continue
			
 
				+    total++
			
 
				+    counts.set(opener, (counts.get(opener) || 0) + 1)
			
 
				+  }
			
 
				+  return [...counts.entries()]
			
 
				+    .sort((a, b) => b[1] - a[1] || cmp(a[0], b[0]))
			
 
				+    .slice(0, 5)
			
 
				+    .map(([开头, 次数]) => ({ 开头, 次数, 占比: total ? 次数 / total : 0 }))
			
 
				+}
			
--- a/v7/test/style-stats/index.test.js
+++ b/v7/test/style-stats/index.test.js
@@ -0,0 +1,155 @@
 
				+import { test } from 'node:test'
			
 
				+import assert from 'node:assert/strict'
			
 
				+import {
			
 
				+  splitSentences,
			
 
				+  splitParagraphs,
			
 
				+  styleMetrics,
			
 
				+  extractImagery,
			
 
				+  extractFingerprint,
			
 
				+  windowTTR,
			
 
				+} from '../../src/style-stats/index.js'
			
 
				+
			
 
				+// —— 分句 / 分段 ——
			
 
				+
			
 
				+test('分句 引号收尾/省略号/分号都断句，闭引号收编进句尾', () => {
			
 
				+  const r = splitSentences('「站住！」林晚喝道。她愣住……随即冷笑；转身便走。')
			
 
				+  assert.deepEqual(r, ['「站住', '林晚喝道', '她愣住', '随即冷笑', '转身便走'])
			
 
				+})
			
 
				+
			
 
				+test('分句 空文本与无终结标点', () => {
			
 
				+  assert.deepEqual(splitSentences(''), [])
			
 
				+  assert.deepEqual(splitSentences(null), [])
			
 
				+  assert.deepEqual(splitSentences('还没写完的半句'), ['还没写完的半句'])
			
 
				+})
			
 
				+
			
 
				+test('分段 换行断段，连续空行不产生空段', () => {
			
 
				+  assert.deepEqual(splitParagraphs('第一段\n\n\n第二段\n第三段'), ['第一段', '第二段', '第三段'])
			
 
				+  assert.deepEqual(splitParagraphs(''), [])
			
 
				+})
			
 
				+
			
 
				+// —— 句式指标（小样本手算对照）——
			
 
				+
			
 
				+test('句式指标 均值方差手算对照：句长 [3,5,7] → 均 5 方差 8/3', () => {
			
 
				+  const m = styleMetrics('一二三。一二三四五。一二三四五六七。')
			
 
				+  assert.equal(m.句数, 3)
			
 
				+  assert.equal(m.平均句长, 5)
			
 
				+  assert.ok(Math.abs(m.句长方差 - 8 / 3) < 1e-12)
			
 
				+})
			
 
				+
			
 
				+test('句式指标 段落数/平均段长/段落分布', () => {
			
 
				+  const m = styleMetrics('第一段有六字\n\n短段\n第三段落有七个字')
			
 
				+  assert.equal(m.段落数, 3)
			
 
				+  assert.ok(Math.abs(m.平均段长 - 16 / 3) < 1e-12)
			
 
				+  assert.equal(m.段落分布.短.段数, 3)
			
 
				+  assert.equal(m.段落分布.短.占比, 1)
			
 
				+  assert.equal(m.段落分布.超长.段数, 0)
			
 
				+})
			
 
				+
			
 
				+test('句式指标 高频开头：人名前缀排除、句首引号跳过', () => {
			
 
				+  const m = styleMetrics('林晚冷笑。林晚转身。今日无事。今日有雨。“今日大吉。”', new Set(['林晚']))
			
 
				+  assert.deepEqual(m.高频开头, [{ 开头: '今日', 次数: 3, 占比: 1 }])
			
 
				+})
			
 
				+
			
 
				+test('句式指标 空文本不炸，全零', () => {
			
 
				+  const m = styleMetrics('')
			
 
				+  assert.equal(m.句数, 0)
			
 
				+  assert.equal(m.平均句长, 0)
			
 
				+  assert.equal(m.句长方差, 0)
			
 
				+  assert.deepEqual(m.高频开头, [])
			
 
				+})
			
 
				+
			
 
				+// —— 跨章高频意象 ——
			
 
				+
			
 
				+const imageryChapters = (n1, n2, n3, unit = '空气仿佛凝固，') => [
			
 
				+  { num: 1, text: unit.repeat(n1) },
			
 
				+  { num: 2, text: unit.repeat(n2) },
			
 
				+  { num: 3, text: unit.repeat(n3) },
			
 
				+]
			
 
				+
			
 
				+test('意象 阈值边界：全书 10 次报、9 次不报', () => {
			
 
				+  const hit = extractImagery(imageryChapters(4, 3, 3))
			
 
				+  assert.deepEqual(hit, [
			
 
				+    { phrase: '空气仿佛凝固', count: 10, chapterCount: 3, firstChapter: 1, lastChapter: 3 },
			
 
				+  ])
			
 
				+  assert.deepEqual(extractImagery(imageryChapters(3, 3, 3)), [])
			
 
				+})
			
 
				+
			
 
				+test('意象 跨章条件：12 次但只出现在 2 章 → 不出', () => {
			
 
				+  const chapters = [
			
 
				+    { num: 1, text: '空气仿佛凝固，'.repeat(6) },
			
 
				+    { num: 2, text: '空气仿佛凝固，'.repeat(6) },
			
 
				+    { num: 3, text: '风平浪静。' },
			
 
				+  ]
			
 
				+  assert.deepEqual(extractImagery(chapters), [])
			
 
				+})
			
 
				+
			
 
				+test('意象 专名排除：含「林晚」的短语与跨名碎片都不出', () => {
			
 
				+  const r = extractImagery(imageryChapters(4, 4, 4, '林晚冷笑一声，'), new Set(['林晚']))
			
 
				+  assert.ok(!r.some((x) => x.phrase.includes('林晚')), JSON.stringify(r))
			
 
				+  assert.ok(!r.some((x) => x.phrase.includes('晚冷')), JSON.stringify(r))
			
 
				+  // 名字切掉后剩余的通用搭配仍计数（12 次跨 3 章）
			
 
				+  assert.deepEqual(r.map((x) => x.phrase), ['冷笑一声'])
			
 
				+})
			
 
				+
			
 
				+test('意象 最长优先去重：子串被父串覆盖，次数超父串 1.25 倍则独立保留', () => {
			
 
				+  const chapters = [
			
 
				+    { num: 1, text: '空气仿佛凝固，'.repeat(4) + '水面仿佛凝固，' },
			
 
				+    { num: 2, text: '空气仿佛凝固，'.repeat(3) + '水面仿佛凝固，' },
			
 
				+    { num: 3, text: '空气仿佛凝固，'.repeat(3) + '水面仿佛凝固，' },
			
 
				+  ]
			
 
				+  const r = extractImagery(chapters)
			
 
				+  // 仿佛凝固 13 次 > 10×1.25，突破父串覆盖独立成条；其余子串全被 空气仿佛凝固 覆盖
			
 
				+  assert.deepEqual(
			
 
				+    r.map((x) => ({ phrase: x.phrase, count: x.count })),
			
 
				+    [
			
 
				+      { phrase: '仿佛凝固', count: 13 },
			
 
				+      { phrase: '空气仿佛凝固', count: 10 },
			
 
				+    ]
			
 
				+  )
			
 
				+})
			
 
				+
			
 
				+// —— 词汇丰富度（滑动窗口 TTR）——
			
 
				+
			
 
				+test('TTR 窗口平均与朴素 TTR 用长短两文本区分', () => {
			
 
				+  const base = '一二三四五六七八九十'.repeat(100) // 恰一个 1000 字窗
			
 
				+  assert.ok(Math.abs(windowTTR(base) - 0.01) < 1e-12)
			
 
				+  const longer = base + '百千万亿' // 第二窗 4 字全异 → TTR 1.0
			
 
				+  assert.ok(Math.abs(windowTTR(longer) - 0.505) < 1e-12)
			
 
				+  const naive = new Set([...longer]).size / longer.length
			
 
				+  assert.ok(naive < 0.02, '朴素 TTR 被文本长度压扁，窗口平均不受影响')
			
 
				+})
			
 
				+
			
 
				+test('TTR 剥标点空白；空文本为 0', () => {
			
 
				+  assert.equal(windowTTR('一，二。三！\n'), 1)
			
 
				+  assert.equal(windowTTR(''), 0)
			
 
				+  assert.equal(windowTTR('……！！'), 0)
			
 
				+})
			
 
				+
			
 
				+// —— 指纹 ——
			
 
				+
			
 
				+test('指纹 五常用列 + fingerprint_data 完整对象；章段内短语条件放宽为 ≥1 章', () => {
			
 
				+  const chapters = [{ num: 1, text: '空气仿佛凝固。'.repeat(10) }]
			
 
				+  const fp = extractFingerprint(chapters)
			
 
				+  assert.equal(fp.avg_sentence_length, 6)
			
 
				+  assert.equal(fp.sentence_length_variance, 0)
			
 
				+  assert.equal(fp.common_phrase_frequency['空气仿佛凝固'], 10)
			
 
				+  assert.ok(fp.vocabulary_richness > 0)
			
 
				+  assert.equal(fp.fingerprint_data.章数, 1)
			
 
				+  assert.equal(fp.fingerprint_data.总字数, 70)
			
 
				+  assert.ok(fp.fingerprint_data.段落分布)
			
 
				+  assert.ok(Array.isArray(fp.fingerprint_data.高频开头))
			
 
				+})
			
 
				+
			
 
				+// —— 确定性（AC3 的根基）——
			
 
				+
			
 
				+test('确定性 同输入两次调用结果深等', () => {
			
 
				+  const chapters = [
			
 
				+    { num: 1, text: '空气仿佛凝固，'.repeat(4) + '林晚冷笑一声。水面仿佛凝固。' },
			
 
				+    { num: 2, text: '空气仿佛凝固，'.repeat(3) + '林晚转身离去。水面仿佛凝固。' },
			
 
				+    { num: 3, text: '空气仿佛凝固，'.repeat(3) + '今日无事发生。水面仿佛凝固。' },
			
 
				+  ]
			
 
				+  const ex = new Set(['林晚', '晚晚'])
			
 
				+  assert.deepStrictEqual(extractImagery(chapters, ex), extractImagery(chapters, ex))
			
 
				+  assert.deepStrictEqual(extractFingerprint(chapters, ex), extractFingerprint(chapters, ex))
			
 
				+  assert.deepStrictEqual(styleMetrics(chapters[0].text, ex), styleMetrics(chapters[0].text, ex))
			
 
				+})