Selaa lähdekoodia

feat(v7): M5.5 P1——style-stats 统计算法(分句/意象/指纹)

lingfengQAQ 18 tuntia sitten
vanhempi
sitoutus
0a3009010e
2 muutettua tiedostoa jossa 434 lisäystä ja 0 poistoa
  1. 279 0
      v7/src/style-stats/index.js
  2. 155 0
      v7/test/style-stats/index.test.js

+ 279 - 0
v7/src/style-stats/index.js

@@ -0,0 +1,279 @@
+/**
+ * style-stats:体检统计算法(spec §9,M5.5)。纯函数、零依赖、无 IO;
+ * 全部固定排序纯计数、无时间戳无随机——同输入任何时候重算结果逐字段一致
+ * (「删缓存全量重建后指纹不变」的根基)。
+ */
+
+// 跨章高频意象口径:全书 ≥10 次且 ≥3 章出现才算高频(报告是提醒不拦截,硬编码合理默认)
+export const IMAGERY_MIN_COUNT = 10
+export const IMAGERY_MIN_CHAPTERS = 3
+
+const MIN_GRAM = 4
+const MAX_GRAM = 8
+const TTR_WINDOW = 1000
+
+/**
+ * 分句:按中文句终标点切分,句尾收编随后的闭引号(「站住!」→ 一句)。
+ * @param {string} text
+ * @returns {string[]} 修剪后的非空句
+ */
+export function splitSentences(text) {
+  if (!text) return []
+  return text
+    .split(/[。!?;…]+[”』」]?/)
+    .map((s) => s.trim())
+    .filter(Boolean)
+}
+
+/**
+ * 分段:一段一换行的网文约定,空行同样断段。
+ * @param {string} text
+ * @returns {string[]} 修剪后的非空段
+ */
+export function splitParagraphs(text) {
+  if (!text) return []
+  return text
+    .split(/\n+/)
+    .map((p) => p.trim())
+    .filter(Boolean)
+}
+
+/**
+ * 句式指标(单章机检与章段体检复用,口径一致)。长度一律取去空白字符数。
+ * @param {string} text 正文
+ * @param {Set<string>} exclude 名册专名/别名——句首开头统计排除人名前缀(「林晚…」是人名不是句式)
+ * @returns {{句数, 平均句长, 句长方差, 段落数, 平均段长, 段落分布, 高频开头}}
+ */
+export function styleMetrics(text, exclude = new Set()) {
+  const sentences = splitSentences(text)
+  const lengths = sentences.map(countChars)
+  const paragraphs = splitParagraphs(text)
+  const pLengths = paragraphs.map(countChars)
+  return {
+    句数: sentences.length,
+    平均句长: mean(lengths),
+    句长方差: variance(lengths),
+    段落数: paragraphs.length,
+    平均段长: mean(pLengths),
+    段落分布: paragraphDistribution(pLengths),
+    高频开头: topOpeners(sentences, exclude),
+  }
+}
+
+/**
+ * 跨章高频意象:连续中文字符段内的 n-gram(n=4..8),Apriori 分层——
+ * 第 n 层只数「前 n-1 字在上层已频繁」的候选,内存有界(全长度一把梭在百万字级别会爆)。
+ * 专名/别名(长度 ≥2)出现处先把字符段切断:含名短语与跨名碎片(「晚冷笑一声」)都出不来,
+ * 人名动作短语的重复是叙事常态不是意象复读,宁缺勿滥。
+ * @param {{num: number, text: string}[]} chapters 定稿章(章号 + 正文)
+ * @param {Set<string>} exclude 名册专名/别名
+ * @param {{minCount?: number, minChapters?: number}} [opts] 默认全书 ≥10 次、≥3 章
+ * @returns {{phrase: string, count: number, chapterCount: number, firstChapter: number, lastChapter: number}[]}
+ *   按(次数 desc,字典序 asc)稳定排序的全量清单,调用方取 top-N
+ */
+export function extractImagery(chapters, exclude = new Set(), opts = {}) {
+  const minCount = opts.minCount ?? IMAGERY_MIN_COUNT
+  const minChapters = opts.minChapters ?? IMAGERY_MIN_CHAPTERS
+  const names = [...exclude]
+    .filter((n) => typeof n === 'string' && n.length >= 2)
+    .sort((a, b) => b.length - a.length || cmp(a, b))
+
+  const runsByChapter = chapters.map((c) => ({
+    num: c.num,
+    runs: cjkRuns(c.text || '', names),
+  }))
+
+  // 分层计数:phrase → 全书次数
+  const frequent = new Map()
+  let prevLevel = null
+  for (let n = MIN_GRAM; n <= MAX_GRAM; n++) {
+    const counts = new Map()
+    for (const { runs } of runsByChapter) {
+      for (const run of runs) {
+        for (let i = 0; i + n <= run.length; i++) {
+          if (prevLevel && !prevLevel.has(run.slice(i, i + n - 1))) continue
+          const g = run.slice(i, i + n)
+          counts.set(g, (counts.get(g) || 0) + 1)
+        }
+      }
+    }
+    const level = new Set()
+    for (const [g, c] of counts) {
+      if (c >= minCount) {
+        level.add(g)
+        frequent.set(g, c)
+      }
+    }
+    if (!level.size) break
+    prevLevel = level
+  }
+
+  // 第二遍只为频繁短语数章分布(第一遍不建 per-gram 章集合,控内存)
+  const byLen = new Map()
+  for (const g of frequent.keys()) {
+    if (!byLen.has(g.length)) byLen.set(g.length, new Set())
+    byLen.get(g.length).add(g)
+  }
+  const chapterSets = new Map()
+  for (const { num, runs } of runsByChapter) {
+    for (const run of runs) {
+      for (const [len, set] of byLen) {
+        for (let i = 0; i + len <= run.length; i++) {
+          const g = run.slice(i, i + len)
+          if (!set.has(g)) continue
+          if (!chapterSets.has(g)) chapterSets.set(g, new Set())
+          chapterSets.get(g).add(num)
+        }
+      }
+    }
+  }
+
+  const candidates = []
+  for (const [phrase, count] of frequent) {
+    const nums = [...(chapterSets.get(phrase) || [])].sort((a, b) => a - b)
+    if (nums.length < minChapters) continue
+    candidates.push({
+      phrase,
+      count,
+      chapterCount: nums.length,
+      firstChapter: nums[0],
+      lastChapter: nums[nums.length - 1],
+    })
+  }
+
+  // 最长优先去重:子串且次数 ≤ 父串 ×1.25 视为同一意象的碎片(「气仿佛凝固」被「空气仿佛凝固」覆盖)
+  candidates.sort(
+    (a, b) => b.phrase.length - a.phrase.length || b.count - a.count || cmp(a.phrase, b.phrase)
+  )
+  const kept = []
+  for (const c of candidates) {
+    const covered = kept.some((k) => k.phrase.includes(c.phrase) && c.count <= k.count * 1.25)
+    if (!covered) kept.push(c)
+  }
+  kept.sort((a, b) => b.count - a.count || cmp(a.phrase, b.phrase))
+  return kept
+}
+
+/**
+ * 文体指纹:章段 → fingerprints 表五常用列 + fingerprint_data 完整对象(含段落分布/高频开头/总字数/章数)。
+ * 返回对象不含 JSON 字符串,序列化在落库处做。
+ * @param {{num: number, text: string}[]} chapters
+ * @param {Set<string>} exclude
+ */
+export function extractFingerprint(chapters, exclude = new Set()) {
+  const combined = chapters.map((c) => c.text || '').join('\n\n')
+  const metrics = styleMetrics(combined, exclude)
+  // 章段内常用短语:跨章条件放宽为 ≥1(章段内统计),取 top10
+  const imagery = extractImagery(chapters, exclude, { minChapters: 1 })
+  const common_phrase_frequency = {}
+  for (const { phrase, count } of imagery.slice(0, 10)) common_phrase_frequency[phrase] = count
+  const vocabulary_richness = windowTTR(combined)
+  const fingerprint_data = {
+    avg_sentence_length: metrics.平均句长,
+    sentence_length_variance: metrics.句长方差,
+    avg_paragraph_length: metrics.平均段长,
+    common_phrase_frequency,
+    vocabulary_richness,
+    段落分布: metrics.段落分布,
+    高频开头: metrics.高频开头,
+    总字数: countChars(combined),
+    章数: chapters.length,
+  }
+  return {
+    avg_sentence_length: metrics.平均句长,
+    sentence_length_variance: metrics.句长方差,
+    avg_paragraph_length: metrics.平均段长,
+    common_phrase_frequency,
+    vocabulary_richness,
+    fingerprint_data,
+  }
+}
+
+/**
+ * 词汇丰富度:滑动窗口 TTR——剥标点空白后按 1000 字窗口求 unique/窗长 的平均(末窗用实际长度)。
+ * 朴素 unique/total 对文本长度敏感,跨章段不可比。
+ * @param {string} text
+ * @returns {number}
+ */
+export function windowTTR(text) {
+  const chars = [...(text || '').replace(/[^一-龥A-Za-z0-9]/g, '')]
+  if (!chars.length) return 0
+  const ttrs = []
+  for (let i = 0; i < chars.length; i += TTR_WINDOW) {
+    const win = chars.slice(i, i + TTR_WINDOW)
+    ttrs.push(new Set(win).size / win.length)
+  }
+  return mean(ttrs)
+}
+
+// —— 内部工具(全部确定性:固定遍历顺序、码元序比较,不用 locale)——
+
+function countChars(s) {
+  return [...s.replace(/\s+/g, '')].length
+}
+
+function mean(nums) {
+  if (!nums.length) return 0
+  return nums.reduce((a, b) => a + b, 0) / nums.length
+}
+
+function variance(nums) {
+  if (!nums.length) return 0
+  const m = mean(nums)
+  return nums.reduce((acc, n) => acc + (n - m) * (n - m), 0) / nums.length
+}
+
+function cmp(a, b) {
+  return a < b ? -1 : a > b ? 1 : 0
+}
+
+// 连续中文字符段(标点断开),段内再按专名出现处切断
+function cjkRuns(text, names) {
+  const raw = text.match(/[一-龥]+/g) || []
+  if (!names.length) return raw
+  let runs = raw
+  for (const name of names) {
+    const next = []
+    for (const run of runs) {
+      if (run.includes(name)) next.push(...run.split(name).filter(Boolean))
+      else next.push(run)
+    }
+    runs = next
+  }
+  return runs
+}
+
+function paragraphDistribution(lengths) {
+  const total = lengths.length
+  const buckets = { 短: 0, 中: 0, 长: 0, 超长: 0 }
+  for (const n of lengths) {
+    if (n <= 50) buckets.短++
+    else if (n <= 150) buckets.中++
+    else if (n <= 300) buckets.长++
+    else buckets.超长++
+  }
+  const dist = {}
+  for (const [k, v] of Object.entries(buckets)) {
+    dist[k] = { 段数: v, 占比: total ? v / total : 0 }
+  }
+  return dist
+}
+
+// 句首 2 字聚合 top5;跳过句首非中文字符(引号等),人名前缀不算句式开头
+function topOpeners(sentences, exclude) {
+  const names = [...exclude].filter((n) => typeof n === 'string' && n.length >= 2)
+  const counts = new Map()
+  let total = 0
+  for (const s of sentences) {
+    const m = s.match(/^[^一-龥]*([一-龥]{2})/)
+    if (!m) continue
+    const opener = m[1]
+    if (names.some((name) => name.startsWith(opener))) continue
+    total++
+    counts.set(opener, (counts.get(opener) || 0) + 1)
+  }
+  return [...counts.entries()]
+    .sort((a, b) => b[1] - a[1] || cmp(a[0], b[0]))
+    .slice(0, 5)
+    .map(([开头, 次数]) => ({ 开头, 次数, 占比: total ? 次数 / total : 0 }))
+}

+ 155 - 0
v7/test/style-stats/index.test.js

@@ -0,0 +1,155 @@
+import { test } from 'node:test'
+import assert from 'node:assert/strict'
+import {
+  splitSentences,
+  splitParagraphs,
+  styleMetrics,
+  extractImagery,
+  extractFingerprint,
+  windowTTR,
+} from '../../src/style-stats/index.js'
+
+// —— 分句 / 分段 ——
+
+test('分句 引号收尾/省略号/分号都断句,闭引号收编进句尾', () => {
+  const r = splitSentences('「站住!」林晚喝道。她愣住……随即冷笑;转身便走。')
+  assert.deepEqual(r, ['「站住', '林晚喝道', '她愣住', '随即冷笑', '转身便走'])
+})
+
+test('分句 空文本与无终结标点', () => {
+  assert.deepEqual(splitSentences(''), [])
+  assert.deepEqual(splitSentences(null), [])
+  assert.deepEqual(splitSentences('还没写完的半句'), ['还没写完的半句'])
+})
+
+test('分段 换行断段,连续空行不产生空段', () => {
+  assert.deepEqual(splitParagraphs('第一段\n\n\n第二段\n第三段'), ['第一段', '第二段', '第三段'])
+  assert.deepEqual(splitParagraphs(''), [])
+})
+
+// —— 句式指标(小样本手算对照)——
+
+test('句式指标 均值方差手算对照:句长 [3,5,7] → 均 5 方差 8/3', () => {
+  const m = styleMetrics('一二三。一二三四五。一二三四五六七。')
+  assert.equal(m.句数, 3)
+  assert.equal(m.平均句长, 5)
+  assert.ok(Math.abs(m.句长方差 - 8 / 3) < 1e-12)
+})
+
+test('句式指标 段落数/平均段长/段落分布', () => {
+  const m = styleMetrics('第一段有六字\n\n短段\n第三段落有七个字')
+  assert.equal(m.段落数, 3)
+  assert.ok(Math.abs(m.平均段长 - 16 / 3) < 1e-12)
+  assert.equal(m.段落分布.短.段数, 3)
+  assert.equal(m.段落分布.短.占比, 1)
+  assert.equal(m.段落分布.超长.段数, 0)
+})
+
+test('句式指标 高频开头:人名前缀排除、句首引号跳过', () => {
+  const m = styleMetrics('林晚冷笑。林晚转身。今日无事。今日有雨。“今日大吉。”', new Set(['林晚']))
+  assert.deepEqual(m.高频开头, [{ 开头: '今日', 次数: 3, 占比: 1 }])
+})
+
+test('句式指标 空文本不炸,全零', () => {
+  const m = styleMetrics('')
+  assert.equal(m.句数, 0)
+  assert.equal(m.平均句长, 0)
+  assert.equal(m.句长方差, 0)
+  assert.deepEqual(m.高频开头, [])
+})
+
+// —— 跨章高频意象 ——
+
+const imageryChapters = (n1, n2, n3, unit = '空气仿佛凝固,') => [
+  { num: 1, text: unit.repeat(n1) },
+  { num: 2, text: unit.repeat(n2) },
+  { num: 3, text: unit.repeat(n3) },
+]
+
+test('意象 阈值边界:全书 10 次报、9 次不报', () => {
+  const hit = extractImagery(imageryChapters(4, 3, 3))
+  assert.deepEqual(hit, [
+    { phrase: '空气仿佛凝固', count: 10, chapterCount: 3, firstChapter: 1, lastChapter: 3 },
+  ])
+  assert.deepEqual(extractImagery(imageryChapters(3, 3, 3)), [])
+})
+
+test('意象 跨章条件:12 次但只出现在 2 章 → 不出', () => {
+  const chapters = [
+    { num: 1, text: '空气仿佛凝固,'.repeat(6) },
+    { num: 2, text: '空气仿佛凝固,'.repeat(6) },
+    { num: 3, text: '风平浪静。' },
+  ]
+  assert.deepEqual(extractImagery(chapters), [])
+})
+
+test('意象 专名排除:含「林晚」的短语与跨名碎片都不出', () => {
+  const r = extractImagery(imageryChapters(4, 4, 4, '林晚冷笑一声,'), new Set(['林晚']))
+  assert.ok(!r.some((x) => x.phrase.includes('林晚')), JSON.stringify(r))
+  assert.ok(!r.some((x) => x.phrase.includes('晚冷')), JSON.stringify(r))
+  // 名字切掉后剩余的通用搭配仍计数(12 次跨 3 章)
+  assert.deepEqual(r.map((x) => x.phrase), ['冷笑一声'])
+})
+
+test('意象 最长优先去重:子串被父串覆盖,次数超父串 1.25 倍则独立保留', () => {
+  const chapters = [
+    { num: 1, text: '空气仿佛凝固,'.repeat(4) + '水面仿佛凝固,' },
+    { num: 2, text: '空气仿佛凝固,'.repeat(3) + '水面仿佛凝固,' },
+    { num: 3, text: '空气仿佛凝固,'.repeat(3) + '水面仿佛凝固,' },
+  ]
+  const r = extractImagery(chapters)
+  // 仿佛凝固 13 次 > 10×1.25,突破父串覆盖独立成条;其余子串全被 空气仿佛凝固 覆盖
+  assert.deepEqual(
+    r.map((x) => ({ phrase: x.phrase, count: x.count })),
+    [
+      { phrase: '仿佛凝固', count: 13 },
+      { phrase: '空气仿佛凝固', count: 10 },
+    ]
+  )
+})
+
+// —— 词汇丰富度(滑动窗口 TTR)——
+
+test('TTR 窗口平均与朴素 TTR 用长短两文本区分', () => {
+  const base = '一二三四五六七八九十'.repeat(100) // 恰一个 1000 字窗
+  assert.ok(Math.abs(windowTTR(base) - 0.01) < 1e-12)
+  const longer = base + '百千万亿' // 第二窗 4 字全异 → TTR 1.0
+  assert.ok(Math.abs(windowTTR(longer) - 0.505) < 1e-12)
+  const naive = new Set([...longer]).size / longer.length
+  assert.ok(naive < 0.02, '朴素 TTR 被文本长度压扁,窗口平均不受影响')
+})
+
+test('TTR 剥标点空白;空文本为 0', () => {
+  assert.equal(windowTTR('一,二。三!\n'), 1)
+  assert.equal(windowTTR(''), 0)
+  assert.equal(windowTTR('……!!'), 0)
+})
+
+// —— 指纹 ——
+
+test('指纹 五常用列 + fingerprint_data 完整对象;章段内短语条件放宽为 ≥1 章', () => {
+  const chapters = [{ num: 1, text: '空气仿佛凝固。'.repeat(10) }]
+  const fp = extractFingerprint(chapters)
+  assert.equal(fp.avg_sentence_length, 6)
+  assert.equal(fp.sentence_length_variance, 0)
+  assert.equal(fp.common_phrase_frequency['空气仿佛凝固'], 10)
+  assert.ok(fp.vocabulary_richness > 0)
+  assert.equal(fp.fingerprint_data.章数, 1)
+  assert.equal(fp.fingerprint_data.总字数, 70)
+  assert.ok(fp.fingerprint_data.段落分布)
+  assert.ok(Array.isArray(fp.fingerprint_data.高频开头))
+})
+
+// —— 确定性(AC3 的根基)——
+
+test('确定性 同输入两次调用结果深等', () => {
+  const chapters = [
+    { num: 1, text: '空气仿佛凝固,'.repeat(4) + '林晚冷笑一声。水面仿佛凝固。' },
+    { num: 2, text: '空气仿佛凝固,'.repeat(3) + '林晚转身离去。水面仿佛凝固。' },
+    { num: 3, text: '空气仿佛凝固,'.repeat(3) + '今日无事发生。水面仿佛凝固。' },
+  ]
+  const ex = new Set(['林晚', '晚晚'])
+  assert.deepStrictEqual(extractImagery(chapters, ex), extractImagery(chapters, ex))
+  assert.deepStrictEqual(extractFingerprint(chapters, ex), extractFingerprint(chapters, ex))
+  assert.deepStrictEqual(styleMetrics(chapters[0].text, ex), styleMetrics(chapters[0].text, ex))
+})