context_pack_builder.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. #!/usr/bin/env python3
  2. """
  3. Context Pack Builder v5.2
  4. 为章节写作生成结构化上下文包,取代直接读取 state.json。
  5. v5.2 变更:
  6. - 使用 v5.1 index_manager schema (entities.id, aliases, current_json)
  7. - 移除对 entity_kv 表的依赖,改用 current_json 字段
  8. - 移除对 entity_aliases 表的依赖,改用 aliases 表
  9. - 章节摘要改为读取 .webnovel/summaries/chNNNN.md
  10. 输出 Schema:
  11. {
  12. "core": {
  13. "chapter_outline": "本章大纲内容",
  14. "protagonist_snapshot": {...},
  15. "recent_summaries": [{...}, ...],
  16. "recent_meta": [{...}, ...]
  17. },
  18. "scene": {
  19. "location_context": {...},
  20. "appearing_characters": [{entity_id, name, snapshot}, ...],
  21. "urgent_foreshadowing": [{...}, ...]
  22. },
  23. "global": {
  24. "worldview_skeleton": "...",
  25. "power_system_skeleton": "...",
  26. "style_contract_ref": "..."
  27. }
  28. }
  29. 使用方式:
  30. python context_pack_builder.py --chapter 45 --project-root /path/to/project
  31. python context_pack_builder.py --chapter 45 --output /tmp/context_pack.json
  32. """
  33. import json
  34. import os
  35. import sys
  36. import argparse
  37. import re
  38. import sqlite3
  39. from pathlib import Path
  40. from typing import Optional, Dict, List, Any
  41. # 导入项目工具
  42. from project_locator import resolve_project_root
  43. from chapter_paths import find_chapter_file
  44. # 导入配置
  45. try:
  46. from data_modules.config import get_config, DataModulesConfig
  47. except ImportError:
  48. from scripts.data_modules.config import get_config, DataModulesConfig
  49. class ContextPackBuilder:
  50. """上下文包构建器"""
  51. def __init__(self, project_root: Path = None):
  52. if project_root is None:
  53. try:
  54. project_root = resolve_project_root()
  55. except FileNotFoundError:
  56. project_root = Path.cwd()
  57. else:
  58. project_root = Path(project_root)
  59. self.project_root = project_root
  60. self.config = get_config(project_root)
  61. self.state_file = project_root / ".webnovel" / "state.json"
  62. self.index_db = project_root / ".webnovel" / "index.db"
  63. self.summaries_dir = project_root / ".webnovel" / "summaries"
  64. self.outline_dir = project_root / "大纲"
  65. self.settings_dir = project_root / "设定集"
  66. self.chapters_dir = project_root / "正文"
  67. self._conn: Optional[sqlite3.Connection] = None
  68. def _conn_index(self) -> Optional[sqlite3.Connection]:
  69. if self._conn is not None:
  70. return self._conn
  71. if not self.index_db.exists():
  72. return None
  73. conn = sqlite3.connect(str(self.index_db))
  74. conn.row_factory = sqlite3.Row
  75. self._conn = conn
  76. return conn
  77. def build(self, chapter_num: int) -> Dict[str, Any]:
  78. """构建完整上下文包"""
  79. state = self._load_state()
  80. return {
  81. "meta": {
  82. "chapter": chapter_num,
  83. "project_root": str(self.project_root),
  84. "version": "5.2"
  85. },
  86. "core": self._build_core(chapter_num),
  87. "scene": self._build_scene(chapter_num),
  88. "global": self._build_global(),
  89. "alerts": self._build_alerts(state)
  90. }
  91. def _build_core(self, chapter_num: int) -> Dict[str, Any]:
  92. """核心上下文:大纲、主角状态、近期摘要"""
  93. state = self._load_state()
  94. return {
  95. "chapter_outline": self._get_chapter_outline(chapter_num),
  96. "protagonist_snapshot": self._get_protagonist_snapshot(state),
  97. "recent_summaries": self._get_recent_summaries(
  98. chapter_num, window=self.config.context_recent_summaries_window
  99. ),
  100. "recent_meta": self._get_recent_chapter_meta(chapter_num, window=3),
  101. }
  102. def _build_scene(self, chapter_num: int) -> Dict[str, Any]:
  103. """场景上下文:地点、出场角色、紧急伏笔"""
  104. state = self._load_state()
  105. # 从大纲推断本章地点和角色
  106. outline = self._get_chapter_outline(chapter_num)
  107. predicted_location = self._predict_location(outline, state)
  108. predicted_characters = self._predict_characters(outline, state)
  109. return {
  110. "location_context": predicted_location,
  111. "appearing_characters": predicted_characters,
  112. "urgent_foreshadowing": self._get_urgent_foreshadowing(state, chapter_num)
  113. }
  114. def _build_global(self) -> Dict[str, Any]:
  115. """全局上下文:世界观、力量体系、风格契约"""
  116. return {
  117. "worldview_skeleton": self._load_skeleton("世界观"),
  118. "power_system_skeleton": self._load_skeleton("力量体系"),
  119. "style_contract_ref": self._get_style_contract_ref()
  120. }
  121. def _build_alerts(self, state: Dict) -> Dict[str, Any]:
  122. """风险提示:消歧警告、待确认项(v5.0)"""
  123. slice_size = self.config.context_alerts_slice
  124. return {
  125. "disambiguation_warnings": state.get("disambiguation_warnings", [])[-slice_size:],
  126. "disambiguation_pending": state.get("disambiguation_pending", [])[-slice_size:]
  127. }
  128. # ================== 辅助方法 ==================
  129. def _load_state(self) -> Dict:
  130. """加载 state.json"""
  131. if not self.state_file.exists():
  132. return {}
  133. with open(self.state_file, 'r', encoding='utf-8') as f:
  134. return json.load(f)
  135. def _get_chapter_outline(self, chapter_num: int) -> str:
  136. """获取本章大纲"""
  137. # 尝试多种大纲文件格式
  138. patterns = [
  139. f"第{chapter_num}章*.md",
  140. f"第{chapter_num:02d}章*.md",
  141. f"第{chapter_num:03d}章*.md",
  142. f"第{chapter_num:04d}章*.md",
  143. f"章纲/第{chapter_num}章*.md",
  144. f"章纲/第{chapter_num:02d}章*.md",
  145. ]
  146. for pattern in patterns:
  147. matches = list(self.outline_dir.glob(pattern))
  148. if matches:
  149. with open(matches[0], 'r', encoding='utf-8') as f:
  150. return f.read()
  151. # 尝试从卷纲中提取
  152. volume_outline = self._extract_from_volume_outline(chapter_num)
  153. if volume_outline:
  154. return volume_outline
  155. return f"[大纲未找到: 第{chapter_num}章]"
  156. def _extract_from_volume_outline(self, chapter_num: int) -> Optional[str]:
  157. """从卷纲中提取章节大纲"""
  158. volume_files = list(self.outline_dir.glob("卷纲*.md")) + list(self.outline_dir.glob("*卷*.md"))
  159. for vf in volume_files:
  160. with open(vf, 'r', encoding='utf-8') as f:
  161. content = f.read()
  162. # 查找章节标记(兼容空格/中英文冒号/不同标题级别)
  163. # 常见格式:### 第 1 章:标题 或 ### 第1章: 标题
  164. heading_pattern = (
  165. rf"(?m)^#+\s*第\s*{chapter_num}\s*章[::][^\n]*\n"
  166. rf".*?(?=^#+\s*第\s*\d+\s*章|^##\s|\Z)"
  167. )
  168. match = re.search(heading_pattern, content, re.DOTALL)
  169. if match:
  170. return match.group(0).strip()
  171. # 兼容无标题级别的格式:第 1 章 标题
  172. plain_pattern = (
  173. rf"(?m)^第\s*{chapter_num}\s*章[^\n]*\n"
  174. rf".*?(?=^第\s*\d+\s*章|\Z)"
  175. )
  176. match = re.search(plain_pattern, content, re.DOTALL)
  177. if match:
  178. return match.group(0).strip()
  179. return None
  180. def _get_protagonist_snapshot(self, state: Dict) -> Dict:
  181. """获取主角状态快照"""
  182. protagonist = state.get("protagonist_state", {}) or {}
  183. power = protagonist.get("power", {}) or {}
  184. location = protagonist.get("location", {}) or {}
  185. snapshot: Dict[str, Any] = {
  186. "entity_id": str(protagonist.get("entity_id", "") or "").strip(),
  187. "name": str(protagonist.get("name", "") or "").strip() or "主角",
  188. "realm": str(power.get("realm", "") or "").strip(),
  189. "layer": power.get("layer", 0),
  190. "bottleneck": str(power.get("bottleneck", "") or "").strip(),
  191. "golden_finger": protagonist.get("golden_finger", {}) or {},
  192. "location": str(location.get("current", "") or "").strip(),
  193. }
  194. # 可选:从 index.db 补齐(以 entity_id 为准)
  195. protagonist_id = snapshot.get("entity_id", "")
  196. conn = self._conn_index()
  197. if protagonist_id and conn is not None:
  198. # v5.1 schema: entities 表使用 id 字段,current_json 存储状态
  199. row = conn.execute(
  200. "SELECT canonical_name, current_json FROM entities WHERE id = ? LIMIT 1",
  201. (protagonist_id,),
  202. ).fetchone()
  203. if row:
  204. if row["canonical_name"]:
  205. snapshot["name"] = row["canonical_name"]
  206. # 从 current_json 解析状态
  207. if row["current_json"]:
  208. try:
  209. current = json.loads(row["current_json"])
  210. if isinstance(current.get("realm"), str) and current.get("realm"):
  211. snapshot["realm"] = current["realm"]
  212. if current.get("layer") is not None and current.get("layer") != "":
  213. snapshot["layer"] = current["layer"]
  214. if isinstance(current.get("bottleneck"), str) and current.get("bottleneck"):
  215. snapshot["bottleneck"] = current["bottleneck"]
  216. if isinstance(current.get("location"), str) and current.get("location"):
  217. snapshot["location"] = current["location"]
  218. except (json.JSONDecodeError, TypeError):
  219. pass
  220. return snapshot
  221. def _get_recent_summaries(self, chapter_num: int, window: int = 3) -> List[Dict]:
  222. """获取最近 N 章的摘要"""
  223. summaries = []
  224. start = max(1, chapter_num - window)
  225. for ch in range(start, chapter_num):
  226. summary = self._load_summary_file(ch)
  227. if summary:
  228. summaries.append(summary)
  229. continue
  230. # 兼容降级:若摘要文件不存在,尝试从章节正文提取
  231. chapter_file = find_chapter_file(self.project_root, ch)
  232. if chapter_file and chapter_file.exists():
  233. fallback = self._extract_summary_from_chapter(chapter_file, ch)
  234. if fallback:
  235. summaries.append(fallback)
  236. return summaries
  237. def _extract_summary_from_chapter(self, chapter_file: Path, chapter_num: int) -> Optional[Dict]:
  238. """从章节文件中提取摘要"""
  239. with open(chapter_file, 'r', encoding='utf-8') as f:
  240. content = f.read()
  241. # 查找摘要区块
  242. summary_match = re.search(r'## 本章摘要\s*\r?\n(.*?)(?=\r?\n##|$)', content, re.DOTALL)
  243. if summary_match:
  244. summary_text = summary_match.group(1).strip()
  245. return {
  246. "chapter": chapter_num,
  247. "summary": summary_text
  248. }
  249. # 没有摘要,返回章节标题
  250. title_match = re.match(r'^#\s*(.+)', content)
  251. title = title_match.group(1).strip() if title_match else f"第{chapter_num}章"
  252. return {
  253. "chapter": chapter_num,
  254. "title": title,
  255. "summary": None
  256. }
  257. def _load_summary_file(self, chapter_num: int) -> Optional[Dict]:
  258. """从 .webnovel/summaries/chNNNN.md 读取摘要"""
  259. summary_path = self.summaries_dir / f"ch{chapter_num:04d}.md"
  260. if not summary_path.exists():
  261. return None
  262. text = summary_path.read_text(encoding="utf-8")
  263. # 解析 YAML 头部(--- ... ---)
  264. meta: Dict[str, Any] = {}
  265. fm_match = re.match(r"^---\s*\r?\n(.*?)\r?\n---\s*\r?\n", text, re.DOTALL)
  266. if fm_match:
  267. fm = fm_match.group(1)
  268. for line in fm.splitlines():
  269. if ":" not in line:
  270. continue
  271. key, _, value = line.partition(":")
  272. key = key.strip()
  273. value = value.strip()
  274. if not key:
  275. continue
  276. # 简单解析列表
  277. if value.startswith("[") and value.endswith("]"):
  278. items = [v.strip().strip('\"').strip("'") for v in value[1:-1].split(",") if v.strip()]
  279. meta[key] = items
  280. else:
  281. meta[key] = value.strip('\"').strip("'")
  282. # 提取剧情摘要段落
  283. summary_match = re.search(r"##\s*剧情摘要\s*\r?\n(.*?)(?=\r?\n##|\Z)", text, re.DOTALL)
  284. summary_text = summary_match.group(1).strip() if summary_match else ""
  285. result = {
  286. "chapter": chapter_num,
  287. "summary": summary_text
  288. }
  289. # 附加部分元数据(可选)
  290. for k in ["hook_type", "hook_strength", "time", "location"]:
  291. if k in meta:
  292. result[k] = meta[k]
  293. return result
  294. def _get_recent_chapter_meta(self, chapter_num: int, window: int = 3) -> List[Dict[str, Any]]:
  295. """读取最近 N 章的 chapter_meta(用于模式重复检查)"""
  296. state = self._load_state()
  297. meta = state.get("chapter_meta", {}) or {}
  298. items: List[Dict[str, Any]] = []
  299. for ch in range(max(1, chapter_num - window), chapter_num):
  300. key_candidates = [f"{ch:04d}", str(ch)]
  301. entry = None
  302. for key in key_candidates:
  303. if key in meta:
  304. entry = meta.get(key)
  305. break
  306. if entry:
  307. items.append({"chapter": ch, **entry})
  308. return items
  309. def _predict_location(self, outline: str, state: Dict) -> Dict:
  310. """从大纲推断地点(优先使用 index.db 别名表)"""
  311. conn = self._conn_index()
  312. if conn is None:
  313. return {"name": "未知地点", "desc": ""}
  314. # v5.1 schema: 使用 aliases 表(替代 entity_aliases)
  315. rows = conn.execute(
  316. "SELECT alias, entity_id FROM aliases WHERE entity_type = ?",
  317. ("地点",),
  318. ).fetchall()
  319. if not rows:
  320. return {"name": "未知地点", "desc": ""}
  321. # 先匹配更长的别名,降低误命中
  322. candidates = sorted(
  323. ((r["alias"], r["entity_id"]) for r in rows if r["alias"]),
  324. key=lambda x: len(x[0]),
  325. reverse=True,
  326. )
  327. for alias, entity_id in candidates:
  328. if len(alias) < 2:
  329. continue
  330. if alias not in outline:
  331. continue
  332. # v5.1 schema: entities 表使用 id 字段
  333. e = conn.execute(
  334. "SELECT canonical_name, desc FROM entities WHERE id = ? LIMIT 1",
  335. (entity_id,),
  336. ).fetchone()
  337. return {
  338. "entity_id": entity_id,
  339. "name": (e["canonical_name"] if e else "") or alias,
  340. "desc": (e["desc"] if e else "") or "",
  341. "match": alias,
  342. }
  343. return {"name": "未知地点", "desc": ""}
  344. def _predict_characters(self, outline: str, state: Dict) -> List[Dict]:
  345. """从大纲推断出场角色(优先使用 index.db 别名表)"""
  346. conn = self._conn_index()
  347. if conn is None:
  348. return []
  349. # v5.1 schema: 使用 aliases 表(替代 entity_aliases)
  350. rows = conn.execute(
  351. "SELECT alias, entity_id FROM aliases WHERE entity_type = ?",
  352. ("角色",),
  353. ).fetchall()
  354. if not rows:
  355. return []
  356. matched_ids: set[str] = set()
  357. for r in rows:
  358. alias = r["alias"] or ""
  359. if len(alias) < 2:
  360. continue
  361. if alias in outline:
  362. matched_ids.add(r["entity_id"])
  363. if not matched_ids:
  364. return []
  365. tier_order = {"核心": 0, "支线": 1, "装饰": 2, "": 3}
  366. matched: List[Dict[str, Any]] = []
  367. for entity_id in matched_ids:
  368. # v5.1 schema: entities 表使用 id 字段,current_json 存储状态
  369. e = conn.execute(
  370. "SELECT canonical_name, tier, current_json FROM entities WHERE id = ? LIMIT 1",
  371. (entity_id,),
  372. ).fetchone()
  373. if not e:
  374. continue
  375. # 从 current_json 解析快照
  376. snapshot = {}
  377. if e["current_json"]:
  378. try:
  379. snapshot = json.loads(e["current_json"])
  380. except (json.JSONDecodeError, TypeError):
  381. pass
  382. matched.append(
  383. {
  384. "entity_id": entity_id,
  385. "name": e["canonical_name"] or entity_id,
  386. "tier": e["tier"] or "",
  387. "snapshot": snapshot,
  388. }
  389. )
  390. matched.sort(key=lambda x: tier_order.get(x.get("tier", ""), 3))
  391. return matched[:self.config.context_max_appearing_characters]
  392. def _get_urgent_foreshadowing(self, state: Dict, chapter_num: int) -> List[Dict]:
  393. """获取紧急伏笔(优先使用 index.db 伏笔索引)"""
  394. conn = self._conn_index()
  395. if conn is not None:
  396. try:
  397. rows = conn.execute(
  398. "SELECT content, introduced_chapter, resolved_chapter, status, urgency, location "
  399. "FROM foreshadowing_index WHERE status = '未回收' ORDER BY urgency DESC LIMIT 5"
  400. ).fetchall()
  401. return [dict(r) for r in rows] if rows else []
  402. except sqlite3.Error:
  403. pass
  404. # fallback:项目未建索引时直接读取 state.json
  405. plot_threads = state.get("plot_threads", {}) or {}
  406. items = plot_threads.get("foreshadowing", []) or []
  407. urgent: List[Dict[str, Any]] = []
  408. for fs in items:
  409. if not isinstance(fs, dict):
  410. continue
  411. status = str(fs.get("status", "")).strip()
  412. if status in {"已回收"}:
  413. continue
  414. planted_chapter = fs.get("planted_chapter") or fs.get("introduced_chapter") or 0
  415. target_chapter = fs.get("target_chapter") or fs.get("target") or 0
  416. try:
  417. planted_chapter = int(planted_chapter)
  418. except (TypeError, ValueError):
  419. planted_chapter = 0
  420. try:
  421. target_chapter = int(target_chapter) if target_chapter else 0
  422. except (TypeError, ValueError):
  423. target_chapter = 0
  424. chapters_pending = chapter_num - planted_chapter if planted_chapter else 0
  425. # 使用配置的紧急度阈值
  426. cfg = self.config
  427. if chapters_pending > cfg.foreshadowing_urgency_pending_high:
  428. urgency = cfg.foreshadowing_urgency_score_high
  429. elif chapters_pending > cfg.foreshadowing_urgency_pending_medium:
  430. urgency = cfg.foreshadowing_urgency_score_medium
  431. elif target_chapter and chapter_num >= target_chapter - cfg.foreshadowing_urgency_target_proximity:
  432. urgency = cfg.foreshadowing_urgency_score_target
  433. else:
  434. urgency = cfg.foreshadowing_urgency_score_low
  435. if urgency >= cfg.foreshadowing_urgency_threshold_show:
  436. urgent.append(
  437. {
  438. "content": fs.get("content") or fs.get("description") or "",
  439. "planted_chapter": planted_chapter,
  440. "target_chapter": target_chapter,
  441. "tier": fs.get("tier", ""),
  442. "urgency": urgency,
  443. }
  444. )
  445. urgent.sort(key=lambda x: x.get("urgency", 0), reverse=True)
  446. return urgent[:self.config.context_max_urgent_foreshadowing]
  447. def _load_skeleton(self, setting_type: str) -> str:
  448. """加载设定骨架"""
  449. patterns = [
  450. f"{setting_type}.md",
  451. f"{setting_type}/*.md",
  452. f"*{setting_type}*.md"
  453. ]
  454. for pattern in patterns:
  455. matches = list(self.settings_dir.glob(pattern))
  456. if matches:
  457. # 如果是目录,合并所有文件
  458. if matches[0].is_dir():
  459. content = []
  460. for f in sorted(matches[0].glob("*.md")):
  461. with open(f, 'r', encoding='utf-8') as file:
  462. content.append(f"## {f.stem}\n{file.read()}")
  463. return "\n\n".join(content)
  464. else:
  465. with open(matches[0], 'r', encoding='utf-8') as f:
  466. return f.read()
  467. return f"[{setting_type}设定未找到]"
  468. def _get_style_contract_ref(self) -> str:
  469. """获取风格契约引用"""
  470. style_file = self.settings_dir / "风格契约.md"
  471. if style_file.exists():
  472. with open(style_file, 'r', encoding='utf-8') as f:
  473. return f.read()
  474. # 检查其他可能的位置
  475. for pattern in ["风格*.md", "写作风格*.md", "style*.md"]:
  476. matches = list(self.settings_dir.glob(pattern))
  477. if matches:
  478. with open(matches[0], 'r', encoding='utf-8') as f:
  479. return f.read()
  480. return "[风格契约未定义]"
  481. def main():
  482. parser = argparse.ArgumentParser(description="Context Pack Builder v5.2")
  483. parser.add_argument("--chapter", type=int, required=True, help="章节编号")
  484. parser.add_argument("--project-root", metavar="PATH", help="项目根目录")
  485. parser.add_argument("--output", metavar="FILE", help="输出文件路径(默认输出到 stdout)")
  486. parser.add_argument("--pretty", action="store_true", help="格式化 JSON 输出")
  487. args = parser.parse_args()
  488. # 构建上下文包
  489. builder = ContextPackBuilder(project_root=args.project_root)
  490. context_pack = builder.build(args.chapter)
  491. # 输出
  492. if args.pretty:
  493. output = json.dumps(context_pack, ensure_ascii=False, indent=2)
  494. else:
  495. output = json.dumps(context_pack, ensure_ascii=False)
  496. if args.output:
  497. with open(args.output, 'w', encoding='utf-8') as f:
  498. f.write(output)
  499. print(f"✅ 上下文包已保存到: {args.output}")
  500. else:
  501. print(output)
  502. if __name__ == "__main__":
  503. # Windows UTF-8 编码修复
  504. if sys.platform == 'win32':
  505. import io
  506. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  507. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  508. main()