archive_manager.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. #!/usr/bin/env python3
  2. """
  3. state.json 数据归档管理脚本
  4. 目标:防止 state.json 无限增长,确保 200 万字长跑稳定运行
  5. 功能:
  6. 1. 智能归档长期未使用的数据(角色/伏笔/审查报告)
  7. 2. 自动触发条件检测(文件大小/章节数)
  8. 3. 安全备份与恢复机制
  9. 4. 归档数据可随时恢复
  10. 归档策略:
  11. - 角色:超过 50 章未出场的次要角色 → archive/characters.json
  12. - 伏笔:status="已回收" 且超过 20 章的伏笔 → archive/plot_threads.json
  13. - 审查报告:超过 50 章的旧报告 → archive/reviews.json
  14. 使用方式:
  15. # 自动归档检查(推荐在 update_state.py 之后调用)
  16. python archive_manager.py --auto-check
  17. # 强制归档(忽略触发条件)
  18. python archive_manager.py --force
  19. # 恢复特定角色
  20. python archive_manager.py --restore-character "李雪"
  21. # 查看归档统计
  22. python archive_manager.py --stats
  23. # Dry-run 模式(仅显示将被归档的数据)
  24. python archive_manager.py --auto-check --dry-run
  25. """
  26. import json
  27. import os
  28. import sys
  29. import argparse
  30. from datetime import datetime
  31. from pathlib import Path
  32. # ============================================================================
  33. # 安全修复:导入安全工具函数(P1 MEDIUM)
  34. # ============================================================================
  35. from security_utils import create_secure_directory, atomic_write_json
  36. from project_locator import resolve_project_root
  37. # Windows UTF-8 编码修复
  38. if sys.platform == 'win32':
  39. import io
  40. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  41. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  42. class ArchiveManager:
  43. """state.json 数据归档管理器"""
  44. def __init__(self, project_root=None):
  45. if project_root is None:
  46. # 默认使用当前目录
  47. project_root = Path.cwd()
  48. else:
  49. project_root = Path(project_root)
  50. self.state_file = project_root / ".webnovel" / "state.json"
  51. self.archive_dir = project_root / ".webnovel" / "archive"
  52. # ============================================================================
  53. # 安全修复:使用安全目录创建函数(P1 MEDIUM)
  54. # 原代码: self.archive_dir.mkdir(parents=True, exist_ok=True)
  55. # 漏洞: 未设置权限,使用OS默认(可能为755,允许同组用户读取)
  56. # ============================================================================
  57. create_secure_directory(str(self.archive_dir))
  58. # 归档文件路径
  59. self.characters_archive = self.archive_dir / "characters.json"
  60. self.plot_threads_archive = self.archive_dir / "plot_threads.json"
  61. self.reviews_archive = self.archive_dir / "reviews.json"
  62. # 归档规则配置
  63. self.config = {
  64. "character_inactive_threshold": 50, # 角色超过 50 章未出场视为不活跃
  65. "plot_resolved_threshold": 20, # 已回收伏笔超过 20 章后归档
  66. "review_old_threshold": 50, # 审查报告超过 50 章后归档
  67. "file_size_trigger_mb": 1.0, # state.json 超过 1.0MB 触发强制归档
  68. "chapter_trigger": 10 # 每 10 章检查一次
  69. }
  70. def load_state(self):
  71. """加载 state.json"""
  72. if not self.state_file.exists():
  73. print(f"❌ state.json 不存在: {self.state_file}")
  74. sys.exit(1)
  75. with open(self.state_file, 'r', encoding='utf-8') as f:
  76. return json.load(f)
  77. def save_state(self, state):
  78. """保存 state.json(原子化写入)"""
  79. # 使用集中式原子写入(自动备份)
  80. atomic_write_json(self.state_file, state, use_lock=True, backup=True)
  81. print(f"✅ state.json 已原子化更新")
  82. def load_archive(self, archive_file):
  83. """加载归档文件"""
  84. if not archive_file.exists():
  85. return []
  86. with open(archive_file, 'r', encoding='utf-8') as f:
  87. return json.load(f)
  88. def save_archive(self, archive_file, data):
  89. """保存归档文件"""
  90. with open(archive_file, 'w', encoding='utf-8') as f:
  91. json.dump(data, f, ensure_ascii=False, indent=2)
  92. def check_trigger_conditions(self, state):
  93. """检查是否需要触发归档"""
  94. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  95. # 条件 1: 文件大小超过阈值
  96. file_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  97. size_trigger = file_size_mb >= self.config["file_size_trigger_mb"]
  98. # 条件 2: 章节数是触发间隔的倍数
  99. chapter_trigger = (current_chapter % self.config["chapter_trigger"]) == 0 and current_chapter > 0
  100. return {
  101. "should_archive": size_trigger or chapter_trigger,
  102. "file_size_mb": file_size_mb,
  103. "current_chapter": current_chapter,
  104. "size_trigger": size_trigger,
  105. "chapter_trigger": chapter_trigger
  106. }
  107. def identify_inactive_characters(self, state):
  108. """识别不活跃的次要角色 (v5.0 entities_v3 格式)"""
  109. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  110. # v5.0: 从 entities_v3.角色 获取角色列表
  111. entities_v3 = state.get("entities_v3", {})
  112. characters_dict = entities_v3.get("角色", {})
  113. threshold = self.config["character_inactive_threshold"]
  114. inactive = []
  115. for char_id, char in characters_dict.items():
  116. # 只归档次要角色(tier="装饰" 或 tier="支线")
  117. tier = str(char.get("tier", "")).strip()
  118. if tier == "核心":
  119. continue
  120. # 检查最后出场章节
  121. last_appearance = char.get("last_appearance", 0)
  122. try:
  123. last_appearance = int(last_appearance)
  124. except (TypeError, ValueError):
  125. last_appearance = 0
  126. if last_appearance <= 0:
  127. continue
  128. inactive_chapters = current_chapter - last_appearance
  129. if inactive_chapters >= threshold:
  130. # 构造兼容结构
  131. char_data = {
  132. "id": char_id,
  133. "name": char.get("canonical_name", char_id),
  134. "tier": tier,
  135. "last_appearance_chapter": last_appearance
  136. }
  137. char_data.update(char)
  138. inactive.append({
  139. "character": char_data,
  140. "inactive_chapters": inactive_chapters,
  141. "last_appearance": last_appearance
  142. })
  143. return inactive
  144. def identify_resolved_plot_threads(self, state):
  145. """识别可归档的已回收伏笔"""
  146. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  147. plot_threads = state.get("plot_threads", {}) or {}
  148. foreshadowing = plot_threads.get("foreshadowing", []) or []
  149. resolved_legacy = plot_threads.get("resolved", []) or []
  150. threshold = self.config["plot_resolved_threshold"]
  151. archivable = []
  152. # 新格式:plot_threads.foreshadowing(用 status 标识是否已回收)
  153. if isinstance(foreshadowing, list):
  154. for item in foreshadowing:
  155. if not isinstance(item, dict):
  156. continue
  157. status = str(item.get("status", "")).strip()
  158. if status not in ["已回收", "resolved"]:
  159. continue
  160. try:
  161. resolved_chapter = int(item.get("resolved_chapter", 0))
  162. except (TypeError, ValueError):
  163. continue
  164. chapters_since_resolved = current_chapter - resolved_chapter
  165. if chapters_since_resolved >= threshold:
  166. archivable.append({
  167. "thread": item,
  168. "chapters_since_resolved": chapters_since_resolved,
  169. "resolved_chapter": resolved_chapter
  170. })
  171. # 旧格式兼容:plot_threads.resolved(直接存已回收列表)
  172. if isinstance(resolved_legacy, list):
  173. for item in resolved_legacy:
  174. if not isinstance(item, dict):
  175. continue
  176. try:
  177. resolved_chapter = int(item.get("resolved_chapter", 0))
  178. except (TypeError, ValueError):
  179. continue
  180. chapters_since_resolved = current_chapter - resolved_chapter
  181. if chapters_since_resolved >= threshold:
  182. archivable.append({
  183. "thread": item,
  184. "chapters_since_resolved": chapters_since_resolved,
  185. "resolved_chapter": resolved_chapter
  186. })
  187. return archivable
  188. def identify_old_reviews(self, state):
  189. """识别可归档的旧审查报告"""
  190. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  191. reviews = state.get("review_checkpoints", [])
  192. threshold = self.config["review_old_threshold"]
  193. def _parse_end_chapter(review: dict) -> int:
  194. # 新格式:{"chapters":"5-6","report":"...","reviewed_at":"..."}
  195. chapters = review.get("chapters")
  196. if isinstance(chapters, str):
  197. parts = [p.strip() for p in chapters.replace("—", "-").split("-") if p.strip()]
  198. if parts:
  199. try:
  200. return int(parts[-1])
  201. except ValueError:
  202. pass
  203. # 旧格式:{"chapter_range":[5,6], "date":"..."}
  204. cr = review.get("chapter_range")
  205. if isinstance(cr, (list, tuple)) and len(cr) >= 2:
  206. try:
  207. return int(cr[1])
  208. except (TypeError, ValueError):
  209. pass
  210. # 兜底:从 report 文件名里抓 "Ch5-6" 或 "第005-006"
  211. report = review.get("report")
  212. if isinstance(report, str):
  213. import re
  214. m = re.search(r"Ch(\d+)[-–—](\d+)", report)
  215. if m:
  216. try:
  217. return int(m.group(2))
  218. except ValueError:
  219. pass
  220. m = re.search(r"第(\d+)[-–—](\d+)章", report)
  221. if m:
  222. try:
  223. return int(m.group(2))
  224. except ValueError:
  225. pass
  226. return 0
  227. old_reviews = []
  228. for review in reviews:
  229. review_chapter = _parse_end_chapter(review)
  230. chapters_since_review = current_chapter - review_chapter
  231. if chapters_since_review >= threshold:
  232. old_reviews.append({
  233. "review": review,
  234. "chapters_since_review": chapters_since_review,
  235. "review_chapter": review_chapter
  236. })
  237. return old_reviews
  238. def archive_characters(self, inactive_list, dry_run=False):
  239. """归档不活跃角色(Priority 2 修复:与索引集成)"""
  240. if not inactive_list:
  241. return 0
  242. # 加载现有归档
  243. archived = self.load_archive(self.characters_archive)
  244. # 添加时间戳
  245. timestamp = datetime.now().isoformat()
  246. for item in inactive_list:
  247. item["character"]["archived_at"] = timestamp
  248. archived.append(item["character"])
  249. # ✅ Priority 2 修复:同步更新索引状态(而非删除)
  250. if not dry_run:
  251. try:
  252. # 导入索引模块
  253. import sys
  254. from pathlib import Path
  255. script_dir = Path(__file__).parent
  256. sys.path.insert(0, str(script_dir))
  257. from structured_index import StructuredIndex
  258. # 更新索引状态为 'archived'
  259. project_root = self.state_file.parent.parent
  260. index = StructuredIndex(str(project_root))
  261. index.mark_character_archived(item["character"]["name"], timestamp)
  262. except Exception as e:
  263. # 索引更新失败不影响归档流程
  264. print(f"⚠️ 索引状态更新失败(不影响归档): {e}")
  265. if not dry_run:
  266. self.save_archive(self.characters_archive, archived)
  267. return len(inactive_list)
  268. def archive_plot_threads(self, resolved_list, dry_run=False):
  269. """归档已回收伏笔"""
  270. if not resolved_list:
  271. return 0
  272. # 加载现有归档
  273. archived = self.load_archive(self.plot_threads_archive)
  274. # 添加时间戳
  275. timestamp = datetime.now().isoformat()
  276. for item in resolved_list:
  277. item["thread"]["archived_at"] = timestamp
  278. archived.append(item["thread"])
  279. if not dry_run:
  280. self.save_archive(self.plot_threads_archive, archived)
  281. return len(resolved_list)
  282. def archive_reviews(self, old_reviews_list, dry_run=False):
  283. """归档旧审查报告"""
  284. if not old_reviews_list:
  285. return 0
  286. # 加载现有归档
  287. archived = self.load_archive(self.reviews_archive)
  288. # 添加时间戳
  289. timestamp = datetime.now().isoformat()
  290. for item in old_reviews_list:
  291. item["review"]["archived_at"] = timestamp
  292. archived.append(item["review"])
  293. if not dry_run:
  294. self.save_archive(self.reviews_archive, archived)
  295. return len(old_reviews_list)
  296. def remove_from_state(self, state, inactive_chars, resolved_threads, old_reviews):
  297. """从 state.json 中移除已归档的数据 (v5.0 entities_v3 格式)"""
  298. # 移除不活跃角色 (v5.0: 从 entities_v3.角色 中移除)
  299. if inactive_chars:
  300. char_ids = {item["character"].get("id") for item in inactive_chars}
  301. entities_v3 = state.get("entities_v3", {})
  302. characters_dict = entities_v3.get("角色", {})
  303. for char_id in char_ids:
  304. if char_id in characters_dict:
  305. del characters_dict[char_id]
  306. # 移除已归档的伏笔
  307. if resolved_threads:
  308. thread_ids = {
  309. (item.get("thread", {}) or {}).get("content") or (item.get("thread", {}) or {}).get("description")
  310. for item in resolved_threads
  311. }
  312. thread_ids = {t for t in thread_ids if isinstance(t, str) and t.strip()}
  313. plot_threads = state.get("plot_threads", {}) or {}
  314. if isinstance(plot_threads.get("foreshadowing"), list):
  315. plot_threads["foreshadowing"] = [
  316. t for t in plot_threads["foreshadowing"]
  317. if not isinstance(t, dict) or (t.get("content") or t.get("description")) not in thread_ids
  318. ]
  319. if isinstance(plot_threads.get("resolved"), list):
  320. plot_threads["resolved"] = [
  321. t for t in plot_threads["resolved"]
  322. if not isinstance(t, dict) or (t.get("content") or t.get("description")) not in thread_ids
  323. ]
  324. state["plot_threads"] = plot_threads
  325. # 移除旧审查报告
  326. if old_reviews:
  327. review_keys = set()
  328. for item in old_reviews:
  329. review = item.get("review", {}) or {}
  330. key = review.get("report") or review.get("reviewed_at") or review.get("date")
  331. if isinstance(key, str) and key.strip():
  332. review_keys.add(key)
  333. state["review_checkpoints"] = [
  334. review for review in state.get("review_checkpoints", [])
  335. if (review.get("report") or review.get("reviewed_at") or review.get("date")) not in review_keys
  336. ]
  337. return state
  338. def run_auto_check(self, force=False, dry_run=False):
  339. """自动归档检查"""
  340. state = self.load_state()
  341. # 检查触发条件
  342. trigger = self.check_trigger_conditions(state)
  343. if not force and not trigger["should_archive"]:
  344. print("✅ 无需归档(触发条件未满足)")
  345. print(f" 文件大小: {trigger['file_size_mb']:.2f} MB (阈值: {self.config['file_size_trigger_mb']} MB)")
  346. print(f" 当前章节: {trigger['current_chapter']} (每 {self.config['chapter_trigger']} 章触发)")
  347. return
  348. print("🔍 开始归档检查...")
  349. print(f" 文件大小: {trigger['file_size_mb']:.2f} MB")
  350. print(f" 当前章节: {trigger['current_chapter']}")
  351. # 识别可归档数据
  352. inactive_chars = self.identify_inactive_characters(state)
  353. resolved_threads = self.identify_resolved_plot_threads(state)
  354. old_reviews = self.identify_old_reviews(state)
  355. # 输出统计
  356. print(f"\n📊 归档统计:")
  357. print(f" 不活跃角色: {len(inactive_chars)}")
  358. print(f" 已回收伏笔: {len(resolved_threads)}")
  359. print(f" 旧审查报告: {len(old_reviews)}")
  360. if not (inactive_chars or resolved_threads or old_reviews):
  361. print("\n✅ 无需归档(无符合条件的数据)")
  362. return
  363. # Dry-run 模式
  364. if dry_run:
  365. print("\n🔍 [Dry-run] 将被归档的数据:")
  366. if inactive_chars:
  367. print("\n 不活跃角色:")
  368. for item in inactive_chars[:5]: # 只显示前 5 个
  369. print(f" - {item['character']['name']} (超过 {item['inactive_chapters']} 章未出场)")
  370. if resolved_threads:
  371. print("\n 已回收伏笔:")
  372. for item in resolved_threads[:5]:
  373. desc = item["thread"].get("content") or item["thread"].get("description") or ""
  374. print(f" - {str(desc)[:30]}... (已回收 {item['chapters_since_resolved']} 章)")
  375. if old_reviews:
  376. print("\n 旧审查报告:")
  377. for item in old_reviews[:5]:
  378. print(f" - Ch{item['review_chapter']} ({item['chapters_since_review']} 章前)")
  379. return
  380. # 执行归档
  381. chars_archived = self.archive_characters(inactive_chars, dry_run=dry_run)
  382. threads_archived = self.archive_plot_threads(resolved_threads, dry_run=dry_run)
  383. reviews_archived = self.archive_reviews(old_reviews, dry_run=dry_run)
  384. # 从 state.json 中移除
  385. state = self.remove_from_state(state, inactive_chars, resolved_threads, old_reviews)
  386. self.save_state(state)
  387. # 最终统计
  388. print(f"\n✅ 归档完成:")
  389. print(f" 角色归档: {chars_archived} → {self.characters_archive.name}")
  390. print(f" 伏笔归档: {threads_archived} → {self.plot_threads_archive.name}")
  391. print(f" 报告归档: {reviews_archived} → {self.reviews_archive.name}")
  392. # 显示归档后的文件大小
  393. new_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  394. saved_mb = trigger["file_size_mb"] - new_size_mb
  395. print(f"\n💾 文件大小: {trigger['file_size_mb']:.2f} MB → {new_size_mb:.2f} MB (节省 {saved_mb:.2f} MB)")
  396. def restore_character(self, name):
  397. """恢复归档的角色(Priority 2 修复:同步恢复索引状态)"""
  398. archived = self.load_archive(self.characters_archive)
  399. state = self.load_state()
  400. # 查找角色
  401. char_to_restore = None
  402. for char in archived:
  403. if char["name"] == name:
  404. char_to_restore = char
  405. break
  406. if not char_to_restore:
  407. print(f"❌ 归档中未找到角色: {name}")
  408. return
  409. # 移除 archived_at 字段
  410. char_to_restore.pop("archived_at", None)
  411. # ✅ 原子性修复:先从归档中移除,再添加到 state.json
  412. # 理由:即使崩溃,数据仍在归档中,可重新恢复,不会丢失或重复
  413. archived = [char for char in archived if char["name"] != name]
  414. self.save_archive(self.characters_archive, archived)
  415. # 恢复到 state.json (v5.0: 添加到 entities_v3.角色)
  416. if "entities_v3" not in state:
  417. state["entities_v3"] = {"角色": {}, "地点": {}, "物品": {}, "势力": {}, "招式": {}}
  418. if "角色" not in state["entities_v3"]:
  419. state["entities_v3"]["角色"] = {}
  420. char_id = char_to_restore.get("id", char_to_restore.get("name", "unknown"))
  421. state["entities_v3"]["角色"][char_id] = {
  422. "canonical_name": char_to_restore.get("name", char_id),
  423. "tier": char_to_restore.get("tier", "装饰"),
  424. "desc": char_to_restore.get("desc", ""),
  425. "current": char_to_restore.get("current", {}),
  426. "first_appearance": char_to_restore.get("first_appearance", 0),
  427. "last_appearance": char_to_restore.get("last_appearance", 0),
  428. "history": char_to_restore.get("history", [])
  429. }
  430. self.save_state(state)
  431. # ✅ Priority 2 修复:同步恢复索引状态为 'active'
  432. try:
  433. import sys
  434. from pathlib import Path
  435. script_dir = Path(__file__).parent
  436. sys.path.insert(0, str(script_dir))
  437. from structured_index import StructuredIndex
  438. project_root = self.state_file.parent.parent
  439. index = StructuredIndex(str(project_root))
  440. index.mark_character_active(name)
  441. except Exception as e:
  442. print(f"⚠️ 索引状态恢复失败(不影响数据恢复): {e}")
  443. print(f"✅ 角色已恢复: {name}")
  444. def show_stats(self):
  445. """显示归档统计"""
  446. chars = self.load_archive(self.characters_archive)
  447. threads = self.load_archive(self.plot_threads_archive)
  448. reviews = self.load_archive(self.reviews_archive)
  449. print("📊 归档统计:")
  450. print(f" 角色归档: {len(chars)}")
  451. print(f" 伏笔归档: {len(threads)}")
  452. print(f" 报告归档: {len(reviews)}")
  453. # 计算归档文件大小
  454. total_size = 0
  455. for archive_file in [self.characters_archive, self.plot_threads_archive, self.reviews_archive]:
  456. if archive_file.exists():
  457. total_size += archive_file.stat().st_size
  458. print(f" 归档大小: {total_size / 1024:.2f} KB")
  459. # 显示 state.json 大小
  460. state_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  461. print(f"\n💾 state.json 当前大小: {state_size_mb:.2f} MB")
  462. def main():
  463. parser = argparse.ArgumentParser(description="state.json 数据归档管理")
  464. parser.add_argument("--auto-check", action="store_true", help="自动归档检查")
  465. parser.add_argument("--force", action="store_true", help="强制归档(忽略触发条件)")
  466. parser.add_argument("--dry-run", action="store_true", help="Dry-run 模式(仅显示将被归档的数据)")
  467. parser.add_argument("--restore-character", metavar="NAME", help="恢复归档的角色")
  468. parser.add_argument("--stats", action="store_true", help="显示归档统计")
  469. parser.add_argument("--project-root", metavar="PATH", help="项目根目录(默认为当前目录)")
  470. args = parser.parse_args()
  471. # 创建管理器(支持从仓库根目录运行)
  472. project_root = args.project_root
  473. if project_root is None and not (Path.cwd() / ".webnovel" / "state.json").exists():
  474. try:
  475. project_root = str(resolve_project_root())
  476. except FileNotFoundError:
  477. project_root = None
  478. manager = ArchiveManager(project_root=project_root)
  479. # 执行操作
  480. if args.auto_check or args.force:
  481. manager.run_auto_check(force=args.force, dry_run=args.dry_run)
  482. elif args.restore_character:
  483. manager.restore_character(args.restore_character)
  484. elif args.stats:
  485. manager.show_stats()
  486. else:
  487. parser.print_help()
  488. if __name__ == "__main__":
  489. main()