archive_manager.py.backup_20260102 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. #!/usr/bin/env python3
  2. """
  3. state.json 数据归档管理脚本
  4. 目标:防止 state.json 无限增长,确保 200 万字长跑稳定运行
  5. 功能:
  6. 1. 智能归档长期未使用的数据(角色/伏笔/审查报告)
  7. 2. 自动触发条件检测(文件大小/章节数)
  8. 3. 安全备份与恢复机制
  9. 4. 归档数据可随时恢复
  10. 归档策略:
  11. - 角色:超过 50 章未出场的次要角色 → archive/characters.json
  12. - 伏笔:status="已回收" 且超过 20 章的伏笔 → archive/plot_threads.json
  13. - 审查报告:超过 50 章的旧报告 → archive/reviews.json
  14. 使用方式:
  15. # 自动归档检查(推荐在 update_state.py 之后调用)
  16. python archive_manager.py --auto-check
  17. # 强制归档(忽略触发条件)
  18. python archive_manager.py --force
  19. # 恢复特定角色
  20. python archive_manager.py --restore-character "李雪"
  21. # 查看归档统计
  22. python archive_manager.py --stats
  23. # Dry-run 模式(仅显示将被归档的数据)
  24. python archive_manager.py --auto-check --dry-run
  25. """
  26. import json
  27. import os
  28. import sys
  29. import argparse
  30. from datetime import datetime
  31. from pathlib import Path
  32. # Windows UTF-8 编码修复
  33. if sys.platform == 'win32':
  34. import io
  35. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  36. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  37. class ArchiveManager:
  38. """state.json 数据归档管理器"""
  39. def __init__(self, project_root=None):
  40. if project_root is None:
  41. # 默认使用当前目录
  42. project_root = Path.cwd()
  43. else:
  44. project_root = Path(project_root)
  45. self.state_file = project_root / ".webnovel" / "state.json"
  46. self.archive_dir = project_root / ".webnovel" / "archive"
  47. # 确保归档目录存在
  48. self.archive_dir.mkdir(parents=True, exist_ok=True)
  49. # 归档文件路径
  50. self.characters_archive = self.archive_dir / "characters.json"
  51. self.plot_threads_archive = self.archive_dir / "plot_threads.json"
  52. self.reviews_archive = self.archive_dir / "reviews.json"
  53. # 归档规则配置
  54. self.config = {
  55. "character_inactive_threshold": 50, # 角色超过 50 章未出场视为不活跃
  56. "plot_resolved_threshold": 20, # 已回收伏笔超过 20 章后归档
  57. "review_old_threshold": 20, # 审查报告超过 20 章后归档(从 50 降至 20)
  58. "file_size_trigger_mb": 0.5, # state.json 超过 0.5MB 触发归档(从 1.0 降至 0.5)
  59. "chapter_trigger": 10 # 每 10 章检查一次
  60. }
  61. def load_state(self):
  62. """加载 state.json"""
  63. if not self.state_file.exists():
  64. print(f"❌ state.json 不存在: {self.state_file}")
  65. sys.exit(1)
  66. with open(self.state_file, 'r', encoding='utf-8') as f:
  67. return json.load(f)
  68. def save_state(self, state):
  69. """保存 state.json(带备份)"""
  70. # 备份原文件
  71. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  72. backup_file = self.state_file.parent / f"state.backup_{timestamp}.json"
  73. if self.state_file.exists():
  74. import shutil
  75. shutil.copy2(self.state_file, backup_file)
  76. # 写入新文件
  77. with open(self.state_file, 'w', encoding='utf-8') as f:
  78. json.dump(state, f, ensure_ascii=False, indent=2)
  79. print(f"✅ state.json 已更新(备份: {backup_file.name})")
  80. def load_archive(self, archive_file):
  81. """加载归档文件"""
  82. if not archive_file.exists():
  83. return []
  84. with open(archive_file, 'r', encoding='utf-8') as f:
  85. return json.load(f)
  86. def save_archive(self, archive_file, data):
  87. """保存归档文件"""
  88. with open(archive_file, 'w', encoding='utf-8') as f:
  89. json.dump(data, f, ensure_ascii=False, indent=2)
  90. def check_trigger_conditions(self, state):
  91. """检查是否需要触发归档"""
  92. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  93. # 条件 1: 文件大小超过阈值
  94. file_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  95. size_trigger = file_size_mb >= self.config["file_size_trigger_mb"]
  96. # 条件 2: 章节数是触发间隔的倍数
  97. chapter_trigger = (current_chapter % self.config["chapter_trigger"]) == 0 and current_chapter > 0
  98. return {
  99. "should_archive": size_trigger or chapter_trigger,
  100. "file_size_mb": file_size_mb,
  101. "current_chapter": current_chapter,
  102. "size_trigger": size_trigger,
  103. "chapter_trigger": chapter_trigger
  104. }
  105. def identify_inactive_characters(self, state):
  106. """识别不活跃的次要角色"""
  107. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  108. characters = state.get("entities", {}).get("characters", [])
  109. threshold = self.config["character_inactive_threshold"]
  110. inactive = []
  111. for char in characters:
  112. # 只归档次要角色(importance="minor")
  113. if char.get("importance") != "minor":
  114. continue
  115. # 检查最后出场章节
  116. last_appearance = char.get("last_appearance_chapter", 0)
  117. inactive_chapters = current_chapter - last_appearance
  118. if inactive_chapters >= threshold:
  119. inactive.append({
  120. "character": char,
  121. "inactive_chapters": inactive_chapters,
  122. "last_appearance": last_appearance
  123. })
  124. return inactive
  125. def identify_resolved_plot_threads(self, state):
  126. """识别可归档的已回收伏笔"""
  127. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  128. plot_threads = state.get("plot_threads", {}).get("active", [])
  129. resolved = state.get("plot_threads", {}).get("resolved", [])
  130. threshold = self.config["plot_resolved_threshold"]
  131. archivable = []
  132. for thread in resolved:
  133. resolved_chapter = thread.get("resolved_chapter", 0)
  134. chapters_since_resolved = current_chapter - resolved_chapter
  135. if chapters_since_resolved >= threshold:
  136. archivable.append({
  137. "thread": thread,
  138. "chapters_since_resolved": chapters_since_resolved,
  139. "resolved_chapter": resolved_chapter
  140. })
  141. return archivable
  142. def identify_old_reviews(self, state):
  143. """识别可归档的旧审查报告"""
  144. current_chapter = state.get("progress", {}).get("current_chapter", 0)
  145. reviews = state.get("review_checkpoints", [])
  146. threshold = self.config["review_old_threshold"]
  147. old_reviews = []
  148. for review in reviews:
  149. review_chapter = review.get("chapter_range", [0, 0])[1] # 取结束章节
  150. chapters_since_review = current_chapter - review_chapter
  151. if chapters_since_review >= threshold:
  152. old_reviews.append({
  153. "review": review,
  154. "chapters_since_review": chapters_since_review,
  155. "review_chapter": review_chapter
  156. })
  157. return old_reviews
  158. def archive_characters(self, inactive_list, dry_run=False):
  159. """归档不活跃角色(Priority 2 修复:与索引集成)"""
  160. if not inactive_list:
  161. return 0
  162. # 加载现有归档
  163. archived = self.load_archive(self.characters_archive)
  164. # 添加时间戳
  165. timestamp = datetime.now().isoformat()
  166. for item in inactive_list:
  167. item["character"]["archived_at"] = timestamp
  168. archived.append(item["character"])
  169. # ✅ Priority 2 修复:同步更新索引状态(而非删除)
  170. if not dry_run:
  171. try:
  172. # 导入索引模块
  173. import sys
  174. from pathlib import Path
  175. script_dir = Path(__file__).parent
  176. sys.path.insert(0, str(script_dir))
  177. from structured_index import StructuredIndex
  178. # 更新索引状态为 'archived'
  179. project_root = self.state_file.parent.parent
  180. index = StructuredIndex(str(project_root))
  181. index.mark_character_archived(item["character"]["name"], timestamp)
  182. except Exception as e:
  183. # 索引更新失败不影响归档流程
  184. print(f"⚠️ 索引状态更新失败(不影响归档): {e}")
  185. if not dry_run:
  186. self.save_archive(self.characters_archive, archived)
  187. return len(inactive_list)
  188. def archive_plot_threads(self, resolved_list, dry_run=False):
  189. """归档已回收伏笔"""
  190. if not resolved_list:
  191. return 0
  192. # 加载现有归档
  193. archived = self.load_archive(self.plot_threads_archive)
  194. # 添加时间戳
  195. timestamp = datetime.now().isoformat()
  196. for item in resolved_list:
  197. item["thread"]["archived_at"] = timestamp
  198. archived.append(item["thread"])
  199. if not dry_run:
  200. self.save_archive(self.plot_threads_archive, archived)
  201. return len(resolved_list)
  202. def archive_reviews(self, old_reviews_list, dry_run=False):
  203. """归档旧审查报告"""
  204. if not old_reviews_list:
  205. return 0
  206. # 加载现有归档
  207. archived = self.load_archive(self.reviews_archive)
  208. # 添加时间戳
  209. timestamp = datetime.now().isoformat()
  210. for item in old_reviews_list:
  211. item["review"]["archived_at"] = timestamp
  212. archived.append(item["review"])
  213. if not dry_run:
  214. self.save_archive(self.reviews_archive, archived)
  215. return len(old_reviews_list)
  216. def remove_from_state(self, state, inactive_chars, resolved_threads, old_reviews):
  217. """从 state.json 中移除已归档的数据"""
  218. # 移除不活跃角色
  219. if inactive_chars:
  220. char_names = {item["character"]["name"] for item in inactive_chars}
  221. state["entities"]["characters"] = [
  222. char for char in state["entities"]["characters"]
  223. if char["name"] not in char_names
  224. ]
  225. # 移除已归档的伏笔
  226. if resolved_threads:
  227. thread_ids = {item["thread"]["description"] for item in resolved_threads}
  228. state["plot_threads"]["resolved"] = [
  229. thread for thread in state["plot_threads"]["resolved"]
  230. if thread["description"] not in thread_ids
  231. ]
  232. # 移除旧审查报告
  233. if old_reviews:
  234. review_dates = {item["review"]["date"] for item in old_reviews}
  235. state["review_checkpoints"] = [
  236. review for review in state["review_checkpoints"]
  237. if review["date"] not in review_dates
  238. ]
  239. return state
  240. def run_auto_check(self, force=False, dry_run=False):
  241. """自动归档检查"""
  242. state = self.load_state()
  243. # 检查触发条件
  244. trigger = self.check_trigger_conditions(state)
  245. if not force and not trigger["should_archive"]:
  246. print("✅ 无需归档(触发条件未满足)")
  247. print(f" 文件大小: {trigger['file_size_mb']:.2f} MB (阈值: {self.config['file_size_trigger_mb']} MB)")
  248. print(f" 当前章节: {trigger['current_chapter']} (每 {self.config['chapter_trigger']} 章触发)")
  249. return
  250. print("🔍 开始归档检查...")
  251. print(f" 文件大小: {trigger['file_size_mb']:.2f} MB")
  252. print(f" 当前章节: {trigger['current_chapter']}")
  253. # 识别可归档数据
  254. inactive_chars = self.identify_inactive_characters(state)
  255. resolved_threads = self.identify_resolved_plot_threads(state)
  256. old_reviews = self.identify_old_reviews(state)
  257. # 输出统计
  258. print(f"\n📊 归档统计:")
  259. print(f" 不活跃角色: {len(inactive_chars)}")
  260. print(f" 已回收伏笔: {len(resolved_threads)}")
  261. print(f" 旧审查报告: {len(old_reviews)}")
  262. if not (inactive_chars or resolved_threads or old_reviews):
  263. print("\n✅ 无需归档(无符合条件的数据)")
  264. return
  265. # Dry-run 模式
  266. if dry_run:
  267. print("\n🔍 [Dry-run] 将被归档的数据:")
  268. if inactive_chars:
  269. print("\n 不活跃角色:")
  270. for item in inactive_chars[:5]: # 只显示前 5 个
  271. print(f" - {item['character']['name']} (超过 {item['inactive_chapters']} 章未出场)")
  272. if resolved_threads:
  273. print("\n 已回收伏笔:")
  274. for item in resolved_threads[:5]:
  275. print(f" - {item['thread']['description'][:30]}... (已回收 {item['chapters_since_resolved']} 章)")
  276. if old_reviews:
  277. print("\n 旧审查报告:")
  278. for item in old_reviews[:5]:
  279. print(f" - Ch{item['review_chapter']} ({item['chapters_since_review']} 章前)")
  280. return
  281. # 执行归档
  282. chars_archived = self.archive_characters(inactive_chars, dry_run=dry_run)
  283. threads_archived = self.archive_plot_threads(resolved_threads, dry_run=dry_run)
  284. reviews_archived = self.archive_reviews(old_reviews, dry_run=dry_run)
  285. # 从 state.json 中移除
  286. state = self.remove_from_state(state, inactive_chars, resolved_threads, old_reviews)
  287. self.save_state(state)
  288. # 最终统计
  289. print(f"\n✅ 归档完成:")
  290. print(f" 角色归档: {chars_archived} → {self.characters_archive.name}")
  291. print(f" 伏笔归档: {threads_archived} → {self.plot_threads_archive.name}")
  292. print(f" 报告归档: {reviews_archived} → {self.reviews_archive.name}")
  293. # 显示归档后的文件大小
  294. new_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  295. saved_mb = trigger["file_size_mb"] - new_size_mb
  296. print(f"\n💾 文件大小: {trigger['file_size_mb']:.2f} MB → {new_size_mb:.2f} MB (节省 {saved_mb:.2f} MB)")
  297. def restore_character(self, name):
  298. """恢复归档的角色(Priority 2 修复:同步恢复索引状态)"""
  299. archived = self.load_archive(self.characters_archive)
  300. state = self.load_state()
  301. # 查找角色
  302. char_to_restore = None
  303. for char in archived:
  304. if char["name"] == name:
  305. char_to_restore = char
  306. break
  307. if not char_to_restore:
  308. print(f"❌ 归档中未找到角色: {name}")
  309. return
  310. # 移除 archived_at 字段
  311. char_to_restore.pop("archived_at", None)
  312. # ✅ 原子性修复:先从归档中移除,再添加到 state.json
  313. # 理由:即使崩溃,数据仍在归档中,可重新恢复,不会丢失或重复
  314. archived = [char for char in archived if char["name"] != name]
  315. self.save_archive(self.characters_archive, archived)
  316. # 恢复到 state.json
  317. state["entities"]["characters"].append(char_to_restore)
  318. self.save_state(state)
  319. # ✅ Priority 2 修复:同步恢复索引状态为 'active'
  320. try:
  321. import sys
  322. from pathlib import Path
  323. script_dir = Path(__file__).parent
  324. sys.path.insert(0, str(script_dir))
  325. from structured_index import StructuredIndex
  326. project_root = self.state_file.parent.parent
  327. index = StructuredIndex(str(project_root))
  328. index.mark_character_active(name)
  329. except Exception as e:
  330. print(f"⚠️ 索引状态恢复失败(不影响数据恢复): {e}")
  331. print(f"✅ 角色已恢复: {name}")
  332. def show_stats(self):
  333. """显示归档统计"""
  334. chars = self.load_archive(self.characters_archive)
  335. threads = self.load_archive(self.plot_threads_archive)
  336. reviews = self.load_archive(self.reviews_archive)
  337. print("📊 归档统计:")
  338. print(f" 角色归档: {len(chars)}")
  339. print(f" 伏笔归档: {len(threads)}")
  340. print(f" 报告归档: {len(reviews)}")
  341. # 计算归档文件大小
  342. total_size = 0
  343. for archive_file in [self.characters_archive, self.plot_threads_archive, self.reviews_archive]:
  344. if archive_file.exists():
  345. total_size += archive_file.stat().st_size
  346. print(f" 归档大小: {total_size / 1024:.2f} KB")
  347. # 显示 state.json 大小
  348. state_size_mb = self.state_file.stat().st_size / (1024 * 1024)
  349. print(f"\n💾 state.json 当前大小: {state_size_mb:.2f} MB")
  350. def main():
  351. parser = argparse.ArgumentParser(description="state.json 数据归档管理")
  352. parser.add_argument("--auto-check", action="store_true", help="自动归档检查")
  353. parser.add_argument("--force", action="store_true", help="强制归档(忽略触发条件)")
  354. parser.add_argument("--dry-run", action="store_true", help="Dry-run 模式(仅显示将被归档的数据)")
  355. parser.add_argument("--restore-character", metavar="NAME", help="恢复归档的角色")
  356. parser.add_argument("--stats", action="store_true", help="显示归档统计")
  357. parser.add_argument("--project-root", metavar="PATH", help="项目根目录(默认为当前目录)")
  358. args = parser.parse_args()
  359. # 创建管理器
  360. manager = ArchiveManager(project_root=args.project_root)
  361. # 执行操作
  362. if args.auto_check or args.force:
  363. manager.run_auto_check(force=args.force, dry_run=args.dry_run)
  364. elif args.restore_character:
  365. manager.restore_character(args.restore_character)
  366. elif args.stats:
  367. manager.show_stats()
  368. else:
  369. parser.print_help()
  370. if __name__ == "__main__":
  371. main()