extract_entities.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. #!/usr/bin/env python3
  2. """
  3. [NEW_ENTITY] 标签提取与同步脚本
  4. 功能:
  5. 1. 扫描指定章节正文,提取所有 [NEW_ENTITY] 标签
  6. 2. 解析实体类型(角色/地点/物品/势力/招式)
  7. 3. 同步到设定集对应文件
  8. 4. 更新 state.json 中的相关记录
  9. 5. 支持自动化模式和交互式模式
  10. 使用方式:
  11. python extract_entities.py <章节文件> [--auto] [--dry-run]
  12. 示例:
  13. python extract_entities.py ../../../正文/第0001章.md # 交互式模式
  14. python extract_entities.py ../../../正文/第0001章.md --auto # 自动化模式
  15. python extract_entities.py ../../../正文/第0001章.md --dry-run # 仅预览不写入
  16. """
  17. import re
  18. import json
  19. import os
  20. import sys
  21. from pathlib import Path
  22. from datetime import datetime
  23. from typing import List, Dict, Tuple
  24. # Windows 编码兼容性修复
  25. if sys.platform == 'win32':
  26. import io
  27. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  28. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  29. # 实体类型与目标文件映射
  30. ENTITY_TYPE_MAP = {
  31. "角色": "设定集/角色库/{category}/{name}.md",
  32. "地点": "设定集/世界观.md", # 追加到世界观地理章节
  33. "物品": "设定集/物品库/{name}.md",
  34. "势力": "设定集/世界观.md", # 追加到势力章节
  35. "招式": "设定集/力量体系.md", # 追加到招式章节
  36. "其他": "设定集/其他设定/{name}.md"
  37. }
  38. # 角色分类规则
  39. ROLE_CATEGORY_MAP = {
  40. "主角": "主要角色",
  41. "配角": "次要角色",
  42. "反派": "反派角色",
  43. "路人": "次要角色"
  44. }
  45. def extract_new_entities(file_path: str) -> List[Dict]:
  46. """
  47. 从章节文件中提取所有 [NEW_ENTITY] 标签
  48. 标签格式:
  49. [NEW_ENTITY: 角色, 李雪, 天云宗外门弟子,主角的青梅竹马]
  50. [NEW_ENTITY: 地点, 血煞秘境, 危险的试炼之地,内有金丹期凶兽]
  51. [NEW_ENTITY: 物品, 天雷果, 可提升雷属性修炼速度的灵果]
  52. Returns:
  53. List[Dict]: [{"type": "角色", "name": "李雪", "desc": "...", "line": 123}, ...]
  54. """
  55. entities = []
  56. with open(file_path, 'r', encoding='utf-8') as f:
  57. for line_num, line in enumerate(f, 1):
  58. # 匹配 [NEW_ENTITY: 类型, 名称, 描述]
  59. matches = re.findall(
  60. r'\[NEW_ENTITY:\s*([^,]+),\s*([^,]+),\s*([^\]]+)\]',
  61. line
  62. )
  63. for match in matches:
  64. entity_type = match[0].strip()
  65. entity_name = match[1].strip()
  66. entity_desc = match[2].strip()
  67. entities.append({
  68. "type": entity_type,
  69. "name": entity_name,
  70. "desc": entity_desc,
  71. "line": line_num,
  72. "source_file": file_path
  73. })
  74. return entities
  75. def categorize_character(desc: str) -> str:
  76. """
  77. 根据描述判断角色分类
  78. 规则:
  79. - 包含"主角"/"林天" → 主要角色
  80. - 包含"反派"/"敌对"/"血煞门" → 反派角色
  81. - 其他 → 次要角色
  82. """
  83. if "主角" in desc or "重要" in desc:
  84. return "主要角色"
  85. elif "反派" in desc or "敌对" in desc or "血煞" in desc:
  86. return "反派角色"
  87. else:
  88. return "次要角色"
  89. def generate_character_card(entity: Dict, category: str) -> str:
  90. """生成角色卡 Markdown 内容"""
  91. return f"""# {entity['name']}
  92. > **首次登场**: {entity.get('source_file', '未知')}(第 {entity.get('line', '?')} 行)
  93. > **创建时间**: {datetime.now().strftime('%Y-%m-%d')}
  94. ## 基本信息
  95. - **姓名**: {entity['name']}
  96. - **性别**: 待补充
  97. - **年龄**: 待补充
  98. - **身份**: {entity['desc']}
  99. - **所属势力**: 待补充
  100. ## 实力设定
  101. - **当前境界**: 待补充
  102. - **擅长招式**: 待补充
  103. - **特殊能力**: 待补充
  104. ## 性格特点
  105. {entity['desc']}
  106. ## 外貌描述
  107. 待补充
  108. ## 人际关系
  109. - **与主角**: 待补充
  110. ## 重要剧情
  111. - 【第 X 章】{entity['desc']}
  112. ## 备注
  113. 自动提取自 [NEW_ENTITY] 标签,请补充完善。
  114. """
  115. def update_world_view(entity: Dict, target_file: str, section: str):
  116. """更新世界观.md(追加地点/势力信息)"""
  117. if not os.path.exists(target_file):
  118. # 创建基础模板
  119. content = f"""# 世界观
  120. ## 地理
  121. ## 势力
  122. ## 历史背景
  123. """
  124. with open(target_file, 'w', encoding='utf-8') as f:
  125. f.write(content)
  126. # 读取现有内容
  127. with open(target_file, 'r', encoding='utf-8') as f:
  128. content = f.read()
  129. # 追加到对应章节
  130. if section == "地理":
  131. entry = f"""
  132. ### {entity['name']}
  133. {entity['desc']}
  134. > 首次登场: {entity.get('source_file', '未知')}
  135. """
  136. elif section == "势力":
  137. entry = f"""
  138. ### {entity['name']}
  139. {entity['desc']}
  140. > 首次登场: {entity.get('source_file', '未知')}
  141. """
  142. # 在对应章节后追加
  143. pattern = f"## {section}"
  144. if pattern in content:
  145. content = content.replace(pattern, f"{pattern}\n{entry}")
  146. else:
  147. content += f"\n## {section}\n{entry}"
  148. with open(target_file, 'w', encoding='utf-8') as f:
  149. f.write(content)
  150. def update_power_system(entity: Dict, target_file: str):
  151. """更新力量体系.md(追加招式)"""
  152. if not os.path.exists(target_file):
  153. content = f"""# 力量体系
  154. ## 境界划分
  155. ## 修炼方法
  156. ## 招式库
  157. """
  158. with open(target_file, 'w', encoding='utf-8') as f:
  159. f.write(content)
  160. with open(target_file, 'r', encoding='utf-8') as f:
  161. content = f.read()
  162. entry = f"""
  163. ### {entity['name']}
  164. {entity['desc']}
  165. > 首次登场: {entity.get('source_file', '未知')}
  166. """
  167. if "## 招式库" in content:
  168. content = content.replace("## 招式库", f"## 招式库\n{entry}")
  169. else:
  170. content += f"\n## 招式库\n{entry}"
  171. with open(target_file, 'w', encoding='utf-8') as f:
  172. f.write(content)
  173. def update_state_json(entities: List[Dict], state_file: str):
  174. """更新 state.json 中的实体记录"""
  175. with open(state_file, 'r', encoding='utf-8') as f:
  176. state = json.load(f)
  177. # 确保存在实体列表
  178. if 'entities' not in state:
  179. state['entities'] = {
  180. "characters": [],
  181. "locations": [],
  182. "items": [],
  183. "factions": [],
  184. "techniques": []
  185. }
  186. for entity in entities:
  187. entity_type = entity['type']
  188. if entity_type == "角色":
  189. if entity['name'] not in [c.get('name') for c in state['entities']['characters']]:
  190. state['entities']['characters'].append({
  191. "name": entity['name'],
  192. "desc": entity['desc'],
  193. "category": categorize_character(entity['desc']),
  194. "first_appearance": entity.get('source_file', ''),
  195. "added_at": datetime.now().strftime('%Y-%m-%d')
  196. })
  197. elif entity_type == "地点":
  198. if entity['name'] not in [l.get('name') for l in state['entities']['locations']]:
  199. state['entities']['locations'].append({
  200. "name": entity['name'],
  201. "desc": entity['desc'],
  202. "first_appearance": entity.get('source_file', ''),
  203. "added_at": datetime.now().strftime('%Y-%m-%d')
  204. })
  205. elif entity_type == "物品":
  206. if entity['name'] not in [i.get('name') for i in state['entities']['items']]:
  207. state['entities']['items'].append({
  208. "name": entity['name'],
  209. "desc": entity['desc'],
  210. "first_appearance": entity.get('source_file', ''),
  211. "added_at": datetime.now().strftime('%Y-%m-%d')
  212. })
  213. elif entity_type == "势力":
  214. if entity['name'] not in [f.get('name') for f in state['entities']['factions']]:
  215. state['entities']['factions'].append({
  216. "name": entity['name'],
  217. "desc": entity['desc'],
  218. "first_appearance": entity.get('source_file', ''),
  219. "added_at": datetime.now().strftime('%Y-%m-%d')
  220. })
  221. elif entity_type == "招式":
  222. if entity['name'] not in [t.get('name') for t in state['entities']['techniques']]:
  223. state['entities']['techniques'].append({
  224. "name": entity['name'],
  225. "desc": entity['desc'],
  226. "first_appearance": entity.get('source_file', ''),
  227. "added_at": datetime.now().strftime('%Y-%m-%d')
  228. })
  229. # 备份旧文件
  230. backup_file = state_file.replace('.json', f'.backup_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json')
  231. os.rename(state_file, backup_file)
  232. # 写入新文件
  233. with open(state_file, 'w', encoding='utf-8') as f:
  234. json.dump(state, f, ensure_ascii=False, indent=2)
  235. print(f"✅ 已备份旧状态文件到: {backup_file}")
  236. def sync_entity_to_settings(entity: Dict, project_root: str, auto_mode: bool = False) -> bool:
  237. """
  238. 将实体同步到设定集
  239. Returns:
  240. bool: 是否成功同步
  241. """
  242. entity_type = entity['type']
  243. entity_name = entity['name']
  244. if entity_type == "角色":
  245. category = categorize_character(entity['desc'])
  246. category_dir = ROLE_CATEGORY_MAP.get(category.split('/')[0], "次要角色")
  247. target_dir = Path(project_root) / f"设定集/角色库/{category_dir}"
  248. target_dir.mkdir(parents=True, exist_ok=True)
  249. target_file = target_dir / f"{entity_name}.md"
  250. if target_file.exists():
  251. print(f"⚠️ 角色卡已存在: {target_file}")
  252. if not auto_mode:
  253. choice = input("是否覆盖?(y/n): ")
  254. if choice.lower() != 'y':
  255. return False
  256. with open(target_file, 'w', encoding='utf-8') as f:
  257. f.write(generate_character_card(entity, category))
  258. print(f"✅ 已创建角色卡: {target_file}")
  259. return True
  260. elif entity_type == "地点":
  261. target_file = Path(project_root) / "设定集/世界观.md"
  262. update_world_view(entity, str(target_file), "地理")
  263. print(f"✅ 已更新世界观(地理): {entity_name}")
  264. return True
  265. elif entity_type == "势力":
  266. target_file = Path(project_root) / "设定集/世界观.md"
  267. update_world_view(entity, str(target_file), "势力")
  268. print(f"✅ 已更新世界观(势力): {entity_name}")
  269. return True
  270. elif entity_type == "招式":
  271. target_file = Path(project_root) / "设定集/力量体系.md"
  272. update_power_system(entity, str(target_file))
  273. print(f"✅ 已更新力量体系(招式): {entity_name}")
  274. return True
  275. elif entity_type == "物品":
  276. target_dir = Path(project_root) / "设定集/物品库"
  277. target_dir.mkdir(parents=True, exist_ok=True)
  278. target_file = target_dir / f"{entity_name}.md"
  279. if target_file.exists():
  280. print(f"⚠️ 物品卡已存在: {target_file}")
  281. if not auto_mode:
  282. choice = input("是否覆盖?(y/n): ")
  283. if choice.lower() != 'y':
  284. return False
  285. content = f"""# {entity_name}
  286. > **首次登场**: {entity.get('source_file', '未知')}
  287. > **创建时间**: {datetime.now().strftime('%Y-%m-%d')}
  288. ## 基本信息
  289. {entity['desc']}
  290. ## 详细设定
  291. 待补充
  292. ## 相关剧情
  293. - 【第 X 章】首次出现
  294. ## 备注
  295. 自动提取自 [NEW_ENTITY] 标签,请补充完善。
  296. """
  297. with open(target_file, 'w', encoding='utf-8') as f:
  298. f.write(content)
  299. print(f"✅ 已创建物品卡: {target_file}")
  300. return True
  301. else:
  302. print(f"⚠️ 未知实体类型: {entity_type}")
  303. return False
  304. def main():
  305. if len(sys.argv) < 2:
  306. print("用法: python extract_entities.py <章节文件> [--auto] [--dry-run]")
  307. print("示例: python extract_entities.py ../../../正文/第0001章.md")
  308. sys.exit(1)
  309. chapter_file = sys.argv[1]
  310. auto_mode = '--auto' in sys.argv
  311. dry_run = '--dry-run' in sys.argv
  312. if not os.path.exists(chapter_file):
  313. print(f"❌ 文件不存在: {chapter_file}")
  314. sys.exit(1)
  315. # 提取实体
  316. print(f"📖 正在扫描: {chapter_file}")
  317. entities = extract_new_entities(chapter_file)
  318. if not entities:
  319. print("✅ 未发现 [NEW_ENTITY] 标签")
  320. return
  321. print(f"\n🔍 发现 {len(entities)} 个新实体:")
  322. for i, entity in enumerate(entities, 1):
  323. print(f" {i}. [{entity['type']}] {entity['name']} - {entity['desc'][:30]}...")
  324. if dry_run:
  325. print("\n⚠️ Dry-run 模式,不执行实际写入")
  326. return
  327. # 确定项目根目录
  328. project_root = Path(chapter_file).parent.parent
  329. state_file = project_root / ".webnovel/state.json"
  330. if not state_file.exists():
  331. print(f"❌ 状态文件不存在: {state_file}")
  332. print("请先运行 /webnovel-init 初始化项目")
  333. sys.exit(1)
  334. # 同步实体到设定集
  335. print(f"\n📝 开始同步到设定集...")
  336. success_count = 0
  337. for entity in entities:
  338. if sync_entity_to_settings(entity, str(project_root), auto_mode):
  339. success_count += 1
  340. # 更新 state.json
  341. print(f"\n💾 更新 state.json...")
  342. update_state_json(entities, str(state_file))
  343. print(f"\n✅ 完成!成功同步 {success_count}/{len(entities)} 个实体")
  344. if not auto_mode:
  345. print("\n💡 建议:")
  346. print(" 1. 检查生成的角色卡/物品卡,补充详细设定")
  347. print(" 2. 查看 世界观.md 和 力量体系.md 的更新")
  348. print(" 3. 确认 .webnovel/state.json 中的实体记录")
  349. if __name__ == "__main__":
  350. main()