index_manager.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Index Manager - 索引管理模块
  5. 管理 index.db (SQLite) 的读写操作:
  6. - 章节元数据索引
  7. - 实体出场记录
  8. - 场景索引
  9. - 快速查询接口
  10. """
  11. import sqlite3
  12. import json
  13. from pathlib import Path
  14. from typing import Dict, List, Optional, Any, Tuple
  15. from dataclasses import dataclass
  16. from contextlib import contextmanager
  17. from .config import get_config
  18. @dataclass
  19. class ChapterMeta:
  20. """章节元数据"""
  21. chapter: int
  22. title: str
  23. location: str
  24. word_count: int
  25. characters: List[str]
  26. summary: str = ""
  27. @dataclass
  28. class SceneMeta:
  29. """场景元数据"""
  30. chapter: int
  31. scene_index: int
  32. start_line: int
  33. end_line: int
  34. location: str
  35. summary: str
  36. characters: List[str]
  37. class IndexManager:
  38. """索引管理器"""
  39. def __init__(self, config=None):
  40. self.config = config or get_config()
  41. self._init_db()
  42. def _init_db(self):
  43. """初始化数据库表"""
  44. self.config.ensure_dirs()
  45. with self._get_conn() as conn:
  46. cursor = conn.cursor()
  47. # 章节表
  48. cursor.execute("""
  49. CREATE TABLE IF NOT EXISTS chapters (
  50. chapter INTEGER PRIMARY KEY,
  51. title TEXT,
  52. location TEXT,
  53. word_count INTEGER,
  54. characters TEXT,
  55. summary TEXT,
  56. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
  57. )
  58. """)
  59. # 场景表
  60. cursor.execute("""
  61. CREATE TABLE IF NOT EXISTS scenes (
  62. id INTEGER PRIMARY KEY AUTOINCREMENT,
  63. chapter INTEGER,
  64. scene_index INTEGER,
  65. start_line INTEGER,
  66. end_line INTEGER,
  67. location TEXT,
  68. summary TEXT,
  69. characters TEXT,
  70. UNIQUE(chapter, scene_index)
  71. )
  72. """)
  73. # 实体出场表
  74. cursor.execute("""
  75. CREATE TABLE IF NOT EXISTS appearances (
  76. id INTEGER PRIMARY KEY AUTOINCREMENT,
  77. entity_id TEXT,
  78. chapter INTEGER,
  79. mentions TEXT,
  80. confidence REAL,
  81. UNIQUE(entity_id, chapter)
  82. )
  83. """)
  84. # 创建索引
  85. cursor.execute("CREATE INDEX IF NOT EXISTS idx_scenes_chapter ON scenes(chapter)")
  86. cursor.execute("CREATE INDEX IF NOT EXISTS idx_appearances_entity ON appearances(entity_id)")
  87. cursor.execute("CREATE INDEX IF NOT EXISTS idx_appearances_chapter ON appearances(chapter)")
  88. conn.commit()
  89. @contextmanager
  90. def _get_conn(self):
  91. """获取数据库连接"""
  92. conn = sqlite3.connect(str(self.config.index_db))
  93. conn.row_factory = sqlite3.Row
  94. try:
  95. yield conn
  96. finally:
  97. conn.close()
  98. # ==================== 章节操作 ====================
  99. def add_chapter(self, meta: ChapterMeta):
  100. """添加/更新章节元数据"""
  101. with self._get_conn() as conn:
  102. cursor = conn.cursor()
  103. cursor.execute("""
  104. INSERT OR REPLACE INTO chapters
  105. (chapter, title, location, word_count, characters, summary)
  106. VALUES (?, ?, ?, ?, ?, ?)
  107. """, (
  108. meta.chapter,
  109. meta.title,
  110. meta.location,
  111. meta.word_count,
  112. json.dumps(meta.characters, ensure_ascii=False),
  113. meta.summary
  114. ))
  115. conn.commit()
  116. def get_chapter(self, chapter: int) -> Optional[Dict]:
  117. """获取章节元数据"""
  118. with self._get_conn() as conn:
  119. cursor = conn.cursor()
  120. cursor.execute("SELECT * FROM chapters WHERE chapter = ?", (chapter,))
  121. row = cursor.fetchone()
  122. if row:
  123. return self._row_to_dict(row, parse_json=["characters"])
  124. return None
  125. def get_recent_chapters(self, limit: int = None) -> List[Dict]:
  126. """获取最近章节"""
  127. if limit is None:
  128. limit = self.config.query_recent_chapters_limit
  129. with self._get_conn() as conn:
  130. cursor = conn.cursor()
  131. cursor.execute("""
  132. SELECT * FROM chapters
  133. ORDER BY chapter DESC
  134. LIMIT ?
  135. """, (limit,))
  136. return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
  137. # ==================== 场景操作 ====================
  138. def add_scenes(self, chapter: int, scenes: List[SceneMeta]):
  139. """添加章节场景"""
  140. with self._get_conn() as conn:
  141. cursor = conn.cursor()
  142. # 先删除该章节旧场景
  143. cursor.execute("DELETE FROM scenes WHERE chapter = ?", (chapter,))
  144. # 插入新场景
  145. for scene in scenes:
  146. cursor.execute("""
  147. INSERT INTO scenes
  148. (chapter, scene_index, start_line, end_line, location, summary, characters)
  149. VALUES (?, ?, ?, ?, ?, ?, ?)
  150. """, (
  151. scene.chapter,
  152. scene.scene_index,
  153. scene.start_line,
  154. scene.end_line,
  155. scene.location,
  156. scene.summary,
  157. json.dumps(scene.characters, ensure_ascii=False)
  158. ))
  159. conn.commit()
  160. def get_scenes(self, chapter: int) -> List[Dict]:
  161. """获取章节场景"""
  162. with self._get_conn() as conn:
  163. cursor = conn.cursor()
  164. cursor.execute("""
  165. SELECT * FROM scenes
  166. WHERE chapter = ?
  167. ORDER BY scene_index
  168. """, (chapter,))
  169. return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
  170. def search_scenes_by_location(self, location: str, limit: int = None) -> List[Dict]:
  171. """按地点搜索场景"""
  172. if limit is None:
  173. limit = self.config.query_scenes_by_location_limit
  174. with self._get_conn() as conn:
  175. cursor = conn.cursor()
  176. cursor.execute("""
  177. SELECT * FROM scenes
  178. WHERE location LIKE ?
  179. ORDER BY chapter DESC
  180. LIMIT ?
  181. """, (f"%{location}%", limit))
  182. return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
  183. # ==================== 出场记录操作 ====================
  184. def record_appearance(
  185. self,
  186. entity_id: str,
  187. chapter: int,
  188. mentions: List[str],
  189. confidence: float = 1.0
  190. ):
  191. """记录实体出场"""
  192. with self._get_conn() as conn:
  193. cursor = conn.cursor()
  194. cursor.execute("""
  195. INSERT OR REPLACE INTO appearances
  196. (entity_id, chapter, mentions, confidence)
  197. VALUES (?, ?, ?, ?)
  198. """, (
  199. entity_id,
  200. chapter,
  201. json.dumps(mentions, ensure_ascii=False),
  202. confidence
  203. ))
  204. conn.commit()
  205. def get_entity_appearances(self, entity_id: str, limit: int = None) -> List[Dict]:
  206. """获取实体出场记录"""
  207. if limit is None:
  208. limit = self.config.query_entity_appearances_limit
  209. with self._get_conn() as conn:
  210. cursor = conn.cursor()
  211. cursor.execute("""
  212. SELECT * FROM appearances
  213. WHERE entity_id = ?
  214. ORDER BY chapter DESC
  215. LIMIT ?
  216. """, (entity_id, limit))
  217. return [self._row_to_dict(row, parse_json=["mentions"]) for row in cursor.fetchall()]
  218. def get_recent_appearances(self, limit: int = None) -> List[Dict]:
  219. """获取最近出场的实体"""
  220. if limit is None:
  221. limit = self.config.query_recent_appearances_limit
  222. with self._get_conn() as conn:
  223. cursor = conn.cursor()
  224. cursor.execute("""
  225. SELECT entity_id, MAX(chapter) as last_chapter, COUNT(*) as total
  226. FROM appearances
  227. GROUP BY entity_id
  228. ORDER BY last_chapter DESC
  229. LIMIT ?
  230. """, (limit,))
  231. return [dict(row) for row in cursor.fetchall()]
  232. def get_chapter_appearances(self, chapter: int) -> List[Dict]:
  233. """获取某章所有出场实体"""
  234. with self._get_conn() as conn:
  235. cursor = conn.cursor()
  236. cursor.execute("""
  237. SELECT * FROM appearances
  238. WHERE chapter = ?
  239. ORDER BY confidence DESC
  240. """, (chapter,))
  241. return [self._row_to_dict(row, parse_json=["mentions"]) for row in cursor.fetchall()]
  242. # ==================== 批量操作 ====================
  243. def process_chapter_data(
  244. self,
  245. chapter: int,
  246. title: str,
  247. location: str,
  248. word_count: int,
  249. entities: List[Dict],
  250. scenes: List[Dict]
  251. ) -> Dict[str, int]:
  252. """
  253. 处理章节数据,批量写入索引
  254. 返回写入统计
  255. """
  256. stats = {"chapters": 0, "scenes": 0, "appearances": 0}
  257. # 提取出场角色
  258. characters = [e.get("id") for e in entities if e.get("type") == "角色"]
  259. # 写入章节元数据
  260. self.add_chapter(ChapterMeta(
  261. chapter=chapter,
  262. title=title,
  263. location=location,
  264. word_count=word_count,
  265. characters=characters,
  266. summary="" # 可后续由 Data Agent 生成
  267. ))
  268. stats["chapters"] = 1
  269. # 写入场景
  270. scene_metas = []
  271. for s in scenes:
  272. scene_metas.append(SceneMeta(
  273. chapter=chapter,
  274. scene_index=s.get("index", 0),
  275. start_line=s.get("start_line", 0),
  276. end_line=s.get("end_line", 0),
  277. location=s.get("location", ""),
  278. summary=s.get("summary", ""),
  279. characters=s.get("characters", [])
  280. ))
  281. self.add_scenes(chapter, scene_metas)
  282. stats["scenes"] = len(scene_metas)
  283. # 写入出场记录
  284. for entity in entities:
  285. entity_id = entity.get("id")
  286. if entity_id and entity_id != "NEW":
  287. self.record_appearance(
  288. entity_id=entity_id,
  289. chapter=chapter,
  290. mentions=entity.get("mentions", []),
  291. confidence=entity.get("confidence", 1.0)
  292. )
  293. stats["appearances"] += 1
  294. return stats
  295. # ==================== 辅助方法 ====================
  296. def _row_to_dict(self, row: sqlite3.Row, parse_json: List[str] = None) -> Dict:
  297. """将 Row 转换为字典"""
  298. d = dict(row)
  299. if parse_json:
  300. for key in parse_json:
  301. if key in d and d[key]:
  302. try:
  303. d[key] = json.loads(d[key])
  304. except json.JSONDecodeError:
  305. pass
  306. return d
  307. def get_stats(self) -> Dict[str, int]:
  308. """获取索引统计"""
  309. with self._get_conn() as conn:
  310. cursor = conn.cursor()
  311. cursor.execute("SELECT COUNT(*) FROM chapters")
  312. chapters = cursor.fetchone()[0]
  313. cursor.execute("SELECT COUNT(*) FROM scenes")
  314. scenes = cursor.fetchone()[0]
  315. cursor.execute("SELECT COUNT(DISTINCT entity_id) FROM appearances")
  316. entities = cursor.fetchone()[0]
  317. cursor.execute("SELECT MAX(chapter) FROM chapters")
  318. max_chapter = cursor.fetchone()[0] or 0
  319. return {
  320. "chapters": chapters,
  321. "scenes": scenes,
  322. "entities": entities,
  323. "max_chapter": max_chapter
  324. }
  325. # ==================== CLI 接口 ====================
  326. def main():
  327. import argparse
  328. parser = argparse.ArgumentParser(description="Index Manager CLI")
  329. parser.add_argument("--project-root", type=str, help="项目根目录")
  330. subparsers = parser.add_subparsers(dest="command")
  331. # 获取统计
  332. subparsers.add_parser("stats")
  333. # 查询章节
  334. chapter_parser = subparsers.add_parser("get-chapter")
  335. chapter_parser.add_argument("--chapter", type=int, required=True)
  336. # 查询最近出场
  337. recent_parser = subparsers.add_parser("recent-appearances")
  338. recent_parser.add_argument("--limit", type=int, default=None)
  339. # 查询实体出场
  340. entity_parser = subparsers.add_parser("entity-appearances")
  341. entity_parser.add_argument("--entity", required=True)
  342. entity_parser.add_argument("--limit", type=int, default=None)
  343. # 搜索场景
  344. search_parser = subparsers.add_parser("search-scenes")
  345. search_parser.add_argument("--location", required=True)
  346. search_parser.add_argument("--limit", type=int, default=None)
  347. # 处理章节数据 (写入)
  348. process_parser = subparsers.add_parser("process-chapter")
  349. process_parser.add_argument("--chapter", type=int, required=True)
  350. process_parser.add_argument("--title", required=True)
  351. process_parser.add_argument("--location", required=True)
  352. process_parser.add_argument("--word-count", type=int, required=True)
  353. process_parser.add_argument("--entities", required=True, help="JSON 格式的实体列表")
  354. process_parser.add_argument("--scenes", required=True, help="JSON 格式的场景列表")
  355. args = parser.parse_args()
  356. # 初始化
  357. config = None
  358. if args.project_root:
  359. from .config import DataModulesConfig
  360. config = DataModulesConfig.from_project_root(args.project_root)
  361. manager = IndexManager(config)
  362. if args.command == "stats":
  363. stats = manager.get_stats()
  364. print(json.dumps(stats, ensure_ascii=False, indent=2))
  365. elif args.command == "get-chapter":
  366. chapter = manager.get_chapter(args.chapter)
  367. if chapter:
  368. print(json.dumps(chapter, ensure_ascii=False, indent=2))
  369. else:
  370. print(f"未找到章节: {args.chapter}")
  371. elif args.command == "recent-appearances":
  372. appearances = manager.get_recent_appearances(args.limit)
  373. for a in appearances:
  374. print(f"{a['entity_id']}: 最后出场第 {a['last_chapter']} 章, 共 {a['total']} 次")
  375. elif args.command == "entity-appearances":
  376. appearances = manager.get_entity_appearances(args.entity, args.limit)
  377. print(f"{args.entity} 出场记录:")
  378. for a in appearances:
  379. print(f" 第 {a['chapter']} 章: {a['mentions']}")
  380. elif args.command == "search-scenes":
  381. scenes = manager.search_scenes_by_location(args.location, args.limit)
  382. for s in scenes:
  383. print(f"第 {s['chapter']} 章 场景 {s['scene_index']}: {s['location']}")
  384. print(f" {s['summary'][:50]}...")
  385. elif args.command == "process-chapter":
  386. entities = json.loads(args.entities)
  387. scenes = json.loads(args.scenes)
  388. stats = manager.process_chapter_data(
  389. chapter=args.chapter,
  390. title=args.title,
  391. location=args.location,
  392. word_count=args.word_count,
  393. entities=entities,
  394. scenes=scenes
  395. )
  396. print(f"✓ 已处理第 {args.chapter} 章")
  397. print(f" 章节: {stats['chapters']}, 场景: {stats['scenes']}, 出场记录: {stats['appearances']}")
  398. if __name__ == "__main__":
  399. main()