| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Index Manager - 索引管理模块
- 管理 index.db (SQLite) 的读写操作:
- - 章节元数据索引
- - 实体出场记录
- - 场景索引
- - 快速查询接口
- """
- import sqlite3
- import json
- from pathlib import Path
- from typing import Dict, List, Optional, Any, Tuple
- from dataclasses import dataclass
- from contextlib import contextmanager
- from .config import get_config
- @dataclass
- class ChapterMeta:
- """章节元数据"""
- chapter: int
- title: str
- location: str
- word_count: int
- characters: List[str]
- summary: str = ""
- @dataclass
- class SceneMeta:
- """场景元数据"""
- chapter: int
- scene_index: int
- start_line: int
- end_line: int
- location: str
- summary: str
- characters: List[str]
- class IndexManager:
- """索引管理器"""
- def __init__(self, config=None):
- self.config = config or get_config()
- self._init_db()
- def _init_db(self):
- """初始化数据库表"""
- self.config.ensure_dirs()
- with self._get_conn() as conn:
- cursor = conn.cursor()
- # 章节表
- cursor.execute("""
- CREATE TABLE IF NOT EXISTS chapters (
- chapter INTEGER PRIMARY KEY,
- title TEXT,
- location TEXT,
- word_count INTEGER,
- characters TEXT,
- summary TEXT,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- )
- """)
- # 场景表
- cursor.execute("""
- CREATE TABLE IF NOT EXISTS scenes (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- chapter INTEGER,
- scene_index INTEGER,
- start_line INTEGER,
- end_line INTEGER,
- location TEXT,
- summary TEXT,
- characters TEXT,
- UNIQUE(chapter, scene_index)
- )
- """)
- # 实体出场表
- cursor.execute("""
- CREATE TABLE IF NOT EXISTS appearances (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- entity_id TEXT,
- chapter INTEGER,
- mentions TEXT,
- confidence REAL,
- UNIQUE(entity_id, chapter)
- )
- """)
- # 创建索引
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_scenes_chapter ON scenes(chapter)")
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_appearances_entity ON appearances(entity_id)")
- cursor.execute("CREATE INDEX IF NOT EXISTS idx_appearances_chapter ON appearances(chapter)")
- conn.commit()
- @contextmanager
- def _get_conn(self):
- """获取数据库连接"""
- conn = sqlite3.connect(str(self.config.index_db))
- conn.row_factory = sqlite3.Row
- try:
- yield conn
- finally:
- conn.close()
- # ==================== 章节操作 ====================
- def add_chapter(self, meta: ChapterMeta):
- """添加/更新章节元数据"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- INSERT OR REPLACE INTO chapters
- (chapter, title, location, word_count, characters, summary)
- VALUES (?, ?, ?, ?, ?, ?)
- """, (
- meta.chapter,
- meta.title,
- meta.location,
- meta.word_count,
- json.dumps(meta.characters, ensure_ascii=False),
- meta.summary
- ))
- conn.commit()
- def get_chapter(self, chapter: int) -> Optional[Dict]:
- """获取章节元数据"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("SELECT * FROM chapters WHERE chapter = ?", (chapter,))
- row = cursor.fetchone()
- if row:
- return self._row_to_dict(row, parse_json=["characters"])
- return None
- def get_recent_chapters(self, limit: int = None) -> List[Dict]:
- """获取最近章节"""
- if limit is None:
- limit = self.config.query_recent_chapters_limit
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT * FROM chapters
- ORDER BY chapter DESC
- LIMIT ?
- """, (limit,))
- return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
- # ==================== 场景操作 ====================
- def add_scenes(self, chapter: int, scenes: List[SceneMeta]):
- """添加章节场景"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- # 先删除该章节旧场景
- cursor.execute("DELETE FROM scenes WHERE chapter = ?", (chapter,))
- # 插入新场景
- for scene in scenes:
- cursor.execute("""
- INSERT INTO scenes
- (chapter, scene_index, start_line, end_line, location, summary, characters)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- """, (
- scene.chapter,
- scene.scene_index,
- scene.start_line,
- scene.end_line,
- scene.location,
- scene.summary,
- json.dumps(scene.characters, ensure_ascii=False)
- ))
- conn.commit()
- def get_scenes(self, chapter: int) -> List[Dict]:
- """获取章节场景"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT * FROM scenes
- WHERE chapter = ?
- ORDER BY scene_index
- """, (chapter,))
- return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
- def search_scenes_by_location(self, location: str, limit: int = None) -> List[Dict]:
- """按地点搜索场景"""
- if limit is None:
- limit = self.config.query_scenes_by_location_limit
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT * FROM scenes
- WHERE location LIKE ?
- ORDER BY chapter DESC
- LIMIT ?
- """, (f"%{location}%", limit))
- return [self._row_to_dict(row, parse_json=["characters"]) for row in cursor.fetchall()]
- # ==================== 出场记录操作 ====================
- def record_appearance(
- self,
- entity_id: str,
- chapter: int,
- mentions: List[str],
- confidence: float = 1.0
- ):
- """记录实体出场"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- INSERT OR REPLACE INTO appearances
- (entity_id, chapter, mentions, confidence)
- VALUES (?, ?, ?, ?)
- """, (
- entity_id,
- chapter,
- json.dumps(mentions, ensure_ascii=False),
- confidence
- ))
- conn.commit()
- def get_entity_appearances(self, entity_id: str, limit: int = None) -> List[Dict]:
- """获取实体出场记录"""
- if limit is None:
- limit = self.config.query_entity_appearances_limit
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT * FROM appearances
- WHERE entity_id = ?
- ORDER BY chapter DESC
- LIMIT ?
- """, (entity_id, limit))
- return [self._row_to_dict(row, parse_json=["mentions"]) for row in cursor.fetchall()]
- def get_recent_appearances(self, limit: int = None) -> List[Dict]:
- """获取最近出场的实体"""
- if limit is None:
- limit = self.config.query_recent_appearances_limit
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT entity_id, MAX(chapter) as last_chapter, COUNT(*) as total
- FROM appearances
- GROUP BY entity_id
- ORDER BY last_chapter DESC
- LIMIT ?
- """, (limit,))
- return [dict(row) for row in cursor.fetchall()]
- def get_chapter_appearances(self, chapter: int) -> List[Dict]:
- """获取某章所有出场实体"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("""
- SELECT * FROM appearances
- WHERE chapter = ?
- ORDER BY confidence DESC
- """, (chapter,))
- return [self._row_to_dict(row, parse_json=["mentions"]) for row in cursor.fetchall()]
- # ==================== 批量操作 ====================
- def process_chapter_data(
- self,
- chapter: int,
- title: str,
- location: str,
- word_count: int,
- entities: List[Dict],
- scenes: List[Dict]
- ) -> Dict[str, int]:
- """
- 处理章节数据,批量写入索引
- 返回写入统计
- """
- stats = {"chapters": 0, "scenes": 0, "appearances": 0}
- # 提取出场角色
- characters = [e.get("id") for e in entities if e.get("type") == "角色"]
- # 写入章节元数据
- self.add_chapter(ChapterMeta(
- chapter=chapter,
- title=title,
- location=location,
- word_count=word_count,
- characters=characters,
- summary="" # 可后续由 Data Agent 生成
- ))
- stats["chapters"] = 1
- # 写入场景
- scene_metas = []
- for s in scenes:
- scene_metas.append(SceneMeta(
- chapter=chapter,
- scene_index=s.get("index", 0),
- start_line=s.get("start_line", 0),
- end_line=s.get("end_line", 0),
- location=s.get("location", ""),
- summary=s.get("summary", ""),
- characters=s.get("characters", [])
- ))
- self.add_scenes(chapter, scene_metas)
- stats["scenes"] = len(scene_metas)
- # 写入出场记录
- for entity in entities:
- entity_id = entity.get("id")
- if entity_id and entity_id != "NEW":
- self.record_appearance(
- entity_id=entity_id,
- chapter=chapter,
- mentions=entity.get("mentions", []),
- confidence=entity.get("confidence", 1.0)
- )
- stats["appearances"] += 1
- return stats
- # ==================== 辅助方法 ====================
- def _row_to_dict(self, row: sqlite3.Row, parse_json: List[str] = None) -> Dict:
- """将 Row 转换为字典"""
- d = dict(row)
- if parse_json:
- for key in parse_json:
- if key in d and d[key]:
- try:
- d[key] = json.loads(d[key])
- except json.JSONDecodeError:
- pass
- return d
- def get_stats(self) -> Dict[str, int]:
- """获取索引统计"""
- with self._get_conn() as conn:
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM chapters")
- chapters = cursor.fetchone()[0]
- cursor.execute("SELECT COUNT(*) FROM scenes")
- scenes = cursor.fetchone()[0]
- cursor.execute("SELECT COUNT(DISTINCT entity_id) FROM appearances")
- entities = cursor.fetchone()[0]
- cursor.execute("SELECT MAX(chapter) FROM chapters")
- max_chapter = cursor.fetchone()[0] or 0
- return {
- "chapters": chapters,
- "scenes": scenes,
- "entities": entities,
- "max_chapter": max_chapter
- }
- # ==================== CLI 接口 ====================
- def main():
- import argparse
- parser = argparse.ArgumentParser(description="Index Manager CLI")
- parser.add_argument("--project-root", type=str, help="项目根目录")
- subparsers = parser.add_subparsers(dest="command")
- # 获取统计
- subparsers.add_parser("stats")
- # 查询章节
- chapter_parser = subparsers.add_parser("get-chapter")
- chapter_parser.add_argument("--chapter", type=int, required=True)
- # 查询最近出场
- recent_parser = subparsers.add_parser("recent-appearances")
- recent_parser.add_argument("--limit", type=int, default=None)
- # 查询实体出场
- entity_parser = subparsers.add_parser("entity-appearances")
- entity_parser.add_argument("--entity", required=True)
- entity_parser.add_argument("--limit", type=int, default=None)
- # 搜索场景
- search_parser = subparsers.add_parser("search-scenes")
- search_parser.add_argument("--location", required=True)
- search_parser.add_argument("--limit", type=int, default=None)
- # 处理章节数据 (写入)
- process_parser = subparsers.add_parser("process-chapter")
- process_parser.add_argument("--chapter", type=int, required=True)
- process_parser.add_argument("--title", required=True)
- process_parser.add_argument("--location", required=True)
- process_parser.add_argument("--word-count", type=int, required=True)
- process_parser.add_argument("--entities", required=True, help="JSON 格式的实体列表")
- process_parser.add_argument("--scenes", required=True, help="JSON 格式的场景列表")
- args = parser.parse_args()
- # 初始化
- config = None
- if args.project_root:
- from .config import DataModulesConfig
- config = DataModulesConfig.from_project_root(args.project_root)
- manager = IndexManager(config)
- if args.command == "stats":
- stats = manager.get_stats()
- print(json.dumps(stats, ensure_ascii=False, indent=2))
- elif args.command == "get-chapter":
- chapter = manager.get_chapter(args.chapter)
- if chapter:
- print(json.dumps(chapter, ensure_ascii=False, indent=2))
- else:
- print(f"未找到章节: {args.chapter}")
- elif args.command == "recent-appearances":
- appearances = manager.get_recent_appearances(args.limit)
- for a in appearances:
- print(f"{a['entity_id']}: 最后出场第 {a['last_chapter']} 章, 共 {a['total']} 次")
- elif args.command == "entity-appearances":
- appearances = manager.get_entity_appearances(args.entity, args.limit)
- print(f"{args.entity} 出场记录:")
- for a in appearances:
- print(f" 第 {a['chapter']} 章: {a['mentions']}")
- elif args.command == "search-scenes":
- scenes = manager.search_scenes_by_location(args.location, args.limit)
- for s in scenes:
- print(f"第 {s['chapter']} 章 场景 {s['scene_index']}: {s['location']}")
- print(f" {s['summary'][:50]}...")
- elif args.command == "process-chapter":
- entities = json.loads(args.entities)
- scenes = json.loads(args.scenes)
- stats = manager.process_chapter_data(
- chapter=args.chapter,
- title=args.title,
- location=args.location,
- word_count=args.word_count,
- entities=entities,
- scenes=scenes
- )
- print(f"✓ 已处理第 {args.chapter} 章")
- print(f" 章节: {stats['chapters']}, 场景: {stats['scenes']}, 出场记录: {stats['appearances']}")
- if __name__ == "__main__":
- main()
|