entity_linker.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Entity Linker - 实体消歧辅助模块 (v5.4)
  5. 为 Data Agent 提供实体消歧的辅助功能:
  6. - 置信度判断
  7. - 别名索引管理 (通过 index.db aliases 表)
  8. - 消歧结果记录
  9. v5.1 变更(v5.4 沿用):
  10. - 别名存储从 state.json 迁移到 index.db aliases 表
  11. - 使用 IndexManager 进行别名读写
  12. - 移除对 state.json 的直接操作
  13. """
  14. from typing import Dict, List, Optional, Tuple
  15. from dataclasses import dataclass, field
  16. from .config import get_config
  17. from .index_manager import IndexManager
  18. from .observability import safe_log_tool_call
  19. @dataclass
  20. class DisambiguationResult:
  21. """消歧结果"""
  22. mention: str
  23. entity_id: Optional[str]
  24. confidence: float
  25. candidates: List[str] = field(default_factory=list)
  26. adopted: bool = False
  27. warning: Optional[str] = None
  28. class EntityLinker:
  29. """实体链接器 - 辅助 Data Agent 进行实体消歧 (v5.1 SQLite,v5.4 沿用)"""
  30. def __init__(self, config=None):
  31. self.config = config or get_config()
  32. self._index_manager = IndexManager(self.config)
  33. # ==================== 别名管理 (v5.1 SQLite,v5.4 沿用) ====================
  34. def register_alias(self, entity_id: str, alias: str, entity_type: str = "角色") -> bool:
  35. """注册新别名(v5.1 引入:写入 index.db aliases 表)"""
  36. if not alias or not entity_id:
  37. return False
  38. return self._index_manager.register_alias(alias, entity_id, entity_type)
  39. def lookup_alias(self, mention: str, entity_type: str = None) -> Optional[str]:
  40. """查找别名对应的实体ID(返回第一个匹配,可选按类型过滤)"""
  41. entries = self._index_manager.get_entities_by_alias(mention)
  42. if not entries:
  43. return None
  44. if entity_type:
  45. for entry in entries:
  46. if entry.get("type") == entity_type:
  47. return entry.get("id")
  48. return None
  49. else:
  50. return entries[0].get("id") if entries else None
  51. def lookup_alias_all(self, mention: str) -> List[Dict]:
  52. """查找别名对应的所有实体(一对多)"""
  53. entries = self._index_manager.get_entities_by_alias(mention)
  54. return [{"type": e.get("type"), "id": e.get("id")} for e in entries]
  55. def get_all_aliases(self, entity_id: str, entity_type: str = None) -> List[str]:
  56. """获取实体的所有别名"""
  57. return self._index_manager.get_entity_aliases(entity_id)
  58. # ==================== 置信度判断 ====================
  59. def evaluate_confidence(self, confidence: float) -> Tuple[str, bool, Optional[str]]:
  60. """
  61. 评估置信度,返回 (action, adopt, warning)
  62. - action: "auto" | "warn" | "manual"
  63. - adopt: 是否采用
  64. - warning: 警告信息
  65. """
  66. if confidence >= self.config.extraction_confidence_high:
  67. return ("auto", True, None)
  68. elif confidence >= self.config.extraction_confidence_medium:
  69. return ("warn", True, f"中置信度匹配 (confidence: {confidence:.2f})")
  70. else:
  71. return ("manual", False, f"需人工确认 (confidence: {confidence:.2f})")
  72. def process_uncertain(
  73. self,
  74. mention: str,
  75. candidates: List[str],
  76. suggested: str,
  77. confidence: float,
  78. context: str = ""
  79. ) -> DisambiguationResult:
  80. """
  81. 处理不确定的实体匹配
  82. 返回消歧结果,包含是否采用、警告信息等
  83. """
  84. action, adopt, warning = self.evaluate_confidence(confidence)
  85. result = DisambiguationResult(
  86. mention=mention,
  87. entity_id=suggested if adopt else None,
  88. confidence=confidence,
  89. candidates=candidates,
  90. adopted=adopt,
  91. warning=warning
  92. )
  93. return result
  94. # ==================== 批量处理 ====================
  95. def process_extraction_result(
  96. self,
  97. uncertain_items: List[Dict]
  98. ) -> Tuple[List[DisambiguationResult], List[str]]:
  99. """
  100. 处理 AI 提取结果中的 uncertain 项
  101. 返回 (results, warnings)
  102. """
  103. results = []
  104. warnings = []
  105. for item in uncertain_items:
  106. result = self.process_uncertain(
  107. mention=item.get("mention", ""),
  108. candidates=item.get("candidates", []),
  109. suggested=item.get("suggested", ""),
  110. confidence=item.get("confidence", 0.0),
  111. context=item.get("context", "")
  112. )
  113. results.append(result)
  114. if result.warning:
  115. warnings.append(f"{result.mention} → {result.entity_id}: {result.warning}")
  116. return results, warnings
  117. def register_new_entities(
  118. self,
  119. new_entities: List[Dict]
  120. ) -> List[str]:
  121. """
  122. 注册新实体的别名 (v5.1 引入,v5.4 沿用)
  123. 返回注册的实体ID列表
  124. """
  125. registered = []
  126. for entity in new_entities:
  127. entity_id = entity.get("suggested_id") or entity.get("id")
  128. if not entity_id or entity_id == "NEW":
  129. continue
  130. entity_type = entity.get("type", "角色")
  131. # 注册主名称
  132. name = entity.get("name", "")
  133. if name:
  134. self.register_alias(entity_id, name, entity_type)
  135. # 注册提及方式
  136. for mention in entity.get("mentions", []):
  137. if mention and mention != name:
  138. self.register_alias(entity_id, mention, entity_type)
  139. registered.append(entity_id)
  140. return registered
  141. # ==================== CLI 接口 ====================
  142. def main():
  143. import argparse
  144. import sys
  145. from .cli_output import print_success, print_error
  146. from .cli_args import normalize_global_project_root
  147. from .index_manager import IndexManager
  148. parser = argparse.ArgumentParser(description="Entity Linker CLI (v5.4 SQLite)")
  149. parser.add_argument("--project-root", type=str, help="项目根目录")
  150. subparsers = parser.add_subparsers(dest="command")
  151. # 注册别名
  152. register_parser = subparsers.add_parser("register-alias")
  153. register_parser.add_argument("--entity", required=True, help="实体ID")
  154. register_parser.add_argument("--alias", required=True, help="别名")
  155. register_parser.add_argument("--type", default="角色", help="实体类型(默认:角色)")
  156. # 查找别名
  157. lookup_parser = subparsers.add_parser("lookup")
  158. lookup_parser.add_argument("--mention", required=True, help="提及文本")
  159. lookup_parser.add_argument("--type", help="按类型过滤")
  160. # 查找所有匹配(一对多)
  161. lookup_all_parser = subparsers.add_parser("lookup-all")
  162. lookup_all_parser.add_argument("--mention", required=True, help="提及文本")
  163. # 列出别名
  164. list_parser = subparsers.add_parser("list-aliases")
  165. list_parser.add_argument("--entity", required=True, help="实体ID")
  166. list_parser.add_argument("--type", help="实体类型")
  167. argv = normalize_global_project_root(sys.argv[1:])
  168. args = parser.parse_args(argv)
  169. # 初始化
  170. config = None
  171. if args.project_root:
  172. # 允许传入“工作区根目录”,统一解析到真正的 book project_root(必须包含 .webnovel/state.json)
  173. from project_locator import resolve_project_root
  174. from .config import DataModulesConfig
  175. resolved_root = resolve_project_root(args.project_root)
  176. config = DataModulesConfig.from_project_root(resolved_root)
  177. linker = EntityLinker(config)
  178. logger = IndexManager(config)
  179. tool_name = f"entity_linker:{args.command or 'unknown'}"
  180. def emit_success(data=None, message: str = "ok"):
  181. print_success(data, message=message)
  182. safe_log_tool_call(logger, tool_name=tool_name, success=True)
  183. def emit_error(code: str, message: str, suggestion: str | None = None):
  184. print_error(code, message, suggestion=suggestion)
  185. safe_log_tool_call(
  186. logger,
  187. tool_name=tool_name,
  188. success=False,
  189. error_code=code,
  190. error_message=message,
  191. )
  192. if args.command == "register-alias":
  193. entity_type = getattr(args, "type", "角色")
  194. success = linker.register_alias(args.entity, args.alias, entity_type)
  195. if success:
  196. emit_success({"entity": args.entity, "alias": args.alias, "type": entity_type}, message="alias_registered")
  197. else:
  198. emit_error("ALIAS_EXISTS", "注册失败或已存在")
  199. elif args.command == "lookup":
  200. entity_type = getattr(args, "type", None)
  201. entity_id = linker.lookup_alias(args.mention, entity_type)
  202. if entity_id:
  203. emit_success({"mention": args.mention, "entity": entity_id}, message="lookup")
  204. else:
  205. emit_error("NOT_FOUND", f"未找到别名: {args.mention}")
  206. elif args.command == "lookup-all":
  207. matches = linker.lookup_alias_all(args.mention)
  208. emit_success(matches, message="lookup_all")
  209. elif args.command == "list-aliases":
  210. entity_type = getattr(args, "type", None)
  211. aliases = linker.get_all_aliases(args.entity, entity_type)
  212. emit_success(aliases, message="aliases")
  213. else:
  214. emit_error("UNKNOWN_COMMAND", "未指定有效命令", suggestion="请查看 --help")
  215. if __name__ == "__main__":
  216. main()