config.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Data Modules - 配置文件
  5. API 配置通过环境变量读取(支持 .env 文件):
  6. - EMBED_BASE_URL, EMBED_MODEL, EMBED_API_KEY
  7. - RERANK_BASE_URL, RERANK_MODEL, RERANK_API_KEY
  8. """
  9. import os
  10. from pathlib import Path
  11. from dataclasses import dataclass, field
  12. from typing import Optional
  13. from .context_weights import TEMPLATE_WEIGHTS_DYNAMIC_DEFAULT
  14. # 加载 .env 文件
  15. def _load_dotenv():
  16. """从项目根目录加载 .env 文件"""
  17. # 尝试多个可能的位置
  18. possible_paths = [
  19. Path.cwd() / ".env",
  20. Path(__file__).parent.parent.parent.parent / ".env", # .claude/scripts/data_modules -> 项目根目录
  21. ]
  22. for env_path in possible_paths:
  23. if env_path.exists():
  24. with open(env_path, "r", encoding="utf-8") as f:
  25. for line in f:
  26. line = line.strip()
  27. if line and not line.startswith("#") and "=" in line:
  28. key, _, value = line.partition("=")
  29. key = key.strip()
  30. value = value.strip()
  31. # 只在环境变量未设置时才从 .env 加载
  32. if key and key not in os.environ:
  33. os.environ[key] = value
  34. break
  35. _load_dotenv()
  36. def _default_context_template_weights_dynamic() -> dict[str, dict[str, dict[str, float]]]:
  37. return {
  38. stage: {
  39. template: dict(weights)
  40. for template, weights in templates.items()
  41. }
  42. for stage, templates in TEMPLATE_WEIGHTS_DYNAMIC_DEFAULT.items()
  43. }
  44. @dataclass
  45. class DataModulesConfig:
  46. """数据模块配置"""
  47. # ================= 项目路径 =================
  48. project_root: Path = field(default_factory=lambda: Path.cwd())
  49. @property
  50. def webnovel_dir(self) -> Path:
  51. return self.project_root / ".webnovel"
  52. @property
  53. def state_file(self) -> Path:
  54. return self.webnovel_dir / "state.json"
  55. @property
  56. def index_db(self) -> Path:
  57. return self.webnovel_dir / "index.db"
  58. # v5.1 引入: alias_index_file 已废弃,别名存储在 index.db aliases 表
  59. @property
  60. def chapters_dir(self) -> Path:
  61. return self.project_root / "正文"
  62. @property
  63. def settings_dir(self) -> Path:
  64. return self.project_root / "设定集"
  65. @property
  66. def outline_dir(self) -> Path:
  67. return self.project_root / "大纲"
  68. # ================= Embedding API 配置 =================
  69. embed_api_type: str = "openai"
  70. embed_base_url: str = field(default_factory=lambda: os.getenv("EMBED_BASE_URL", "https://api-inference.modelscope.cn/v1"))
  71. embed_model: str = field(default_factory=lambda: os.getenv("EMBED_MODEL", "Qwen/Qwen3-Embedding-8B"))
  72. embed_api_key: str = field(default_factory=lambda: os.getenv("EMBED_API_KEY", ""))
  73. @property
  74. def embed_url(self) -> str:
  75. return self.embed_base_url
  76. # ================= Rerank API 配置 =================
  77. rerank_api_type: str = "openai"
  78. rerank_base_url: str = field(default_factory=lambda: os.getenv("RERANK_BASE_URL", "https://api.jina.ai/v1"))
  79. rerank_model: str = field(default_factory=lambda: os.getenv("RERANK_MODEL", "jina-reranker-v3"))
  80. rerank_api_key: str = field(default_factory=lambda: os.getenv("RERANK_API_KEY", ""))
  81. @property
  82. def rerank_url(self) -> str:
  83. return self.rerank_base_url
  84. # ================= 并发配置 =================
  85. embed_concurrency: int = 64
  86. rerank_concurrency: int = 32
  87. embed_batch_size: int = 64
  88. # ================= 超时配置 =================
  89. cold_start_timeout: int = 300
  90. normal_timeout: int = 180
  91. # ================= 重试配置 =================
  92. api_max_retries: int = 3 # 最大重试次数
  93. api_retry_delay: float = 1.0 # 初始重试延迟(秒),使用指数退避
  94. # ================= 检索配置 =================
  95. vector_top_k: int = 30
  96. bm25_top_k: int = 20
  97. rerank_top_n: int = 10
  98. rrf_k: int = 60
  99. vector_full_scan_max_vectors: int = 500
  100. vector_prefilter_bm25_candidates: int = 200
  101. vector_prefilter_recent_candidates: int = 200
  102. # ================= Graph-RAG 配置 =================
  103. graph_rag_enabled: bool = False
  104. graph_rag_expand_hops: int = 1
  105. graph_rag_max_expanded_entities: int = 30
  106. graph_rag_candidate_limit: int = 150
  107. graph_rag_boost_same_entity: float = 0.2
  108. graph_rag_boost_related_entity: float = 0.1
  109. graph_rag_boost_recency: float = 0.05
  110. relationship_graph_from_index_enabled: bool = True
  111. # ================= 实体提取配置 =================
  112. extraction_confidence_high: float = 0.8
  113. extraction_confidence_medium: float = 0.5
  114. # ================= 列表截断限制 =================
  115. max_disambiguation_warnings: int = 500
  116. max_disambiguation_pending: int = 1000
  117. max_state_changes: int = 2000
  118. context_recent_summaries_window: int = 3
  119. context_recent_meta_window: int = 3
  120. context_alerts_slice: int = 10
  121. context_max_appearing_characters: int = 10
  122. context_max_urgent_foreshadowing: int = 5
  123. context_story_skeleton_interval: int = 20
  124. context_story_skeleton_max_samples: int = 5
  125. context_story_skeleton_snippet_chars: int = 400
  126. context_extra_section_budget: int = 800
  127. context_ranker_enabled: bool = True
  128. context_ranker_recency_weight: float = 0.7
  129. context_ranker_frequency_weight: float = 0.3
  130. context_ranker_hook_bonus: float = 0.2
  131. context_ranker_length_bonus_cap: float = 0.2
  132. context_ranker_alert_critical_keywords: tuple[str, ...] = (
  133. "冲突",
  134. "矛盾",
  135. "critical",
  136. "break",
  137. "违规",
  138. "断裂",
  139. )
  140. context_ranker_debug: bool = False
  141. context_reader_signal_enabled: bool = True
  142. context_reader_signal_recent_limit: int = 5
  143. context_reader_signal_window_chapters: int = 20
  144. context_reader_signal_review_window: int = 5
  145. context_reader_signal_include_debt: bool = False
  146. context_genre_profile_enabled: bool = True
  147. context_genre_profile_max_refs: int = 8
  148. context_genre_profile_fallback: str = "shuangwen"
  149. context_compact_text_enabled: bool = True
  150. context_compact_min_budget: int = 120
  151. context_compact_head_ratio: float = 0.65
  152. context_writing_guidance_enabled: bool = True
  153. context_writing_guidance_max_items: int = 6
  154. context_writing_guidance_low_score_threshold: float = 75.0
  155. context_writing_guidance_hook_diversify: bool = True
  156. context_methodology_enabled: bool = True
  157. context_methodology_genre_whitelist: tuple[str, ...] = ("*",)
  158. context_methodology_label: str = "digital-serial-v1"
  159. context_writing_checklist_enabled: bool = True
  160. context_writing_checklist_min_items: int = 3
  161. context_writing_checklist_max_items: int = 6
  162. context_writing_checklist_default_weight: float = 1.0
  163. context_writing_score_persist_enabled: bool = True
  164. context_writing_score_include_reader_trend: bool = True
  165. context_writing_score_trend_window: int = 10
  166. context_rag_assist_enabled: bool = True
  167. context_rag_assist_top_k: int = 4
  168. context_rag_assist_min_outline_chars: int = 40
  169. context_rag_assist_max_query_chars: int = 120
  170. context_dynamic_budget_enabled: bool = True
  171. context_dynamic_budget_early_chapter: int = 30
  172. context_dynamic_budget_late_chapter: int = 120
  173. context_dynamic_budget_early_core_bonus: float = 0.08
  174. context_dynamic_budget_early_scene_bonus: float = 0.04
  175. context_dynamic_budget_late_global_bonus: float = 0.08
  176. context_dynamic_budget_late_scene_penalty: float = 0.06
  177. context_template_weights_dynamic: dict[str, dict[str, dict[str, float]]] = field(
  178. default_factory=_default_context_template_weights_dynamic
  179. )
  180. context_genre_profile_support_composite: bool = True
  181. context_genre_profile_max_genres: int = 2
  182. context_genre_profile_separators: tuple[str, ...] = (
  183. "+",
  184. "/",
  185. "|",
  186. ",",
  187. ",",
  188. "、",
  189. )
  190. export_recent_changes_slice: int = 20
  191. export_disambiguation_slice: int = 20
  192. # ================= 查询默认限制 =================
  193. query_recent_chapters_limit: int = 10
  194. query_scenes_by_location_limit: int = 20
  195. query_entity_appearances_limit: int = 50
  196. query_recent_appearances_limit: int = 20
  197. # ================= 伏笔紧急度 =================
  198. foreshadowing_urgency_pending_high: int = 100
  199. foreshadowing_urgency_pending_medium: int = 50
  200. foreshadowing_urgency_target_proximity: int = 5
  201. foreshadowing_urgency_score_high: int = 100
  202. foreshadowing_urgency_score_medium: int = 60
  203. foreshadowing_urgency_score_target: int = 80
  204. foreshadowing_urgency_score_low: int = 20
  205. foreshadowing_urgency_threshold_show: int = 60
  206. foreshadowing_tier_weight_core: float = 3.0
  207. foreshadowing_tier_weight_sub: float = 2.0
  208. foreshadowing_tier_weight_decor: float = 1.0
  209. # ================= 角色活跃度 =================
  210. character_absence_warning: int = 30
  211. character_absence_critical: int = 100
  212. character_candidates_limit: int = 800
  213. # ================= Strand Weave 节奏 =================
  214. strand_quest_max_consecutive: int = 5
  215. strand_fire_max_gap: int = 10
  216. strand_constellation_max_gap: int = 15
  217. strand_quest_ratio_min: int = 55
  218. strand_quest_ratio_max: int = 65
  219. strand_fire_ratio_min: int = 20
  220. strand_fire_ratio_max: int = 30
  221. strand_constellation_ratio_min: int = 10
  222. strand_constellation_ratio_max: int = 20
  223. # ================= 爽点节奏 =================
  224. pacing_segment_size: int = 100
  225. pacing_words_per_point_excellent: int = 1000
  226. pacing_words_per_point_good: int = 1500
  227. pacing_words_per_point_acceptable: int = 2000
  228. # ================= RAG 存储 =================
  229. @property
  230. def rag_db(self) -> Path:
  231. return self.webnovel_dir / "rag.db"
  232. @property
  233. def vector_db(self) -> Path:
  234. return self.webnovel_dir / "vectors.db"
  235. def ensure_dirs(self):
  236. self.webnovel_dir.mkdir(parents=True, exist_ok=True)
  237. @classmethod
  238. def from_project_root(cls, project_root: str | Path) -> "DataModulesConfig":
  239. return cls(project_root=Path(project_root))
  240. _default_config: Optional[DataModulesConfig] = None
  241. def get_config(project_root: Optional[Path] = None) -> DataModulesConfig:
  242. global _default_config
  243. if project_root is not None:
  244. return DataModulesConfig.from_project_root(project_root)
  245. if _default_config is None:
  246. _default_config = DataModulesConfig()
  247. return _default_config
  248. def set_project_root(project_root: str | Path):
  249. global _default_config
  250. _default_config = DataModulesConfig.from_project_root(project_root)