Procházet zdrojové kódy

fix: address v5.1 review issues from Codex

Fixes:
- [HIGH] Stop writing entities_v3/alias_index/state_changes/relationships
  to state.json when SQLite sync is enabled (prevents re-bloat after migration)
- [MED] Add canonical_name field to export_to_entities_v3_format (with name alias)
- [MED] Fix falsy state values (0/""/False) being skipped in current sync
- [MED] Add update_metadata param to upsert_entity for full metadata updates
- [MED] Record appearances for new entities in process_chapter_entities
lingfengQAQ před 5 měsíci
rodič
revize
9000769386

+ 38 - 12
.claude/scripts/data_modules/index_manager.py

@@ -394,12 +394,13 @@ class IndexManager:
 
     # ==================== v5.1 实体操作 ====================
 
-    def upsert_entity(self, entity: EntityMeta) -> bool:
+    def upsert_entity(self, entity: EntityMeta, update_metadata: bool = False) -> bool:
         """
         插入或更新实体 (智能合并)
 
         - 新实体: 直接插入
         - 已存在: 更新 current_json, last_appearance, updated_at
+        - update_metadata=True: 同时更新 canonical_name/tier/desc/is_protagonist/is_archived
 
         返回是否为新实体
         """
@@ -422,17 +423,42 @@ class IndexManager:
                 # 合并 current (新值覆盖旧值)
                 merged_current = {**old_current, **entity.current}
 
-                cursor.execute("""
-                    UPDATE entities SET
-                        current_json = ?,
-                        last_appearance = ?,
-                        updated_at = CURRENT_TIMESTAMP
-                    WHERE id = ?
-                """, (
-                    json.dumps(merged_current, ensure_ascii=False),
-                    entity.last_appearance,
-                    entity.id
-                ))
+                if update_metadata:
+                    # 完整更新(包括元数据)
+                    cursor.execute("""
+                        UPDATE entities SET
+                            canonical_name = ?,
+                            tier = ?,
+                            desc = ?,
+                            current_json = ?,
+                            last_appearance = ?,
+                            is_protagonist = ?,
+                            is_archived = ?,
+                            updated_at = CURRENT_TIMESTAMP
+                        WHERE id = ?
+                    """, (
+                        entity.canonical_name,
+                        entity.tier,
+                        entity.desc,
+                        json.dumps(merged_current, ensure_ascii=False),
+                        entity.last_appearance,
+                        1 if entity.is_protagonist else 0,
+                        1 if entity.is_archived else 0,
+                        entity.id
+                    ))
+                else:
+                    # 只更新 current 和 last_appearance
+                    cursor.execute("""
+                        UPDATE entities SET
+                            current_json = ?,
+                            last_appearance = ?,
+                            updated_at = CURRENT_TIMESTAMP
+                        WHERE id = ?
+                    """, (
+                        json.dumps(merged_current, ensure_ascii=False),
+                        entity.last_appearance,
+                        entity.id
+                    ))
                 conn.commit()
                 return False
             else:

+ 15 - 2
.claude/scripts/data_modules/sql_state_manager.py

@@ -342,6 +342,17 @@ class SQLStateManager:
             # 统计别名
             stats["aliases"] += 1 + len(entity_data.aliases)
 
+            # 记录新实体的首次出场(解决 appearances 缺失问题)
+            mentions = entity.get("mentions", [])
+            if not mentions:
+                mentions = [entity_data.name]  # 至少包含实体名
+            self._index_manager.record_appearance(
+                entity_id=suggested_id,
+                chapter=chapter,
+                mentions=mentions,
+                confidence=entity.get("confidence", 1.0)
+            )
+
         # 3. 处理状态变化
         for change in state_changes:
             entity_id = change.get("entity_id")
@@ -361,7 +372,8 @@ class SQLStateManager:
             # 同步更新实体的 current
             field_name = change.get("field")
             new_value = change.get("new", change.get("new_value"))
-            if field_name and new_value:
+            # 注意:new_value 可能是 0/""/False 等 falsy 值,需要用 is not None 判断
+            if field_name and new_value is not None:
                 self._index_manager.update_entity_current(entity_id, {field_name: new_value})
 
         # 4. 处理新关系
@@ -414,7 +426,8 @@ class SQLStateManager:
             entities = self.get_entities_by_type(entity_type, include_archived=True)
             for e in entities:
                 entity_dict = {
-                    "name": e.get("canonical_name"),
+                    "canonical_name": e.get("canonical_name"),
+                    "name": e.get("canonical_name"),  # 兼容性别名
                     "tier": e.get("tier", "装饰"),
                     "aliases": e.get("aliases", []),
                     "desc": e.get("desc", ""),

+ 129 - 116
.claude/scripts/data_modules/state_manager.py

@@ -264,123 +264,136 @@ class StateManager:
 
                     progress["last_updated"] = self._now_progress_timestamp()
 
-                # entities_v3(按补丁应用)
-                entities_v3 = disk_state.get("entities_v3", {})
-                if not isinstance(entities_v3, dict):
-                    entities_v3 = {}
-                    disk_state["entities_v3"] = entities_v3
-                for t in self.ENTITY_TYPES:
-                    if not isinstance(entities_v3.get(t), dict):
-                        entities_v3[t] = {}
-
-                for (entity_type, entity_id), patch in self._pending_entity_patches.items():
-                    bucket = entities_v3.setdefault(entity_type, {})
-                    if not isinstance(bucket, dict):
-                        bucket = {}
-                        entities_v3[entity_type] = bucket
-
-                    entity = bucket.get(entity_id)
-                    if not isinstance(entity, dict):
-                        entity = {}
-                        bucket[entity_id] = entity
-
-                    # 新建实体时:只填充缺失字段,避免覆盖并发写入的更完整信息
-                    if patch.base_entity:
-                        for k, v in patch.base_entity.items():
-                            if k not in entity:
-                                entity[k] = v
-                            elif isinstance(entity.get(k), dict) and isinstance(v, dict):
-                                # 递归填充缺失
-                                for kk, vv in v.items():
-                                    if kk not in entity[k]:
-                                        entity[k][kk] = vv
-
-                    # top-level updates(明确写入)
-                    for k, v in patch.top_updates.items():
-                        entity[k] = v
-
-                    # current updates(明确写入)
-                    if patch.current_updates:
-                        current = entity.get("current")
-                        if not isinstance(current, dict):
-                            current = {}
-                            entity["current"] = current
-                        current.update(patch.current_updates)
-
-                    # appearance updates(first=min(non-zero), last=max)
-                    if patch.appearance_chapter is not None:
-                        chapter = int(patch.appearance_chapter)
-                        try:
-                            first = int(entity.get("first_appearance", 0) or 0)
-                        except (TypeError, ValueError):
-                            first = 0
-                        try:
-                            last = int(entity.get("last_appearance", 0) or 0)
-                        except (TypeError, ValueError):
-                            last = 0
-
-                        if first <= 0:
-                            entity["first_appearance"] = chapter
-                        else:
-                            entity["first_appearance"] = min(first, chapter)
-                        entity["last_appearance"] = max(last, chapter)
-
-                # alias_index(一对多:合并去重)
-                alias_index = disk_state.get("alias_index", {})
-                if not isinstance(alias_index, dict):
-                    alias_index = {}
-                    disk_state["alias_index"] = alias_index
-
-                for alias, entries in self._pending_alias_entries.items():
-                    if not alias:
-                        continue
-                    existing = alias_index.get(alias)
-                    if not isinstance(existing, list):
-                        existing = []
-                        alias_index[alias] = existing
-
-                    for entry in entries:
-                        et = entry.get("type")
-                        eid = entry.get("id")
-                        if not et or not eid:
-                            continue
-                        if any(e.get("type") == et and e.get("id") == eid for e in existing if isinstance(e, dict)):
-                            continue
-                        existing.append({"type": et, "id": eid})
-
-                # state_changes(追加)
-                if self._pending_state_changes:
-                    changes = disk_state.get("state_changes")
-                    if not isinstance(changes, list):
-                        changes = []
-                        disk_state["state_changes"] = changes
-                    changes.extend(self._pending_state_changes)
-
-                # structured_relationships(追加去重)
-                if self._pending_structured_relationships:
-                    rels = disk_state.get("structured_relationships")
-                    if not isinstance(rels, list):
-                        rels = []
-                        disk_state["structured_relationships"] = rels
-
-                    def _rel_key(r: Dict[str, Any]) -> tuple:
-                        return (
-                            r.get("from_entity"),
-                            r.get("to_entity"),
-                            r.get("type"),
-                            r.get("description"),
-                            r.get("chapter"),
-                        )
-
-                    existing_keys = {_rel_key(r) for r in rels if isinstance(r, dict)}
-                    for r in self._pending_structured_relationships:
-                        if not isinstance(r, dict):
+                # v5.1: 检查是否已迁移到 SQLite
+                # 如果启用了 SQLite 同步,则不再写入大数据字段到 state.json
+                _migrated = self._enable_sqlite_sync and self._sql_state_manager is not None
+
+                if not _migrated:
+                    # ==================== 旧模式:写入 state.json ====================
+                    # entities_v3(按补丁应用)
+                    entities_v3 = disk_state.get("entities_v3", {})
+                    if not isinstance(entities_v3, dict):
+                        entities_v3 = {}
+                        disk_state["entities_v3"] = entities_v3
+                    for t in self.ENTITY_TYPES:
+                        if not isinstance(entities_v3.get(t), dict):
+                            entities_v3[t] = {}
+
+                    for (entity_type, entity_id), patch in self._pending_entity_patches.items():
+                        bucket = entities_v3.setdefault(entity_type, {})
+                        if not isinstance(bucket, dict):
+                            bucket = {}
+                            entities_v3[entity_type] = bucket
+
+                        entity = bucket.get(entity_id)
+                        if not isinstance(entity, dict):
+                            entity = {}
+                            bucket[entity_id] = entity
+
+                        # 新建实体时:只填充缺失字段,避免覆盖并发写入的更完整信息
+                        if patch.base_entity:
+                            for k, v in patch.base_entity.items():
+                                if k not in entity:
+                                    entity[k] = v
+                                elif isinstance(entity.get(k), dict) and isinstance(v, dict):
+                                    # 递归填充缺失
+                                    for kk, vv in v.items():
+                                        if kk not in entity[k]:
+                                            entity[k][kk] = vv
+
+                        # top-level updates(明确写入)
+                        for k, v in patch.top_updates.items():
+                            entity[k] = v
+
+                        # current updates(明确写入)
+                        if patch.current_updates:
+                            current = entity.get("current")
+                            if not isinstance(current, dict):
+                                current = {}
+                                entity["current"] = current
+                            current.update(patch.current_updates)
+
+                        # appearance updates(first=min(non-zero), last=max)
+                        if patch.appearance_chapter is not None:
+                            chapter = int(patch.appearance_chapter)
+                            try:
+                                first = int(entity.get("first_appearance", 0) or 0)
+                            except (TypeError, ValueError):
+                                first = 0
+                            try:
+                                last = int(entity.get("last_appearance", 0) or 0)
+                            except (TypeError, ValueError):
+                                last = 0
+
+                            if first <= 0:
+                                entity["first_appearance"] = chapter
+                            else:
+                                entity["first_appearance"] = min(first, chapter)
+                            entity["last_appearance"] = max(last, chapter)
+
+                    # alias_index(一对多:合并去重)
+                    alias_index = disk_state.get("alias_index", {})
+                    if not isinstance(alias_index, dict):
+                        alias_index = {}
+                        disk_state["alias_index"] = alias_index
+
+                    for alias, entries in self._pending_alias_entries.items():
+                        if not alias:
                             continue
-                        k = _rel_key(r)
-                        if k in existing_keys:
-                            continue
-                        rels.append(r)
-                        existing_keys.add(k)
+                        existing = alias_index.get(alias)
+                        if not isinstance(existing, list):
+                            existing = []
+                            alias_index[alias] = existing
+
+                        for entry in entries:
+                            et = entry.get("type")
+                            eid = entry.get("id")
+                            if not et or not eid:
+                                continue
+                            if any(e.get("type") == et and e.get("id") == eid for e in existing if isinstance(e, dict)):
+                                continue
+                            existing.append({"type": et, "id": eid})
+
+                    # state_changes(追加)
+                    if self._pending_state_changes:
+                        changes = disk_state.get("state_changes")
+                        if not isinstance(changes, list):
+                            changes = []
+                            disk_state["state_changes"] = changes
+                        changes.extend(self._pending_state_changes)
+
+                    # structured_relationships(追加去重)
+                    if self._pending_structured_relationships:
+                        rels = disk_state.get("structured_relationships")
+                        if not isinstance(rels, list):
+                            rels = []
+                            disk_state["structured_relationships"] = rels
+
+                        def _rel_key(r: Dict[str, Any]) -> tuple:
+                            return (
+                                r.get("from_entity"),
+                                r.get("to_entity"),
+                                r.get("type"),
+                                r.get("description"),
+                                r.get("chapter"),
+                            )
+
+                        existing_keys = {_rel_key(r) for r in rels if isinstance(r, dict)}
+                        for r in self._pending_structured_relationships:
+                            if not isinstance(r, dict):
+                                continue
+                            k = _rel_key(r)
+                            if k in existing_keys:
+                                continue
+                            rels.append(r)
+                            existing_keys.add(k)
+                else:
+                    # ==================== v5.1 模式:移除大数据字段 ====================
+                    # 确保 state.json 中不存在这些膨胀字段
+                    for field in ["entities_v3", "alias_index", "state_changes", "structured_relationships"]:
+                        disk_state.pop(field, None)
+                    # 标记已迁移
+                    disk_state["_migrated_to_sqlite"] = True
 
                 # disambiguation_warnings(追加去重 + 截断)
                 if self._pending_disambiguation_warnings: