Преглед изворни кода

fix: prevent vector projection chunk collisions

lingfengQAQ пре 1 месец
родитељ
комит
578dd33f6a

+ 58 - 0
webnovel-writer/scripts/data_modules/tests/test_vector_projection_writer.py

@@ -58,6 +58,64 @@ def test_collect_chunks_from_commit():
     assert len(chunks) == 2
     assert chunks[0]["chunk_type"] == "event"
     assert chunks[1]["chunk_type"] == "entity_delta"
+    assert chunks[0]["chunk_id"] != chunks[1]["chunk_id"]
+
+
+def test_collect_chunks_assigns_unique_ids_for_same_chapter_events():
+    writer = VectorProjectionWriter.__new__(VectorProjectionWriter)
+    payload = {
+        "meta": {"chapter": 47, "status": "accepted"},
+        "accepted_events": [
+            {
+                "event_type": "character_state_changed",
+                "chapter": 47,
+                "subject": "韩立",
+                "payload": {"field": "状态", "new": "警觉"},
+            },
+            {
+                "event_type": "character_state_changed",
+                "chapter": 47,
+                "subject": "陈巧倩",
+                "payload": {"field": "状态", "new": "迟疑"},
+            },
+        ],
+        "entity_deltas": [],
+    }
+
+    chunks = writer._collect_chunks(payload)
+
+    assert len(chunks) == 2
+    assert len({chunk["chunk_id"] for chunk in chunks}) == 2
+    assert all(chunk["scene_index"] == 0 for chunk in chunks)
+
+
+def test_collect_chunks_keeps_event_id_stable_when_order_changes():
+    writer = VectorProjectionWriter.__new__(VectorProjectionWriter)
+    event_a = {
+        "event_id": "evt-a",
+        "event_type": "character_state_changed",
+        "chapter": 47,
+        "subject": "韩立",
+        "payload": {"field": "状态", "new": "警觉"},
+    }
+    event_b = {
+        "event_id": "evt-b",
+        "event_type": "character_state_changed",
+        "chapter": 47,
+        "subject": "陈巧倩",
+        "payload": {"field": "状态", "new": "迟疑"},
+    }
+
+    first = writer._collect_chunks(
+        {"meta": {"chapter": 47}, "accepted_events": [event_a, event_b], "entity_deltas": []}
+    )
+    second = writer._collect_chunks(
+        {"meta": {"chapter": 47}, "accepted_events": [event_b, event_a], "entity_deltas": []}
+    )
+
+    first_ids = {chunk["content"]: chunk["chunk_id"] for chunk in first}
+    second_ids = {chunk["content"]: chunk["chunk_id"] for chunk in second}
+    assert first_ids == second_ids
 
 
 def test_rejected_commit_returns_not_applied():

+ 26 - 0
webnovel-writer/scripts/data_modules/vector_projection_writer.py

@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import asyncio
+import hashlib
 import logging
 from pathlib import Path
 from typing import Any, Dict, List
@@ -33,13 +34,18 @@ class VectorProjectionWriter:
         chunks: List[Dict[str, Any]] = []
         chapter = int(commit_payload.get("meta", {}).get("chapter") or 0)
 
+        chunk_counts: Dict[str, int] = {}
+
         for event in commit_payload.get("accepted_events") or []:
             if not isinstance(event, dict):
                 continue
             text = self._event_to_text(event)
             if text:
                 evt_chapter = int(event.get("chapter") or chapter)
+                event_key = event.get("event_id") or f"{event.get('event_type')}:{event.get('subject')}:{text}"
+                chunk_id = self._unique_chunk_id(chunk_counts, "event", evt_chapter, event_key)
                 chunks.append({
+                    "chunk_id": chunk_id,
                     "chapter": evt_chapter,
                     "scene_index": 0,
                     "content": text,
@@ -54,7 +60,10 @@ class VectorProjectionWriter:
             text = self._delta_to_text(delta)
             if text:
                 d_chapter = int(delta.get("chapter") or chapter)
+                delta_key = delta.get("delta_id") or delta.get("entity_id") or text
+                chunk_id = self._unique_chunk_id(chunk_counts, "entity_delta", d_chapter, delta_key)
                 chunks.append({
+                    "chunk_id": chunk_id,
                     "chapter": d_chapter,
                     "scene_index": 0,
                     "content": text,
@@ -65,6 +74,23 @@ class VectorProjectionWriter:
 
         return chunks
 
+    def _unique_chunk_id(
+        self,
+        counts: Dict[str, int],
+        kind: str,
+        chapter: int,
+        key: Any,
+    ) -> str:
+        base_id = self._chunk_id(kind, chapter, key)
+        occurrence = counts.get(base_id, 0) + 1
+        counts[base_id] = occurrence
+        return base_id if occurrence == 1 else f"{base_id}_{occurrence}"
+
+    def _chunk_id(self, kind: str, chapter: int, key: Any) -> str:
+        raw = f"{kind}:{chapter}:{key}"
+        digest = hashlib.sha1(raw.encode("utf-8")).hexdigest()[:12]
+        return f"ch{chapter:04d}_{kind}_{digest}"
+
     def _event_to_text(self, event: dict) -> str:
         chapter = int(event.get("chapter") or 0)
         subject = str(event.get("subject") or "").strip()