genre_profile_builder.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Genre profile parsing helpers for ContextManager.
  5. """
  6. from __future__ import annotations
  7. import re
  8. from typing import List
  9. from .genre_aliases import normalize_genre_token
  10. def parse_genre_tokens(
  11. genre_raw: str,
  12. *,
  13. support_composite: bool,
  14. separators: tuple[str, ...],
  15. ) -> List[str]:
  16. text = str(genre_raw or "").strip()
  17. if not text:
  18. return []
  19. if not support_composite:
  20. normalized_single = normalize_genre_token(text)
  21. return [normalized_single] if normalized_single else [text]
  22. pattern = "|".join(re.escape(str(token)) for token in separators if str(token))
  23. if not pattern:
  24. normalized_single = normalize_genre_token(text)
  25. return [normalized_single] if normalized_single else [text]
  26. tokens = [chunk.strip() for chunk in re.split(pattern, text) if chunk and chunk.strip()]
  27. deduped: List[str] = []
  28. seen = set()
  29. for token in tokens:
  30. normalized_token = normalize_genre_token(token)
  31. if not normalized_token:
  32. continue
  33. lower = normalized_token.lower()
  34. if lower in seen:
  35. continue
  36. seen.add(lower)
  37. deduped.append(normalized_token)
  38. if deduped:
  39. return deduped
  40. fallback_token = normalize_genre_token(text)
  41. return [fallback_token] if fallback_token else [text]
  42. def extract_genre_section(text: str, genre: str) -> str:
  43. if not text:
  44. return ""
  45. lines = text.splitlines()
  46. capture: List[str] = []
  47. active = False
  48. target = genre.strip().lower()
  49. for line in lines:
  50. normalized = line.strip().lower()
  51. if normalized.startswith("## ") or normalized.startswith("### "):
  52. if active:
  53. break
  54. active = target in normalized
  55. if active:
  56. capture.append(line)
  57. continue
  58. if active:
  59. capture.append(line)
  60. if capture:
  61. return "\n".join(capture).strip()
  62. return "\n".join(lines[:80]).strip()
  63. def extract_markdown_refs(text: str, max_items: int = 8) -> List[str]:
  64. if not text:
  65. return []
  66. refs: List[str] = []
  67. for line in text.splitlines():
  68. row = line.strip().lstrip("-*").strip()
  69. if not row or row.startswith("#"):
  70. continue
  71. refs.append(row)
  72. if len(refs) >= max(1, max_items):
  73. break
  74. return refs
  75. def build_composite_genre_hints(genres: List[str], refs: List[str]) -> List[str]:
  76. if len(genres) <= 1:
  77. return []
  78. primary = genres[0]
  79. secondaries = genres[1:]
  80. hints: List[str] = []
  81. hints.append(
  82. f"以“{primary}”作为主引擎推进主线,每章至少保留1处“{'/'.join(secondaries)}”特征表达。"
  83. )
  84. if refs:
  85. hints.append(f"复合题材执行参考:{refs[0]}")
  86. hints.append("主辅题材冲突时,优先保证主题材读者承诺,辅题材用于制造新鲜感。")
  87. return hints