retention_curve_checker.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. #!/usr/bin/env python3
  2. """
  3. retention_curve_checker.py - 基于MrBeast方法论的视频脚本retention检查器
  4. 检查维度(基于MrBeast公开分享的retention理论):
  5. 1. 前30秒hook:是否有明确的hook抓住观众?
  6. 2. Re-engagement moments:每3-5分钟是否有升级/转折?
  7. 3. 结尾CTA/悬念:结尾是否有行动号召或悬念?
  8. 4. Boring parts:是否有长段落无动作的「死区」?
  9. 5. 递进结构:内容是否持续升级(MrBeast的核心理念)
  10. 用法:
  11. python retention_curve_checker.py script.txt
  12. python retention_curve_checker.py script.txt -o report.md
  13. python retention_curve_checker.py script.txt --duration 15
  14. 输入格式: 纯文本脚本文件
  15. """
  16. import argparse
  17. import re
  18. import sys
  19. from pathlib import Path
  20. # ---------- 常量 ----------
  21. # 平均语速(中文约250字/分钟,英文约150词/分钟)
  22. WORDS_PER_MINUTE_EN = 150
  23. CHARS_PER_MINUTE_ZH = 250
  24. # Hook检查的关键词/模式
  25. HOOK_PATTERNS = [
  26. r"\b(today|right now|in this video|let me show|watch what happens)\b",
  27. r"\b(challenge|bet|dare|surprise|secret|reveal|biggest|craziest)\b",
  28. r"\$[\d,]+",
  29. r"\b\d+\s*(hours?|days?|people|dollars)\b",
  30. r"[!?]{1,}",
  31. # 中文hook
  32. r"(今天|现在|接下来|你绝对|不敢相信|挑战|赌|秘密|最大的|最疯狂的)",
  33. ]
  34. # Re-engagement信号词
  35. REENGAGEMENT_PATTERNS = [
  36. r"\b(but wait|it gets (better|worse|crazier)|plot twist|here'?s where)\b",
  37. r"\b(next|now|then|suddenly|finally|the (biggest|craziest|best) part)\b",
  38. r"\b(level \d|round \d|phase \d|stage \d|part \d)\b",
  39. r"\b(upgrade|double|triple|10x|100x|even more|even bigger)\b",
  40. # 中文
  41. r"(但是等等|更疯狂的是|转折来了|接下来|突然|最关键的|升级|加倍|翻倍)",
  42. r"(第[一二三四五六七八九十\d]+[轮关回])",
  43. ]
  44. # CTA和悬念模式
  45. CTA_PATTERNS = [
  46. r"\b(subscribe|like|comment|share|click|next video|see you)\b",
  47. r"\b(what do you think|let me know|tell me)\b",
  48. r"\b(next time|coming soon|stay tuned|part \d|to be continued)\b",
  49. # 中文
  50. r"(关注|点赞|评论|分享|下期|下次|下一个视频|你觉得呢|告诉我|敬请期待)",
  51. ]
  52. # Boring part检测:连续长段落无动作词
  53. ACTION_WORDS = [
  54. r"\b(explode|run|jump|scream|crash|build|destroy|open|reveal|surprise)\b",
  55. r"\b(win|lose|fail|succeed|break|smash|launch|drop|fly|race)\b",
  56. r"[!?]",
  57. r"\$[\d,]+",
  58. # 中文
  59. r"(爆炸|跑|跳|尖叫|建造|打开|揭晓|惊喜|赢|输|失败|打破|发射)",
  60. ]
  61. def detect_language(text: str) -> str:
  62. """简单检测文本主要语言"""
  63. zh_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
  64. en_words = len(re.findall(r"[a-zA-Z]+", text))
  65. return "zh" if zh_chars > en_words else "en"
  66. def estimate_duration(text: str, lang: str) -> float:
  67. """估算脚本时长(分钟)"""
  68. if lang == "zh":
  69. char_count = len(re.findall(r"[\u4e00-\u9fff]", text))
  70. return char_count / CHARS_PER_MINUTE_ZH
  71. else:
  72. word_count = len(text.split())
  73. return word_count / WORDS_PER_MINUTE_EN
  74. def split_into_segments(text: str, segment_minutes: float, lang: str) -> list[str]:
  75. """将脚本按时长切成段落"""
  76. lines = text.splitlines()
  77. if lang == "zh":
  78. chars_per_seg = int(CHARS_PER_MINUTE_ZH * segment_minutes)
  79. segments = []
  80. current = []
  81. current_len = 0
  82. for line in lines:
  83. line_len = len(re.findall(r"[\u4e00-\u9fff]", line))
  84. if current_len + line_len > chars_per_seg and current:
  85. segments.append("\n".join(current))
  86. current = [line]
  87. current_len = line_len
  88. else:
  89. current.append(line)
  90. current_len += line_len
  91. if current:
  92. segments.append("\n".join(current))
  93. return segments
  94. else:
  95. words_per_seg = int(WORDS_PER_MINUTE_EN * segment_minutes)
  96. segments = []
  97. current = []
  98. current_len = 0
  99. for line in lines:
  100. line_len = len(line.split())
  101. if current_len + line_len > words_per_seg and current:
  102. segments.append("\n".join(current))
  103. current = [line]
  104. current_len = line_len
  105. else:
  106. current.append(line)
  107. current_len += line_len
  108. if current:
  109. segments.append("\n".join(current))
  110. return segments
  111. def check_hook(text: str, first_n_chars: int = 500) -> dict:
  112. """检查前30秒是否有hook"""
  113. opening = text[:first_n_chars]
  114. matches = []
  115. for pattern in HOOK_PATTERNS:
  116. found = re.findall(pattern, opening, re.IGNORECASE)
  117. if found:
  118. matches.extend(found)
  119. score = min(len(matches), 5) # 0-5分
  120. return {
  121. "score": score,
  122. "max": 5,
  123. "matches": matches[:5],
  124. "opening_preview": opening[:200].replace("\n", " "),
  125. }
  126. def check_reengagement(segments: list[str]) -> dict:
  127. """检查每个段落是否有re-engagement moment"""
  128. results = []
  129. for i, seg in enumerate(segments):
  130. matches = []
  131. for pattern in REENGAGEMENT_PATTERNS:
  132. found = re.findall(pattern, seg, re.IGNORECASE)
  133. if found:
  134. matches.extend(found)
  135. results.append({
  136. "segment": i + 1,
  137. "has_reengagement": len(matches) > 0,
  138. "matches": matches[:3],
  139. })
  140. segments_with = sum(1 for r in results if r["has_reengagement"])
  141. total = len(results)
  142. score = round(segments_with / total * 5) if total > 0 else 0
  143. return {
  144. "score": score,
  145. "max": 5,
  146. "segments_with": segments_with,
  147. "total_segments": total,
  148. "details": results,
  149. }
  150. def check_ending(text: str, last_n_chars: int = 500) -> dict:
  151. """检查结尾是否有CTA或悬念"""
  152. ending = text[-last_n_chars:]
  153. matches = []
  154. for pattern in CTA_PATTERNS:
  155. found = re.findall(pattern, ending, re.IGNORECASE)
  156. if found:
  157. matches.extend(found)
  158. score = min(len(matches), 5)
  159. return {
  160. "score": score,
  161. "max": 5,
  162. "matches": matches[:5],
  163. "ending_preview": ending[-200:].replace("\n", " "),
  164. }
  165. def check_boring_parts(segments: list[str]) -> dict:
  166. """检查是否有无动作的「死区」"""
  167. boring_segments = []
  168. for i, seg in enumerate(segments):
  169. action_count = 0
  170. for pattern in ACTION_WORDS:
  171. action_count += len(re.findall(pattern, seg, re.IGNORECASE))
  172. # 一个段落如果动作词密度太低就标记
  173. word_count = max(len(seg.split()), 1)
  174. density = action_count / word_count * 100
  175. if density < 0.5: # 低于0.5%动作词密度
  176. boring_segments.append({
  177. "segment": i + 1,
  178. "preview": seg[:100].replace("\n", " ") + "...",
  179. "action_density": round(density, 2),
  180. })
  181. # 没有boring段落得5分,每有一个扣1分
  182. score = max(0, 5 - len(boring_segments))
  183. return {
  184. "score": score,
  185. "max": 5,
  186. "boring_count": len(boring_segments),
  187. "details": boring_segments,
  188. }
  189. def check_escalation(segments: list[str]) -> dict:
  190. """检查内容是否持续升级(MrBeast核心理念:每一分钟都要比上一分钟更精彩)"""
  191. escalation_words = [
  192. r"\b(more|bigger|better|crazier|harder|faster|extreme|ultimate|final)\b",
  193. r"\b(upgrade|level up|raise|increase|double|triple|max|peak)\b",
  194. r"(更大|更好|更疯狂|更难|升级|加码|翻倍|终极|最终)",
  195. ]
  196. scores_per_seg = []
  197. for seg in segments:
  198. count = 0
  199. for pattern in escalation_words:
  200. count += len(re.findall(pattern, seg, re.IGNORECASE))
  201. scores_per_seg.append(count)
  202. # 理想情况:后半部分的升级词应该多于前半部分
  203. if len(scores_per_seg) >= 2:
  204. mid = len(scores_per_seg) // 2
  205. first_half = sum(scores_per_seg[:mid])
  206. second_half = sum(scores_per_seg[mid:])
  207. is_escalating = second_half >= first_half
  208. else:
  209. is_escalating = True
  210. total_escalation = sum(scores_per_seg)
  211. score = min(total_escalation, 3) + (2 if is_escalating else 0)
  212. return {
  213. "score": min(score, 5),
  214. "max": 5,
  215. "is_escalating": is_escalating,
  216. "escalation_per_segment": scores_per_seg,
  217. }
  218. def generate_report(filepath: str, text: str, duration_override: float = None) -> str:
  219. """生成完整检查报告"""
  220. lang = detect_language(text)
  221. duration = duration_override or estimate_duration(text, lang)
  222. segments = split_into_segments(text, 3.0, lang) # 每3分钟一段
  223. hook = check_hook(text)
  224. reengagement = check_reengagement(segments)
  225. ending = check_ending(text)
  226. boring = check_boring_parts(segments)
  227. escalation = check_escalation(segments)
  228. total_score = hook["score"] + reengagement["score"] + ending["score"] + boring["score"] + escalation["score"]
  229. max_score = 25
  230. lines = []
  231. lines.append("# Retention检查报告\n")
  232. lines.append(f"**文件**: {filepath}")
  233. lines.append(f"**语言**: {'中文' if lang == 'zh' else '英文'}")
  234. lines.append(f"**预估时长**: {duration:.1f} 分钟")
  235. lines.append(f"**分段数**: {len(segments)} 段(每3分钟)")
  236. lines.append(f"**总分**: {total_score}/{max_score}\n")
  237. # 评级
  238. if total_score >= 20:
  239. grade = "A - 优秀,retention结构扎实"
  240. elif total_score >= 15:
  241. grade = "B - 良好,有改进空间"
  242. elif total_score >= 10:
  243. grade = "C - 及格,需要重点优化"
  244. else:
  245. grade = "D - 需要大幅改写"
  246. lines.append(f"**评级**: {grade}\n")
  247. # 1. Hook检查
  248. lines.append("## 1. 前30秒Hook ({}/{})\n".format(hook["score"], hook["max"]))
  249. if hook["score"] >= 3:
  250. lines.append("开头有明确的hook元素。")
  251. elif hook["score"] >= 1:
  252. lines.append("开头有部分hook,但力度不够。")
  253. else:
  254. lines.append("**警告**: 开头缺少hook,观众可能在前10秒流失。")
  255. if hook["matches"]:
  256. lines.append(f"\n检测到的hook元素: {', '.join(str(m) for m in hook['matches'][:5])}")
  257. lines.append(f"\n> 开头预览: {hook['opening_preview']}")
  258. lines.append("")
  259. lines.append("**MrBeast原则**: 前30秒必须让观众知道「这个视频值得看完」。要么展示最终成果的预告,要么直接抛出不可抗拒的悬念。\n")
  260. # 2. Re-engagement
  261. lines.append("## 2. Re-engagement Moments ({}/{})\n".format(reengagement["score"], reengagement["max"]))
  262. lines.append(f"在 {reengagement['total_segments']} 个段落中,{reengagement['segments_with']} 个有re-engagement信号。\n")
  263. for detail in reengagement["details"]:
  264. status = "有" if detail["has_reengagement"] else "**缺失**"
  265. lines.append(f"- 段落 {detail['segment']}: {status}")
  266. if detail["matches"]:
  267. lines.append(f" 信号词: {', '.join(str(m) for m in detail['matches'])}")
  268. lines.append("")
  269. lines.append("**MrBeast原则**: 每3-5分钟必须有一个「重新抓住观众」的时刻。可以是新的挑战升级、意外转折、或者stakes提高。\n")
  270. # 3. 结尾
  271. lines.append("## 3. 结尾CTA/悬念 ({}/{})\n".format(ending["score"], ending["max"]))
  272. if ending["score"] >= 3:
  273. lines.append("结尾有明确的CTA或悬念。")
  274. elif ending["score"] >= 1:
  275. lines.append("结尾有部分CTA元素,可以更强。")
  276. else:
  277. lines.append("**警告**: 结尾平淡,缺少行动号召或下期预告。")
  278. lines.append(f"\n> 结尾预览: ...{ending['ending_preview']}")
  279. lines.append("")
  280. # 4. Boring Parts
  281. lines.append("## 4. Boring Parts检测 ({}/{})\n".format(boring["score"], boring["max"]))
  282. if boring["boring_count"] == 0:
  283. lines.append("未检测到明显的「死区」段落。")
  284. else:
  285. lines.append(f"检测到 **{boring['boring_count']}** 个低动作密度段落:\n")
  286. for b in boring["details"]:
  287. lines.append(f"- 段落 {b['segment']} (动作密度: {b['action_density']}%): {b['preview']}")
  288. lines.append("")
  289. lines.append("**MrBeast原则**: 「If it's boring, cut it.」没有动作、没有张力的段落就是观众点走的时刻。\n")
  290. # 5. 递进结构
  291. lines.append("## 5. 递进结构 ({}/{})\n".format(escalation["score"], escalation["max"]))
  292. if escalation["is_escalating"]:
  293. lines.append("内容呈现递进趋势,后半段升级词多于前半段。")
  294. else:
  295. lines.append("**警告**: 后半段的升级感不如前半段,可能导致观众中途流失。")
  296. lines.append(f"\n各段落升级词数量: {escalation['escalation_per_segment']}")
  297. lines.append("")
  298. lines.append("**MrBeast原则**: 视频的每一分钟都应该比上一分钟更精彩。观众的期望在持续上升,内容必须跟上。\n")
  299. # 改进建议
  300. lines.append("## 改进建议\n")
  301. if hook["score"] < 3:
  302. lines.append("1. **强化开头**: 考虑用「结果前置」策略,在前5秒展示视频最震撼的画面,然后回到起点讲故事。")
  303. if reengagement["segments_with"] < reengagement["total_segments"] * 0.6:
  304. missing = [d["segment"] for d in reengagement["details"] if not d["has_reengagement"]]
  305. lines.append(f"2. **补充转折点**: 段落 {missing} 缺少re-engagement,考虑加入新挑战、意外事件或stakes升级。")
  306. if ending["score"] < 3:
  307. lines.append("3. **强化结尾**: 加入明确的CTA(关注/点赞)或下期预告,让观众有理由回来。")
  308. if boring["boring_count"] > 0:
  309. lines.append(f"4. **删减死区**: {boring['boring_count']}个段落动作密度过低,考虑压缩或加入视觉/动作元素。")
  310. if not escalation["is_escalating"]:
  311. lines.append("5. **重排结构**: 把最精彩的内容放在后半段,确保观众感受到持续升级。")
  312. return "\n".join(lines)
  313. def main():
  314. parser = argparse.ArgumentParser(
  315. description="基于MrBeast方法论的视频脚本retention检查器",
  316. formatter_class=argparse.RawDescriptionHelpFormatter,
  317. epilog="示例:\n python retention_curve_checker.py script.txt\n python retention_curve_checker.py script.txt --duration 15 -o report.md",
  318. )
  319. parser.add_argument("input", help="视频脚本文本文件")
  320. parser.add_argument("-o", "--output", help="输出报告文件路径")
  321. parser.add_argument("--duration", type=float, help="手动指定视频时长(分钟),覆盖自动估算")
  322. args = parser.parse_args()
  323. path = Path(args.input)
  324. if not path.exists():
  325. print(f"[ERROR] 文件不存在: {args.input}", file=sys.stderr)
  326. sys.exit(1)
  327. text = path.read_text(encoding="utf-8")
  328. if not text.strip():
  329. print(f"[ERROR] 文件为空: {args.input}", file=sys.stderr)
  330. sys.exit(1)
  331. report = generate_report(args.input, text, args.duration)
  332. if args.output:
  333. Path(args.output).write_text(report, encoding="utf-8")
  334. print(f"[OK] 报告已保存到: {args.output}")
  335. else:
  336. print(report)
  337. if __name__ == "__main__":
  338. main()