analyze_titles.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. #!/usr/bin/env python3
  2. """
  3. analyze_titles.py - 分析YouTube频道视频标题模式
  4. 分析维度:
  5. - 标题长度分布
  6. - 数字使用频率与类型
  7. - 高频词汇(去除停用词)
  8. - 标题公式分类(挑战型/数字型/悬念型/对比型/情感型)
  9. - 标点符号与大写模式
  10. 用法:
  11. python analyze_titles.py titles.txt
  12. python analyze_titles.py titles.txt -o report.md
  13. python analyze_titles.py titles.txt --top 30
  14. 输入格式: 纯文本文件,每行一个标题
  15. 输出格式: Markdown分析报告
  16. """
  17. import argparse
  18. import re
  19. import sys
  20. from collections import Counter
  21. from pathlib import Path
  22. # 英文停用词(轻量级,不依赖nltk)
  23. STOP_WORDS = {
  24. "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
  25. "her", "it", "its", "they", "them", "their", "a", "an", "the", "and",
  26. "but", "or", "for", "nor", "on", "at", "to", "from", "by", "in", "of",
  27. "with", "is", "am", "are", "was", "were", "be", "been", "being", "have",
  28. "has", "had", "do", "does", "did", "will", "would", "shall", "should",
  29. "may", "might", "must", "can", "could", "not", "no", "so", "if", "then",
  30. "than", "that", "this", "these", "those", "what", "which", "who", "whom",
  31. "how", "when", "where", "why", "all", "each", "every", "both", "few",
  32. "more", "most", "other", "some", "such", "only", "same", "too", "very",
  33. "just", "about", "above", "after", "again", "also", "any", "because",
  34. "before", "between", "during", "into", "through", "up", "down", "out",
  35. "over", "under", "here", "there", "now", "get", "got", "go", "going",
  36. "went", "make", "made", "like", "even", "still", "back", "us",
  37. }
  38. # 标题公式分类的关键词/模式
  39. TITLE_PATTERNS = {
  40. "挑战型": [
  41. r"\b(challenge|survive|last|endure|spent|living|tried|attempt)\b",
  42. r"\b(\d+)\s*(hours?|days?|minutes?)\b",
  43. r"\bi\s+(survived|built|made|ate|bought|opened|spent)\b",
  44. ],
  45. "数字型": [
  46. r"^\$[\d,]+",
  47. r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
  48. r"\b\d{2,}\b",
  49. r"\b(100|1000|10000|million|billion)\b",
  50. ],
  51. "悬念型": [
  52. r"\.\.\.",
  53. r"\?$",
  54. r"\b(mystery|secret|hidden|never|impossible|insane|unbelievable)\b",
  55. r"\b(what happens|you won't believe|no one|nobody)\b",
  56. ],
  57. "对比型": [
  58. r"\bvs\.?\b",
  59. r"\bversus\b",
  60. r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
  61. r"\b(cheap|expensive|worst|best|biggest|smallest)\b.*\bvs\.?\b",
  62. r"\b(world'?s?\s+(largest|smallest|most|least|biggest|cheapest))\b",
  63. ],
  64. "情感型": [
  65. r"\b(emotional|crying|tears|heartwarming|giving|donated|surprise)\b",
  66. r"!{2,}",
  67. r"\b(amazing|incredible|insane|crazy|epic|extreme)\b",
  68. ],
  69. }
  70. def load_titles(filepath: str) -> list[str]:
  71. """从文本文件加载标题,每行一个"""
  72. path = Path(filepath)
  73. if not path.exists():
  74. print(f"[ERROR] 文件不存在: {filepath}", file=sys.stderr)
  75. sys.exit(1)
  76. titles = [
  77. line.strip()
  78. for line in path.read_text(encoding="utf-8").splitlines()
  79. if line.strip()
  80. ]
  81. if not titles:
  82. print(f"[ERROR] 文件为空: {filepath}", file=sys.stderr)
  83. sys.exit(1)
  84. return titles
  85. def analyze_length(titles: list[str]) -> dict:
  86. """分析标题长度分布"""
  87. lengths = [len(t) for t in titles]
  88. word_counts = [len(t.split()) for t in titles]
  89. return {
  90. "char_avg": sum(lengths) / len(lengths),
  91. "char_min": min(lengths),
  92. "char_max": max(lengths),
  93. "char_median": sorted(lengths)[len(lengths) // 2],
  94. "word_avg": sum(word_counts) / len(word_counts),
  95. "word_min": min(word_counts),
  96. "word_max": max(word_counts),
  97. "brackets": [
  98. sum(1 for l in lengths if l <= 30),
  99. sum(1 for l in lengths if 30 < l <= 50),
  100. sum(1 for l in lengths if 50 < l <= 70),
  101. sum(1 for l in lengths if l > 70),
  102. ],
  103. }
  104. def analyze_numbers(titles: list[str]) -> dict:
  105. """分析数字使用情况"""
  106. has_number = [t for t in titles if re.search(r"\d", t)]
  107. has_dollar = [t for t in titles if "$" in t]
  108. numbers_found = []
  109. for t in titles:
  110. numbers_found.extend(int(n.replace(",", "")) for n in re.findall(r"[\d,]+", t) if n.replace(",", "").isdigit())
  111. return {
  112. "with_number_pct": len(has_number) / len(titles) * 100,
  113. "with_dollar_pct": len(has_dollar) / len(titles) * 100,
  114. "common_numbers": Counter(numbers_found).most_common(10),
  115. }
  116. def analyze_words(titles: list[str], top_n: int = 20) -> list[tuple[str, int]]:
  117. """提取高频词汇(去除停用词)"""
  118. words = []
  119. for t in titles:
  120. tokens = re.findall(r"[a-zA-Z]+", t.lower())
  121. words.extend(w for w in tokens if w not in STOP_WORDS and len(w) > 1)
  122. return Counter(words).most_common(top_n)
  123. def classify_titles(titles: list[str]) -> dict[str, list[str]]:
  124. """按标题公式分类"""
  125. results = {cat: [] for cat in TITLE_PATTERNS}
  126. for t in titles:
  127. for cat, patterns in TITLE_PATTERNS.items():
  128. if any(re.search(p, t, re.IGNORECASE) for p in patterns):
  129. results[cat].append(t)
  130. break # 每个标题只归入第一个匹配的类别
  131. results["其他"] = [
  132. t for t in titles
  133. if not any(t in v for v in results.values())
  134. ]
  135. return results
  136. def analyze_punctuation(titles: list[str]) -> dict:
  137. """分析标点和大写模式"""
  138. return {
  139. "ends_exclamation": sum(1 for t in titles if t.endswith("!")),
  140. "ends_question": sum(1 for t in titles if t.endswith("?")),
  141. "ends_ellipsis": sum(1 for t in titles if t.endswith("...")),
  142. "has_all_caps_word": sum(1 for t in titles if re.search(r"\b[A-Z]{2,}\b", t)),
  143. "has_emoji": sum(1 for t in titles if re.search(r"[\U0001F600-\U0001F9FF]", t)),
  144. }
  145. def generate_report(titles: list[str], top_n: int) -> str:
  146. """生成Markdown分析报告"""
  147. total = len(titles)
  148. length_stats = analyze_length(titles)
  149. number_stats = analyze_numbers(titles)
  150. top_words = analyze_words(titles, top_n)
  151. categories = classify_titles(titles)
  152. punct_stats = analyze_punctuation(titles)
  153. lines = []
  154. lines.append(f"# YouTube标题分析报告\n")
  155. lines.append(f"共分析 **{total}** 个标题\n")
  156. # 长度分布
  157. lines.append("## 1. 标题长度分布\n")
  158. lines.append("| 指标 | 字符数 | 词数 |")
  159. lines.append("|------|--------|------|")
  160. lines.append(f"| 平均 | {length_stats['char_avg']:.1f} | {length_stats['word_avg']:.1f} |")
  161. lines.append(f"| 最短 | {length_stats['char_min']} | {length_stats['word_min']} |")
  162. lines.append(f"| 最长 | {length_stats['char_max']} | {length_stats['word_max']} |")
  163. lines.append(f"| 中位数 | {length_stats['char_median']} | - |")
  164. lines.append("")
  165. b = length_stats["brackets"]
  166. lines.append(f"- 30字符以内: {b[0]} ({b[0]/total*100:.1f}%)")
  167. lines.append(f"- 31-50字符: {b[1]} ({b[1]/total*100:.1f}%)")
  168. lines.append(f"- 51-70字符: {b[2]} ({b[2]/total*100:.1f}%)")
  169. lines.append(f"- 70字符以上: {b[3]} ({b[3]/total*100:.1f}%)")
  170. lines.append("")
  171. # 数字使用
  172. lines.append("## 2. 数字使用\n")
  173. lines.append(f"- 含数字的标题: {number_stats['with_number_pct']:.1f}%")
  174. lines.append(f"- 含$金额的标题: {number_stats['with_dollar_pct']:.1f}%")
  175. if number_stats["common_numbers"]:
  176. lines.append("\n常见数字:")
  177. for num, count in number_stats["common_numbers"]:
  178. lines.append(f" - {num:,}: 出现 {count} 次")
  179. lines.append("")
  180. # 高频词汇
  181. lines.append(f"## 3. 高频词汇 (Top {top_n})\n")
  182. lines.append("| 排名 | 词汇 | 出现次数 |")
  183. lines.append("|------|------|----------|")
  184. for i, (word, count) in enumerate(top_words, 1):
  185. lines.append(f"| {i} | {word} | {count} |")
  186. lines.append("")
  187. # 标题公式分类
  188. lines.append("## 4. 标题公式分类\n")
  189. lines.append("| 类型 | 数量 | 占比 | 示例 |")
  190. lines.append("|------|------|------|------|")
  191. for cat in ["挑战型", "数字型", "悬念型", "对比型", "情感型", "其他"]:
  192. items = categories.get(cat, [])
  193. pct = len(items) / total * 100 if total else 0
  194. example = items[0][:50] + "..." if items and len(items[0]) > 50 else (items[0] if items else "-")
  195. lines.append(f"| {cat} | {len(items)} | {pct:.1f}% | {example} |")
  196. lines.append("")
  197. # 标点与格式
  198. lines.append("## 5. 标点与格式特征\n")
  199. lines.append(f"- 感叹号结尾: {punct_stats['ends_exclamation']} ({punct_stats['ends_exclamation']/total*100:.1f}%)")
  200. lines.append(f"- 问号结尾: {punct_stats['ends_question']} ({punct_stats['ends_question']/total*100:.1f}%)")
  201. lines.append(f"- 省略号结尾: {punct_stats['ends_ellipsis']} ({punct_stats['ends_ellipsis']/total*100:.1f}%)")
  202. lines.append(f"- 含全大写词: {punct_stats['has_all_caps_word']} ({punct_stats['has_all_caps_word']/total*100:.1f}%)")
  203. lines.append("")
  204. # 洞察
  205. lines.append("## 6. 关键洞察\n")
  206. # 自动生成一些洞察
  207. if number_stats["with_number_pct"] > 60:
  208. lines.append("- **数字驱动**: 超过60%的标题使用数字,数字是核心吸引力元素")
  209. if number_stats["with_dollar_pct"] > 30:
  210. lines.append("- **金钱叙事**: 大量使用$金额,制造价值感和规模感")
  211. dominant_cat = max(
  212. [(cat, len(items)) for cat, items in categories.items() if cat != "其他"],
  213. key=lambda x: x[1],
  214. )
  215. lines.append(f"- **主导公式**: 「{dominant_cat[0]}」是使用最多的标题类型 ({dominant_cat[1]}/{total})")
  216. if length_stats["char_avg"] < 50:
  217. lines.append("- **简洁风格**: 平均标题长度不到50字符,倾向短标题")
  218. else:
  219. lines.append("- **详细风格**: 平均标题超过50字符,倾向描述性标题")
  220. return "\n".join(lines)
  221. def main():
  222. parser = argparse.ArgumentParser(
  223. description="分析YouTube频道视频标题模式",
  224. formatter_class=argparse.RawDescriptionHelpFormatter,
  225. epilog="示例:\n python analyze_titles.py mrbeast_titles.txt\n python analyze_titles.py titles.txt -o report.md --top 30",
  226. )
  227. parser.add_argument("input", help="标题文本文件(每行一个标题)")
  228. parser.add_argument("-o", "--output", help="输出报告文件路径(默认打印到终端)")
  229. parser.add_argument("--top", type=int, default=20, help="显示的高频词数量(默认20)")
  230. args = parser.parse_args()
  231. titles = load_titles(args.input)
  232. report = generate_report(titles, args.top)
  233. if args.output:
  234. Path(args.output).write_text(report, encoding="utf-8")
  235. print(f"[OK] 报告已保存到: {args.output}")
  236. else:
  237. print(report)
  238. if __name__ == "__main__":
  239. main()