| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- #!/usr/bin/env python3
- """
- analyze_titles.py - 分析YouTube频道视频标题模式
- 分析维度:
- - 标题长度分布
- - 数字使用频率与类型
- - 高频词汇(去除停用词)
- - 标题公式分类(挑战型/数字型/悬念型/对比型/情感型)
- - 标点符号与大写模式
- 用法:
- python analyze_titles.py titles.txt
- python analyze_titles.py titles.txt -o report.md
- python analyze_titles.py titles.txt --top 30
- 输入格式: 纯文本文件,每行一个标题
- 输出格式: Markdown分析报告
- """
- import argparse
- import re
- import sys
- from collections import Counter
- from pathlib import Path
- # 英文停用词(轻量级,不依赖nltk)
- STOP_WORDS = {
- "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
- "her", "it", "its", "they", "them", "their", "a", "an", "the", "and",
- "but", "or", "for", "nor", "on", "at", "to", "from", "by", "in", "of",
- "with", "is", "am", "are", "was", "were", "be", "been", "being", "have",
- "has", "had", "do", "does", "did", "will", "would", "shall", "should",
- "may", "might", "must", "can", "could", "not", "no", "so", "if", "then",
- "than", "that", "this", "these", "those", "what", "which", "who", "whom",
- "how", "when", "where", "why", "all", "each", "every", "both", "few",
- "more", "most", "other", "some", "such", "only", "same", "too", "very",
- "just", "about", "above", "after", "again", "also", "any", "because",
- "before", "between", "during", "into", "through", "up", "down", "out",
- "over", "under", "here", "there", "now", "get", "got", "go", "going",
- "went", "make", "made", "like", "even", "still", "back", "us",
- }
- # 标题公式分类的关键词/模式
- TITLE_PATTERNS = {
- "挑战型": [
- r"\b(challenge|survive|last|endure|spent|living|tried|attempt)\b",
- r"\b(\d+)\s*(hours?|days?|minutes?)\b",
- r"\bi\s+(survived|built|made|ate|bought|opened|spent)\b",
- ],
- "数字型": [
- r"^\$[\d,]+",
- r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
- r"\b\d{2,}\b",
- r"\b(100|1000|10000|million|billion)\b",
- ],
- "悬念型": [
- r"\.\.\.",
- r"\?$",
- r"\b(mystery|secret|hidden|never|impossible|insane|unbelievable)\b",
- r"\b(what happens|you won't believe|no one|nobody)\b",
- ],
- "对比型": [
- r"\bvs\.?\b",
- r"\bversus\b",
- r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
- r"\b(cheap|expensive|worst|best|biggest|smallest)\b.*\bvs\.?\b",
- r"\b(world'?s?\s+(largest|smallest|most|least|biggest|cheapest))\b",
- ],
- "情感型": [
- r"\b(emotional|crying|tears|heartwarming|giving|donated|surprise)\b",
- r"!{2,}",
- r"\b(amazing|incredible|insane|crazy|epic|extreme)\b",
- ],
- }
- def load_titles(filepath: str) -> list[str]:
- """从文本文件加载标题,每行一个"""
- path = Path(filepath)
- if not path.exists():
- print(f"[ERROR] 文件不存在: {filepath}", file=sys.stderr)
- sys.exit(1)
- titles = [
- line.strip()
- for line in path.read_text(encoding="utf-8").splitlines()
- if line.strip()
- ]
- if not titles:
- print(f"[ERROR] 文件为空: {filepath}", file=sys.stderr)
- sys.exit(1)
- return titles
- def analyze_length(titles: list[str]) -> dict:
- """分析标题长度分布"""
- lengths = [len(t) for t in titles]
- word_counts = [len(t.split()) for t in titles]
- return {
- "char_avg": sum(lengths) / len(lengths),
- "char_min": min(lengths),
- "char_max": max(lengths),
- "char_median": sorted(lengths)[len(lengths) // 2],
- "word_avg": sum(word_counts) / len(word_counts),
- "word_min": min(word_counts),
- "word_max": max(word_counts),
- "brackets": [
- sum(1 for l in lengths if l <= 30),
- sum(1 for l in lengths if 30 < l <= 50),
- sum(1 for l in lengths if 50 < l <= 70),
- sum(1 for l in lengths if l > 70),
- ],
- }
- def analyze_numbers(titles: list[str]) -> dict:
- """分析数字使用情况"""
- has_number = [t for t in titles if re.search(r"\d", t)]
- has_dollar = [t for t in titles if "$" in t]
- numbers_found = []
- for t in titles:
- numbers_found.extend(int(n.replace(",", "")) for n in re.findall(r"[\d,]+", t) if n.replace(",", "").isdigit())
- return {
- "with_number_pct": len(has_number) / len(titles) * 100,
- "with_dollar_pct": len(has_dollar) / len(titles) * 100,
- "common_numbers": Counter(numbers_found).most_common(10),
- }
- def analyze_words(titles: list[str], top_n: int = 20) -> list[tuple[str, int]]:
- """提取高频词汇(去除停用词)"""
- words = []
- for t in titles:
- tokens = re.findall(r"[a-zA-Z]+", t.lower())
- words.extend(w for w in tokens if w not in STOP_WORDS and len(w) > 1)
- return Counter(words).most_common(top_n)
- def classify_titles(titles: list[str]) -> dict[str, list[str]]:
- """按标题公式分类"""
- results = {cat: [] for cat in TITLE_PATTERNS}
- for t in titles:
- for cat, patterns in TITLE_PATTERNS.items():
- if any(re.search(p, t, re.IGNORECASE) for p in patterns):
- results[cat].append(t)
- break # 每个标题只归入第一个匹配的类别
- results["其他"] = [
- t for t in titles
- if not any(t in v for v in results.values())
- ]
- return results
- def analyze_punctuation(titles: list[str]) -> dict:
- """分析标点和大写模式"""
- return {
- "ends_exclamation": sum(1 for t in titles if t.endswith("!")),
- "ends_question": sum(1 for t in titles if t.endswith("?")),
- "ends_ellipsis": sum(1 for t in titles if t.endswith("...")),
- "has_all_caps_word": sum(1 for t in titles if re.search(r"\b[A-Z]{2,}\b", t)),
- "has_emoji": sum(1 for t in titles if re.search(r"[\U0001F600-\U0001F9FF]", t)),
- }
- def generate_report(titles: list[str], top_n: int) -> str:
- """生成Markdown分析报告"""
- total = len(titles)
- length_stats = analyze_length(titles)
- number_stats = analyze_numbers(titles)
- top_words = analyze_words(titles, top_n)
- categories = classify_titles(titles)
- punct_stats = analyze_punctuation(titles)
- lines = []
- lines.append(f"# YouTube标题分析报告\n")
- lines.append(f"共分析 **{total}** 个标题\n")
- # 长度分布
- lines.append("## 1. 标题长度分布\n")
- lines.append("| 指标 | 字符数 | 词数 |")
- lines.append("|------|--------|------|")
- lines.append(f"| 平均 | {length_stats['char_avg']:.1f} | {length_stats['word_avg']:.1f} |")
- lines.append(f"| 最短 | {length_stats['char_min']} | {length_stats['word_min']} |")
- lines.append(f"| 最长 | {length_stats['char_max']} | {length_stats['word_max']} |")
- lines.append(f"| 中位数 | {length_stats['char_median']} | - |")
- lines.append("")
- b = length_stats["brackets"]
- lines.append(f"- 30字符以内: {b[0]} ({b[0]/total*100:.1f}%)")
- lines.append(f"- 31-50字符: {b[1]} ({b[1]/total*100:.1f}%)")
- lines.append(f"- 51-70字符: {b[2]} ({b[2]/total*100:.1f}%)")
- lines.append(f"- 70字符以上: {b[3]} ({b[3]/total*100:.1f}%)")
- lines.append("")
- # 数字使用
- lines.append("## 2. 数字使用\n")
- lines.append(f"- 含数字的标题: {number_stats['with_number_pct']:.1f}%")
- lines.append(f"- 含$金额的标题: {number_stats['with_dollar_pct']:.1f}%")
- if number_stats["common_numbers"]:
- lines.append("\n常见数字:")
- for num, count in number_stats["common_numbers"]:
- lines.append(f" - {num:,}: 出现 {count} 次")
- lines.append("")
- # 高频词汇
- lines.append(f"## 3. 高频词汇 (Top {top_n})\n")
- lines.append("| 排名 | 词汇 | 出现次数 |")
- lines.append("|------|------|----------|")
- for i, (word, count) in enumerate(top_words, 1):
- lines.append(f"| {i} | {word} | {count} |")
- lines.append("")
- # 标题公式分类
- lines.append("## 4. 标题公式分类\n")
- lines.append("| 类型 | 数量 | 占比 | 示例 |")
- lines.append("|------|------|------|------|")
- for cat in ["挑战型", "数字型", "悬念型", "对比型", "情感型", "其他"]:
- items = categories.get(cat, [])
- pct = len(items) / total * 100 if total else 0
- example = items[0][:50] + "..." if items and len(items[0]) > 50 else (items[0] if items else "-")
- lines.append(f"| {cat} | {len(items)} | {pct:.1f}% | {example} |")
- lines.append("")
- # 标点与格式
- lines.append("## 5. 标点与格式特征\n")
- lines.append(f"- 感叹号结尾: {punct_stats['ends_exclamation']} ({punct_stats['ends_exclamation']/total*100:.1f}%)")
- lines.append(f"- 问号结尾: {punct_stats['ends_question']} ({punct_stats['ends_question']/total*100:.1f}%)")
- lines.append(f"- 省略号结尾: {punct_stats['ends_ellipsis']} ({punct_stats['ends_ellipsis']/total*100:.1f}%)")
- lines.append(f"- 含全大写词: {punct_stats['has_all_caps_word']} ({punct_stats['has_all_caps_word']/total*100:.1f}%)")
- lines.append("")
- # 洞察
- lines.append("## 6. 关键洞察\n")
- # 自动生成一些洞察
- if number_stats["with_number_pct"] > 60:
- lines.append("- **数字驱动**: 超过60%的标题使用数字,数字是核心吸引力元素")
- if number_stats["with_dollar_pct"] > 30:
- lines.append("- **金钱叙事**: 大量使用$金额,制造价值感和规模感")
- dominant_cat = max(
- [(cat, len(items)) for cat, items in categories.items() if cat != "其他"],
- key=lambda x: x[1],
- )
- lines.append(f"- **主导公式**: 「{dominant_cat[0]}」是使用最多的标题类型 ({dominant_cat[1]}/{total})")
- if length_stats["char_avg"] < 50:
- lines.append("- **简洁风格**: 平均标题长度不到50字符,倾向短标题")
- else:
- lines.append("- **详细风格**: 平均标题超过50字符,倾向描述性标题")
- return "\n".join(lines)
- def main():
- parser = argparse.ArgumentParser(
- description="分析YouTube频道视频标题模式",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="示例:\n python analyze_titles.py mrbeast_titles.txt\n python analyze_titles.py titles.txt -o report.md --top 30",
- )
- parser.add_argument("input", help="标题文本文件(每行一个标题)")
- parser.add_argument("-o", "--output", help="输出报告文件路径(默认打印到终端)")
- parser.add_argument("--top", type=int, default=20, help="显示的高频词数量(默认20)")
- args = parser.parse_args()
- titles = load_titles(args.input)
- report = generate_report(titles, args.top)
- if args.output:
- Path(args.output).write_text(report, encoding="utf-8")
- print(f"[OK] 报告已保存到: {args.output}")
- else:
- print(report)
- if __name__ == "__main__":
- main()
|