#!/usr/bin/env python3
"""
analyze_titles.py - 分析YouTube频道视频标题模式

分析维度：
  - 标题长度分布
  - 数字使用频率与类型
  - 高频词汇（去除停用词）
  - 标题公式分类（挑战型/数字型/悬念型/对比型/情感型）
  - 标点符号与大写模式

用法:
  python analyze_titles.py titles.txt
  python analyze_titles.py titles.txt -o report.md
  python analyze_titles.py titles.txt --top 30

输入格式: 纯文本文件，每行一个标题
输出格式: Markdown分析报告
"""

import argparse
import re
import sys
from collections import Counter
from pathlib import Path

# 英文停用词（轻量级，不依赖nltk）
STOP_WORDS = {
    "i", "me", "my", "we", "our", "you", "your", "he", "him", "his", "she",
    "her", "it", "its", "they", "them", "their", "a", "an", "the", "and",
    "but", "or", "for", "nor", "on", "at", "to", "from", "by", "in", "of",
    "with", "is", "am", "are", "was", "were", "be", "been", "being", "have",
    "has", "had", "do", "does", "did", "will", "would", "shall", "should",
    "may", "might", "must", "can", "could", "not", "no", "so", "if", "then",
    "than", "that", "this", "these", "those", "what", "which", "who", "whom",
    "how", "when", "where", "why", "all", "each", "every", "both", "few",
    "more", "most", "other", "some", "such", "only", "same", "too", "very",
    "just", "about", "above", "after", "again", "also", "any", "because",
    "before", "between", "during", "into", "through", "up", "down", "out",
    "over", "under", "here", "there", "now", "get", "got", "go", "going",
    "went", "make", "made", "like", "even", "still", "back", "us",
}

# 标题公式分类的关键词/模式
TITLE_PATTERNS = {
    "挑战型": [
        r"\b(challenge|survive|last|endure|spent|living|tried|attempt)\b",
        r"\b(\d+)\s*(hours?|days?|minutes?)\b",
        r"\bi\s+(survived|built|made|ate|bought|opened|spent)\b",
    ],
    "数字型": [
        r"^\$[\d,]+",
        r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
        r"\b\d{2,}\b",
        r"\b(100|1000|10000|million|billion)\b",
    ],
    "悬念型": [
        r"\.\.\.",
        r"\?$",
        r"\b(mystery|secret|hidden|never|impossible|insane|unbelievable)\b",
        r"\b(what happens|you won't believe|no one|nobody)\b",
    ],
    "对比型": [
        r"\bvs\.?\b",
        r"\bversus\b",
        r"\$[\d,]+\s+vs\.?\s+\$[\d,]+",
        r"\b(cheap|expensive|worst|best|biggest|smallest)\b.*\bvs\.?\b",
        r"\b(world'?s?\s+(largest|smallest|most|least|biggest|cheapest))\b",
    ],
    "情感型": [
        r"\b(emotional|crying|tears|heartwarming|giving|donated|surprise)\b",
        r"!{2,}",
        r"\b(amazing|incredible|insane|crazy|epic|extreme)\b",
    ],
}


def load_titles(filepath: str) -> list[str]:
    """从文本文件加载标题，每行一个"""
    path = Path(filepath)
    if not path.exists():
        print(f"[ERROR] 文件不存在: {filepath}", file=sys.stderr)
        sys.exit(1)
    titles = [
        line.strip()
        for line in path.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]
    if not titles:
        print(f"[ERROR] 文件为空: {filepath}", file=sys.stderr)
        sys.exit(1)
    return titles


def analyze_length(titles: list[str]) -> dict:
    """分析标题长度分布"""
    lengths = [len(t) for t in titles]
    word_counts = [len(t.split()) for t in titles]
    return {
        "char_avg": sum(lengths) / len(lengths),
        "char_min": min(lengths),
        "char_max": max(lengths),
        "char_median": sorted(lengths)[len(lengths) // 2],
        "word_avg": sum(word_counts) / len(word_counts),
        "word_min": min(word_counts),
        "word_max": max(word_counts),
        "brackets": [
            sum(1 for l in lengths if l <= 30),
            sum(1 for l in lengths if 30 < l <= 50),
            sum(1 for l in lengths if 50 < l <= 70),
            sum(1 for l in lengths if l > 70),
        ],
    }


def analyze_numbers(titles: list[str]) -> dict:
    """分析数字使用情况"""
    has_number = [t for t in titles if re.search(r"\d", t)]
    has_dollar = [t for t in titles if "$" in t]
    numbers_found = []
    for t in titles:
        numbers_found.extend(int(n.replace(",", "")) for n in re.findall(r"[\d,]+", t) if n.replace(",", "").isdigit())
    return {
        "with_number_pct": len(has_number) / len(titles) * 100,
        "with_dollar_pct": len(has_dollar) / len(titles) * 100,
        "common_numbers": Counter(numbers_found).most_common(10),
    }


def analyze_words(titles: list[str], top_n: int = 20) -> list[tuple[str, int]]:
    """提取高频词汇（去除停用词）"""
    words = []
    for t in titles:
        tokens = re.findall(r"[a-zA-Z]+", t.lower())
        words.extend(w for w in tokens if w not in STOP_WORDS and len(w) > 1)
    return Counter(words).most_common(top_n)


def classify_titles(titles: list[str]) -> dict[str, list[str]]:
    """按标题公式分类"""
    results = {cat: [] for cat in TITLE_PATTERNS}
    for t in titles:
        for cat, patterns in TITLE_PATTERNS.items():
            if any(re.search(p, t, re.IGNORECASE) for p in patterns):
                results[cat].append(t)
                break  # 每个标题只归入第一个匹配的类别
    results["其他"] = [
        t for t in titles
        if not any(t in v for v in results.values())
    ]
    return results


def analyze_punctuation(titles: list[str]) -> dict:
    """分析标点和大写模式"""
    return {
        "ends_exclamation": sum(1 for t in titles if t.endswith("!")),
        "ends_question": sum(1 for t in titles if t.endswith("?")),
        "ends_ellipsis": sum(1 for t in titles if t.endswith("...")),
        "has_all_caps_word": sum(1 for t in titles if re.search(r"\b[A-Z]{2,}\b", t)),
        "has_emoji": sum(1 for t in titles if re.search(r"[\U0001F600-\U0001F9FF]", t)),
    }


def generate_report(titles: list[str], top_n: int) -> str:
    """生成Markdown分析报告"""
    total = len(titles)
    length_stats = analyze_length(titles)
    number_stats = analyze_numbers(titles)
    top_words = analyze_words(titles, top_n)
    categories = classify_titles(titles)
    punct_stats = analyze_punctuation(titles)

    lines = []
    lines.append(f"# YouTube标题分析报告\n")
    lines.append(f"共分析 **{total}** 个标题\n")

    # 长度分布
    lines.append("## 1. 标题长度分布\n")
    lines.append("| 指标 | 字符数 | 词数 |")
    lines.append("|------|--------|------|")
    lines.append(f"| 平均 | {length_stats['char_avg']:.1f} | {length_stats['word_avg']:.1f} |")
    lines.append(f"| 最短 | {length_stats['char_min']} | {length_stats['word_min']} |")
    lines.append(f"| 最长 | {length_stats['char_max']} | {length_stats['word_max']} |")
    lines.append(f"| 中位数 | {length_stats['char_median']} | - |")
    lines.append("")
    b = length_stats["brackets"]
    lines.append(f"- 30字符以内: {b[0]} ({b[0]/total*100:.1f}%)")
    lines.append(f"- 31-50字符: {b[1]} ({b[1]/total*100:.1f}%)")
    lines.append(f"- 51-70字符: {b[2]} ({b[2]/total*100:.1f}%)")
    lines.append(f"- 70字符以上: {b[3]} ({b[3]/total*100:.1f}%)")
    lines.append("")

    # 数字使用
    lines.append("## 2. 数字使用\n")
    lines.append(f"- 含数字的标题: {number_stats['with_number_pct']:.1f}%")
    lines.append(f"- 含$金额的标题: {number_stats['with_dollar_pct']:.1f}%")
    if number_stats["common_numbers"]:
        lines.append("\n常见数字:")
        for num, count in number_stats["common_numbers"]:
            lines.append(f"  - {num:,}: 出现 {count} 次")
    lines.append("")

    # 高频词汇
    lines.append(f"## 3. 高频词汇 (Top {top_n})\n")
    lines.append("| 排名 | 词汇 | 出现次数 |")
    lines.append("|------|------|----------|")
    for i, (word, count) in enumerate(top_words, 1):
        lines.append(f"| {i} | {word} | {count} |")
    lines.append("")

    # 标题公式分类
    lines.append("## 4. 标题公式分类\n")
    lines.append("| 类型 | 数量 | 占比 | 示例 |")
    lines.append("|------|------|------|------|")
    for cat in ["挑战型", "数字型", "悬念型", "对比型", "情感型", "其他"]:
        items = categories.get(cat, [])
        pct = len(items) / total * 100 if total else 0
        example = items[0][:50] + "..." if items and len(items[0]) > 50 else (items[0] if items else "-")
        lines.append(f"| {cat} | {len(items)} | {pct:.1f}% | {example} |")
    lines.append("")

    # 标点与格式
    lines.append("## 5. 标点与格式特征\n")
    lines.append(f"- 感叹号结尾: {punct_stats['ends_exclamation']} ({punct_stats['ends_exclamation']/total*100:.1f}%)")
    lines.append(f"- 问号结尾: {punct_stats['ends_question']} ({punct_stats['ends_question']/total*100:.1f}%)")
    lines.append(f"- 省略号结尾: {punct_stats['ends_ellipsis']} ({punct_stats['ends_ellipsis']/total*100:.1f}%)")
    lines.append(f"- 含全大写词: {punct_stats['has_all_caps_word']} ({punct_stats['has_all_caps_word']/total*100:.1f}%)")
    lines.append("")

    # 洞察
    lines.append("## 6. 关键洞察\n")
    # 自动生成一些洞察
    if number_stats["with_number_pct"] > 60:
        lines.append("- **数字驱动**: 超过60%的标题使用数字，数字是核心吸引力元素")
    if number_stats["with_dollar_pct"] > 30:
        lines.append("- **金钱叙事**: 大量使用$金额，制造价值感和规模感")
    dominant_cat = max(
        [(cat, len(items)) for cat, items in categories.items() if cat != "其他"],
        key=lambda x: x[1],
    )
    lines.append(f"- **主导公式**: 「{dominant_cat[0]}」是使用最多的标题类型 ({dominant_cat[1]}/{total})")
    if length_stats["char_avg"] < 50:
        lines.append("- **简洁风格**: 平均标题长度不到50字符，倾向短标题")
    else:
        lines.append("- **详细风格**: 平均标题超过50字符，倾向描述性标题")

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser(
        description="分析YouTube频道视频标题模式",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="示例:\n  python analyze_titles.py mrbeast_titles.txt\n  python analyze_titles.py titles.txt -o report.md --top 30",
    )
    parser.add_argument("input", help="标题文本文件（每行一个标题）")
    parser.add_argument("-o", "--output", help="输出报告文件路径（默认打印到终端）")
    parser.add_argument("--top", type=int, default=20, help="显示的高频词数量（默认20）")
    args = parser.parse_args()

    titles = load_titles(args.input)
    report = generate_report(titles, args.top)

    if args.output:
        Path(args.output).write_text(report, encoding="utf-8")
        print(f"[OK] 报告已保存到: {args.output}")
    else:
        print(report)


if __name__ == "__main__":
    main()