Kaynağa Gözat

add scripts/ directory with utility scripts, update SKILL.md paths

Move py/sh scripts from references/ to scripts/ for better organization.
Update all script path references in SKILL.md accordingly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
alchain 2 ay önce
ebeveyn
işleme
43ec5b9ad4

+ 4 - 4
SKILL.md

@@ -253,18 +253,18 @@ spawn subagent时,用以下结构给任务(以Agent 1著作为例):
 #### 工具辅助(如可用)
 - 书籍:Z-Library/LibGen搜索下载 → 存入 `sources/books/`
 - 视频字幕获取(已提供脚本,直接调用):
-  - **Step 1 下载字幕**:`bash [skill目录]/references/download_subtitles.sh <YouTube_URL> [输出目录]`
+  - **Step 1 下载字幕**:`bash [skill目录]/scripts/download_subtitles.sh <YouTube_URL> [输出目录]`
     - 自动优先人工字幕 → 中文 → 英文 → 自动生成字幕
     - 输出SRT/VTT文件到指定目录
-  - **Step 2 清洗为纯文本**:`python3 [skill目录]/references/srt_to_transcript.py <input.srt> [output.txt]`
+  - **Step 2 清洗为纯文本**:`python3 [skill目录]/scripts/srt_to_transcript.py <input.srt> [output.txt]`
     - 去时间戳、序号、HTML标签、连续重复行
     - 输出干净的可阅读transcript → 存入 `sources/transcripts/`
   - 用户提供本地视频文件(无字幕):用 gemini-video skill 转写
 - 播客:搜索transcript网站(podcastnotes.org等)
-- 调研摘要生成(Phase 1.5用):`python3 [skill目录]/references/merge_research.py <skill目录>`
+- 调研摘要生成(Phase 1.5用):`python3 [skill目录]/scripts/merge_research.py <skill目录>`
   - 自动扫描 `references/research/01-06.md`,统计来源数、一手/二手占比、关键发现
   - 输出Phase 1.5检查点的markdown表格,无需手动统计
-- 质量自检(Phase 4用):`python3 [skill目录]/references/quality_check.py <SKILL.md路径>`
+- 质量自检(Phase 4用):`python3 [skill目录]/scripts/quality_check.py <SKILL.md路径>`
   - 自动检查6项通过标准:心智模型数量、局限性、表达DNA、诚实边界、内在张力、一手来源占比
   - 输出逐项PASS/FAIL和总结
 

+ 55 - 0
scripts/download_subtitles.sh

@@ -0,0 +1,55 @@
+#!/bin/bash
+# 从YouTube视频下载字幕
+# 用法: ./download_subtitles.sh <YouTube_URL> [输出目录]
+# 优先下载人工字幕,无人工字幕则下载自动生成字幕
+# 语言优先级:中文 > 英文 > 其他
+
+set -e
+
+URL="$1"
+OUTPUT_DIR="${2:-.}"
+
+if [ -z "$URL" ]; then
+    echo "用法: ./download_subtitles.sh <YouTube_URL> [输出目录]"
+    exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+echo ">>> 检查可用字幕..."
+yt-dlp --list-subs --no-download "$URL" 2>/dev/null | tail -20
+
+echo ""
+echo ">>> 尝试下载人工字幕(中文优先)..."
+
+# 尝试1: 人工中文字幕
+if yt-dlp --write-subs --sub-langs "zh-Hans,zh-Hant,zh,zh-CN,zh-TW" --sub-format srt --skip-download -o "$OUTPUT_DIR/%(title)s" "$URL" 2>/dev/null; then
+    FOUND=$(find "$OUTPUT_DIR" -name "*.srt" -newer /tmp/.ytdlp_marker 2>/dev/null | head -1)
+    if [ -n "$FOUND" ]; then
+        echo "✅ 下载成功: $FOUND"
+        exit 0
+    fi
+fi
+
+# 尝试2: 人工英文字幕
+echo ">>> 无中文人工字幕,尝试英文..."
+if yt-dlp --write-subs --sub-langs "en,en-US,en-GB" --sub-format srt --skip-download -o "$OUTPUT_DIR/%(title)s" "$URL" 2>/dev/null; then
+    FOUND=$(find "$OUTPUT_DIR" -name "*.srt" -mmin -1 2>/dev/null | head -1)
+    if [ -n "$FOUND" ]; then
+        echo "✅ 下载成功: $FOUND"
+        exit 0
+    fi
+fi
+
+# 尝试3: 自动生成字幕(中文优先)
+echo ">>> 无人工字幕,尝试自动生成字幕..."
+if yt-dlp --write-auto-subs --sub-langs "zh-Hans,zh,en" --sub-format srt --skip-download -o "$OUTPUT_DIR/%(title)s" "$URL" 2>/dev/null; then
+    FOUND=$(find "$OUTPUT_DIR" -name "*.srt" -o -name "*.vtt" 2>/dev/null | head -1)
+    if [ -n "$FOUND" ]; then
+        echo "✅ 自动字幕下载成功: $FOUND"
+        exit 0
+    fi
+fi
+
+echo "❌ 未找到任何可用字幕"
+exit 1

+ 150 - 0
scripts/merge_research.py

@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+合并6个Agent的调研结果,生成Phase 1.5调研Review检查点的摘要表格。
+扫描 references/research/ 目录下的01-06 md文件,统计每个维度的来源数量、
+一手/二手占比、关键发现。
+
+用法:
+    python3 merge_research.py <skill目录路径>
+
+示例:
+    python3 merge_research.py .claude/skills/elon-musk-perspective
+
+输出: 打印markdown格式的摘要表格到stdout
+"""
+
+import sys
+import re
+from pathlib import Path
+
+AGENTS = {
+    '01-writings': '著作',
+    '02-conversations': '对话',
+    '03-expression-dna': '表达',
+    '04-external-views': '他者',
+    '05-decisions': '决策',
+    '06-timeline': '时间线',
+}
+
+
+def count_sources(content: str) -> dict:
+    """统计来源数量和一手/二手占比"""
+    # 计算URL数量作为来源数
+    urls = re.findall(r'https?://[^\s\)]+', content)
+
+    # 检测一手/二手标记
+    primary_markers = len(re.findall(r'一手|primary|本人|原文|原始|直接引用', content, re.IGNORECASE))
+    secondary_markers = len(re.findall(r'二手|secondary|转述|总结|评论|分析', content, re.IGNORECASE))
+
+    return {
+        'url_count': len(urls),
+        'unique_urls': len(set(urls)),
+        'primary_markers': primary_markers,
+        'secondary_markers': secondary_markers,
+    }
+
+
+def extract_key_findings(content: str, max_items: int = 3) -> list[str]:
+    """提取关键发现(取前几个二级标题或加粗项)"""
+    # 尝试提取##标题
+    headings = re.findall(r'^##\s+(.+)$', content, re.MULTILINE)
+    if headings:
+        return headings[:max_items]
+
+    # fallback: 提取加粗项
+    bolds = re.findall(r'\*\*(.+?)\*\*', content)
+    if bolds:
+        return bolds[:max_items]
+
+    # fallback: 取前3个非空行
+    lines = [l.strip() for l in content.split('\n') if l.strip() and not l.startswith('#')]
+    return [l[:50] + '...' if len(l) > 50 else l for l in lines[:max_items]]
+
+
+def find_contradictions(files: dict[str, str]) -> list[str]:
+    """简单检测跨文件矛盾(同一关键词出现不同判断)"""
+    contradictions = []
+    # 检测「但是」「然而」「相反」「矛盾」等矛盾标记
+    for name, content in files.items():
+        matches = re.findall(r'(?:矛盾|相反|但实际上|然而.*?不同|争议).{0,100}', content)
+        for m in matches:
+            contradictions.append(f"{AGENTS.get(name, name)}: {m[:80]}")
+    return contradictions[:5]  # 最多5条
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("用法: python3 merge_research.py <skill目录路径>")
+        sys.exit(1)
+
+    skill_dir = Path(sys.argv[1])
+    research_dir = skill_dir / 'references' / 'research'
+
+    if not research_dir.exists():
+        print(f"❌ 目录不存在: {research_dir}")
+        sys.exit(1)
+
+    files = {}
+    rows = []
+    total_sources = 0
+    total_primary = 0
+    total_secondary = 0
+    missing = []
+
+    for key, label in AGENTS.items():
+        md_file = research_dir / f"{key}.md"
+        if not md_file.exists():
+            missing.append(label)
+            rows.append(f"│ {label:<12} │ {'❌ 缺失':<8} │ {'—':<24} │")
+            continue
+
+        content = md_file.read_text(encoding='utf-8')
+        files[key] = content
+        stats = count_sources(content)
+        findings = extract_key_findings(content)
+
+        total_sources += stats['unique_urls']
+        total_primary += stats['primary_markers']
+        total_secondary += stats['secondary_markers']
+
+        findings_str = ', '.join(findings) if findings else '—'
+        if len(findings_str) > 40:
+            findings_str = findings_str[:37] + '...'
+
+        rows.append(f"│ {label:<12} │ {stats['unique_urls']:<8} │ {findings_str:<24} │")
+
+    # 矛盾检测
+    contradictions = find_contradictions(files)
+
+    # 输出
+    print("┌──────────────┬──────────┬──────────────────────────┐")
+    print("│ Agent        │ 来源数量  │ 关键发现                  │")
+    print("├──────────────┼──────────┼──────────────────────────┤")
+    for row in rows:
+        print(row)
+    print("├──────────────┼──────────┼──────────────────────────┤")
+
+    primary_ratio = f"{total_primary}/{total_primary + total_secondary}" if (total_primary + total_secondary) > 0 else "未标记"
+    print(f"│ 总来源数      │ {total_sources:<8} │ 一手占比: {primary_ratio:<15} │")
+
+    if contradictions:
+        print(f"│ 矛盾点        │ {len(contradictions)}处      │ {contradictions[0][:24]:<24} │")
+    else:
+        print(f"│ 矛盾点        │ 0处      │ {'—':<24} │")
+
+    if missing:
+        print(f"│ 信息不足维度   │ {len(missing)}个      │ {', '.join(missing):<24} │")
+    else:
+        print(f"│ 信息不足维度   │ 无       │ {'—':<24} │")
+
+    print("└──────────────┴──────────┴──────────────────────────┘")
+
+    # 总结
+    if total_sources < 10:
+        print("\n⚠️ 总来源数 <10,建议降低期望或补充调研")
+    if missing:
+        print(f"\n⚠️ 缺失维度: {', '.join(missing)},建议补充或在诚实边界中标注")
+
+
+if __name__ == '__main__':
+    main()

+ 152 - 0
scripts/quality_check.py

@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+自动检查生成的SKILL.md是否通过Phase 4质量标准。
+对照通过标准表格逐项检查,输出通过/不通过和具体原因。
+
+用法:
+    python3 quality_check.py <SKILL.md路径>
+
+示例:
+    python3 quality_check.py .claude/skills/elon-musk-perspective/SKILL.md
+"""
+
+import sys
+import re
+from pathlib import Path
+
+
+def check_mental_models(content: str) -> tuple[bool, str]:
+    """检查心智模型数量(3-7个)"""
+    # 匹配 ### 模型N: 或 ### N. 等模式
+    models = re.findall(r'^###\s+(?:模型|Model|心智模型)\s*\d', content, re.MULTILINE)
+    if not models:
+        # fallback: 数「### 」开头的行在心智模型section中
+        in_section = False
+        count = 0
+        for line in content.split('\n'):
+            if re.match(r'^##\s+.*心智模型|Mental Model', line, re.IGNORECASE):
+                in_section = True
+                continue
+            if in_section and re.match(r'^##\s+', line) and '心智模型' not in line:
+                break
+            if in_section and re.match(r'^###\s+', line):
+                count += 1
+        if count > 0:
+            passed = 3 <= count <= 7
+            return passed, f"{count}个心智模型 {'✅' if passed else '❌ (应为3-7个)'}"
+
+    count = len(models)
+    if count == 0:
+        return False, "未检测到心智模型section"
+    passed = 3 <= count <= 7
+    return passed, f"{count}个心智模型 {'✅' if passed else '❌ (应为3-7个)'}"
+
+
+def check_limitations(content: str) -> tuple[bool, str]:
+    """检查每个模型是否有局限性"""
+    has_limitation = bool(re.search(r'局限|失效|不适用|盲区|limitation|blind spot', content, re.IGNORECASE))
+    return has_limitation, "有局限性标注 ✅" if has_limitation else "❌ 未找到局限性描述"
+
+
+def check_expression_dna(content: str) -> tuple[bool, str]:
+    """检查表达DNA辨识度"""
+    dna_section = bool(re.search(r'表达DNA|Expression DNA|表达风格', content, re.IGNORECASE))
+    if not dna_section:
+        return False, "❌ 未找到表达DNA section"
+
+    # 检查是否有具体的风格描述(句式、词汇等)
+    style_markers = len(re.findall(r'句式|词汇|语气|幽默|节奏|确定性|引用|口头禅', content))
+    passed = style_markers >= 3
+    return passed, f"表达DNA特征: {style_markers}项 {'✅' if passed else '❌ (应≥3项)'}"
+
+
+def check_honest_boundary(content: str) -> tuple[bool, str]:
+    """检查诚实边界(至少3条)"""
+    # 找诚实边界section
+    boundary_match = re.search(r'(?:##\s+.*诚实边界|## Honest Boundary)(.*?)(?=\n##\s|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if not boundary_match:
+        return False, "❌ 未找到诚实边界section"
+
+    boundary_text = boundary_match.group(1)
+    # 计算列表项
+    items = re.findall(r'^[-*]\s+', boundary_text, re.MULTILINE)
+    count = len(items)
+    passed = count >= 3
+    return passed, f"诚实边界: {count}条 {'✅' if passed else '❌ (应≥3条)'}"
+
+
+def check_tensions(content: str) -> tuple[bool, str]:
+    """检查内在张力(至少2对)"""
+    tension_markers = len(re.findall(r'张力|矛盾|tension|paradox|一方面.*另一方面|既.*又', content, re.IGNORECASE))
+    passed = tension_markers >= 2
+    return passed, f"内在张力: {tension_markers}处 {'✅' if passed else '❌ (应≥2处)'}"
+
+
+def check_primary_sources(content: str) -> tuple[bool, str]:
+    """检查一手来源占比"""
+    # 找调研来源section
+    source_section = re.search(r'(?:##\s+.*来源|## Source|## Reference)(.*?)(?=\n##\s|\Z)', content, re.DOTALL | re.IGNORECASE)
+    if not source_section:
+        return True, "未找到来源section(跳过检查)"
+
+    source_text = source_section.group(1)
+    primary = len(re.findall(r'一手|primary|本人著作|原始', source_text, re.IGNORECASE))
+    secondary = len(re.findall(r'二手|secondary|转述|评论', source_text, re.IGNORECASE))
+    total = primary + secondary
+    if total == 0:
+        return True, "未标记来源类型(跳过检查)"
+
+    ratio = primary / total
+    passed = ratio > 0.5
+    return passed, f"一手来源占比: {primary}/{total} ({ratio:.0%}) {'✅' if passed else '❌ (应>50%)'}"
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("用法: python3 quality_check.py <SKILL.md路径>")
+        sys.exit(1)
+
+    skill_path = Path(sys.argv[1])
+    if not skill_path.exists():
+        print(f"❌ 文件不存在: {skill_path}")
+        sys.exit(1)
+
+    content = skill_path.read_text(encoding='utf-8')
+
+    checks = [
+        ("心智模型数量", check_mental_models),
+        ("模型局限性", check_limitations),
+        ("表达DNA辨识度", check_expression_dna),
+        ("诚实边界", check_honest_boundary),
+        ("内在张力", check_tensions),
+        ("一手来源占比", check_primary_sources),
+    ]
+
+    print(f"质量检查: {skill_path.name}")
+    print("=" * 50)
+
+    passed_count = 0
+    total = len(checks)
+
+    for name, check_fn in checks:
+        passed, detail = check_fn(content)
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"  {name:<12} {status}  {detail}")
+        if passed:
+            passed_count += 1
+
+    print("=" * 50)
+    print(f"结果: {passed_count}/{total} 通过")
+
+    if passed_count == total:
+        print("🎉 全部通过,可以交付")
+    elif passed_count >= total - 1:
+        print("⚠️ 基本通过,建议修复不通过项后交付")
+    else:
+        print("❌ 多项不通过,建议回到Phase 2迭代")
+
+    sys.exit(0 if passed_count == total else 1)
+
+
+if __name__ == '__main__':
+    main()

+ 108 - 0
scripts/srt_to_transcript.py

@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+将SRT/VTT字幕文件清洗为干净的纯文本transcript。
+去除时间戳、序号、重复行、HTML标签,输出可直接阅读的文本。
+
+用法:
+    python3 srt_to_transcript.py input.srt [output.txt]
+    python3 srt_to_transcript.py input.vtt [output.txt]
+
+如果不指定输出文件,默认输出到 input_transcript.txt
+"""
+
+import sys
+import re
+from pathlib import Path
+
+
+def clean_srt(content: str) -> str:
+    """清洗SRT格式字幕"""
+    lines = content.strip().split('\n')
+    texts = []
+
+    for line in lines:
+        line = line.strip()
+        # 跳过序号行(纯数字)
+        if re.match(r'^\d+$', line):
+            continue
+        # 跳过时间戳行
+        if re.match(r'\d{2}:\d{2}:\d{2}', line):
+            continue
+        # 跳过空行
+        if not line:
+            continue
+        # 去除HTML标签
+        line = re.sub(r'<[^>]+>', '', line)
+        # 去除VTT的position标记
+        line = re.sub(r'align:.*$|position:.*$', '', line).strip()
+        if line:
+            texts.append(line)
+
+    # 去重(自动字幕常有连续重复行)
+    deduped = []
+    for text in texts:
+        if not deduped or text != deduped[-1]:
+            deduped.append(text)
+
+    # 合并成段落:连续的短句合并,遇到句末标点或长停顿换行
+    result = []
+    current = []
+
+    for text in deduped:
+        current.append(text)
+        # 如果当前累积文本够长或遇到句末标点,形成一个段落
+        joined = ' '.join(current)
+        if len(joined) > 200 or re.search(r'[。!?.!?]$', text):
+            result.append(joined)
+            current = []
+
+    if current:
+        result.append(' '.join(current))
+
+    return '\n\n'.join(result)
+
+
+def clean_vtt(content: str) -> str:
+    """清洗VTT格式字幕(先去掉VTT头部,然后按SRT逻辑处理)"""
+    # 去掉WEBVTT头部
+    content = re.sub(r'^WEBVTT.*?\n\n', '', content, flags=re.DOTALL)
+    # 去掉NOTE块
+    content = re.sub(r'NOTE.*?\n\n', '', content, flags=re.DOTALL)
+    return clean_srt(content)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("用法: python3 srt_to_transcript.py <input.srt|input.vtt> [output.txt]")
+        sys.exit(1)
+
+    input_path = Path(sys.argv[1])
+    if not input_path.exists():
+        print(f"❌ 文件不存在: {input_path}")
+        sys.exit(1)
+
+    # 默认输出文件名
+    if len(sys.argv) >= 3:
+        output_path = Path(sys.argv[2])
+    else:
+        output_path = input_path.parent / f"{input_path.stem}_transcript.txt"
+
+    # 读取并检测格式
+    content = input_path.read_text(encoding='utf-8')
+
+    if input_path.suffix.lower() == '.vtt' or content.startswith('WEBVTT'):
+        transcript = clean_vtt(content)
+    else:
+        transcript = clean_srt(content)
+
+    output_path.write_text(transcript, encoding='utf-8')
+
+    # 统计
+    word_count = len(transcript)
+    line_count = transcript.count('\n') + 1
+    print(f"✅ 转换完成: {output_path}")
+    print(f"   字数: {word_count}  段落数: {line_count}")
+
+
+if __name__ == '__main__':
+    main()