1
0

merge_research.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/env python3
  2. """
  3. 合并6个Agent的调研结果,生成Phase 1.5调研Review检查点的摘要表格。
  4. 扫描 references/research/ 目录下的01-06 md文件,统计每个维度的来源数量、
  5. 一手/二手占比、关键发现。
  6. 用法:
  7. python3 merge_research.py <skill目录路径>
  8. 示例:
  9. python3 merge_research.py .claude/skills/elon-musk-perspective
  10. 输出: 打印markdown格式的摘要表格到stdout
  11. """
  12. import sys
  13. import re
  14. from pathlib import Path
  15. AGENTS = {
  16. '01-writings': '著作',
  17. '02-conversations': '对话',
  18. '03-expression-dna': '表达',
  19. '04-external-views': '他者',
  20. '05-decisions': '决策',
  21. '06-timeline': '时间线',
  22. }
  23. def count_sources(content: str) -> dict:
  24. """统计来源数量和一手/二手占比"""
  25. # 计算URL数量作为来源数
  26. urls = re.findall(r'https?://[^\s\)]+', content)
  27. # 检测一手/二手标记
  28. primary_markers = len(re.findall(r'一手|primary|本人|原文|原始|直接引用', content, re.IGNORECASE))
  29. secondary_markers = len(re.findall(r'二手|secondary|转述|总结|评论|分析', content, re.IGNORECASE))
  30. return {
  31. 'url_count': len(urls),
  32. 'unique_urls': len(set(urls)),
  33. 'primary_markers': primary_markers,
  34. 'secondary_markers': secondary_markers,
  35. }
  36. def extract_key_findings(content: str, max_items: int = 3) -> list[str]:
  37. """提取关键发现(取前几个二级标题或加粗项)"""
  38. # 尝试提取##标题
  39. headings = re.findall(r'^##\s+(.+)$', content, re.MULTILINE)
  40. if headings:
  41. return headings[:max_items]
  42. # fallback: 提取加粗项
  43. bolds = re.findall(r'\*\*(.+?)\*\*', content)
  44. if bolds:
  45. return bolds[:max_items]
  46. # fallback: 取前3个非空行
  47. lines = [l.strip() for l in content.split('\n') if l.strip() and not l.startswith('#')]
  48. return [l[:50] + '...' if len(l) > 50 else l for l in lines[:max_items]]
  49. def find_contradictions(files: dict[str, str]) -> list[str]:
  50. """简单检测跨文件矛盾(同一关键词出现不同判断)"""
  51. contradictions = []
  52. # 检测「但是」「然而」「相反」「矛盾」等矛盾标记
  53. for name, content in files.items():
  54. matches = re.findall(r'(?:矛盾|相反|但实际上|然而.*?不同|争议).{0,100}', content)
  55. for m in matches:
  56. contradictions.append(f"{AGENTS.get(name, name)}: {m[:80]}")
  57. return contradictions[:5] # 最多5条
  58. def main():
  59. if len(sys.argv) < 2:
  60. print("用法: python3 merge_research.py <skill目录路径>")
  61. sys.exit(1)
  62. skill_dir = Path(sys.argv[1])
  63. research_dir = skill_dir / 'references' / 'research'
  64. if not research_dir.exists():
  65. print(f"❌ 目录不存在: {research_dir}")
  66. sys.exit(1)
  67. files = {}
  68. rows = []
  69. total_sources = 0
  70. total_primary = 0
  71. total_secondary = 0
  72. missing = []
  73. for key, label in AGENTS.items():
  74. md_file = research_dir / f"{key}.md"
  75. if not md_file.exists():
  76. missing.append(label)
  77. rows.append(f"│ {label:<12} │ {'❌ 缺失':<8} │ {'—':<24} │")
  78. continue
  79. content = md_file.read_text(encoding='utf-8')
  80. files[key] = content
  81. stats = count_sources(content)
  82. findings = extract_key_findings(content)
  83. total_sources += stats['unique_urls']
  84. total_primary += stats['primary_markers']
  85. total_secondary += stats['secondary_markers']
  86. findings_str = ', '.join(findings) if findings else '—'
  87. if len(findings_str) > 40:
  88. findings_str = findings_str[:37] + '...'
  89. rows.append(f"│ {label:<12} │ {stats['unique_urls']:<8} │ {findings_str:<24} │")
  90. # 矛盾检测
  91. contradictions = find_contradictions(files)
  92. # 输出
  93. print("┌──────────────┬──────────┬──────────────────────────┐")
  94. print("│ Agent │ 来源数量 │ 关键发现 │")
  95. print("├──────────────┼──────────┼──────────────────────────┤")
  96. for row in rows:
  97. print(row)
  98. print("├──────────────┼──────────┼──────────────────────────┤")
  99. primary_ratio = f"{total_primary}/{total_primary + total_secondary}" if (total_primary + total_secondary) > 0 else "未标记"
  100. print(f"│ 总来源数 │ {total_sources:<8} │ 一手占比: {primary_ratio:<15} │")
  101. if contradictions:
  102. print(f"│ 矛盾点 │ {len(contradictions)}处 │ {contradictions[0][:24]:<24} │")
  103. else:
  104. print(f"│ 矛盾点 │ 0处 │ {'—':<24} │")
  105. if missing:
  106. print(f"│ 信息不足维度 │ {len(missing)}个 │ {', '.join(missing):<24} │")
  107. else:
  108. print(f"│ 信息不足维度 │ 无 │ {'—':<24} │")
  109. print("└──────────────┴──────────┴──────────────────────────┘")
  110. # 总结
  111. if total_sources < 10:
  112. print("\n⚠️ 总来源数 <10,建议降低期望或补充调研")
  113. if missing:
  114. print(f"\n⚠️ 缺失维度: {', '.join(missing)},建议补充或在诚实边界中标注")
  115. if __name__ == '__main__':
  116. main()