srt_to_transcript.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/usr/bin/env python3
  2. """
  3. 将SRT/VTT字幕文件清洗为干净的纯文本transcript。
  4. 去除时间戳、序号、重复行、HTML标签,输出可直接阅读的文本。
  5. 用法:
  6. python3 srt_to_transcript.py input.srt [output.txt]
  7. python3 srt_to_transcript.py input.vtt [output.txt]
  8. 如果不指定输出文件,默认输出到 input_transcript.txt
  9. """
  10. import sys
  11. import re
  12. from pathlib import Path
  13. def clean_srt(content: str) -> str:
  14. """清洗SRT格式字幕"""
  15. lines = content.strip().split('\n')
  16. texts = []
  17. for line in lines:
  18. line = line.strip()
  19. # 跳过序号行(纯数字)
  20. if re.match(r'^\d+$', line):
  21. continue
  22. # 跳过时间戳行
  23. if re.match(r'\d{2}:\d{2}:\d{2}', line):
  24. continue
  25. # 跳过空行
  26. if not line:
  27. continue
  28. # 去除HTML标签
  29. line = re.sub(r'<[^>]+>', '', line)
  30. # 去除VTT的position标记
  31. line = re.sub(r'align:.*$|position:.*$', '', line).strip()
  32. if line:
  33. texts.append(line)
  34. # 去重(自动字幕常有连续重复行)
  35. deduped = []
  36. for text in texts:
  37. if not deduped or text != deduped[-1]:
  38. deduped.append(text)
  39. # 合并成段落:连续的短句合并,遇到句末标点或长停顿换行
  40. result = []
  41. current = []
  42. for text in deduped:
  43. current.append(text)
  44. # 如果当前累积文本够长或遇到句末标点,形成一个段落
  45. joined = ' '.join(current)
  46. if len(joined) > 200 or re.search(r'[。!?.!?]$', text):
  47. result.append(joined)
  48. current = []
  49. if current:
  50. result.append(' '.join(current))
  51. return '\n\n'.join(result)
  52. def clean_vtt(content: str) -> str:
  53. """清洗VTT格式字幕(先去掉VTT头部,然后按SRT逻辑处理)"""
  54. # 去掉WEBVTT头部
  55. content = re.sub(r'^WEBVTT.*?\n\n', '', content, flags=re.DOTALL)
  56. # 去掉NOTE块
  57. content = re.sub(r'NOTE.*?\n\n', '', content, flags=re.DOTALL)
  58. return clean_srt(content)
  59. def main():
  60. if len(sys.argv) < 2:
  61. print("用法: python3 srt_to_transcript.py <input.srt|input.vtt> [output.txt]")
  62. sys.exit(1)
  63. input_path = Path(sys.argv[1])
  64. if not input_path.exists():
  65. print(f"❌ 文件不存在: {input_path}")
  66. sys.exit(1)
  67. # 默认输出文件名
  68. if len(sys.argv) >= 3:
  69. output_path = Path(sys.argv[2])
  70. else:
  71. output_path = input_path.parent / f"{input_path.stem}_transcript.txt"
  72. # 读取并检测格式
  73. content = input_path.read_text(encoding='utf-8')
  74. if input_path.suffix.lower() == '.vtt' or content.startswith('WEBVTT'):
  75. transcript = clean_vtt(content)
  76. else:
  77. transcript = clean_srt(content)
  78. output_path.write_text(transcript, encoding='utf-8')
  79. # 统计
  80. word_count = len(transcript)
  81. line_count = transcript.count('\n') + 1
  82. print(f"✅ 转换完成: {output_path}")
  83. print(f" 字数: {word_count} 段落数: {line_count}")
  84. if __name__ == '__main__':
  85. main()