| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- #!/usr/bin/env python3
- """
- 将SRT/VTT字幕文件清洗为干净的纯文本transcript。
- 去除时间戳、序号、重复行、HTML标签,输出可直接阅读的文本。
- 用法:
- python3 srt_to_transcript.py input.srt [output.txt]
- python3 srt_to_transcript.py input.vtt [output.txt]
- 如果不指定输出文件,默认输出到 input_transcript.txt
- """
- import sys
- import re
- from pathlib import Path
- def clean_srt(content: str) -> str:
- """清洗SRT格式字幕"""
- lines = content.strip().split('\n')
- texts = []
- for line in lines:
- line = line.strip()
- # 跳过序号行(纯数字)
- if re.match(r'^\d+$', line):
- continue
- # 跳过时间戳行
- if re.match(r'\d{2}:\d{2}:\d{2}', line):
- continue
- # 跳过空行
- if not line:
- continue
- # 去除HTML标签
- line = re.sub(r'<[^>]+>', '', line)
- # 去除VTT的position标记
- line = re.sub(r'align:.*$|position:.*$', '', line).strip()
- if line:
- texts.append(line)
- # 去重(自动字幕常有连续重复行)
- deduped = []
- for text in texts:
- if not deduped or text != deduped[-1]:
- deduped.append(text)
- # 合并成段落:连续的短句合并,遇到句末标点或长停顿换行
- result = []
- current = []
- for text in deduped:
- current.append(text)
- # 如果当前累积文本够长或遇到句末标点,形成一个段落
- joined = ' '.join(current)
- if len(joined) > 200 or re.search(r'[。!?.!?]$', text):
- result.append(joined)
- current = []
- if current:
- result.append(' '.join(current))
- return '\n\n'.join(result)
- def clean_vtt(content: str) -> str:
- """清洗VTT格式字幕(先去掉VTT头部,然后按SRT逻辑处理)"""
- # 去掉WEBVTT头部
- content = re.sub(r'^WEBVTT.*?\n\n', '', content, flags=re.DOTALL)
- # 去掉NOTE块
- content = re.sub(r'NOTE.*?\n\n', '', content, flags=re.DOTALL)
- return clean_srt(content)
- def main():
- if len(sys.argv) < 2:
- print("用法: python3 srt_to_transcript.py <input.srt|input.vtt> [output.txt]")
- sys.exit(1)
- input_path = Path(sys.argv[1])
- if not input_path.exists():
- print(f"❌ 文件不存在: {input_path}")
- sys.exit(1)
- # 默认输出文件名
- if len(sys.argv) >= 3:
- output_path = Path(sys.argv[2])
- else:
- output_path = input_path.parent / f"{input_path.stem}_transcript.txt"
- # 读取并检测格式
- content = input_path.read_text(encoding='utf-8')
- if input_path.suffix.lower() == '.vtt' or content.startswith('WEBVTT'):
- transcript = clean_vtt(content)
- else:
- transcript = clean_srt(content)
- output_path.write_text(transcript, encoding='utf-8')
- # 统计
- word_count = len(transcript)
- line_count = transcript.count('\n') + 1
- print(f"✅ 转换完成: {output_path}")
- print(f" 字数: {word_count} 段落数: {line_count}")
- if __name__ == '__main__':
- main()
|