1
0

narrate-pipeline.mjs 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. #!/usr/bin/env node
  2. /**
  3. * narrate-pipeline.mjs · L2 长解说总指挥
  4. *
  5. * 输入:markdown 解说稿(## scene-id 分段,[[cue:id]] 标关键句)
  6. * 输出:voiceover.mp3(拼接好的整段人声)+ timeline.json(每段 start/end + cues 绝对时间)
  7. *
  8. * 用法:
  9. * node scripts/narrate-pipeline.mjs --script demo.md --out-dir _narration_demo
  10. *
  11. * 解说稿格式:
  12. * ---
  13. * title: 什么是 LLM
  14. * voice: S_JSdgdWk22 # 可选,不填走 .env
  15. * speed: 1.0 # 可选
  16. * gap: 0.3 # 段间静音秒数,默认 0.3
  17. * ---
  18. *
  19. * ## intro
  20. * 大家好,我是花叔。今天我们 5 分钟讲清楚 LLM 是什么。
  21. *
  22. * ## what-is
  23. * LLM 全称 Large Language Model,[[cue:bigmodel]]它是一个有几千亿参数的神经网络。
  24. * 本质是一个文字接龙的预测器。
  25. *
  26. * 输出文件结构(out-dir 下):
  27. * audio/
  28. * intro.mp3
  29. * what-is.mp3
  30. * voiceover.mp3 拼接全部 scene 的整段人声
  31. * timeline.json schema 见 references/voiceover-pipeline.md
  32. *
  33. * 依赖:tts-doubao.mjs、ffmpeg、ffprobe
  34. */
  35. import fs from 'node:fs';
  36. import path from 'node:path';
  37. import { execFileSync, execSync } from 'node:child_process';
  38. import { fileURLToPath } from 'node:url';
  39. const __dirname = path.dirname(fileURLToPath(import.meta.url));
  40. const SKILL_ROOT = path.resolve(__dirname, '..');
  41. const TTS_SCRIPT = path.join(__dirname, 'tts-doubao.mjs');
  42. function parseArgs(argv) {
  43. const args = {};
  44. for (let i = 2; i < argv.length; i++) {
  45. const a = argv[i];
  46. if (a === '--script') args.script = argv[++i];
  47. else if (a === '--out-dir') args.outDir = argv[++i];
  48. else if (a === '--help' || a === '-h') args.help = true;
  49. }
  50. return args;
  51. }
  52. function usage() {
  53. console.error(`
  54. narrate-pipeline.mjs · L2 长解说总指挥
  55. --script <path> 解说稿 .md 文件(必填)
  56. --out-dir <path> 输出目录(必填)
  57. 输出:<out-dir>/voiceover.mp3 + <out-dir>/timeline.json
  58. `.trim());
  59. process.exit(1);
  60. }
  61. /**
  62. * Parse frontmatter + scene blocks from markdown
  63. * Returns { meta, scenes: [{ id, raw }] }
  64. */
  65. function parseScript(md) {
  66. const meta = {};
  67. let body = md;
  68. const fmMatch = md.match(/^---\n([\s\S]*?)\n---\n/);
  69. if (fmMatch) {
  70. for (const line of fmMatch[1].split('\n')) {
  71. const idx = line.indexOf(':');
  72. if (idx < 0) continue;
  73. const key = line.slice(0, idx).trim();
  74. const val = line.slice(idx + 1).trim();
  75. meta[key] = val;
  76. }
  77. body = md.slice(fmMatch[0].length);
  78. }
  79. const scenes = [];
  80. const re = /^##\s+([\w-]+)\s*\n([\s\S]*?)(?=^##\s+[\w-]+\s*\n|$(?![\r\n]))/gm;
  81. let m;
  82. while ((m = re.exec(body)) !== null) {
  83. scenes.push({ id: m[1], raw: m[2].trim() });
  84. }
  85. return { meta, scenes };
  86. }
  87. /**
  88. * Split a scene's text by [[cue:id]] markers into chunks.
  89. * Returns: { chunks: [{ text, cueAfter? }] }
  90. * cueAfter is the cue id that follows this chunk (chunk's end = cue position)
  91. *
  92. * Example: "A[[cue:x]]B[[cue:y]]C" =>
  93. * chunks: [
  94. * { text: "A", cueAfter: "x" },
  95. * { text: "B", cueAfter: "y" },
  96. * { text: "C" }
  97. * ]
  98. */
  99. function splitByCues(text) {
  100. const chunks = [];
  101. const re = /\[\[cue:([\w-]+)\]\]/g;
  102. let lastIdx = 0;
  103. let m;
  104. while ((m = re.exec(text)) !== null) {
  105. const before = text.slice(lastIdx, m.index).trim();
  106. chunks.push({ text: before, cueAfter: m[1] });
  107. lastIdx = m.index + m[0].length;
  108. }
  109. const tail = text.slice(lastIdx).trim();
  110. chunks.push({ text: tail });
  111. // 过滤空文本块(cue 紧贴段首/段尾时)
  112. return chunks.filter((c) => c.text.length > 0 || c.cueAfter);
  113. }
  114. function getDuration(filePath) {
  115. const out = execFileSync('ffprobe', [
  116. '-v', 'error',
  117. '-show_entries', 'format=duration',
  118. '-of', 'default=noprint_wrappers=1:nokey=1',
  119. filePath,
  120. ], { encoding: 'utf8' });
  121. return parseFloat(out.trim());
  122. }
  123. function callTTS(text, outPath, opts) {
  124. const args = ['--text', text, '--out', outPath];
  125. if (opts.voice) args.push('--voice', opts.voice);
  126. if (opts.speed) args.push('--speed', String(opts.speed));
  127. const out = execFileSync('node', [TTS_SCRIPT, ...args], {
  128. encoding: 'utf8',
  129. stdio: ['ignore', 'pipe', 'inherit'],
  130. });
  131. return JSON.parse(out.trim());
  132. }
  133. function ffmpegConcat(inputs, output) {
  134. // 用 concat demuxer 合并相同编码的 mp3
  135. const listFile = output + '.list';
  136. fs.writeFileSync(
  137. listFile,
  138. inputs.map((p) => `file '${p.replace(/'/g, "'\\''")}'`).join('\n'),
  139. );
  140. execSync(
  141. `ffmpeg -y -f concat -safe 0 -i "${listFile}" -c copy "${output}"`,
  142. { stdio: ['ignore', 'pipe', 'pipe'] },
  143. );
  144. fs.unlinkSync(listFile);
  145. }
  146. function makeSilence(duration, outPath) {
  147. execSync(
  148. `ffmpeg -y -f lavfi -i anullsrc=r=24000:cl=mono -t ${duration} -q:a 9 -acodec libmp3lame "${outPath}"`,
  149. { stdio: ['ignore', 'pipe', 'pipe'] },
  150. );
  151. }
  152. async function main() {
  153. const args = parseArgs(process.argv);
  154. if (args.help || !args.script || !args.outDir) usage();
  155. const scriptPath = path.resolve(args.script);
  156. const outDir = path.resolve(args.outDir);
  157. const audioDir = path.join(outDir, 'audio');
  158. const tmpDir = path.join(outDir, '.tmp');
  159. fs.mkdirSync(audioDir, { recursive: true });
  160. fs.mkdirSync(tmpDir, { recursive: true });
  161. const md = fs.readFileSync(scriptPath, 'utf8');
  162. const { meta, scenes } = parseScript(md);
  163. if (scenes.length === 0) {
  164. console.error('错:解说稿没有 ## scene 段,至少一段。');
  165. process.exit(1);
  166. }
  167. const voice = meta.voice || undefined;
  168. const speed = meta.speed ? parseFloat(meta.speed) : 1.0;
  169. const gap = meta.gap ? parseFloat(meta.gap) : 0.3;
  170. console.error(`[narrate] script=${path.basename(scriptPath)} scenes=${scenes.length} voice=${voice || '(env)'} speed=${speed} gap=${gap}s`);
  171. // 段间静音文件(共用一个)
  172. const gapFile = path.join(tmpDir, 'gap.mp3');
  173. if (gap > 0) makeSilence(gap, gapFile);
  174. const timeline = {
  175. title: meta.title || path.basename(scriptPath, '.md'),
  176. voice: voice || null,
  177. speed,
  178. gap,
  179. totalDuration: 0,
  180. scenes: [],
  181. };
  182. let cursor = 0;
  183. const sceneAudioFiles = [];
  184. for (let i = 0; i < scenes.length; i++) {
  185. const scene = scenes[i];
  186. console.error(`[narrate] (${i + 1}/${scenes.length}) scene="${scene.id}"`);
  187. const chunks = splitByCues(scene.raw);
  188. const chunkFiles = [];
  189. const cueRecords = [];
  190. const chunkRecords = []; // 每个 chunk 的实测 start/end 段内时间,用于字幕显示
  191. let sceneInternalCursor = 0;
  192. for (let j = 0; j < chunks.length; j++) {
  193. const chunk = chunks[j];
  194. if (!chunk.text) {
  195. // 空文本块(cue 紧贴),跳过 TTS 但仍记录 cue 位置
  196. if (chunk.cueAfter) {
  197. cueRecords.push({
  198. id: chunk.cueAfter,
  199. offset: sceneInternalCursor,
  200. });
  201. }
  202. continue;
  203. }
  204. const chunkPath = path.join(tmpDir, `${scene.id}-${j}.mp3`);
  205. const result = callTTS(chunk.text, chunkPath, { voice, speed });
  206. const chunkStart = sceneInternalCursor;
  207. chunkFiles.push(chunkPath);
  208. sceneInternalCursor += result.duration;
  209. chunkRecords.push({
  210. text: chunk.text,
  211. start: chunkStart,
  212. end: sceneInternalCursor,
  213. duration: result.duration,
  214. });
  215. console.error(` chunk ${j}: ${result.duration.toFixed(2)}s · ${chunk.text.length} 字 · ${chunk.text.slice(0, 30)}${chunk.text.length > 30 ? '…' : ''}`);
  216. if (chunk.cueAfter) {
  217. cueRecords.push({
  218. id: chunk.cueAfter,
  219. offset: sceneInternalCursor,
  220. });
  221. }
  222. }
  223. // 合并段内子段
  224. const sceneAudio = path.join(audioDir, `${scene.id}.mp3`);
  225. if (chunkFiles.length === 1) {
  226. fs.copyFileSync(chunkFiles[0], sceneAudio);
  227. } else {
  228. ffmpegConcat(chunkFiles, sceneAudio);
  229. }
  230. const sceneDuration = getDuration(sceneAudio);
  231. // 拼接到总轨:先加 gap(除了第一段),再加 scene
  232. if (i > 0 && gap > 0) {
  233. sceneAudioFiles.push(gapFile);
  234. cursor += gap;
  235. }
  236. sceneAudioFiles.push(sceneAudio);
  237. timeline.scenes.push({
  238. id: scene.id,
  239. start: cursor,
  240. end: cursor + sceneDuration,
  241. duration: sceneDuration,
  242. audio: path.relative(outDir, sceneAudio),
  243. text: scene.raw.replace(/\[\[cue:[\w-]+\]\]/g, ''),
  244. // chunks: 用于字幕逐句显示。start/end 是段内相对时间,absoluteStart/absoluteEnd 是整轨绝对时间
  245. chunks: chunkRecords.map((c) => ({
  246. text: c.text,
  247. start: c.start,
  248. end: c.end,
  249. absoluteStart: cursor + c.start,
  250. absoluteEnd: cursor + c.end,
  251. })),
  252. cues: cueRecords.map((c) => ({
  253. id: c.id,
  254. offset: c.offset,
  255. absoluteTime: cursor + c.offset,
  256. })),
  257. });
  258. cursor += sceneDuration;
  259. }
  260. // 合并整轨
  261. const voiceoverPath = path.join(outDir, 'voiceover.mp3');
  262. ffmpegConcat(sceneAudioFiles, voiceoverPath);
  263. timeline.totalDuration = getDuration(voiceoverPath);
  264. timeline.voiceover = 'voiceover.mp3';
  265. fs.writeFileSync(
  266. path.join(outDir, 'timeline.json'),
  267. JSON.stringify(timeline, null, 2),
  268. );
  269. // 清理 tmp
  270. fs.rmSync(tmpDir, { recursive: true, force: true });
  271. console.error(`\n[narrate] 完成。`);
  272. console.error(` voiceover: ${voiceoverPath}`);
  273. console.error(` timeline: ${path.join(outDir, 'timeline.json')}`);
  274. console.error(` 总时长: ${timeline.totalDuration.toFixed(2)}s (${(timeline.totalDuration / 60).toFixed(2)} min)`);
  275. console.error(` 段数: ${timeline.scenes.length}`);
  276. const totalCues = timeline.scenes.reduce((sum, s) => sum + s.cues.length, 0);
  277. console.error(` cue 数: ${totalCues}`);
  278. }
  279. main().catch((err) => {
  280. console.error(`narrate-pipeline 失败:${err.message}`);
  281. console.error(err.stack);
  282. process.exit(1);
  283. });